data_kitten 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'data_kitten/origins/git'
|
2
|
+
require 'data_kitten/origins/web_service'
|
3
|
+
require 'data_kitten/origins/html'
|
4
|
+
require 'data_kitten/origins/linked_data'
|
5
|
+
|
6
|
+
module DataKitten
|
7
|
+
|
8
|
+
module Origins
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def detect_origin
|
13
|
+
[
|
14
|
+
DataKitten::Origins::Git,
|
15
|
+
DataKitten::Origins::HTML,
|
16
|
+
DataKitten::Origins::WebService,
|
17
|
+
DataKitten::Origins::LinkedData,
|
18
|
+
].each do |origin|
|
19
|
+
if origin.supported?(@access_url)
|
20
|
+
extend origin
|
21
|
+
break
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# Git origin module. Automatically mixed into {Dataset} for datasets that are loaded from Git repositories.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Git
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/.*\.git\Z/
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# The origin type of the dataset.
|
20
|
+
# @return [Symbol] +:git+
|
21
|
+
# @see Dataset#origin
|
22
|
+
def origin
|
23
|
+
:git
|
24
|
+
end
|
25
|
+
|
26
|
+
# A history of changes to the Dataset, taken from the full git changelog
|
27
|
+
# @see Dataset#change_history
|
28
|
+
def change_history
|
29
|
+
@change_history ||= begin
|
30
|
+
repository.log.map{|commit| commit}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def load_file(path)
|
37
|
+
# Make sure we have a working copy
|
38
|
+
repository
|
39
|
+
# read file
|
40
|
+
File.read(File.join(working_copy_path, path))
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def working_copy_path
|
46
|
+
# Create holding directory
|
47
|
+
FileUtils.mkdir_p(File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories'))
|
48
|
+
# generate working copy dir
|
49
|
+
File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories', @access_url.gsub('/','-'))
|
50
|
+
end
|
51
|
+
|
52
|
+
def repository
|
53
|
+
@repository ||= begin
|
54
|
+
repo = ::Git.open(working_copy_path)
|
55
|
+
repo.pull("origin", "master")
|
56
|
+
repo
|
57
|
+
rescue ArgumentError
|
58
|
+
repo = ::Git.clone(@access_url, working_copy_path)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# HTML origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module HTML
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
RestClient.get(uri).headers[:content_type] =~ /text\/html/
|
15
|
+
rescue
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
public
|
20
|
+
|
21
|
+
# The origin type of the dataset.
|
22
|
+
# @return [Symbol] +:html+
|
23
|
+
# @see Dataset#origin
|
24
|
+
def origin
|
25
|
+
:html
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# Linked Data origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module LinkedData
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
content_type = RestClient.head(uri).headers[:content_type]
|
15
|
+
return nil unless content_type
|
16
|
+
|
17
|
+
return RDF::Format.content_types.keys.include?(
|
18
|
+
content_type.split(";").first )
|
19
|
+
|
20
|
+
rescue
|
21
|
+
false
|
22
|
+
end
|
23
|
+
|
24
|
+
public
|
25
|
+
|
26
|
+
# The origin type of the dataset.
|
27
|
+
# @return [Symbol] +:linkeddata+
|
28
|
+
# @see Dataset#origin
|
29
|
+
def origin
|
30
|
+
:linkeddata
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# Web service origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module WebService
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
false
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# The origin type of the dataset.
|
20
|
+
# @return [Symbol] +:web_service+
|
21
|
+
# @see Dataset#origin
|
22
|
+
def origin
|
23
|
+
:web_service
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'data_kitten/publishing_formats/datapackage'
|
2
|
+
require 'data_kitten/publishing_formats/rdfa'
|
3
|
+
require 'data_kitten/publishing_formats/linked_data'
|
4
|
+
require 'data_kitten/publishing_formats/ckan'
|
5
|
+
|
6
|
+
module DataKitten
|
7
|
+
|
8
|
+
module PublishingFormats
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def detect_publishing_format
|
13
|
+
[
|
14
|
+
DataKitten::PublishingFormats::Datapackage,
|
15
|
+
DataKitten::PublishingFormats::CKAN,
|
16
|
+
DataKitten::PublishingFormats::RDFa,
|
17
|
+
DataKitten::PublishingFormats::LinkedData
|
18
|
+
].each do |format|
|
19
|
+
if format.supported?(self)
|
20
|
+
extend format
|
21
|
+
break
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
module CKAN
|
6
|
+
|
7
|
+
@@metadata = nil
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def self.supported?(instance)
|
12
|
+
uri = URI(instance.uri)
|
13
|
+
package = uri.path.split("/").last
|
14
|
+
# If the package is a UUID - it's more than likely to be a CKAN ID
|
15
|
+
if package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
|
16
|
+
@@id = package
|
17
|
+
else
|
18
|
+
|
19
|
+
results = RestClient.get "#{uri.scheme}://#{uri.host}/api/3/action/package_show", {:params => {:id => package}} rescue ""
|
20
|
+
|
21
|
+
if results == ""
|
22
|
+
results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
|
23
|
+
end
|
24
|
+
|
25
|
+
result = JSON.parse results
|
26
|
+
@@id = result["result"]["id"] rescue result["id"]
|
27
|
+
@@metadata = JSON.parse RestClient.get "#{uri.scheme}://#{uri.host}/api/rest/package/#{@@id}"
|
28
|
+
end
|
29
|
+
rescue
|
30
|
+
false
|
31
|
+
end
|
32
|
+
|
33
|
+
public
|
34
|
+
|
35
|
+
# The publishing format for the dataset.
|
36
|
+
# @return [Symbol] +:ckan+
|
37
|
+
# @see Dataset#publishing_format
|
38
|
+
def publishing_format
|
39
|
+
:ckan
|
40
|
+
end
|
41
|
+
|
42
|
+
# The human-readable title of the dataset.
|
43
|
+
#
|
44
|
+
# @see Dataset#data_title
|
45
|
+
def data_title
|
46
|
+
metadata["title"] rescue nil
|
47
|
+
end
|
48
|
+
|
49
|
+
# A brief description of the dataset
|
50
|
+
#
|
51
|
+
# @see Dataset#description
|
52
|
+
def description
|
53
|
+
metadata["notes"] rescue nil
|
54
|
+
end
|
55
|
+
|
56
|
+
# Keywords for the dataset
|
57
|
+
#
|
58
|
+
# @see Dataset#keywords
|
59
|
+
def keywords
|
60
|
+
keywords = []
|
61
|
+
metadata["tags"].each do |tag|
|
62
|
+
keywords << tag
|
63
|
+
end
|
64
|
+
return keywords
|
65
|
+
rescue
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
|
69
|
+
# A list of publishers.
|
70
|
+
#
|
71
|
+
# @see Dataset#publishers
|
72
|
+
def publishers
|
73
|
+
id = metadata['organization']['id'] || metadata['groups'][0]
|
74
|
+
fetch_publisher(id)
|
75
|
+
rescue
|
76
|
+
[]
|
77
|
+
end
|
78
|
+
|
79
|
+
# A list of licenses.
|
80
|
+
#
|
81
|
+
# @see Dataset#licenses
|
82
|
+
def licenses
|
83
|
+
uri = metadata["license_url"] || metadata["extras"]["licence_url"] rescue nil
|
84
|
+
name = metadata["license_title"] || metadata["extras"]["licence_url_title"] rescue nil
|
85
|
+
[
|
86
|
+
License.new(:id => metadata["license_id"],
|
87
|
+
:uri => uri,
|
88
|
+
:name => name
|
89
|
+
)
|
90
|
+
]
|
91
|
+
rescue
|
92
|
+
[]
|
93
|
+
end
|
94
|
+
|
95
|
+
# A list of distributions, referred to as +resources+ by Datapackage.
|
96
|
+
#
|
97
|
+
# @see Dataset#distributions
|
98
|
+
def distributions
|
99
|
+
distributions = []
|
100
|
+
metadata["resources"].each do |resource|
|
101
|
+
distribution = {
|
102
|
+
:title => resource["description"],
|
103
|
+
:accessURL => resource["url"],
|
104
|
+
:format => resource["format"]
|
105
|
+
}
|
106
|
+
distributions << Distribution.new(self, ckan_resource: distribution)
|
107
|
+
end
|
108
|
+
return distributions
|
109
|
+
rescue
|
110
|
+
nil
|
111
|
+
end
|
112
|
+
|
113
|
+
# How frequently the data is updated.
|
114
|
+
#
|
115
|
+
# @see Dataset#update_frequency
|
116
|
+
def update_frequency
|
117
|
+
metadata["extras"]["update_frequency"] || metadata["extras"]["frequency-of-update"] rescue nil
|
118
|
+
end
|
119
|
+
|
120
|
+
# Date the dataset was released
|
121
|
+
#
|
122
|
+
# @see Dataset#issued
|
123
|
+
def issued
|
124
|
+
Date.parse metadata["metadata_created"] rescue nil
|
125
|
+
end
|
126
|
+
|
127
|
+
# Date the dataset was modified
|
128
|
+
#
|
129
|
+
# @see Dataset#modified
|
130
|
+
def modified
|
131
|
+
Date.parse metadata["metadata_modified"] rescue nil
|
132
|
+
end
|
133
|
+
|
134
|
+
# The temporal coverage of the dataset
|
135
|
+
#
|
136
|
+
# @see Dataset#temporal
|
137
|
+
def temporal
|
138
|
+
start_date = Date.parse metadata["extras"]["temporal_coverage-from"] rescue nil
|
139
|
+
end_date = Date.parse metadata["extras"]["temporal_coverage-to"] rescue nil
|
140
|
+
Temporal.new(:start => start_date, :end => end_date)
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def metadata
|
146
|
+
@@metadata
|
147
|
+
end
|
148
|
+
|
149
|
+
def select_extras(group, key)
|
150
|
+
extra = group["extras"][key] rescue ""
|
151
|
+
if extra == ""
|
152
|
+
extra = group['result']['extras'].select {|e| e["key"] == key }.first['value'] rescue ""
|
153
|
+
end
|
154
|
+
extra
|
155
|
+
end
|
156
|
+
|
157
|
+
def fetch_publisher(id)
|
158
|
+
uri = parsed_uri
|
159
|
+
[
|
160
|
+
"#{uri.scheme}://#{uri.host}/api/rest/group/#{id}",
|
161
|
+
"#{uri.scheme}://#{uri.host}/api/3/action/group_show?id=#{id}",
|
162
|
+
"#{uri.scheme}://#{uri.host}/api/3/action/organization_show?id=#{id}"
|
163
|
+
].each do |uri|
|
164
|
+
begin
|
165
|
+
@group = JSON.parse RestClient.get uri
|
166
|
+
break
|
167
|
+
rescue RestClient::ResourceNotFound
|
168
|
+
nil
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
[
|
173
|
+
Agent.new(
|
174
|
+
:name => @group["display_name"] || @group["result"]["name"],
|
175
|
+
:homepage => select_extras(@group, "website-url"),
|
176
|
+
:mbox => select_extras(@group, "contact-email")
|
177
|
+
)
|
178
|
+
]
|
179
|
+
end
|
180
|
+
|
181
|
+
def parsed_uri
|
182
|
+
URI(self.uri)
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
# Datapackage metadata format module. Automatically mixed into {Dataset} for datasets that include a +datapackage.json+.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Datapackage
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(instance)
|
14
|
+
begin
|
15
|
+
if instance.send(:origin) == :git
|
16
|
+
metadata = instance.send(:load_file, "datapackage.json")
|
17
|
+
datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
|
18
|
+
return datapackage.datapackage_version != nil
|
19
|
+
else
|
20
|
+
datapackage = DataPackage::Package.new( instance.uri )
|
21
|
+
return datapackage.datapackage_version != nil
|
22
|
+
end
|
23
|
+
rescue => e
|
24
|
+
false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
public
|
29
|
+
|
30
|
+
# The publishing format for the dataset.
|
31
|
+
# @return [Symbol] +:datapackage+
|
32
|
+
# @see Dataset#publishing_format
|
33
|
+
def publishing_format
|
34
|
+
:datapackage
|
35
|
+
end
|
36
|
+
|
37
|
+
# A list of maintainers.
|
38
|
+
#
|
39
|
+
# @see Dataset#maintainers
|
40
|
+
def maintainers
|
41
|
+
package.maintainers.map do |x|
|
42
|
+
Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# A list of publishers.
|
47
|
+
#
|
48
|
+
# @see Dataset#publishers
|
49
|
+
def publishers
|
50
|
+
package.publisher.map do |x|
|
51
|
+
Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# A list of licenses.
|
56
|
+
#
|
57
|
+
# @see Dataset#licenses
|
58
|
+
def licenses
|
59
|
+
package.licenses.map do |x|
|
60
|
+
License.new(:id => x['id'], :uri => x['url'], :name => x['name'])
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def rights
|
65
|
+
if package.property("rights")
|
66
|
+
Rights.new( ( package.property("rights", [])).each_with_object({}){|(k,v), h| h[k.to_sym] = v} )
|
67
|
+
else
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# A list of contributors.
|
73
|
+
#
|
74
|
+
# @see Dataset#contributors
|
75
|
+
def contributors
|
76
|
+
package.contributors.map do |x|
|
77
|
+
Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# A list of distributions, referred to as +resources+ by Datapackage.
|
82
|
+
#
|
83
|
+
# @see Dataset#distributions
|
84
|
+
def distributions
|
85
|
+
package.resources.map { |resource| Distribution.new(self, datapackage_resource: resource) }
|
86
|
+
end
|
87
|
+
|
88
|
+
# The human-readable title of the dataset.
|
89
|
+
#
|
90
|
+
# @see Dataset#data_title
|
91
|
+
def data_title
|
92
|
+
package.title || package.name
|
93
|
+
end
|
94
|
+
|
95
|
+
# A brief description of the dataset
|
96
|
+
#
|
97
|
+
# @see Dataset#description
|
98
|
+
def description
|
99
|
+
package.description
|
100
|
+
end
|
101
|
+
|
102
|
+
# Keywords for the dataset
|
103
|
+
#
|
104
|
+
# @see Dataset#keywords
|
105
|
+
def keywords
|
106
|
+
package.keywords
|
107
|
+
end
|
108
|
+
|
109
|
+
# Where the data is sourced from
|
110
|
+
#
|
111
|
+
# @see Dataset#sources
|
112
|
+
def sources
|
113
|
+
package.sources.map do |x|
|
114
|
+
Source.new(:label => x['name'], :resource => x['web'])
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Date the dataset was modified
|
119
|
+
def modified
|
120
|
+
package.last_modified
|
121
|
+
end
|
122
|
+
|
123
|
+
# A history of changes to the Dataset.
|
124
|
+
#
|
125
|
+
# If {Dataset#source} is +:git+, this is the git changelog for the actual distribution files, rather
|
126
|
+
# then the full unfiltered log.
|
127
|
+
#
|
128
|
+
# @return [Array] An array of changes. Exact format depends on the source.
|
129
|
+
#
|
130
|
+
# @see Dataset#change_history
|
131
|
+
def change_history
|
132
|
+
@change_history ||= begin
|
133
|
+
if origin == :git
|
134
|
+
# Get a log for each file in the local repo
|
135
|
+
logs = distributions.map do |file|
|
136
|
+
if file.path
|
137
|
+
log = repository.log.path(file.path)
|
138
|
+
# Convert to list of commits
|
139
|
+
log.map{|commit| commit}
|
140
|
+
else
|
141
|
+
[]
|
142
|
+
end
|
143
|
+
end
|
144
|
+
# combine all logs, make unique, and re-sort in date order
|
145
|
+
logs.flatten.uniq.sort_by{|x| x.committer.date}.reverse
|
146
|
+
else
|
147
|
+
[]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
def package
|
155
|
+
if !@datapackage
|
156
|
+
if origin == :git
|
157
|
+
metadata = load_file("datapackage.json")
|
158
|
+
@datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
|
159
|
+
else
|
160
|
+
@datapackage = DataPackage::Package.new( access_url )
|
161
|
+
end
|
162
|
+
end
|
163
|
+
@datapackage
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|