data_kitten 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'data_kitten/origins/git'
|
2
|
+
require 'data_kitten/origins/web_service'
|
3
|
+
require 'data_kitten/origins/html'
|
4
|
+
require 'data_kitten/origins/linked_data'
|
5
|
+
|
6
|
+
module DataKitten
|
7
|
+
|
8
|
+
module Origins
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def detect_origin
|
13
|
+
[
|
14
|
+
DataKitten::Origins::Git,
|
15
|
+
DataKitten::Origins::HTML,
|
16
|
+
DataKitten::Origins::WebService,
|
17
|
+
DataKitten::Origins::LinkedData,
|
18
|
+
].each do |origin|
|
19
|
+
if origin.supported?(@access_url)
|
20
|
+
extend origin
|
21
|
+
break
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# Git origin module. Automatically mixed into {Dataset} for datasets that are loaded from Git repositories.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Git
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/.*\.git\Z/
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# The origin type of the dataset.
|
20
|
+
# @return [Symbol] +:git+
|
21
|
+
# @see Dataset#origin
|
22
|
+
def origin
|
23
|
+
:git
|
24
|
+
end
|
25
|
+
|
26
|
+
# A history of changes to the Dataset, taken from the full git changelog
|
27
|
+
# @see Dataset#change_history
|
28
|
+
def change_history
|
29
|
+
@change_history ||= begin
|
30
|
+
repository.log.map{|commit| commit}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def load_file(path)
|
37
|
+
# Make sure we have a working copy
|
38
|
+
repository
|
39
|
+
# read file
|
40
|
+
File.read(File.join(working_copy_path, path))
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def working_copy_path
|
46
|
+
# Create holding directory
|
47
|
+
FileUtils.mkdir_p(File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories'))
|
48
|
+
# generate working copy dir
|
49
|
+
File.join(File.dirname(__FILE__), '..', '..', '..', 'tmp', 'repositories', @access_url.gsub('/','-'))
|
50
|
+
end
|
51
|
+
|
52
|
+
def repository
|
53
|
+
@repository ||= begin
|
54
|
+
repo = ::Git.open(working_copy_path)
|
55
|
+
repo.pull("origin", "master")
|
56
|
+
repo
|
57
|
+
rescue ArgumentError
|
58
|
+
repo = ::Git.clone(@access_url, working_copy_path)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# HTML origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module HTML
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
RestClient.get(uri).headers[:content_type] =~ /text\/html/
|
15
|
+
rescue
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
public
|
20
|
+
|
21
|
+
# The origin type of the dataset.
|
22
|
+
# @return [Symbol] +:html+
|
23
|
+
# @see Dataset#origin
|
24
|
+
def origin
|
25
|
+
:html
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# Linked Data origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module LinkedData
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
content_type = RestClient.head(uri).headers[:content_type]
|
15
|
+
return nil unless content_type
|
16
|
+
|
17
|
+
return RDF::Format.content_types.keys.include?(
|
18
|
+
content_type.split(";").first )
|
19
|
+
|
20
|
+
rescue
|
21
|
+
false
|
22
|
+
end
|
23
|
+
|
24
|
+
public
|
25
|
+
|
26
|
+
# The origin type of the dataset.
|
27
|
+
# @return [Symbol] +:linkeddata+
|
28
|
+
# @see Dataset#origin
|
29
|
+
def origin
|
30
|
+
:linkeddata
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Origins
|
4
|
+
|
5
|
+
# Web service origin module. Automatically mixed into {Dataset} for datasets that are accessed through an API.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module WebService
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
false
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# The origin type of the dataset.
|
20
|
+
# @return [Symbol] +:web_service+
|
21
|
+
# @see Dataset#origin
|
22
|
+
def origin
|
23
|
+
:web_service
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'data_kitten/publishing_formats/datapackage'
|
2
|
+
require 'data_kitten/publishing_formats/rdfa'
|
3
|
+
require 'data_kitten/publishing_formats/linked_data'
|
4
|
+
require 'data_kitten/publishing_formats/ckan'
|
5
|
+
|
6
|
+
module DataKitten
|
7
|
+
|
8
|
+
module PublishingFormats
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def detect_publishing_format
|
13
|
+
[
|
14
|
+
DataKitten::PublishingFormats::Datapackage,
|
15
|
+
DataKitten::PublishingFormats::CKAN,
|
16
|
+
DataKitten::PublishingFormats::RDFa,
|
17
|
+
DataKitten::PublishingFormats::LinkedData
|
18
|
+
].each do |format|
|
19
|
+
if format.supported?(self)
|
20
|
+
extend format
|
21
|
+
break
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
module CKAN
|
6
|
+
|
7
|
+
@@metadata = nil
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def self.supported?(instance)
|
12
|
+
uri = URI(instance.uri)
|
13
|
+
package = uri.path.split("/").last
|
14
|
+
# If the package is a UUID - it's more than likely to be a CKAN ID
|
15
|
+
if package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
|
16
|
+
@@id = package
|
17
|
+
else
|
18
|
+
|
19
|
+
results = RestClient.get "#{uri.scheme}://#{uri.host}/api/3/action/package_show", {:params => {:id => package}} rescue ""
|
20
|
+
|
21
|
+
if results == ""
|
22
|
+
results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
|
23
|
+
end
|
24
|
+
|
25
|
+
result = JSON.parse results
|
26
|
+
@@id = result["result"]["id"] rescue result["id"]
|
27
|
+
@@metadata = JSON.parse RestClient.get "#{uri.scheme}://#{uri.host}/api/rest/package/#{@@id}"
|
28
|
+
end
|
29
|
+
rescue
|
30
|
+
false
|
31
|
+
end
|
32
|
+
|
33
|
+
public
|
34
|
+
|
35
|
+
# The publishing format for the dataset.
|
36
|
+
# @return [Symbol] +:ckan+
|
37
|
+
# @see Dataset#publishing_format
|
38
|
+
def publishing_format
|
39
|
+
:ckan
|
40
|
+
end
|
41
|
+
|
42
|
+
# The human-readable title of the dataset.
|
43
|
+
#
|
44
|
+
# @see Dataset#data_title
|
45
|
+
def data_title
|
46
|
+
metadata["title"] rescue nil
|
47
|
+
end
|
48
|
+
|
49
|
+
# A brief description of the dataset
|
50
|
+
#
|
51
|
+
# @see Dataset#description
|
52
|
+
def description
|
53
|
+
metadata["notes"] rescue nil
|
54
|
+
end
|
55
|
+
|
56
|
+
# Keywords for the dataset
|
57
|
+
#
|
58
|
+
# @see Dataset#keywords
|
59
|
+
def keywords
|
60
|
+
keywords = []
|
61
|
+
metadata["tags"].each do |tag|
|
62
|
+
keywords << tag
|
63
|
+
end
|
64
|
+
return keywords
|
65
|
+
rescue
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
|
69
|
+
# A list of publishers.
|
70
|
+
#
|
71
|
+
# @see Dataset#publishers
|
72
|
+
def publishers
|
73
|
+
id = metadata['organization']['id'] || metadata['groups'][0]
|
74
|
+
fetch_publisher(id)
|
75
|
+
rescue
|
76
|
+
[]
|
77
|
+
end
|
78
|
+
|
79
|
+
# A list of licenses.
|
80
|
+
#
|
81
|
+
# @see Dataset#licenses
|
82
|
+
def licenses
|
83
|
+
uri = metadata["license_url"] || metadata["extras"]["licence_url"] rescue nil
|
84
|
+
name = metadata["license_title"] || metadata["extras"]["licence_url_title"] rescue nil
|
85
|
+
[
|
86
|
+
License.new(:id => metadata["license_id"],
|
87
|
+
:uri => uri,
|
88
|
+
:name => name
|
89
|
+
)
|
90
|
+
]
|
91
|
+
rescue
|
92
|
+
[]
|
93
|
+
end
|
94
|
+
|
95
|
+
# A list of distributions, referred to as +resources+ by Datapackage.
|
96
|
+
#
|
97
|
+
# @see Dataset#distributions
|
98
|
+
def distributions
|
99
|
+
distributions = []
|
100
|
+
metadata["resources"].each do |resource|
|
101
|
+
distribution = {
|
102
|
+
:title => resource["description"],
|
103
|
+
:accessURL => resource["url"],
|
104
|
+
:format => resource["format"]
|
105
|
+
}
|
106
|
+
distributions << Distribution.new(self, ckan_resource: distribution)
|
107
|
+
end
|
108
|
+
return distributions
|
109
|
+
rescue
|
110
|
+
nil
|
111
|
+
end
|
112
|
+
|
113
|
+
# How frequently the data is updated.
|
114
|
+
#
|
115
|
+
# @see Dataset#update_frequency
|
116
|
+
def update_frequency
|
117
|
+
metadata["extras"]["update_frequency"] || metadata["extras"]["frequency-of-update"] rescue nil
|
118
|
+
end
|
119
|
+
|
120
|
+
# Date the dataset was released
|
121
|
+
#
|
122
|
+
# @see Dataset#issued
|
123
|
+
def issued
|
124
|
+
Date.parse metadata["metadata_created"] rescue nil
|
125
|
+
end
|
126
|
+
|
127
|
+
# Date the dataset was modified
|
128
|
+
#
|
129
|
+
# @see Dataset#modified
|
130
|
+
def modified
|
131
|
+
Date.parse metadata["metadata_modified"] rescue nil
|
132
|
+
end
|
133
|
+
|
134
|
+
# The temporal coverage of the dataset
|
135
|
+
#
|
136
|
+
# @see Dataset#temporal
|
137
|
+
def temporal
|
138
|
+
start_date = Date.parse metadata["extras"]["temporal_coverage-from"] rescue nil
|
139
|
+
end_date = Date.parse metadata["extras"]["temporal_coverage-to"] rescue nil
|
140
|
+
Temporal.new(:start => start_date, :end => end_date)
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def metadata
|
146
|
+
@@metadata
|
147
|
+
end
|
148
|
+
|
149
|
+
def select_extras(group, key)
|
150
|
+
extra = group["extras"][key] rescue ""
|
151
|
+
if extra == ""
|
152
|
+
extra = group['result']['extras'].select {|e| e["key"] == key }.first['value'] rescue ""
|
153
|
+
end
|
154
|
+
extra
|
155
|
+
end
|
156
|
+
|
157
|
+
def fetch_publisher(id)
|
158
|
+
uri = parsed_uri
|
159
|
+
[
|
160
|
+
"#{uri.scheme}://#{uri.host}/api/rest/group/#{id}",
|
161
|
+
"#{uri.scheme}://#{uri.host}/api/3/action/group_show?id=#{id}",
|
162
|
+
"#{uri.scheme}://#{uri.host}/api/3/action/organization_show?id=#{id}"
|
163
|
+
].each do |uri|
|
164
|
+
begin
|
165
|
+
@group = JSON.parse RestClient.get uri
|
166
|
+
break
|
167
|
+
rescue RestClient::ResourceNotFound
|
168
|
+
nil
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
[
|
173
|
+
Agent.new(
|
174
|
+
:name => @group["display_name"] || @group["result"]["name"],
|
175
|
+
:homepage => select_extras(@group, "website-url"),
|
176
|
+
:mbox => select_extras(@group, "contact-email")
|
177
|
+
)
|
178
|
+
]
|
179
|
+
end
|
180
|
+
|
181
|
+
def parsed_uri
|
182
|
+
URI(self.uri)
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
# Datapackage metadata format module. Automatically mixed into {Dataset} for datasets that include a +datapackage.json+.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Datapackage
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(instance)
|
14
|
+
begin
|
15
|
+
if instance.send(:origin) == :git
|
16
|
+
metadata = instance.send(:load_file, "datapackage.json")
|
17
|
+
datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
|
18
|
+
return datapackage.datapackage_version != nil
|
19
|
+
else
|
20
|
+
datapackage = DataPackage::Package.new( instance.uri )
|
21
|
+
return datapackage.datapackage_version != nil
|
22
|
+
end
|
23
|
+
rescue => e
|
24
|
+
false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
public
|
29
|
+
|
30
|
+
# The publishing format for the dataset.
|
31
|
+
# @return [Symbol] +:datapackage+
|
32
|
+
# @see Dataset#publishing_format
|
33
|
+
def publishing_format
|
34
|
+
:datapackage
|
35
|
+
end
|
36
|
+
|
37
|
+
# A list of maintainers.
|
38
|
+
#
|
39
|
+
# @see Dataset#maintainers
|
40
|
+
def maintainers
|
41
|
+
package.maintainers.map do |x|
|
42
|
+
Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# A list of publishers.
|
47
|
+
#
|
48
|
+
# @see Dataset#publishers
|
49
|
+
def publishers
|
50
|
+
package.publisher.map do |x|
|
51
|
+
Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# A list of licenses.
|
56
|
+
#
|
57
|
+
# @see Dataset#licenses
|
58
|
+
def licenses
|
59
|
+
package.licenses.map do |x|
|
60
|
+
License.new(:id => x['id'], :uri => x['url'], :name => x['name'])
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def rights
|
65
|
+
if package.property("rights")
|
66
|
+
Rights.new( ( package.property("rights", [])).each_with_object({}){|(k,v), h| h[k.to_sym] = v} )
|
67
|
+
else
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# A list of contributors.
|
73
|
+
#
|
74
|
+
# @see Dataset#contributors
|
75
|
+
def contributors
|
76
|
+
package.contributors.map do |x|
|
77
|
+
Agent.new(:name => x['name'], :uri => x['web'], :email => x['email'])
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# A list of distributions, referred to as +resources+ by Datapackage.
|
82
|
+
#
|
83
|
+
# @see Dataset#distributions
|
84
|
+
def distributions
|
85
|
+
package.resources.map { |resource| Distribution.new(self, datapackage_resource: resource) }
|
86
|
+
end
|
87
|
+
|
88
|
+
# The human-readable title of the dataset.
|
89
|
+
#
|
90
|
+
# @see Dataset#data_title
|
91
|
+
def data_title
|
92
|
+
package.title || package.name
|
93
|
+
end
|
94
|
+
|
95
|
+
# A brief description of the dataset
|
96
|
+
#
|
97
|
+
# @see Dataset#description
|
98
|
+
def description
|
99
|
+
package.description
|
100
|
+
end
|
101
|
+
|
102
|
+
# Keywords for the dataset
|
103
|
+
#
|
104
|
+
# @see Dataset#keywords
|
105
|
+
def keywords
|
106
|
+
package.keywords
|
107
|
+
end
|
108
|
+
|
109
|
+
# Where the data is sourced from
|
110
|
+
#
|
111
|
+
# @see Dataset#sources
|
112
|
+
def sources
|
113
|
+
package.sources.map do |x|
|
114
|
+
Source.new(:label => x['name'], :resource => x['web'])
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Date the dataset was modified
|
119
|
+
def modified
|
120
|
+
package.last_modified
|
121
|
+
end
|
122
|
+
|
123
|
+
# A history of changes to the Dataset.
|
124
|
+
#
|
125
|
+
# If {Dataset#source} is +:git+, this is the git changelog for the actual distribution files, rather
|
126
|
+
# then the full unfiltered log.
|
127
|
+
#
|
128
|
+
# @return [Array] An array of changes. Exact format depends on the source.
|
129
|
+
#
|
130
|
+
# @see Dataset#change_history
|
131
|
+
def change_history
|
132
|
+
@change_history ||= begin
|
133
|
+
if origin == :git
|
134
|
+
# Get a log for each file in the local repo
|
135
|
+
logs = distributions.map do |file|
|
136
|
+
if file.path
|
137
|
+
log = repository.log.path(file.path)
|
138
|
+
# Convert to list of commits
|
139
|
+
log.map{|commit| commit}
|
140
|
+
else
|
141
|
+
[]
|
142
|
+
end
|
143
|
+
end
|
144
|
+
# combine all logs, make unique, and re-sort in date order
|
145
|
+
logs.flatten.uniq.sort_by{|x| x.committer.date}.reverse
|
146
|
+
else
|
147
|
+
[]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
def package
|
155
|
+
if !@datapackage
|
156
|
+
if origin == :git
|
157
|
+
metadata = load_file("datapackage.json")
|
158
|
+
@datapackage = DataPackage::Package.new( JSON.parse( metadata ) )
|
159
|
+
else
|
160
|
+
@datapackage = DataPackage::Package.new( access_url )
|
161
|
+
end
|
162
|
+
end
|
163
|
+
@datapackage
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|