data_kitten 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
@@ -0,0 +1,102 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
module LinkedData
|
6
|
+
|
7
|
+
ACCEPT_HEADER = "text/turtle, application/n-triples, application/ld+json; q=1.0,application/rdf+xml; q=0.8, */*; q=0.5"
|
8
|
+
|
9
|
+
include RDFa
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
#Find first resource with one of the specified RDF types
|
14
|
+
def self.first_of_type(graph, classes)
|
15
|
+
term = nil
|
16
|
+
classes.each do |clazz|
|
17
|
+
term = graph.first_subject(
|
18
|
+
RDF::Query::Pattern.new( nil, RDF.type, clazz ) )
|
19
|
+
break if term
|
20
|
+
end
|
21
|
+
term
|
22
|
+
end
|
23
|
+
|
24
|
+
#Attempt to create an RDF graph for this object
|
25
|
+
#
|
26
|
+
#Supports content negotiation for various RDF serializations. Attempts "dataset autodiscovery" if it receives
|
27
|
+
#an HTML response. This leaves the RDFa Publishing Format to just parse RDFa responses
|
28
|
+
def self.create_graph(uri)
|
29
|
+
|
30
|
+
resp = RestClient.get uri,
|
31
|
+
:accept=>ACCEPT_HEADER
|
32
|
+
return false if resp.code != 200
|
33
|
+
|
34
|
+
if resp.headers[:content_type] =~ /text\/html/
|
35
|
+
doc = Nokogiri::HTML( resp.body )
|
36
|
+
link = doc.search('link[rel=alternate]').detect { |n| n[:type] == 'application/rdf+xml' }
|
37
|
+
if link
|
38
|
+
resp = RestClient.get link["href"],
|
39
|
+
:accept=>ACCEPT_HEADER
|
40
|
+
return false if resp.code != 200
|
41
|
+
else
|
42
|
+
return false
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
reader = RDF::Reader.for( :content_type => resp.headers[:content_type] )
|
47
|
+
|
48
|
+
if !reader
|
49
|
+
extension = File.extname( uri ).gsub(".", "")
|
50
|
+
reader = RDF::Reader.for( :file_extension => extension ) if extension != ""
|
51
|
+
end
|
52
|
+
return false unless reader
|
53
|
+
|
54
|
+
graph = RDF::Graph.new()
|
55
|
+
graph << reader.new( StringIO.new( resp.body ))
|
56
|
+
|
57
|
+
return graph
|
58
|
+
rescue => e
|
59
|
+
#puts e
|
60
|
+
#puts e.backtrace
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
#Can we create an RDF graph for this object containing the description of a dataset?
|
65
|
+
def self.supported?(instance)
|
66
|
+
graph = create_graph(instance.uri)
|
67
|
+
return false unless graph
|
68
|
+
return true if first_of_type(graph,
|
69
|
+
[RDF::Vocabulary.new("http://www.w3.org/ns/dcat#").Dataset,
|
70
|
+
RDF::Vocabulary.new("http://rdfs.org/ns/void#").Dataset])
|
71
|
+
return false
|
72
|
+
end
|
73
|
+
|
74
|
+
public
|
75
|
+
|
76
|
+
# The publishing format for the dataset.
|
77
|
+
# @return [Symbol] +:rdfa+
|
78
|
+
# @see Dataset#publishing_format
|
79
|
+
def publishing_format
|
80
|
+
:rdf
|
81
|
+
end
|
82
|
+
|
83
|
+
def uri
|
84
|
+
access_url
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def dataset_uri
|
90
|
+
access_url
|
91
|
+
end
|
92
|
+
|
93
|
+
def graph
|
94
|
+
if !@graph
|
95
|
+
@graph = LinkedData.create_graph(access_url)
|
96
|
+
end
|
97
|
+
@graph
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,239 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
module RDFa
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
def self.supported?(instance)
|
10
|
+
graph = RDF::Graph.load(instance.uri, :format => :rdfa)
|
11
|
+
|
12
|
+
query = RDF::Query.new({
|
13
|
+
:dataset => {
|
14
|
+
RDF.type => RDF::Vocabulary.new("http://www.w3.org/ns/dcat#").Dataset
|
15
|
+
}
|
16
|
+
})
|
17
|
+
|
18
|
+
query.execute(graph)[0][:dataset].to_s
|
19
|
+
rescue
|
20
|
+
false
|
21
|
+
end
|
22
|
+
|
23
|
+
public
|
24
|
+
|
25
|
+
# The publishing format for the dataset.
|
26
|
+
# @return [Symbol] +:rdfa+
|
27
|
+
# @see Dataset#publishing_format
|
28
|
+
def publishing_format
|
29
|
+
:rdfa
|
30
|
+
end
|
31
|
+
|
32
|
+
# A list of maintainers.
|
33
|
+
#
|
34
|
+
# @see Dataset#maintainers
|
35
|
+
def maintainers
|
36
|
+
[]
|
37
|
+
end
|
38
|
+
|
39
|
+
# A list of publishers.
|
40
|
+
#
|
41
|
+
# @see Dataset#publishers
|
42
|
+
def publishers
|
43
|
+
publishers = []
|
44
|
+
uris = metadata[dataset_uri][RDF::DC.publisher.to_s]
|
45
|
+
uris.each do |publisher_uri|
|
46
|
+
publishers << Agent.new(:name => first_value( publisher_uri, RDF::FOAF.name ),
|
47
|
+
:homepage => first_value( publisher_uri, RDF::FOAF.homepage ),
|
48
|
+
:mbox => first_value( publisher_uri, RDF::FOAF.mbox ))
|
49
|
+
end
|
50
|
+
return publishers
|
51
|
+
rescue
|
52
|
+
[]
|
53
|
+
end
|
54
|
+
|
55
|
+
# The rights statment for the data
|
56
|
+
#
|
57
|
+
# @see Dataset#rights
|
58
|
+
def rights
|
59
|
+
rights_uri = metadata[dataset_uri][RDF::DC.rights.to_s][0]
|
60
|
+
if !metadata[rights_uri]
|
61
|
+
return Rights.new(:uri => rights_uri)
|
62
|
+
else
|
63
|
+
return Rights.new(:uri => uri,
|
64
|
+
:dataLicense => first_value( rights_uri, odrs.dataLicense ),
|
65
|
+
:contentLicense => first_value( rights_uri, odrs.contentLicense ),
|
66
|
+
:copyrightNotice => first_value( rights_uri, odrs.copyrightNotice ),
|
67
|
+
:attributionURL => first_value( rights_uri, odrs.attributionURL ),
|
68
|
+
:attributionText => first_value( rights_uri, odrs.attributionText ),
|
69
|
+
:copyrightHolder => first_value( rights_uri, odrs.copyrightHolder ),
|
70
|
+
:databaseRightHolder => first_value( rights_uri, odrs.databaseRightHolder ),
|
71
|
+
:copyrightYear => first_value( rights_uri, odrs.copyrightYear ),
|
72
|
+
:databaseRightYear => first_value( rights_uri, odrs.databaseRightYear ),
|
73
|
+
:copyrightStatement => first_value( rights_uri, odrs.copyrightStatement ),
|
74
|
+
:databaseRightStatement => first_value( rights_uri, odrs.databaseRightStatement )
|
75
|
+
)
|
76
|
+
end
|
77
|
+
rescue => e
|
78
|
+
#puts e
|
79
|
+
#puts e.backtrace
|
80
|
+
nil
|
81
|
+
end
|
82
|
+
|
83
|
+
# A list of licenses.
|
84
|
+
#
|
85
|
+
# @see Dataset#licenses
|
86
|
+
def licenses
|
87
|
+
licenses = []
|
88
|
+
uris = metadata[dataset_uri][RDF::DC.license.to_s]
|
89
|
+
if uris.nil?
|
90
|
+
[]
|
91
|
+
else
|
92
|
+
uris.each do |license_uri|
|
93
|
+
licenses << License.new(:uri => license_uri, :name => first_value( license_uri, RDF::DC.title ))
|
94
|
+
end
|
95
|
+
return licenses
|
96
|
+
end
|
97
|
+
rescue => e
|
98
|
+
[]
|
99
|
+
end
|
100
|
+
|
101
|
+
# A list of contributors.
|
102
|
+
#
|
103
|
+
# @see Dataset#contributors
|
104
|
+
def contributors
|
105
|
+
[]
|
106
|
+
end
|
107
|
+
|
108
|
+
# A list of distributions, referred to as +resources+ by Datapackage.
|
109
|
+
#
|
110
|
+
# @see Dataset#distributions
|
111
|
+
def distributions
|
112
|
+
distributions = []
|
113
|
+
uris = metadata[dataset_uri][dcat.distribution.to_s]
|
114
|
+
uris.each do |distribution_uri|
|
115
|
+
distribution = {
|
116
|
+
:title => first_value( distribution_uri, RDF::DC.title ),
|
117
|
+
:accessURL => first_value( distribution_uri, dcat.accessURL )
|
118
|
+
}
|
119
|
+
distributions << Distribution.new(self, dcat_resource: distribution)
|
120
|
+
end
|
121
|
+
return distributions
|
122
|
+
rescue
|
123
|
+
[]
|
124
|
+
end
|
125
|
+
|
126
|
+
# The human-readable title of the dataset.
|
127
|
+
#
|
128
|
+
# @see Dataset#data_title
|
129
|
+
def data_title
|
130
|
+
metadata[dataset_uri][dct.title.to_s][0] rescue nil
|
131
|
+
end
|
132
|
+
|
133
|
+
# A brief description of the dataset
|
134
|
+
#
|
135
|
+
# @see Dataset#description
|
136
|
+
def description
|
137
|
+
metadata[dataset_uri][dct.description.to_s][0] rescue nil
|
138
|
+
end
|
139
|
+
|
140
|
+
# Keywords for the dataset
|
141
|
+
#
|
142
|
+
# @see Dataset#keywords
|
143
|
+
def keywords
|
144
|
+
keywords = []
|
145
|
+
metadata[dataset_uri][dcat.keyword.to_s].each do |k|
|
146
|
+
keywords << k
|
147
|
+
end
|
148
|
+
rescue
|
149
|
+
[]
|
150
|
+
end
|
151
|
+
|
152
|
+
# Where the data is sourced from
|
153
|
+
#
|
154
|
+
# @see Dataset#sources
|
155
|
+
def sources
|
156
|
+
[]
|
157
|
+
end
|
158
|
+
|
159
|
+
# How frequently the data is updated.
|
160
|
+
#
|
161
|
+
# @see Dataset#update_frequency
|
162
|
+
def update_frequency
|
163
|
+
first_value( dataset_uri, dcat.accrualPeriodicity )
|
164
|
+
end
|
165
|
+
|
166
|
+
def issued
|
167
|
+
date = first_value(dataset_uri, RDF::DC.issued) ||
|
168
|
+
first_value(dataset_uri, RDF::DC.created)
|
169
|
+
if date
|
170
|
+
return Date.parse( date )
|
171
|
+
end
|
172
|
+
return nil
|
173
|
+
end
|
174
|
+
|
175
|
+
def modified
|
176
|
+
date = first_value(dataset_uri, RDF::DC.modified)
|
177
|
+
if date
|
178
|
+
return Date.parse( date )
|
179
|
+
end
|
180
|
+
return nil
|
181
|
+
end
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
def graph
|
186
|
+
@graph ||= RDF::Graph.load(uri, :format => :rdfa)
|
187
|
+
end
|
188
|
+
|
189
|
+
def first_value(resource, property, default=nil)
|
190
|
+
if metadata[resource] && metadata[resource][property.to_s]
|
191
|
+
return metadata[resource][property.to_s][0]
|
192
|
+
end
|
193
|
+
return default
|
194
|
+
end
|
195
|
+
|
196
|
+
def metadata
|
197
|
+
@metadata ||= {}
|
198
|
+
|
199
|
+
# This is UGLY, and exists solely to make getting data out of the graph easier. We will probably change this later
|
200
|
+
graph.triples.each do |triple|
|
201
|
+
@metadata[triple[0].to_s] ||= {}
|
202
|
+
@metadata[triple[0].to_s][triple[1].to_s] ||= []
|
203
|
+
@metadata[triple[0].to_s][triple[1].to_s] << triple[2].to_s unless @metadata[triple[0].to_s][triple[1].to_s].include? triple[2].to_s
|
204
|
+
end
|
205
|
+
|
206
|
+
return @metadata
|
207
|
+
end
|
208
|
+
|
209
|
+
def dataset_uri
|
210
|
+
query = RDF::Query.new({
|
211
|
+
:dataset => {
|
212
|
+
RDF.type => dcat.Dataset
|
213
|
+
}
|
214
|
+
})
|
215
|
+
|
216
|
+
query.execute(graph)[0][:dataset].to_s
|
217
|
+
end
|
218
|
+
|
219
|
+
def dcat
|
220
|
+
RDF::Vocabulary.new("http://www.w3.org/ns/dcat#")
|
221
|
+
end
|
222
|
+
|
223
|
+
def dct
|
224
|
+
RDF::Vocabulary.new("http://purl.org/dc/terms/")
|
225
|
+
end
|
226
|
+
|
227
|
+
def odrs
|
228
|
+
RDF::Vocabulary.new("http://schema.theodi.org/odrs#")
|
229
|
+
end
|
230
|
+
|
231
|
+
def void
|
232
|
+
RDF::Vocabulary.new("http://rdfs.org/ns/void#")
|
233
|
+
end
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
|
239
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A rights statement for a {Dataset} or {Distribution}
|
4
|
+
#
|
5
|
+
class Rights
|
6
|
+
|
7
|
+
# @!attribute uri
|
8
|
+
# @return [String] the URI for the rights statement
|
9
|
+
attr_accessor :uri
|
10
|
+
|
11
|
+
# @!attribute dataLicense
|
12
|
+
# @return [String] the license for the data in the dataset.
|
13
|
+
attr_accessor :dataLicense
|
14
|
+
|
15
|
+
# @!attribute contentLicense
|
16
|
+
# @return [String] the license for the content in the dataset.
|
17
|
+
attr_accessor :contentLicense
|
18
|
+
|
19
|
+
# @!attribute copyrightNotice
|
20
|
+
# @return [String] the copyright notice for the dataset.
|
21
|
+
attr_accessor :copyrightNotice
|
22
|
+
|
23
|
+
# @!attribute attributionURL
|
24
|
+
# @return [String] the attribution URL for the dataset.
|
25
|
+
attr_accessor :attributionURL
|
26
|
+
|
27
|
+
# @!attribute attributionText
|
28
|
+
# @return [String] the attribution text for the dataset.
|
29
|
+
attr_accessor :attributionText
|
30
|
+
|
31
|
+
# @!attribute copyrightHolder
|
32
|
+
# @return [String] the URI of the organization that holds copyright for this dataset
|
33
|
+
attr_accessor :copyrightHolder
|
34
|
+
|
35
|
+
# @!attribute databaseRightHolder
|
36
|
+
# @return [String] the URI of the organization that owns the database rights for this dataset
|
37
|
+
attr_accessor :databaseRightHolder
|
38
|
+
|
39
|
+
# @!attribute copyrightYear
|
40
|
+
# @return [String] the year in which copyright is claimed
|
41
|
+
attr_accessor :copyrightYear
|
42
|
+
|
43
|
+
# @!attribute databaseRightYear
|
44
|
+
# @return [String] the year in which copyright is claimed
|
45
|
+
attr_accessor :databaseRightYear
|
46
|
+
|
47
|
+
# @!attribute copyrightStatement
|
48
|
+
# @return [String] the URL of a copyright statement for the dataset
|
49
|
+
attr_accessor :copyrightStatement
|
50
|
+
|
51
|
+
# @!attribute databaseRightStatement
|
52
|
+
# @return [String] the URL of a database right statement for the dataset
|
53
|
+
attr_accessor :databaseRightStatement
|
54
|
+
|
55
|
+
# Create a new Rights object.
|
56
|
+
#
|
57
|
+
# @param options [Hash] A set of options with which to initialise the license.
|
58
|
+
# @option options [String] :dataLicense the license for the data in the dataset
|
59
|
+
# @option options [String] :contentLicense the license for the content in the dataset
|
60
|
+
# @option options [String] :copyrightNotice the copyright notice for the dataset
|
61
|
+
# @option options [String] :attributionURL the attribution URL for the dataset
|
62
|
+
# @option options [String] :attributionText attribution name for the dataset
|
63
|
+
def initialize(options)
|
64
|
+
@uri = options[:uri]
|
65
|
+
@dataLicense = options[:dataLicense]
|
66
|
+
@contentLicense = options[:contentLicense]
|
67
|
+
@copyrightNotice = options[:copyrightNotice]
|
68
|
+
@attributionURL = options[:attributionURL]
|
69
|
+
@attributionText = options[:attributionText]
|
70
|
+
@copyrightHolder = options[:copyrightHolder]
|
71
|
+
@databaseRightHolder = options[:databaseRightHolder]
|
72
|
+
@copyrightYear = options[:copyrightYear]
|
73
|
+
@databaseRightYear = options[:databaseRightYear]
|
74
|
+
@copyrightStatement = options[:copyrightStatement]
|
75
|
+
@databaseRightStatement = options[:databaseRightStatement]
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# Where the data has been sourced from
|
4
|
+
# Follows the pattern of {http://purl.org/dc/terms/source} with a {http://www.w3.org/2000/01/rdf-schema#label} and a {http://www.w3.org/1999/02/22-rdf-syntax-ns#resource}, and with useful aliases for other vocabularies
|
5
|
+
|
6
|
+
class Source
|
7
|
+
|
8
|
+
# Create a new Source
|
9
|
+
#
|
10
|
+
# @param [Hash] options the details of the Source.
|
11
|
+
# @option options [String] :label The name of the Source
|
12
|
+
# @option options [String] :resource The URI of the Source
|
13
|
+
#
|
14
|
+
def initialize(options)
|
15
|
+
@label = options[:label]
|
16
|
+
@resource = options[:resource]
|
17
|
+
end
|
18
|
+
|
19
|
+
# @!attribute label
|
20
|
+
# @return [String] the name of the Source
|
21
|
+
attr_accessor :label
|
22
|
+
alias_method :name, :label
|
23
|
+
|
24
|
+
# @!attribute label
|
25
|
+
# @return [String] the URI of the Source
|
26
|
+
attr_accessor :resource
|
27
|
+
alias_method :web, :resource
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|