data_kitten 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
@@ -0,0 +1,102 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
module LinkedData
|
6
|
+
|
7
|
+
ACCEPT_HEADER = "text/turtle, application/n-triples, application/ld+json; q=1.0,application/rdf+xml; q=0.8, */*; q=0.5"
|
8
|
+
|
9
|
+
include RDFa
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
#Find first resource with one of the specified RDF types
|
14
|
+
def self.first_of_type(graph, classes)
|
15
|
+
term = nil
|
16
|
+
classes.each do |clazz|
|
17
|
+
term = graph.first_subject(
|
18
|
+
RDF::Query::Pattern.new( nil, RDF.type, clazz ) )
|
19
|
+
break if term
|
20
|
+
end
|
21
|
+
term
|
22
|
+
end
|
23
|
+
|
24
|
+
#Attempt to create an RDF graph for this object
|
25
|
+
#
|
26
|
+
#Supports content negotiation for various RDF serializations. Attempts "dataset autodiscovery" if it receives
|
27
|
+
#an HTML response. This leaves the RDFa Publishing Format to just parse RDFa responses
|
28
|
+
def self.create_graph(uri)
|
29
|
+
|
30
|
+
resp = RestClient.get uri,
|
31
|
+
:accept=>ACCEPT_HEADER
|
32
|
+
return false if resp.code != 200
|
33
|
+
|
34
|
+
if resp.headers[:content_type] =~ /text\/html/
|
35
|
+
doc = Nokogiri::HTML( resp.body )
|
36
|
+
link = doc.search('link[rel=alternate]').detect { |n| n[:type] == 'application/rdf+xml' }
|
37
|
+
if link
|
38
|
+
resp = RestClient.get link["href"],
|
39
|
+
:accept=>ACCEPT_HEADER
|
40
|
+
return false if resp.code != 200
|
41
|
+
else
|
42
|
+
return false
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
reader = RDF::Reader.for( :content_type => resp.headers[:content_type] )
|
47
|
+
|
48
|
+
if !reader
|
49
|
+
extension = File.extname( uri ).gsub(".", "")
|
50
|
+
reader = RDF::Reader.for( :file_extension => extension ) if extension != ""
|
51
|
+
end
|
52
|
+
return false unless reader
|
53
|
+
|
54
|
+
graph = RDF::Graph.new()
|
55
|
+
graph << reader.new( StringIO.new( resp.body ))
|
56
|
+
|
57
|
+
return graph
|
58
|
+
rescue => e
|
59
|
+
#puts e
|
60
|
+
#puts e.backtrace
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
#Can we create an RDF graph for this object containing the description of a dataset?
|
65
|
+
def self.supported?(instance)
|
66
|
+
graph = create_graph(instance.uri)
|
67
|
+
return false unless graph
|
68
|
+
return true if first_of_type(graph,
|
69
|
+
[RDF::Vocabulary.new("http://www.w3.org/ns/dcat#").Dataset,
|
70
|
+
RDF::Vocabulary.new("http://rdfs.org/ns/void#").Dataset])
|
71
|
+
return false
|
72
|
+
end
|
73
|
+
|
74
|
+
public
|
75
|
+
|
76
|
+
# The publishing format for the dataset.
|
77
|
+
# @return [Symbol] +:rdfa+
|
78
|
+
# @see Dataset#publishing_format
|
79
|
+
def publishing_format
|
80
|
+
:rdf
|
81
|
+
end
|
82
|
+
|
83
|
+
def uri
|
84
|
+
access_url
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def dataset_uri
|
90
|
+
access_url
|
91
|
+
end
|
92
|
+
|
93
|
+
def graph
|
94
|
+
if !@graph
|
95
|
+
@graph = LinkedData.create_graph(access_url)
|
96
|
+
end
|
97
|
+
@graph
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,239 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module PublishingFormats
|
4
|
+
|
5
|
+
module RDFa
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
def self.supported?(instance)
|
10
|
+
graph = RDF::Graph.load(instance.uri, :format => :rdfa)
|
11
|
+
|
12
|
+
query = RDF::Query.new({
|
13
|
+
:dataset => {
|
14
|
+
RDF.type => RDF::Vocabulary.new("http://www.w3.org/ns/dcat#").Dataset
|
15
|
+
}
|
16
|
+
})
|
17
|
+
|
18
|
+
query.execute(graph)[0][:dataset].to_s
|
19
|
+
rescue
|
20
|
+
false
|
21
|
+
end
|
22
|
+
|
23
|
+
public
|
24
|
+
|
25
|
+
# The publishing format for the dataset.
|
26
|
+
# @return [Symbol] +:rdfa+
|
27
|
+
# @see Dataset#publishing_format
|
28
|
+
def publishing_format
|
29
|
+
:rdfa
|
30
|
+
end
|
31
|
+
|
32
|
+
# A list of maintainers.
|
33
|
+
#
|
34
|
+
# @see Dataset#maintainers
|
35
|
+
def maintainers
|
36
|
+
[]
|
37
|
+
end
|
38
|
+
|
39
|
+
# A list of publishers.
|
40
|
+
#
|
41
|
+
# @see Dataset#publishers
|
42
|
+
def publishers
|
43
|
+
publishers = []
|
44
|
+
uris = metadata[dataset_uri][RDF::DC.publisher.to_s]
|
45
|
+
uris.each do |publisher_uri|
|
46
|
+
publishers << Agent.new(:name => first_value( publisher_uri, RDF::FOAF.name ),
|
47
|
+
:homepage => first_value( publisher_uri, RDF::FOAF.homepage ),
|
48
|
+
:mbox => first_value( publisher_uri, RDF::FOAF.mbox ))
|
49
|
+
end
|
50
|
+
return publishers
|
51
|
+
rescue
|
52
|
+
[]
|
53
|
+
end
|
54
|
+
|
55
|
+
# The rights statment for the data
|
56
|
+
#
|
57
|
+
# @see Dataset#rights
|
58
|
+
def rights
|
59
|
+
rights_uri = metadata[dataset_uri][RDF::DC.rights.to_s][0]
|
60
|
+
if !metadata[rights_uri]
|
61
|
+
return Rights.new(:uri => rights_uri)
|
62
|
+
else
|
63
|
+
return Rights.new(:uri => uri,
|
64
|
+
:dataLicense => first_value( rights_uri, odrs.dataLicense ),
|
65
|
+
:contentLicense => first_value( rights_uri, odrs.contentLicense ),
|
66
|
+
:copyrightNotice => first_value( rights_uri, odrs.copyrightNotice ),
|
67
|
+
:attributionURL => first_value( rights_uri, odrs.attributionURL ),
|
68
|
+
:attributionText => first_value( rights_uri, odrs.attributionText ),
|
69
|
+
:copyrightHolder => first_value( rights_uri, odrs.copyrightHolder ),
|
70
|
+
:databaseRightHolder => first_value( rights_uri, odrs.databaseRightHolder ),
|
71
|
+
:copyrightYear => first_value( rights_uri, odrs.copyrightYear ),
|
72
|
+
:databaseRightYear => first_value( rights_uri, odrs.databaseRightYear ),
|
73
|
+
:copyrightStatement => first_value( rights_uri, odrs.copyrightStatement ),
|
74
|
+
:databaseRightStatement => first_value( rights_uri, odrs.databaseRightStatement )
|
75
|
+
)
|
76
|
+
end
|
77
|
+
rescue => e
|
78
|
+
#puts e
|
79
|
+
#puts e.backtrace
|
80
|
+
nil
|
81
|
+
end
|
82
|
+
|
83
|
+
# A list of licenses.
|
84
|
+
#
|
85
|
+
# @see Dataset#licenses
|
86
|
+
def licenses
|
87
|
+
licenses = []
|
88
|
+
uris = metadata[dataset_uri][RDF::DC.license.to_s]
|
89
|
+
if uris.nil?
|
90
|
+
[]
|
91
|
+
else
|
92
|
+
uris.each do |license_uri|
|
93
|
+
licenses << License.new(:uri => license_uri, :name => first_value( license_uri, RDF::DC.title ))
|
94
|
+
end
|
95
|
+
return licenses
|
96
|
+
end
|
97
|
+
rescue => e
|
98
|
+
[]
|
99
|
+
end
|
100
|
+
|
101
|
+
# A list of contributors.
|
102
|
+
#
|
103
|
+
# @see Dataset#contributors
|
104
|
+
def contributors
|
105
|
+
[]
|
106
|
+
end
|
107
|
+
|
108
|
+
# A list of distributions, referred to as +resources+ by Datapackage.
|
109
|
+
#
|
110
|
+
# @see Dataset#distributions
|
111
|
+
def distributions
|
112
|
+
distributions = []
|
113
|
+
uris = metadata[dataset_uri][dcat.distribution.to_s]
|
114
|
+
uris.each do |distribution_uri|
|
115
|
+
distribution = {
|
116
|
+
:title => first_value( distribution_uri, RDF::DC.title ),
|
117
|
+
:accessURL => first_value( distribution_uri, dcat.accessURL )
|
118
|
+
}
|
119
|
+
distributions << Distribution.new(self, dcat_resource: distribution)
|
120
|
+
end
|
121
|
+
return distributions
|
122
|
+
rescue
|
123
|
+
[]
|
124
|
+
end
|
125
|
+
|
126
|
+
# The human-readable title of the dataset.
|
127
|
+
#
|
128
|
+
# @see Dataset#data_title
|
129
|
+
def data_title
|
130
|
+
metadata[dataset_uri][dct.title.to_s][0] rescue nil
|
131
|
+
end
|
132
|
+
|
133
|
+
# A brief description of the dataset
|
134
|
+
#
|
135
|
+
# @see Dataset#description
|
136
|
+
def description
|
137
|
+
metadata[dataset_uri][dct.description.to_s][0] rescue nil
|
138
|
+
end
|
139
|
+
|
140
|
+
# Keywords for the dataset
|
141
|
+
#
|
142
|
+
# @see Dataset#keywords
|
143
|
+
def keywords
|
144
|
+
keywords = []
|
145
|
+
metadata[dataset_uri][dcat.keyword.to_s].each do |k|
|
146
|
+
keywords << k
|
147
|
+
end
|
148
|
+
rescue
|
149
|
+
[]
|
150
|
+
end
|
151
|
+
|
152
|
+
# Where the data is sourced from
|
153
|
+
#
|
154
|
+
# @see Dataset#sources
|
155
|
+
def sources
|
156
|
+
[]
|
157
|
+
end
|
158
|
+
|
159
|
+
# How frequently the data is updated.
|
160
|
+
#
|
161
|
+
# @see Dataset#update_frequency
|
162
|
+
def update_frequency
|
163
|
+
first_value( dataset_uri, dcat.accrualPeriodicity )
|
164
|
+
end
|
165
|
+
|
166
|
+
def issued
|
167
|
+
date = first_value(dataset_uri, RDF::DC.issued) ||
|
168
|
+
first_value(dataset_uri, RDF::DC.created)
|
169
|
+
if date
|
170
|
+
return Date.parse( date )
|
171
|
+
end
|
172
|
+
return nil
|
173
|
+
end
|
174
|
+
|
175
|
+
def modified
|
176
|
+
date = first_value(dataset_uri, RDF::DC.modified)
|
177
|
+
if date
|
178
|
+
return Date.parse( date )
|
179
|
+
end
|
180
|
+
return nil
|
181
|
+
end
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
def graph
|
186
|
+
@graph ||= RDF::Graph.load(uri, :format => :rdfa)
|
187
|
+
end
|
188
|
+
|
189
|
+
def first_value(resource, property, default=nil)
|
190
|
+
if metadata[resource] && metadata[resource][property.to_s]
|
191
|
+
return metadata[resource][property.to_s][0]
|
192
|
+
end
|
193
|
+
return default
|
194
|
+
end
|
195
|
+
|
196
|
+
def metadata
|
197
|
+
@metadata ||= {}
|
198
|
+
|
199
|
+
# This is UGLY, and exists solely to make getting data out of the graph easier. We will probably change this later
|
200
|
+
graph.triples.each do |triple|
|
201
|
+
@metadata[triple[0].to_s] ||= {}
|
202
|
+
@metadata[triple[0].to_s][triple[1].to_s] ||= []
|
203
|
+
@metadata[triple[0].to_s][triple[1].to_s] << triple[2].to_s unless @metadata[triple[0].to_s][triple[1].to_s].include? triple[2].to_s
|
204
|
+
end
|
205
|
+
|
206
|
+
return @metadata
|
207
|
+
end
|
208
|
+
|
209
|
+
def dataset_uri
|
210
|
+
query = RDF::Query.new({
|
211
|
+
:dataset => {
|
212
|
+
RDF.type => dcat.Dataset
|
213
|
+
}
|
214
|
+
})
|
215
|
+
|
216
|
+
query.execute(graph)[0][:dataset].to_s
|
217
|
+
end
|
218
|
+
|
219
|
+
def dcat
|
220
|
+
RDF::Vocabulary.new("http://www.w3.org/ns/dcat#")
|
221
|
+
end
|
222
|
+
|
223
|
+
def dct
|
224
|
+
RDF::Vocabulary.new("http://purl.org/dc/terms/")
|
225
|
+
end
|
226
|
+
|
227
|
+
def odrs
|
228
|
+
RDF::Vocabulary.new("http://schema.theodi.org/odrs#")
|
229
|
+
end
|
230
|
+
|
231
|
+
def void
|
232
|
+
RDF::Vocabulary.new("http://rdfs.org/ns/void#")
|
233
|
+
end
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
|
239
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A rights statement for a {Dataset} or {Distribution}
|
4
|
+
#
|
5
|
+
class Rights
|
6
|
+
|
7
|
+
# @!attribute uri
|
8
|
+
# @return [String] the URI for the rights statement
|
9
|
+
attr_accessor :uri
|
10
|
+
|
11
|
+
# @!attribute dataLicense
|
12
|
+
# @return [String] the license for the data in the dataset.
|
13
|
+
attr_accessor :dataLicense
|
14
|
+
|
15
|
+
# @!attribute contentLicense
|
16
|
+
# @return [String] the license for the content in the dataset.
|
17
|
+
attr_accessor :contentLicense
|
18
|
+
|
19
|
+
# @!attribute copyrightNotice
|
20
|
+
# @return [String] the copyright notice for the dataset.
|
21
|
+
attr_accessor :copyrightNotice
|
22
|
+
|
23
|
+
# @!attribute attributionURL
|
24
|
+
# @return [String] the attribution URL for the dataset.
|
25
|
+
attr_accessor :attributionURL
|
26
|
+
|
27
|
+
# @!attribute attributionText
|
28
|
+
# @return [String] the attribution text for the dataset.
|
29
|
+
attr_accessor :attributionText
|
30
|
+
|
31
|
+
# @!attribute copyrightHolder
|
32
|
+
# @return [String] the URI of the organization that holds copyright for this dataset
|
33
|
+
attr_accessor :copyrightHolder
|
34
|
+
|
35
|
+
# @!attribute databaseRightHolder
|
36
|
+
# @return [String] the URI of the organization that owns the database rights for this dataset
|
37
|
+
attr_accessor :databaseRightHolder
|
38
|
+
|
39
|
+
# @!attribute copyrightYear
|
40
|
+
# @return [String] the year in which copyright is claimed
|
41
|
+
attr_accessor :copyrightYear
|
42
|
+
|
43
|
+
# @!attribute databaseRightYear
|
44
|
+
# @return [String] the year in which copyright is claimed
|
45
|
+
attr_accessor :databaseRightYear
|
46
|
+
|
47
|
+
# @!attribute copyrightStatement
|
48
|
+
# @return [String] the URL of a copyright statement for the dataset
|
49
|
+
attr_accessor :copyrightStatement
|
50
|
+
|
51
|
+
# @!attribute databaseRightStatement
|
52
|
+
# @return [String] the URL of a database right statement for the dataset
|
53
|
+
attr_accessor :databaseRightStatement
|
54
|
+
|
55
|
+
# Create a new Rights object.
|
56
|
+
#
|
57
|
+
# @param options [Hash] A set of options with which to initialise the license.
|
58
|
+
# @option options [String] :dataLicense the license for the data in the dataset
|
59
|
+
# @option options [String] :contentLicense the license for the content in the dataset
|
60
|
+
# @option options [String] :copyrightNotice the copyright notice for the dataset
|
61
|
+
# @option options [String] :attributionURL the attribution URL for the dataset
|
62
|
+
# @option options [String] :attributionText attribution name for the dataset
|
63
|
+
def initialize(options)
|
64
|
+
@uri = options[:uri]
|
65
|
+
@dataLicense = options[:dataLicense]
|
66
|
+
@contentLicense = options[:contentLicense]
|
67
|
+
@copyrightNotice = options[:copyrightNotice]
|
68
|
+
@attributionURL = options[:attributionURL]
|
69
|
+
@attributionText = options[:attributionText]
|
70
|
+
@copyrightHolder = options[:copyrightHolder]
|
71
|
+
@databaseRightHolder = options[:databaseRightHolder]
|
72
|
+
@copyrightYear = options[:copyrightYear]
|
73
|
+
@databaseRightYear = options[:databaseRightYear]
|
74
|
+
@copyrightStatement = options[:copyrightStatement]
|
75
|
+
@databaseRightStatement = options[:databaseRightStatement]
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# Where the data has been sourced from
|
4
|
+
# Follows the pattern of {http://purl.org/dc/terms/source} with a {http://www.w3.org/2000/01/rdf-schema#label} and a {http://www.w3.org/1999/02/22-rdf-syntax-ns#resource}, and with useful aliases for other vocabularies
|
5
|
+
|
6
|
+
class Source
|
7
|
+
|
8
|
+
# Create a new Source
|
9
|
+
#
|
10
|
+
# @param [Hash] options the details of the Source.
|
11
|
+
# @option options [String] :label The name of the Source
|
12
|
+
# @option options [String] :resource The URI of the Source
|
13
|
+
#
|
14
|
+
def initialize(options)
|
15
|
+
@label = options[:label]
|
16
|
+
@resource = options[:resource]
|
17
|
+
end
|
18
|
+
|
19
|
+
# @!attribute label
|
20
|
+
# @return [String] the name of the Source
|
21
|
+
attr_accessor :label
|
22
|
+
alias_method :name, :label
|
23
|
+
|
24
|
+
# @!attribute label
|
25
|
+
# @return [String] the URI of the Source
|
26
|
+
attr_accessor :resource
|
27
|
+
alias_method :web, :resource
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|