data_kitten 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MGU0NTIxNWM3YjhlNDU1MDNiYTk2YzkwYzE3ZjZmODU5YWEyMDE4NQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YWNhOTQ2ODVjZTI0MTUxYmI3MjRhNjZlODBkNjQ5NmZjNjc5MTI1NA==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
Zjc4OWRjYmRmYzcxNzJjZmNkMGVmMmRhZmIyODVmMTAwZmJmZGZlNzdlZmNl
|
10
|
+
Yjk5M2ZlYzU4NTllMTcyYzdkNTk4NTJkMGQ2NzcxMDhhY2MzNjZiOTcwZWU1
|
11
|
+
M2NhOWYwY2Y5M2NhZWRhZjUyM2M5ODg2YjM5NDVkYTlhZDgyMzM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
OWUyMjlhYzA1YzA0NDY4ZGE1OTNjMTA3ZDA2MjMyNDkzYmVlZDY3N2ZlYzIw
|
14
|
+
MWNmOTNmZjZkNzM2YzE0OTYxZjRkNWMzOGU5NmMwMWRjMTM0NTFkYTZiYmY4
|
15
|
+
YzFjMzljNzcyYzU3ZTQ3YmQzZDJjNWFiOWVhZmNlYjU3NGNmODA=
|
data/LICENSE.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright 2013 The Open Data Institute
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
[](https://travis-ci.org/theodi/data_kitten)
|
2
|
+
[](https://gemnasium.com/theodi/data_kitten)
|
3
|
+
[](https://coveralls.io/r/theodi/data_kitten)
|
4
|
+
[](https://codeclimate.com/github/theodi/data_kitten)
|
5
|
+
[](https://rubygems.org/gems/data_kitten)
|
6
|
+
[](http://theodi.mit-license.org)
|
7
|
+
[](https://github.com/pikesley/badger)
|
8
|
+
|
9
|
+
# data_kitten
|
10
|
+
|
11
|
+

|
12
|
+
|
13
|
+
A collection of classes that represent Datasets and other concepts, modeled on [DCAT](http://www.w3.org/TR/vocab-dcat/)
|
14
|
+
|
15
|
+
The module is designed to automatically interrogate data sources and give back data
|
16
|
+
and metadata in a consistent format. The best starting place is probably by having a look at `Dataset`.
|
17
|
+
|
18
|
+
It is designed to handle data from multiple `Sources` (such as git repositories, local files, remote URLs),
|
19
|
+
`Hosts` (GitHub, etc), and `PublishingFormats` (DataPackage, RDFa, microdata, DSPL, etc).
|
20
|
+
|
21
|
+
Currently supports Datapackages in git repositories (including but not limited to GitHub repos).
|
22
|
+
Wider support will follow.
|
23
|
+
|
24
|
+
# Documentation
|
25
|
+
|
26
|
+
Full YARD documentation is available on [Rubydoc.info](http://rubydoc.info/github/theodi/data_kitten/master/frames).
|
27
|
+
|
28
|
+
# Licence
|
29
|
+
|
30
|
+
This code is open source under the MIT license. See the LICENSE.md file for full details.
|
31
|
+
|
32
|
+
# Requirements
|
33
|
+
|
34
|
+
* Git ~> 1.2.6
|
35
|
+
|
36
|
+
# Usage
|
37
|
+
|
38
|
+
Pop the gem into your Gemfile:
|
39
|
+
|
40
|
+
gem 'data_kitten', :git => "git://github.com/theodi/data_kitten.git"
|
41
|
+
|
42
|
+
Require if you need to:
|
43
|
+
|
44
|
+
require 'data_kitten'
|
45
|
+
|
46
|
+
Request a dataset:
|
47
|
+
|
48
|
+
dataset = DataKitten::Dataset.new(access_url: "https://github.com/theodi/dataset-mod-disposals.git")
|
49
|
+
|
50
|
+
Use the results:
|
51
|
+
|
52
|
+
dataset.supported?
|
53
|
+
dataset.origin
|
54
|
+
dataset.host
|
55
|
+
dataset.data_title
|
56
|
+
dataset.documentation_url
|
57
|
+
dataset.release_type
|
58
|
+
dataset.time_sensitive?
|
59
|
+
dataset.publishing_format
|
60
|
+
dataset.maintainers
|
61
|
+
dataset.publishers
|
62
|
+
dataset.licenses
|
63
|
+
dataset.contributors
|
64
|
+
dataset.crowdsourced?
|
65
|
+
dataset.contributor_agreement_url
|
66
|
+
dataset.distributions
|
67
|
+
dataset.change_history
|
68
|
+
|
69
|
+
# And more to come!
|
70
|
+
|
71
|
+
See example usage in a Rails project at [https://github.com/theodi/git-data-viewer](https://github.com/theodi/git-data-viewer)
|
72
|
+
|
73
|
+

|
data/bin/data_kitten
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$:.unshift File.join( File.dirname(__FILE__), "..", "lib")
|
3
|
+
|
4
|
+
require 'data_kitten'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
if ARGV.length == 0
|
8
|
+
puts "Usage: data_kitten <access_url>"
|
9
|
+
exit 1
|
10
|
+
end
|
11
|
+
|
12
|
+
dataset = DataKitten::Dataset.new(access_url: ARGV[0])
|
13
|
+
|
14
|
+
if dataset.publishing_format == nil
|
15
|
+
puts "Unable to determine format for dataset metadata"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
(dataset.public_methods - Object.public_methods).sort.delete_if {|x| x.to_s =~ /=/ }.each do |method|
|
20
|
+
puts "#{method}: #{dataset.send(method).pretty_inspect}"
|
21
|
+
end
|
22
|
+
|
data/lib/data_kitten.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'uri'
|
3
|
+
require 'cgi'
|
4
|
+
require 'git'
|
5
|
+
require 'json'
|
6
|
+
require 'rest-client'
|
7
|
+
require 'rdf'
|
8
|
+
require 'linkeddata'
|
9
|
+
require 'nokogiri'
|
10
|
+
require 'uri'
|
11
|
+
require 'curb'
|
12
|
+
require 'datapackage'
|
13
|
+
|
14
|
+
require 'data_kitten/license'
|
15
|
+
require 'data_kitten/rights'
|
16
|
+
require 'data_kitten/agent'
|
17
|
+
require 'data_kitten/source'
|
18
|
+
require 'data_kitten/temporal'
|
19
|
+
require 'data_kitten/dataset'
|
20
|
+
require 'data_kitten/distribution_format'
|
21
|
+
require 'data_kitten/distribution'
|
22
|
+
|
23
|
+
# A collection of classes that represent Datasets and other concepts, modeled on {http://www.w3.org/TR/vocab-dcat/ DCAT}.
|
24
|
+
#
|
25
|
+
# The module is designed to automatically interrogate data sources and give back data and metadata in a consistent
|
26
|
+
# format. The best starting place is probably by having a look at {Dataset}.
|
27
|
+
#
|
28
|
+
# It is designed to handle data from multiple {Sources} (such as git repositories, local files, remote URLs),
|
29
|
+
# {Hosts} (GitHub, etc), and {PublishingFormats} (DataPackage, RDFa, microdata, DSPL, etc).
|
30
|
+
#
|
31
|
+
# Currently supports Datapackages in git repositories (including but not limited to GitHub repos). Wider support will follow.
|
32
|
+
#
|
33
|
+
# https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/67399f2b335ef62d562dc9eb41c0db16/tumblr_mmy9g7rA8M1s4aj1ho1_500.jpg
|
34
|
+
#
|
35
|
+
# @example Load a Dataset from a git repository
|
36
|
+
# dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
|
37
|
+
# dataset.supported? # => true
|
38
|
+
# dataset.origin # => :git
|
39
|
+
# dataset.host # => :github
|
40
|
+
# dataset.publishing_format # => :datapackage
|
41
|
+
# dataset.distributions # => [Distribution<#1>, Distribution<#2>]
|
42
|
+
# dataset.distributions[0].headers # => ['col1', 'col2']
|
43
|
+
# dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A person or organisation.
|
4
|
+
#
|
5
|
+
# Naming is based on {http://xmlns.com/foaf/spec/#term_Agent foaf:Agent}, but with useful aliases for other vocabularies.
|
6
|
+
class Agent
|
7
|
+
|
8
|
+
# Create a new Agent
|
9
|
+
#
|
10
|
+
# @param [Hash] options the details of the Agent.
|
11
|
+
# @option options [String] :name The Agent's name
|
12
|
+
# @option options [String] :homepage The homepage URL for the Agent
|
13
|
+
# @option options [String] :mbox Email address for the Agent
|
14
|
+
#
|
15
|
+
def initialize(options)
|
16
|
+
@name = options[:name]
|
17
|
+
@homepage = options[:homepage]
|
18
|
+
@mbox = options[:mbox]
|
19
|
+
end
|
20
|
+
|
21
|
+
# @!attribute name
|
22
|
+
# @return [String] the name of the Agent
|
23
|
+
attr_accessor :name
|
24
|
+
|
25
|
+
# @!attribute homepage
|
26
|
+
# @return [String] the homepage URL of the Agent
|
27
|
+
attr_accessor :homepage
|
28
|
+
alias_method :url, :homepage
|
29
|
+
alias_method :uri, :homepage
|
30
|
+
|
31
|
+
# @!attribute mbox
|
32
|
+
# @return [String] the email address of the Agent
|
33
|
+
attr_accessor :mbox
|
34
|
+
alias_method :email, :mbox
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,227 @@
|
|
1
|
+
require 'data_kitten/origins'
|
2
|
+
require 'data_kitten/hosts'
|
3
|
+
require 'data_kitten/publishing_formats'
|
4
|
+
|
5
|
+
module DataKitten
|
6
|
+
|
7
|
+
# Represents a single dataset from some origin (see {http://www.w3.org/TR/vocab-dcat/#class-dataset dcat:Dataset}
|
8
|
+
# for relevant vocabulary).
|
9
|
+
#
|
10
|
+
# Designed to be created with a URI to the dataset, and then to work out metadata from there.
|
11
|
+
#
|
12
|
+
# Currently supports Datasets hosted in Git (and optionally on GitHub), and which
|
13
|
+
# use the Datapackage metadata format.
|
14
|
+
#
|
15
|
+
# @example Load a Dataset from a git repository
|
16
|
+
# dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
|
17
|
+
# dataset.supported? # => true
|
18
|
+
# dataset.origin # => :git
|
19
|
+
# dataset.host # => :github
|
20
|
+
# dataset.publishing_format # => :datapackage
|
21
|
+
#
|
22
|
+
class Dataset
|
23
|
+
|
24
|
+
include DataKitten::Origins
|
25
|
+
include DataKitten::Hosts
|
26
|
+
include DataKitten::PublishingFormats
|
27
|
+
|
28
|
+
# @!attribute access_url
|
29
|
+
# @return [String] the URL that gives access to the dataset
|
30
|
+
attr_accessor :access_url
|
31
|
+
alias_method :uri, :access_url
|
32
|
+
alias_method :url, :access_url
|
33
|
+
|
34
|
+
# Create a new Dataset object
|
35
|
+
#
|
36
|
+
# @param [Hash] options the details of the Dataset.
|
37
|
+
# @option options [String] :access_url A URL that can be used to access the Dataset.
|
38
|
+
# The class will attempt to auto-load metadata from this URL.
|
39
|
+
#
|
40
|
+
def initialize(options)
|
41
|
+
@access_url = options[:access_url]
|
42
|
+
detect_origin
|
43
|
+
detect_host
|
44
|
+
detect_publishing_format
|
45
|
+
end
|
46
|
+
|
47
|
+
# Can metadata be loaded for this Dataset?
|
48
|
+
#
|
49
|
+
# @return [Boolean] true if metadata can be loaded, false if it's
|
50
|
+
# an unknown origin type, or has an unknown metadata format.
|
51
|
+
def supported?
|
52
|
+
!(origin.nil? || publishing_format.nil?)
|
53
|
+
end
|
54
|
+
|
55
|
+
# The origin type of the dataset.
|
56
|
+
#
|
57
|
+
# @return [Symbol] The origin type. For instance, datasets loaded from git
|
58
|
+
# repositories will return +:git+. If no origin type is
|
59
|
+
# identified, will return +nil+.
|
60
|
+
def origin
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Where the dataset is hosted.
|
65
|
+
#
|
66
|
+
# @return [Symbol] The host. For instance, data loaded from github repositories
|
67
|
+
# will return +:github+. This can be used to control extra host-specific
|
68
|
+
# behaviour if required. If no host type is identified, will return +nil+.
|
69
|
+
def host
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
|
73
|
+
# The human-readable title of the dataset.
|
74
|
+
#
|
75
|
+
# @return [String] the title of the dataset.
|
76
|
+
def data_title
|
77
|
+
nil
|
78
|
+
end
|
79
|
+
|
80
|
+
# A brief description of the dataset
|
81
|
+
#
|
82
|
+
# @return [String] the description of the dataset.
|
83
|
+
def description
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
87
|
+
# Keywords for the dataset
|
88
|
+
#
|
89
|
+
# @return [Array<string>] an array of keywords
|
90
|
+
def keywords
|
91
|
+
[]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Human-readable documentation for the dataset.
|
95
|
+
#
|
96
|
+
# @return [String] the URL of the documentation.
|
97
|
+
def documentation_url
|
98
|
+
nil
|
99
|
+
end
|
100
|
+
|
101
|
+
# What type of dataset is this?
|
102
|
+
# Options are: +:web_service+ for API-accessible data, or +:one_off+ for downloadable data dumps.
|
103
|
+
#
|
104
|
+
# @return [Symbol] the release type.
|
105
|
+
def release_type
|
106
|
+
false
|
107
|
+
end
|
108
|
+
|
109
|
+
# Date the dataset was released
|
110
|
+
#
|
111
|
+
# @return [Date] the release date of the dataset
|
112
|
+
def issued
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
alias_method :release_date, :issued
|
116
|
+
|
117
|
+
# Date the dataset was last modified
|
118
|
+
#
|
119
|
+
# @return [Date] the dataset's last modified date
|
120
|
+
def modified
|
121
|
+
nil
|
122
|
+
end
|
123
|
+
|
124
|
+
# The temporal coverage of the dataset
|
125
|
+
#
|
126
|
+
# @return [Object<Temporal>] the start and end dates of the dataset's temporal coverage
|
127
|
+
def temporal
|
128
|
+
nil
|
129
|
+
end
|
130
|
+
|
131
|
+
# Where the data is sourced from
|
132
|
+
#
|
133
|
+
# @return [Array<Source>] the sources of the data, each as a Source object.
|
134
|
+
def sources
|
135
|
+
[]
|
136
|
+
end
|
137
|
+
|
138
|
+
# Is the information time-sensitive?
|
139
|
+
#
|
140
|
+
# @return [Boolean] whether the information will go out of date.
|
141
|
+
def time_sensitive?
|
142
|
+
false
|
143
|
+
end
|
144
|
+
|
145
|
+
# The publishing format for the dataset.
|
146
|
+
#
|
147
|
+
# @return [Symbol] The format. For instance, datasets that publish metadata in
|
148
|
+
# Datapackage format will return +:datapackage+. If no format
|
149
|
+
# is identified, will return +nil+.
|
150
|
+
def publishing_format
|
151
|
+
nil
|
152
|
+
end
|
153
|
+
|
154
|
+
# A list of maintainers
|
155
|
+
#
|
156
|
+
# @return [Array<Agent>] An array of maintainers, each as an Agent object.
|
157
|
+
def maintainers
|
158
|
+
[]
|
159
|
+
end
|
160
|
+
|
161
|
+
# A list of publishers
|
162
|
+
#
|
163
|
+
# @return [Array<Agent>] An array of publishers, each as an Agent object.
|
164
|
+
def publishers
|
165
|
+
[]
|
166
|
+
end
|
167
|
+
|
168
|
+
# A list of licenses
|
169
|
+
#
|
170
|
+
# @return [Array<License>] An array of licenses, each as a License object.
|
171
|
+
def licenses
|
172
|
+
[]
|
173
|
+
end
|
174
|
+
|
175
|
+
# The rights statment for the data
|
176
|
+
#
|
177
|
+
# @return [Object<Rights>] How the content and data can be used, as well as copyright notice and attribution URL
|
178
|
+
def rights
|
179
|
+
nil
|
180
|
+
end
|
181
|
+
|
182
|
+
# A list of contributors
|
183
|
+
#
|
184
|
+
# @return [Array<Agent>] An array of contributors to the dataset, each as an Agent object.
|
185
|
+
def contributors
|
186
|
+
[]
|
187
|
+
end
|
188
|
+
|
189
|
+
# Has the data been crowdsourced?
|
190
|
+
#
|
191
|
+
# @return [Boolean] Whether the data has been crowdsourced or not.
|
192
|
+
def crowdsourced?
|
193
|
+
false
|
194
|
+
end
|
195
|
+
|
196
|
+
# The URL of the contributor license agreement
|
197
|
+
#
|
198
|
+
# @return [String] A URL for the agreement that contributors accept.
|
199
|
+
def contributor_agreement_url
|
200
|
+
nil
|
201
|
+
end
|
202
|
+
|
203
|
+
# A list of distributions. Has aliases for popular alternative vocabularies.
|
204
|
+
#
|
205
|
+
# @return [Array<Distribution>] An array of Distribution objects.
|
206
|
+
def distributions
|
207
|
+
[]
|
208
|
+
end
|
209
|
+
alias_method :files, :distributions
|
210
|
+
alias_method :resources, :distributions
|
211
|
+
|
212
|
+
# How frequently the data is updated.
|
213
|
+
#
|
214
|
+
# @return [String] The frequency of update expressed as a dct:Frequency.
|
215
|
+
def update_frequency
|
216
|
+
nil
|
217
|
+
end
|
218
|
+
|
219
|
+
# A history of changes to the Dataset
|
220
|
+
#
|
221
|
+
# @return [Array] An array of changes. Exact format depends on the origin and publishing format.
|
222
|
+
def change_history
|
223
|
+
[]
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
end
|