data_kitten 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MGU0NTIxNWM3YjhlNDU1MDNiYTk2YzkwYzE3ZjZmODU5YWEyMDE4NQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YWNhOTQ2ODVjZTI0MTUxYmI3MjRhNjZlODBkNjQ5NmZjNjc5MTI1NA==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
Zjc4OWRjYmRmYzcxNzJjZmNkMGVmMmRhZmIyODVmMTAwZmJmZGZlNzdlZmNl
|
10
|
+
Yjk5M2ZlYzU4NTllMTcyYzdkNTk4NTJkMGQ2NzcxMDhhY2MzNjZiOTcwZWU1
|
11
|
+
M2NhOWYwY2Y5M2NhZWRhZjUyM2M5ODg2YjM5NDVkYTlhZDgyMzM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
OWUyMjlhYzA1YzA0NDY4ZGE1OTNjMTA3ZDA2MjMyNDkzYmVlZDY3N2ZlYzIw
|
14
|
+
MWNmOTNmZjZkNzM2YzE0OTYxZjRkNWMzOGU5NmMwMWRjMTM0NTFkYTZiYmY4
|
15
|
+
YzFjMzljNzcyYzU3ZTQ3YmQzZDJjNWFiOWVhZmNlYjU3NGNmODA=
|
data/LICENSE.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright 2013 The Open Data Institute
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
[![Build Status](http://img.shields.io/travis/theodi/data_kitten.svg)](https://travis-ci.org/theodi/data_kitten)
|
2
|
+
[![Dependency Status](http://img.shields.io/gemnasium/theodi/data_kitten.svg)](https://gemnasium.com/theodi/data_kitten)
|
3
|
+
[![Coverage Status](http://img.shields.io/coveralls/theodi/data_kitten.svg)](https://coveralls.io/r/theodi/data_kitten)
|
4
|
+
[![Code Climate](http://img.shields.io/codeclimate/github/theodi/data_kitten.svg)](https://codeclimate.com/github/theodi/data_kitten)
|
5
|
+
[![Gem Version](http://img.shields.io/gem/v/data_kitten.svg)](https://rubygems.org/gems/data_kitten)
|
6
|
+
[![License](http://img.shields.io/:license-mit-blue.svg)](http://theodi.mit-license.org)
|
7
|
+
[![Badges](http://img.shields.io/:badges-7/7-ff6799.svg)](https://github.com/pikesley/badger)
|
8
|
+
|
9
|
+
# data_kitten
|
10
|
+
|
11
|
+
![DATAS - I HAZ THEM](https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/67399f2b335ef62d562dc9eb41c0db16/tumblr_mmy9g7rA8M1s4aj1ho1_500.jpg)
|
12
|
+
|
13
|
+
A collection of classes that represent Datasets and other concepts, modeled on [DCAT](http://www.w3.org/TR/vocab-dcat/)
|
14
|
+
|
15
|
+
The module is designed to automatically interrogate data sources and give back data
|
16
|
+
and metadata in a consistent format. The best starting place is probably by having a look at `Dataset`.
|
17
|
+
|
18
|
+
It is designed to handle data from multiple `Sources` (such as git repositories, local files, remote URLs),
|
19
|
+
`Hosts` (GitHub, etc), and `PublishingFormats` (DataPackage, RDFa, microdata, DSPL, etc).
|
20
|
+
|
21
|
+
Currently supports Datapackages in git repositories (including but not limited to GitHub repos).
|
22
|
+
Wider support will follow.
|
23
|
+
|
24
|
+
# Documentation
|
25
|
+
|
26
|
+
Full YARD documentation is available on [Rubydoc.info](http://rubydoc.info/github/theodi/data_kitten/master/frames).
|
27
|
+
|
28
|
+
# Licence
|
29
|
+
|
30
|
+
This code is open source under the MIT license. See the LICENSE.md file for full details.
|
31
|
+
|
32
|
+
# Requirements
|
33
|
+
|
34
|
+
* Git ~> 1.2.6
|
35
|
+
|
36
|
+
# Usage
|
37
|
+
|
38
|
+
Pop the gem into your Gemfile:
|
39
|
+
|
40
|
+
gem 'data_kitten', :git => "git://github.com/theodi/data_kitten.git"
|
41
|
+
|
42
|
+
Require if you need to:
|
43
|
+
|
44
|
+
require 'data_kitten'
|
45
|
+
|
46
|
+
Request a dataset:
|
47
|
+
|
48
|
+
dataset = DataKitten::Dataset.new(access_url: "https://github.com/theodi/dataset-mod-disposals.git")
|
49
|
+
|
50
|
+
Use the results:
|
51
|
+
|
52
|
+
dataset.supported?
|
53
|
+
dataset.origin
|
54
|
+
dataset.host
|
55
|
+
dataset.data_title
|
56
|
+
dataset.documentation_url
|
57
|
+
dataset.release_type
|
58
|
+
dataset.time_sensitive?
|
59
|
+
dataset.publishing_format
|
60
|
+
dataset.maintainers
|
61
|
+
dataset.publishers
|
62
|
+
dataset.licenses
|
63
|
+
dataset.contributors
|
64
|
+
dataset.crowdsourced?
|
65
|
+
dataset.contributor_agreement_url
|
66
|
+
dataset.distributions
|
67
|
+
dataset.change_history
|
68
|
+
|
69
|
+
# And more to come!
|
70
|
+
|
71
|
+
See example usage in a Rails project at [https://github.com/theodi/git-data-viewer](https://github.com/theodi/git-data-viewer)
|
72
|
+
|
73
|
+
![actual_data_kitten](http://i.imgur.com/wXZEkh7.gif)
|
data/bin/data_kitten
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$:.unshift File.join( File.dirname(__FILE__), "..", "lib")
|
3
|
+
|
4
|
+
require 'data_kitten'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
if ARGV.length == 0
|
8
|
+
puts "Usage: data_kitten <access_url>"
|
9
|
+
exit 1
|
10
|
+
end
|
11
|
+
|
12
|
+
dataset = DataKitten::Dataset.new(access_url: ARGV[0])
|
13
|
+
|
14
|
+
if dataset.publishing_format == nil
|
15
|
+
puts "Unable to determine format for dataset metadata"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
(dataset.public_methods - Object.public_methods).sort.delete_if {|x| x.to_s =~ /=/ }.each do |method|
|
20
|
+
puts "#{method}: #{dataset.send(method).pretty_inspect}"
|
21
|
+
end
|
22
|
+
|
data/lib/data_kitten.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'uri'
|
3
|
+
require 'cgi'
|
4
|
+
require 'git'
|
5
|
+
require 'json'
|
6
|
+
require 'rest-client'
|
7
|
+
require 'rdf'
|
8
|
+
require 'linkeddata'
|
9
|
+
require 'nokogiri'
|
10
|
+
require 'uri'
|
11
|
+
require 'curb'
|
12
|
+
require 'datapackage'
|
13
|
+
|
14
|
+
require 'data_kitten/license'
|
15
|
+
require 'data_kitten/rights'
|
16
|
+
require 'data_kitten/agent'
|
17
|
+
require 'data_kitten/source'
|
18
|
+
require 'data_kitten/temporal'
|
19
|
+
require 'data_kitten/dataset'
|
20
|
+
require 'data_kitten/distribution_format'
|
21
|
+
require 'data_kitten/distribution'
|
22
|
+
|
23
|
+
# A collection of classes that represent Datasets and other concepts, modeled on {http://www.w3.org/TR/vocab-dcat/ DCAT}.
|
24
|
+
#
|
25
|
+
# The module is designed to automatically interrogate data sources and give back data and metadata in a consistent
|
26
|
+
# format. The best starting place is probably by having a look at {Dataset}.
|
27
|
+
#
|
28
|
+
# It is designed to handle data from multiple {Sources} (such as git repositories, local files, remote URLs),
|
29
|
+
# {Hosts} (GitHub, etc), and {PublishingFormats} (DataPackage, RDFa, microdata, DSPL, etc).
|
30
|
+
#
|
31
|
+
# Currently supports Datapackages in git repositories (including but not limited to GitHub repos). Wider support will follow.
|
32
|
+
#
|
33
|
+
# https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/67399f2b335ef62d562dc9eb41c0db16/tumblr_mmy9g7rA8M1s4aj1ho1_500.jpg
|
34
|
+
#
|
35
|
+
# @example Load a Dataset from a git repository
|
36
|
+
# dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
|
37
|
+
# dataset.supported? # => true
|
38
|
+
# dataset.origin # => :git
|
39
|
+
# dataset.host # => :github
|
40
|
+
# dataset.publishing_format # => :datapackage
|
41
|
+
# dataset.distributions # => [Distribution<#1>, Distribution<#2>]
|
42
|
+
# dataset.distributions[0].headers # => ['col1', 'col2']
|
43
|
+
# dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A person or organisation.
|
4
|
+
#
|
5
|
+
# Naming is based on {http://xmlns.com/foaf/spec/#term_Agent foaf:Agent}, but with useful aliases for other vocabularies.
|
6
|
+
class Agent
|
7
|
+
|
8
|
+
# Create a new Agent
|
9
|
+
#
|
10
|
+
# @param [Hash] options the details of the Agent.
|
11
|
+
# @option options [String] :name The Agent's name
|
12
|
+
# @option options [String] :homepage The homepage URL for the Agent
|
13
|
+
# @option options [String] :mbox Email address for the Agent
|
14
|
+
#
|
15
|
+
def initialize(options)
|
16
|
+
@name = options[:name]
|
17
|
+
@homepage = options[:homepage]
|
18
|
+
@mbox = options[:mbox]
|
19
|
+
end
|
20
|
+
|
21
|
+
# @!attribute name
|
22
|
+
# @return [String] the name of the Agent
|
23
|
+
attr_accessor :name
|
24
|
+
|
25
|
+
# @!attribute homepage
|
26
|
+
# @return [String] the homepage URL of the Agent
|
27
|
+
attr_accessor :homepage
|
28
|
+
alias_method :url, :homepage
|
29
|
+
alias_method :uri, :homepage
|
30
|
+
|
31
|
+
# @!attribute mbox
|
32
|
+
# @return [String] the email address of the Agent
|
33
|
+
attr_accessor :mbox
|
34
|
+
alias_method :email, :mbox
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,227 @@
|
|
1
|
+
require 'data_kitten/origins'
|
2
|
+
require 'data_kitten/hosts'
|
3
|
+
require 'data_kitten/publishing_formats'
|
4
|
+
|
5
|
+
module DataKitten
|
6
|
+
|
7
|
+
# Represents a single dataset from some origin (see {http://www.w3.org/TR/vocab-dcat/#class-dataset dcat:Dataset}
|
8
|
+
# for relevant vocabulary).
|
9
|
+
#
|
10
|
+
# Designed to be created with a URI to the dataset, and then to work out metadata from there.
|
11
|
+
#
|
12
|
+
# Currently supports Datasets hosted in Git (and optionally on GitHub), and which
|
13
|
+
# use the Datapackage metadata format.
|
14
|
+
#
|
15
|
+
# @example Load a Dataset from a git repository
|
16
|
+
# dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
|
17
|
+
# dataset.supported? # => true
|
18
|
+
# dataset.origin # => :git
|
19
|
+
# dataset.host # => :github
|
20
|
+
# dataset.publishing_format # => :datapackage
|
21
|
+
#
|
22
|
+
class Dataset
|
23
|
+
|
24
|
+
include DataKitten::Origins
|
25
|
+
include DataKitten::Hosts
|
26
|
+
include DataKitten::PublishingFormats
|
27
|
+
|
28
|
+
# @!attribute access_url
|
29
|
+
# @return [String] the URL that gives access to the dataset
|
30
|
+
attr_accessor :access_url
|
31
|
+
alias_method :uri, :access_url
|
32
|
+
alias_method :url, :access_url
|
33
|
+
|
34
|
+
# Create a new Dataset object
|
35
|
+
#
|
36
|
+
# @param [Hash] options the details of the Dataset.
|
37
|
+
# @option options [String] :access_url A URL that can be used to access the Dataset.
|
38
|
+
# The class will attempt to auto-load metadata from this URL.
|
39
|
+
#
|
40
|
+
def initialize(options)
|
41
|
+
@access_url = options[:access_url]
|
42
|
+
detect_origin
|
43
|
+
detect_host
|
44
|
+
detect_publishing_format
|
45
|
+
end
|
46
|
+
|
47
|
+
# Can metadata be loaded for this Dataset?
|
48
|
+
#
|
49
|
+
# @return [Boolean] true if metadata can be loaded, false if it's
|
50
|
+
# an unknown origin type, or has an unknown metadata format.
|
51
|
+
def supported?
|
52
|
+
!(origin.nil? || publishing_format.nil?)
|
53
|
+
end
|
54
|
+
|
55
|
+
# The origin type of the dataset.
|
56
|
+
#
|
57
|
+
# @return [Symbol] The origin type. For instance, datasets loaded from git
|
58
|
+
# repositories will return +:git+. If no origin type is
|
59
|
+
# identified, will return +nil+.
|
60
|
+
def origin
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Where the dataset is hosted.
|
65
|
+
#
|
66
|
+
# @return [Symbol] The host. For instance, data loaded from github repositories
|
67
|
+
# will return +:github+. This can be used to control extra host-specific
|
68
|
+
# behaviour if required. If no host type is identified, will return +nil+.
|
69
|
+
def host
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
|
73
|
+
# The human-readable title of the dataset.
|
74
|
+
#
|
75
|
+
# @return [String] the title of the dataset.
|
76
|
+
def data_title
|
77
|
+
nil
|
78
|
+
end
|
79
|
+
|
80
|
+
# A brief description of the dataset
|
81
|
+
#
|
82
|
+
# @return [String] the description of the dataset.
|
83
|
+
def description
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
87
|
+
# Keywords for the dataset
|
88
|
+
#
|
89
|
+
# @return [Array<string>] an array of keywords
|
90
|
+
def keywords
|
91
|
+
[]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Human-readable documentation for the dataset.
|
95
|
+
#
|
96
|
+
# @return [String] the URL of the documentation.
|
97
|
+
def documentation_url
|
98
|
+
nil
|
99
|
+
end
|
100
|
+
|
101
|
+
# What type of dataset is this?
|
102
|
+
# Options are: +:web_service+ for API-accessible data, or +:one_off+ for downloadable data dumps.
|
103
|
+
#
|
104
|
+
# @return [Symbol] the release type.
|
105
|
+
def release_type
|
106
|
+
false
|
107
|
+
end
|
108
|
+
|
109
|
+
# Date the dataset was released
|
110
|
+
#
|
111
|
+
# @return [Date] the release date of the dataset
|
112
|
+
def issued
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
alias_method :release_date, :issued
|
116
|
+
|
117
|
+
# Date the dataset was last modified
|
118
|
+
#
|
119
|
+
# @return [Date] the dataset's last modified date
|
120
|
+
def modified
|
121
|
+
nil
|
122
|
+
end
|
123
|
+
|
124
|
+
# The temporal coverage of the dataset
|
125
|
+
#
|
126
|
+
# @return [Object<Temporal>] the start and end dates of the dataset's temporal coverage
|
127
|
+
def temporal
|
128
|
+
nil
|
129
|
+
end
|
130
|
+
|
131
|
+
# Where the data is sourced from
|
132
|
+
#
|
133
|
+
# @return [Array<Source>] the sources of the data, each as a Source object.
|
134
|
+
def sources
|
135
|
+
[]
|
136
|
+
end
|
137
|
+
|
138
|
+
# Is the information time-sensitive?
|
139
|
+
#
|
140
|
+
# @return [Boolean] whether the information will go out of date.
|
141
|
+
def time_sensitive?
|
142
|
+
false
|
143
|
+
end
|
144
|
+
|
145
|
+
# The publishing format for the dataset.
|
146
|
+
#
|
147
|
+
# @return [Symbol] The format. For instance, datasets that publish metadata in
|
148
|
+
# Datapackage format will return +:datapackage+. If no format
|
149
|
+
# is identified, will return +nil+.
|
150
|
+
def publishing_format
|
151
|
+
nil
|
152
|
+
end
|
153
|
+
|
154
|
+
# A list of maintainers
|
155
|
+
#
|
156
|
+
# @return [Array<Agent>] An array of maintainers, each as an Agent object.
|
157
|
+
def maintainers
|
158
|
+
[]
|
159
|
+
end
|
160
|
+
|
161
|
+
# A list of publishers
|
162
|
+
#
|
163
|
+
# @return [Array<Agent>] An array of publishers, each as an Agent object.
|
164
|
+
def publishers
|
165
|
+
[]
|
166
|
+
end
|
167
|
+
|
168
|
+
# A list of licenses
|
169
|
+
#
|
170
|
+
# @return [Array<License>] An array of licenses, each as a License object.
|
171
|
+
def licenses
|
172
|
+
[]
|
173
|
+
end
|
174
|
+
|
175
|
+
# The rights statment for the data
|
176
|
+
#
|
177
|
+
# @return [Object<Rights>] How the content and data can be used, as well as copyright notice and attribution URL
|
178
|
+
def rights
|
179
|
+
nil
|
180
|
+
end
|
181
|
+
|
182
|
+
# A list of contributors
|
183
|
+
#
|
184
|
+
# @return [Array<Agent>] An array of contributors to the dataset, each as an Agent object.
|
185
|
+
def contributors
|
186
|
+
[]
|
187
|
+
end
|
188
|
+
|
189
|
+
# Has the data been crowdsourced?
|
190
|
+
#
|
191
|
+
# @return [Boolean] Whether the data has been crowdsourced or not.
|
192
|
+
def crowdsourced?
|
193
|
+
false
|
194
|
+
end
|
195
|
+
|
196
|
+
# The URL of the contributor license agreement
|
197
|
+
#
|
198
|
+
# @return [String] A URL for the agreement that contributors accept.
|
199
|
+
def contributor_agreement_url
|
200
|
+
nil
|
201
|
+
end
|
202
|
+
|
203
|
+
# A list of distributions. Has aliases for popular alternative vocabularies.
|
204
|
+
#
|
205
|
+
# @return [Array<Distribution>] An array of Distribution objects.
|
206
|
+
def distributions
|
207
|
+
[]
|
208
|
+
end
|
209
|
+
alias_method :files, :distributions
|
210
|
+
alias_method :resources, :distributions
|
211
|
+
|
212
|
+
# How frequently the data is updated.
|
213
|
+
#
|
214
|
+
# @return [String] The frequency of update expressed as a dct:Frequency.
|
215
|
+
def update_frequency
|
216
|
+
nil
|
217
|
+
end
|
218
|
+
|
219
|
+
# A history of changes to the Dataset
|
220
|
+
#
|
221
|
+
# @return [Array] An array of changes. Exact format depends on the origin and publishing format.
|
222
|
+
def change_history
|
223
|
+
[]
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
end
|