data_kitten 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A specific available form of a dataset, such as a CSV file, an API, or an RSS feed.
|
4
|
+
#
|
5
|
+
# Based on {http://www.w3.org/TR/vocab-dcat/#class-distribution dcat:Distribution}, but
|
6
|
+
# with useful aliases for other vocabularies.
|
7
|
+
#
|
8
|
+
class Distribution
|
9
|
+
|
10
|
+
# @!attribute format
|
11
|
+
# @return [DistributionFormat] the file format of the distribution.
|
12
|
+
attr_accessor :format
|
13
|
+
|
14
|
+
# @!attribute access_url
|
15
|
+
# @return [String] a URL to access the distribution.
|
16
|
+
attr_accessor :access_url
|
17
|
+
alias_method :uri, :access_url
|
18
|
+
alias_method :download_url, :access_url
|
19
|
+
|
20
|
+
# @!attribute path
|
21
|
+
# @return [String] the path of the distribution within the source, if appropriate
|
22
|
+
attr_accessor :path
|
23
|
+
|
24
|
+
# @!attribute title
|
25
|
+
# @return [String] a short title, unique within the dataset
|
26
|
+
attr_accessor :title
|
27
|
+
|
28
|
+
# @!attribute description
|
29
|
+
# @return [String] a textual description
|
30
|
+
attr_accessor :description
|
31
|
+
|
32
|
+
# @!attribute schema
|
33
|
+
# @return [Hash] a hash representing the schema of the data within the distribution. Will
|
34
|
+
# change to a more structured object later.
|
35
|
+
attr_accessor :schema
|
36
|
+
|
37
|
+
# Create a new Distribution. Currently only loads from Datapackage +resource+ hashes.
|
38
|
+
#
|
39
|
+
# @param dataset [Dataset] the {Dataset} that this is a part of.
|
40
|
+
# @param options [Hash] A set of options with which to initialise the distribution.
|
41
|
+
# @option options [String] :datapackage_resource the +resource+ section of a Datapackage
|
42
|
+
# representation to load information from.
|
43
|
+
def initialize(dataset, options)
|
44
|
+
# Store dataset
|
45
|
+
@dataset = dataset
|
46
|
+
# Parse datapackage
|
47
|
+
if r = options[:datapackage_resource]
|
48
|
+
# Load basics
|
49
|
+
@description = r['description']
|
50
|
+
# Load HTTP Response for further use
|
51
|
+
if r['url']
|
52
|
+
@response = Curl::Easy.http_head(r['url'])
|
53
|
+
end
|
54
|
+
# Work out format
|
55
|
+
@format = begin
|
56
|
+
extension = r['format']
|
57
|
+
if extension.nil?
|
58
|
+
extension = r['path'].is_a?(String) ? r['path'].split('.').last.upcase : nil
|
59
|
+
end
|
60
|
+
extension ? DistributionFormat.new(extension, @response) : nil
|
61
|
+
end
|
62
|
+
# Get CSV dialect
|
63
|
+
@dialect = r['dialect']
|
64
|
+
# Extract schema
|
65
|
+
@schema = r['schema']
|
66
|
+
# Get path
|
67
|
+
@path = r['path']
|
68
|
+
@access_url = r['url']
|
69
|
+
# Set title
|
70
|
+
@title = @path || @uri
|
71
|
+
elsif r = options[:dcat_resource]
|
72
|
+
@title = r[:title]
|
73
|
+
@description = r[:title]
|
74
|
+
@access_url = r[:accessURL]
|
75
|
+
elsif r = options[:ckan_resource]
|
76
|
+
@title = r[:title]
|
77
|
+
@description = r[:title]
|
78
|
+
@access_url = r[:accessURL]
|
79
|
+
# Load HTTP Response for further use
|
80
|
+
if @access_url
|
81
|
+
@response = Curl::Easy.http_head(@access_url) do |c|
|
82
|
+
c.follow_location = true
|
83
|
+
c.useragent = "curb"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
@format = r[:format] ? DistributionFormat.new(r[:format], @response) : nil
|
87
|
+
end
|
88
|
+
# Set default CSV dialect
|
89
|
+
@dialect ||= {
|
90
|
+
"delimiter" => ","
|
91
|
+
}
|
92
|
+
end
|
93
|
+
|
94
|
+
# A usable name for the distribution, unique within the {Dataset}.
|
95
|
+
#
|
96
|
+
# @return [String] a locally unique name
|
97
|
+
def title
|
98
|
+
@title
|
99
|
+
end
|
100
|
+
alias_method :name, :title
|
101
|
+
|
102
|
+
# An array of column headers for the distribution. Loaded from the schema, or from the file directly if no
|
103
|
+
# schema is present.
|
104
|
+
#
|
105
|
+
# @return [Array<String>] an array of column headers, as strings.
|
106
|
+
def headers
|
107
|
+
@headers ||= begin
|
108
|
+
if @schema
|
109
|
+
@schema['fields'].map{|x| x['id']}
|
110
|
+
else
|
111
|
+
data.headers
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Whether the file that the distribution represents actually exists
|
117
|
+
#
|
118
|
+
# @return [Boolean] whether the HTTP response returns a success code or not
|
119
|
+
def exists?
|
120
|
+
if @access_url
|
121
|
+
@response.response_code != 404
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# A CSV object representing the loaded data.
|
126
|
+
#
|
127
|
+
# @return [Array<Array<String>>] an array of arrays of strings, representing each row.
|
128
|
+
def data
|
129
|
+
@data ||= begin
|
130
|
+
if @path
|
131
|
+
datafile = @dataset.send(:load_file, @path)
|
132
|
+
elsif @access_url
|
133
|
+
datafile = RestClient.get @access_url rescue nil
|
134
|
+
end
|
135
|
+
if datafile
|
136
|
+
case format.extension
|
137
|
+
when :csv
|
138
|
+
CSV.parse(
|
139
|
+
datafile,
|
140
|
+
:headers => true,
|
141
|
+
:col_sep => @dialect["delimiter"]
|
142
|
+
)
|
143
|
+
else
|
144
|
+
nil
|
145
|
+
end
|
146
|
+
else
|
147
|
+
nil
|
148
|
+
end
|
149
|
+
rescue
|
150
|
+
nil
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A file format for a distribution
|
4
|
+
#
|
5
|
+
# For instance CSV, XML, etc.
|
6
|
+
#
|
7
|
+
class DistributionFormat
|
8
|
+
|
9
|
+
#@!attribute extension
|
10
|
+
#@return [Symbol] a symbol for the file extension. For instance, :csv.
|
11
|
+
attr_reader :extension
|
12
|
+
|
13
|
+
# Create a new DistributionFormat object with the relevant extension
|
14
|
+
#
|
15
|
+
# @param extension [String] the file extension for the format
|
16
|
+
def initialize(extension, response)
|
17
|
+
# Store extension as a lowercase symbol
|
18
|
+
@extension = extension.to_s.downcase.to_sym
|
19
|
+
# Store response for later use
|
20
|
+
@response = response
|
21
|
+
# Set up format lists
|
22
|
+
@@formats ||= {
|
23
|
+
csv: { structured: true, open: true },
|
24
|
+
xls: { structured: true, open: false },
|
25
|
+
rdf: { structured: true, open: true },
|
26
|
+
xml: { structured: true, open: true },
|
27
|
+
wms: { structured: true, open: true },
|
28
|
+
ods: { structured: true, open: true },
|
29
|
+
rdfa: { structured: true, open: true },
|
30
|
+
kml: { structured: true, open: true },
|
31
|
+
rss: { structured: true, open: true },
|
32
|
+
json: { structured: true, open: true },
|
33
|
+
ical: { structured: true, open: true },
|
34
|
+
sparql: { structured: true, open: true },
|
35
|
+
kml: { structured: true, open: true },
|
36
|
+
georss: { structured: true, open: true },
|
37
|
+
shp: { structured: true, open: true },
|
38
|
+
html: { structured: false, open: true },
|
39
|
+
doc: { structured: false, open: false },
|
40
|
+
pdf: { structured: false, open: true },
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Is this a structured format?
|
45
|
+
#
|
46
|
+
# @return [Boolean] whether the format is machine-readable or not.
|
47
|
+
def structured?
|
48
|
+
@@formats[@extension][:structured] rescue nil
|
49
|
+
end
|
50
|
+
|
51
|
+
# Is this an open format?
|
52
|
+
#
|
53
|
+
# @return [Boolean] whether the format is open or not
|
54
|
+
def open?
|
55
|
+
@@formats[@extension][:open] rescue nil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Whether the format of the file matches the extension given by the data
|
59
|
+
#
|
60
|
+
# @return [Boolean] whether the MIME type given in the HTTP response matches the data or not
|
61
|
+
def matches?
|
62
|
+
begin
|
63
|
+
mimes = []
|
64
|
+
MIME::Types.type_for(@extension.to_s).each { |i| mimes << i.content_type }
|
65
|
+
!!(@response.content_type =~ /#{mimes.join('|')}/) || false
|
66
|
+
rescue
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'data_kitten/hosts/github'
|
2
|
+
require 'data_kitten/hosts/bitbucket'
|
3
|
+
require 'data_kitten/hosts/gist'
|
4
|
+
|
5
|
+
module DataKitten
|
6
|
+
|
7
|
+
module Hosts
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def detect_host
|
12
|
+
[
|
13
|
+
DataKitten::Hosts::Github,
|
14
|
+
DataKitten::Hosts::Bitbucket,
|
15
|
+
DataKitten::Hosts::Gist
|
16
|
+
].each do |host|
|
17
|
+
extend host if host.supported?(@access_url)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Hosts
|
4
|
+
|
5
|
+
# Bitbucket host module. Automatically mixed into {Dataset} for datasets that are loaded from Bitbucket.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Bitbucket
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/[^\/]*bitbucket\.org\//
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# Where the dataset is hosted.
|
20
|
+
# @return [Symbol] +:bitbucket+
|
21
|
+
# @see Dataset#host
|
22
|
+
def host
|
23
|
+
:bitbucket
|
24
|
+
end
|
25
|
+
|
26
|
+
# Helper for generating Bitbucket URLs
|
27
|
+
#
|
28
|
+
# @param path [String] The path to append to the Bitbucket base URL.
|
29
|
+
#
|
30
|
+
# @return [String] The supplied path with the Bitbucket base URL prepended
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# dataset = Dataset.new(access_url: 'https://bitbucket.org/floppy/hot-drinks.git')
|
34
|
+
# dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
|
35
|
+
# dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
|
36
|
+
def bitbucket_path(path = '')
|
37
|
+
"https://bitbucket.org/#{bitbucket_user_name}/#{bitbucket_repository_name}/#{path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def bitbucket_user_name
|
43
|
+
@bitbucket_user_name ||= uri.split('/')[-2]
|
44
|
+
end
|
45
|
+
|
46
|
+
def bitbucket_repository_name
|
47
|
+
@bitbucket_repository_name ||= uri.split('/')[-1].split('.')[0]
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Hosts
|
4
|
+
|
5
|
+
# Gist host module. Automatically mixed into {Dataset} for datasets that are loaded from Gist.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Gist
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/gist\.github\.com\//
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# Where the dataset is hosted.
|
20
|
+
# @return [Symbol] +:gist+
|
21
|
+
# @see Dataset#host
|
22
|
+
def host
|
23
|
+
:gist
|
24
|
+
end
|
25
|
+
|
26
|
+
# Helper for generating Gist URLs
|
27
|
+
#
|
28
|
+
# @param path [String] The path to append to the Gist base URL.
|
29
|
+
#
|
30
|
+
# @return [String] The supplied path with the Gist base URL prepended
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# dataset = Dataset.new(access_url: 'git://gist.github.com/5633865.git')
|
34
|
+
# dataset.gist_path # => 'https://gist.github.com/5633865'
|
35
|
+
# dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
|
36
|
+
def gist_path(path = '')
|
37
|
+
"https://gist.github.com/#{gist_repository_name}/#{path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def gist_repository_name
|
43
|
+
@gist_repository_name ||= uri.split('/')[-1].split('.')[0]
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Hosts
|
4
|
+
|
5
|
+
# GitHub host module. Automatically mixed into {Dataset} for datasets that are loaded from GitHub.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Github
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/github\.com\//
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# Where the dataset is hosted.
|
20
|
+
# @return [Symbol] +:github+
|
21
|
+
# @see Dataset#host
|
22
|
+
def host
|
23
|
+
:github
|
24
|
+
end
|
25
|
+
|
26
|
+
# Helper for generating GitHub URLs
|
27
|
+
#
|
28
|
+
# @param path [String] The path to append to the GitHub base URL.
|
29
|
+
#
|
30
|
+
# @return [String] The supplied path with the GitHub base URL prepended
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
|
34
|
+
# dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
|
35
|
+
# dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
|
36
|
+
def github_path(path = '')
|
37
|
+
"https://github.com/#{github_user_name}/#{github_repository_name}/#{path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def github_user_name
|
43
|
+
@github_user_name ||= uri.split('/')[-2]
|
44
|
+
end
|
45
|
+
|
46
|
+
def github_repository_name
|
47
|
+
@github_repository_name ||= uri.split('/')[-1].split('.')[0]
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A license for a {Dataset} or {Distribution}
|
4
|
+
#
|
5
|
+
class License
|
6
|
+
|
7
|
+
# @!attribute is
|
8
|
+
# @return [String] a short ID that identifies the license.
|
9
|
+
attr_accessor :id
|
10
|
+
|
11
|
+
# @!attribute name
|
12
|
+
# @return [String] the human name of the license.
|
13
|
+
attr_accessor :name
|
14
|
+
|
15
|
+
# @!attribute uri
|
16
|
+
# @return [String] the URI for the license text.
|
17
|
+
attr_accessor :uri
|
18
|
+
|
19
|
+
# @!attribute type
|
20
|
+
# @return [String] the type of information this license applies to. Could be +:data+ or +:content+.
|
21
|
+
attr_accessor :type
|
22
|
+
|
23
|
+
# Create a new License object.
|
24
|
+
#
|
25
|
+
# @param options [Hash] A set of options with which to initialise the license.
|
26
|
+
# @option options [String] :id the short ID for the license
|
27
|
+
# @option options [String] :name the human name for the license
|
28
|
+
# @option options [String] :uri the URI of the license text
|
29
|
+
# @option options [String] :type the type of information covered by this license.
|
30
|
+
def initialize(options)
|
31
|
+
@id = options[:id]
|
32
|
+
@name = options[:name]
|
33
|
+
@uri = options[:uri]
|
34
|
+
@type = options[:type]
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|