data_kitten 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE.md +20 -0
- data/README.md +73 -0
- data/bin/data_kitten +22 -0
- data/lib/data_kitten.rb +43 -0
- data/lib/data_kitten/agent.rb +38 -0
- data/lib/data_kitten/dataset.rb +227 -0
- data/lib/data_kitten/distribution.rb +156 -0
- data/lib/data_kitten/distribution_format.rb +73 -0
- data/lib/data_kitten/hosts.rb +23 -0
- data/lib/data_kitten/hosts/bitbucket.rb +54 -0
- data/lib/data_kitten/hosts/gist.rb +50 -0
- data/lib/data_kitten/hosts/github.rb +54 -0
- data/lib/data_kitten/license.rb +39 -0
- data/lib/data_kitten/origins.rb +28 -0
- data/lib/data_kitten/origins/git.rb +66 -0
- data/lib/data_kitten/origins/html.rb +32 -0
- data/lib/data_kitten/origins/linked_data.rb +37 -0
- data/lib/data_kitten/origins/web_service.rb +30 -0
- data/lib/data_kitten/publishing_formats.rb +28 -0
- data/lib/data_kitten/publishing_formats/ckan.rb +187 -0
- data/lib/data_kitten/publishing_formats/datapackage.rb +169 -0
- data/lib/data_kitten/publishing_formats/linked_data.rb +102 -0
- data/lib/data_kitten/publishing_formats/rdfa.rb +239 -0
- data/lib/data_kitten/rights.rb +80 -0
- data/lib/data_kitten/source.rb +31 -0
- data/lib/data_kitten/temporal.rb +27 -0
- data/lib/data_kitten/version.rb +3 -0
- metadata +242 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A specific available form of a dataset, such as a CSV file, an API, or an RSS feed.
|
4
|
+
#
|
5
|
+
# Based on {http://www.w3.org/TR/vocab-dcat/#class-distribution dcat:Distribution}, but
|
6
|
+
# with useful aliases for other vocabularies.
|
7
|
+
#
|
8
|
+
class Distribution
|
9
|
+
|
10
|
+
# @!attribute format
|
11
|
+
# @return [DistributionFormat] the file format of the distribution.
|
12
|
+
attr_accessor :format
|
13
|
+
|
14
|
+
# @!attribute access_url
|
15
|
+
# @return [String] a URL to access the distribution.
|
16
|
+
attr_accessor :access_url
|
17
|
+
alias_method :uri, :access_url
|
18
|
+
alias_method :download_url, :access_url
|
19
|
+
|
20
|
+
# @!attribute path
|
21
|
+
# @return [String] the path of the distribution within the source, if appropriate
|
22
|
+
attr_accessor :path
|
23
|
+
|
24
|
+
# @!attribute title
|
25
|
+
# @return [String] a short title, unique within the dataset
|
26
|
+
attr_accessor :title
|
27
|
+
|
28
|
+
# @!attribute description
|
29
|
+
# @return [String] a textual description
|
30
|
+
attr_accessor :description
|
31
|
+
|
32
|
+
# @!attribute schema
|
33
|
+
# @return [Hash] a hash representing the schema of the data within the distribution. Will
|
34
|
+
# change to a more structured object later.
|
35
|
+
attr_accessor :schema
|
36
|
+
|
37
|
+
# Create a new Distribution. Currently only loads from Datapackage +resource+ hashes.
|
38
|
+
#
|
39
|
+
# @param dataset [Dataset] the {Dataset} that this is a part of.
|
40
|
+
# @param options [Hash] A set of options with which to initialise the distribution.
|
41
|
+
# @option options [String] :datapackage_resource the +resource+ section of a Datapackage
|
42
|
+
# representation to load information from.
|
43
|
+
def initialize(dataset, options)
|
44
|
+
# Store dataset
|
45
|
+
@dataset = dataset
|
46
|
+
# Parse datapackage
|
47
|
+
if r = options[:datapackage_resource]
|
48
|
+
# Load basics
|
49
|
+
@description = r['description']
|
50
|
+
# Load HTTP Response for further use
|
51
|
+
if r['url']
|
52
|
+
@response = Curl::Easy.http_head(r['url'])
|
53
|
+
end
|
54
|
+
# Work out format
|
55
|
+
@format = begin
|
56
|
+
extension = r['format']
|
57
|
+
if extension.nil?
|
58
|
+
extension = r['path'].is_a?(String) ? r['path'].split('.').last.upcase : nil
|
59
|
+
end
|
60
|
+
extension ? DistributionFormat.new(extension, @response) : nil
|
61
|
+
end
|
62
|
+
# Get CSV dialect
|
63
|
+
@dialect = r['dialect']
|
64
|
+
# Extract schema
|
65
|
+
@schema = r['schema']
|
66
|
+
# Get path
|
67
|
+
@path = r['path']
|
68
|
+
@access_url = r['url']
|
69
|
+
# Set title
|
70
|
+
@title = @path || @uri
|
71
|
+
elsif r = options[:dcat_resource]
|
72
|
+
@title = r[:title]
|
73
|
+
@description = r[:title]
|
74
|
+
@access_url = r[:accessURL]
|
75
|
+
elsif r = options[:ckan_resource]
|
76
|
+
@title = r[:title]
|
77
|
+
@description = r[:title]
|
78
|
+
@access_url = r[:accessURL]
|
79
|
+
# Load HTTP Response for further use
|
80
|
+
if @access_url
|
81
|
+
@response = Curl::Easy.http_head(@access_url) do |c|
|
82
|
+
c.follow_location = true
|
83
|
+
c.useragent = "curb"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
@format = r[:format] ? DistributionFormat.new(r[:format], @response) : nil
|
87
|
+
end
|
88
|
+
# Set default CSV dialect
|
89
|
+
@dialect ||= {
|
90
|
+
"delimiter" => ","
|
91
|
+
}
|
92
|
+
end
|
93
|
+
|
94
|
+
# A usable name for the distribution, unique within the {Dataset}.
|
95
|
+
#
|
96
|
+
# @return [String] a locally unique name
|
97
|
+
def title
|
98
|
+
@title
|
99
|
+
end
|
100
|
+
alias_method :name, :title
|
101
|
+
|
102
|
+
# An array of column headers for the distribution. Loaded from the schema, or from the file directly if no
|
103
|
+
# schema is present.
|
104
|
+
#
|
105
|
+
# @return [Array<String>] an array of column headers, as strings.
|
106
|
+
def headers
|
107
|
+
@headers ||= begin
|
108
|
+
if @schema
|
109
|
+
@schema['fields'].map{|x| x['id']}
|
110
|
+
else
|
111
|
+
data.headers
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Whether the file that the distribution represents actually exists
|
117
|
+
#
|
118
|
+
# @return [Boolean] whether the HTTP response returns a success code or not
|
119
|
+
def exists?
|
120
|
+
if @access_url
|
121
|
+
@response.response_code != 404
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# A CSV object representing the loaded data.
|
126
|
+
#
|
127
|
+
# @return [Array<Array<String>>] an array of arrays of strings, representing each row.
|
128
|
+
def data
|
129
|
+
@data ||= begin
|
130
|
+
if @path
|
131
|
+
datafile = @dataset.send(:load_file, @path)
|
132
|
+
elsif @access_url
|
133
|
+
datafile = RestClient.get @access_url rescue nil
|
134
|
+
end
|
135
|
+
if datafile
|
136
|
+
case format.extension
|
137
|
+
when :csv
|
138
|
+
CSV.parse(
|
139
|
+
datafile,
|
140
|
+
:headers => true,
|
141
|
+
:col_sep => @dialect["delimiter"]
|
142
|
+
)
|
143
|
+
else
|
144
|
+
nil
|
145
|
+
end
|
146
|
+
else
|
147
|
+
nil
|
148
|
+
end
|
149
|
+
rescue
|
150
|
+
nil
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A file format for a distribution
|
4
|
+
#
|
5
|
+
# For instance CSV, XML, etc.
|
6
|
+
#
|
7
|
+
class DistributionFormat
|
8
|
+
|
9
|
+
#@!attribute extension
|
10
|
+
#@return [Symbol] a symbol for the file extension. For instance, :csv.
|
11
|
+
attr_reader :extension
|
12
|
+
|
13
|
+
# Create a new DistributionFormat object with the relevant extension
|
14
|
+
#
|
15
|
+
# @param extension [String] the file extension for the format
|
16
|
+
def initialize(extension, response)
|
17
|
+
# Store extension as a lowercase symbol
|
18
|
+
@extension = extension.to_s.downcase.to_sym
|
19
|
+
# Store response for later use
|
20
|
+
@response = response
|
21
|
+
# Set up format lists
|
22
|
+
@@formats ||= {
|
23
|
+
csv: { structured: true, open: true },
|
24
|
+
xls: { structured: true, open: false },
|
25
|
+
rdf: { structured: true, open: true },
|
26
|
+
xml: { structured: true, open: true },
|
27
|
+
wms: { structured: true, open: true },
|
28
|
+
ods: { structured: true, open: true },
|
29
|
+
rdfa: { structured: true, open: true },
|
30
|
+
kml: { structured: true, open: true },
|
31
|
+
rss: { structured: true, open: true },
|
32
|
+
json: { structured: true, open: true },
|
33
|
+
ical: { structured: true, open: true },
|
34
|
+
sparql: { structured: true, open: true },
|
35
|
+
kml: { structured: true, open: true },
|
36
|
+
georss: { structured: true, open: true },
|
37
|
+
shp: { structured: true, open: true },
|
38
|
+
html: { structured: false, open: true },
|
39
|
+
doc: { structured: false, open: false },
|
40
|
+
pdf: { structured: false, open: true },
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Is this a structured format?
|
45
|
+
#
|
46
|
+
# @return [Boolean] whether the format is machine-readable or not.
|
47
|
+
def structured?
|
48
|
+
@@formats[@extension][:structured] rescue nil
|
49
|
+
end
|
50
|
+
|
51
|
+
# Is this an open format?
|
52
|
+
#
|
53
|
+
# @return [Boolean] whether the format is open or not
|
54
|
+
def open?
|
55
|
+
@@formats[@extension][:open] rescue nil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Whether the format of the file matches the extension given by the data
|
59
|
+
#
|
60
|
+
# @return [Boolean] whether the MIME type given in the HTTP response matches the data or not
|
61
|
+
def matches?
|
62
|
+
begin
|
63
|
+
mimes = []
|
64
|
+
MIME::Types.type_for(@extension.to_s).each { |i| mimes << i.content_type }
|
65
|
+
!!(@response.content_type =~ /#{mimes.join('|')}/) || false
|
66
|
+
rescue
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'data_kitten/hosts/github'
|
2
|
+
require 'data_kitten/hosts/bitbucket'
|
3
|
+
require 'data_kitten/hosts/gist'
|
4
|
+
|
5
|
+
module DataKitten
|
6
|
+
|
7
|
+
module Hosts
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def detect_host
|
12
|
+
[
|
13
|
+
DataKitten::Hosts::Github,
|
14
|
+
DataKitten::Hosts::Bitbucket,
|
15
|
+
DataKitten::Hosts::Gist
|
16
|
+
].each do |host|
|
17
|
+
extend host if host.supported?(@access_url)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Hosts
|
4
|
+
|
5
|
+
# Bitbucket host module. Automatically mixed into {Dataset} for datasets that are loaded from Bitbucket.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Bitbucket
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/[^\/]*bitbucket\.org\//
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# Where the dataset is hosted.
|
20
|
+
# @return [Symbol] +:bitbucket+
|
21
|
+
# @see Dataset#host
|
22
|
+
def host
|
23
|
+
:bitbucket
|
24
|
+
end
|
25
|
+
|
26
|
+
# Helper for generating Bitbucket URLs
|
27
|
+
#
|
28
|
+
# @param path [String] The path to append to the Bitbucket base URL.
|
29
|
+
#
|
30
|
+
# @return [String] The supplied path with the Bitbucket base URL prepended
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# dataset = Dataset.new(access_url: 'https://bitbucket.org/floppy/hot-drinks.git')
|
34
|
+
# dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
|
35
|
+
# dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
|
36
|
+
def bitbucket_path(path = '')
|
37
|
+
"https://bitbucket.org/#{bitbucket_user_name}/#{bitbucket_repository_name}/#{path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def bitbucket_user_name
|
43
|
+
@bitbucket_user_name ||= uri.split('/')[-2]
|
44
|
+
end
|
45
|
+
|
46
|
+
def bitbucket_repository_name
|
47
|
+
@bitbucket_repository_name ||= uri.split('/')[-1].split('.')[0]
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Hosts
|
4
|
+
|
5
|
+
# Gist host module. Automatically mixed into {Dataset} for datasets that are loaded from Gist.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Gist
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/gist\.github\.com\//
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# Where the dataset is hosted.
|
20
|
+
# @return [Symbol] +:gist+
|
21
|
+
# @see Dataset#host
|
22
|
+
def host
|
23
|
+
:gist
|
24
|
+
end
|
25
|
+
|
26
|
+
# Helper for generating Gist URLs
|
27
|
+
#
|
28
|
+
# @param path [String] The path to append to the Gist base URL.
|
29
|
+
#
|
30
|
+
# @return [String] The supplied path with the Gist base URL prepended
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# dataset = Dataset.new(access_url: 'git://gist.github.com/5633865.git')
|
34
|
+
# dataset.gist_path # => 'https://gist.github.com/5633865'
|
35
|
+
# dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
|
36
|
+
def gist_path(path = '')
|
37
|
+
"https://gist.github.com/#{gist_repository_name}/#{path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def gist_repository_name
|
43
|
+
@gist_repository_name ||= uri.split('/')[-1].split('.')[0]
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
module Hosts
|
4
|
+
|
5
|
+
# GitHub host module. Automatically mixed into {Dataset} for datasets that are loaded from GitHub.
|
6
|
+
#
|
7
|
+
# @see Dataset
|
8
|
+
#
|
9
|
+
module Github
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def self.supported?(uri)
|
14
|
+
uri =~ /\A(git|https?):\/\/github\.com\//
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
|
19
|
+
# Where the dataset is hosted.
|
20
|
+
# @return [Symbol] +:github+
|
21
|
+
# @see Dataset#host
|
22
|
+
def host
|
23
|
+
:github
|
24
|
+
end
|
25
|
+
|
26
|
+
# Helper for generating GitHub URLs
|
27
|
+
#
|
28
|
+
# @param path [String] The path to append to the GitHub base URL.
|
29
|
+
#
|
30
|
+
# @return [String] The supplied path with the GitHub base URL prepended
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
|
34
|
+
# dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
|
35
|
+
# dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
|
36
|
+
def github_path(path = '')
|
37
|
+
"https://github.com/#{github_user_name}/#{github_repository_name}/#{path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def github_user_name
|
43
|
+
@github_user_name ||= uri.split('/')[-2]
|
44
|
+
end
|
45
|
+
|
46
|
+
def github_repository_name
|
47
|
+
@github_repository_name ||= uri.split('/')[-1].split('.')[0]
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module DataKitten
|
2
|
+
|
3
|
+
# A license for a {Dataset} or {Distribution}
|
4
|
+
#
|
5
|
+
class License
|
6
|
+
|
7
|
+
# @!attribute is
|
8
|
+
# @return [String] a short ID that identifies the license.
|
9
|
+
attr_accessor :id
|
10
|
+
|
11
|
+
# @!attribute name
|
12
|
+
# @return [String] the human name of the license.
|
13
|
+
attr_accessor :name
|
14
|
+
|
15
|
+
# @!attribute uri
|
16
|
+
# @return [String] the URI for the license text.
|
17
|
+
attr_accessor :uri
|
18
|
+
|
19
|
+
# @!attribute type
|
20
|
+
# @return [String] the type of information this license applies to. Could be +:data+ or +:content+.
|
21
|
+
attr_accessor :type
|
22
|
+
|
23
|
+
# Create a new License object.
|
24
|
+
#
|
25
|
+
# @param options [Hash] A set of options with which to initialise the license.
|
26
|
+
# @option options [String] :id the short ID for the license
|
27
|
+
# @option options [String] :name the human name for the license
|
28
|
+
# @option options [String] :uri the URI of the license text
|
29
|
+
# @option options [String] :type the type of information covered by this license.
|
30
|
+
def initialize(options)
|
31
|
+
@id = options[:id]
|
32
|
+
@name = options[:name]
|
33
|
+
@uri = options[:uri]
|
34
|
+
@type = options[:type]
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|