data_kitten 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +1 -1
- data/bin/data_kitten +1 -1
- data/lib/data_kitten.rb +1 -2
- data/lib/data_kitten/dataset.rb +25 -11
- data/lib/data_kitten/distribution.rb +5 -16
- data/lib/data_kitten/distribution_format.rb +26 -25
- data/lib/data_kitten/fetcher.rb +10 -0
- data/lib/data_kitten/hosts/bitbucket.rb +2 -2
- data/lib/data_kitten/hosts/gist.rb +2 -2
- data/lib/data_kitten/hosts/github.rb +2 -2
- data/lib/data_kitten/publishing_formats/ckan.rb +21 -18
- data/lib/data_kitten/version.rb +1 -1
- metadata +1 -15
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MzYzNmUyZmIxZGNlOGViNGQwMmQ5YThiZDQzYWZiNTc2OTBkZWYwYQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDVhMTRkZWRlMGI2YWEzNWZhNmRjZTRkNWJmNjAwOTRiNTVkMmQ0Yw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDg5ZGZiNjRjYjM3ODgwNjRlYWU2NTlhM2EzMTgyNmRhOGM4MjQ1YWYzMGU3
|
10
|
+
YjU1NTI2ZmY4OGJkNWRiYTZhYzNmM2QzZWUwMjQzMDlhZGViYjlmYjM3ZmU1
|
11
|
+
MzA1ZGJkZGU5MWM0NjIyZWMzY2M5YThiNjA1ZTIwOTg4OGY2ZDY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OTVmZTRlODU0ZDJkMmRhMjU3YTViY2VlODlhNDBlYjVkYjdlM2RhNDE4N2Uw
|
14
|
+
MGQ1MTQ4NTY2MGY2YTIwZjY4YmE5NzQ1OWExODJjNWM3MDk0YmU3YjcxZGVm
|
15
|
+
Njg0ZjQ0NzFjN2I0NmM5OTM4OWM0MzhlNTY2OWIyMjYyZDZiM2U=
|
data/README.md
CHANGED
@@ -45,7 +45,7 @@ Require if you need to:
|
|
45
45
|
|
46
46
|
Request a dataset:
|
47
47
|
|
48
|
-
dataset = DataKitten::Dataset.new(
|
48
|
+
dataset = DataKitten::Dataset.new("https://github.com/theodi/dataset-mod-disposals.git")
|
49
49
|
|
50
50
|
Use the results:
|
51
51
|
|
data/bin/data_kitten
CHANGED
data/lib/data_kitten.rb
CHANGED
@@ -8,7 +8,6 @@ require 'rdf'
|
|
8
8
|
require 'linkeddata'
|
9
9
|
require 'nokogiri'
|
10
10
|
require 'uri'
|
11
|
-
require 'curb'
|
12
11
|
require 'datapackage'
|
13
12
|
|
14
13
|
require 'data_kitten/license'
|
@@ -41,4 +40,4 @@ require 'data_kitten/fetcher'
|
|
41
40
|
# dataset.publishing_format # => :datapackage
|
42
41
|
# dataset.distributions # => [Distribution<#1>, Distribution<#2>]
|
43
42
|
# dataset.distributions[0].headers # => ['col1', 'col2']
|
44
|
-
# dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
|
43
|
+
# dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
|
data/lib/data_kitten/dataset.rb
CHANGED
@@ -13,7 +13,7 @@ module DataKitten
|
|
13
13
|
# use the Datapackage metadata format.
|
14
14
|
#
|
15
15
|
# @example Load a Dataset from a git repository
|
16
|
-
# dataset = Dataset.new(
|
16
|
+
# dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
|
17
17
|
# dataset.supported? # => true
|
18
18
|
# dataset.origin # => :git
|
19
19
|
# dataset.host # => :github
|
@@ -30,13 +30,25 @@ module DataKitten
|
|
30
30
|
attr_accessor :access_url
|
31
31
|
|
32
32
|
# Create a new Dataset object
|
33
|
-
#
|
34
|
-
# @param [Hash] options the details of the Dataset.
|
35
|
-
# @option options [String] :access_url A URL that can be used to access the Dataset.
|
36
|
-
# The class will attempt to auto-load metadata from this URL.
|
37
33
|
#
|
38
|
-
|
39
|
-
|
34
|
+
# The class will attempt to auto-load metadata from this URL.
|
35
|
+
#
|
36
|
+
# @overload new(url)
|
37
|
+
# @param [String] url A URL that can be used to access the Dataset
|
38
|
+
#
|
39
|
+
# @overload new(options)
|
40
|
+
# @param [Hash] options the details of the Dataset.
|
41
|
+
# @option options [String] :access_url A URL that can be used to access the Dataset.
|
42
|
+
#
|
43
|
+
def initialize(url_or_options)
|
44
|
+
url = case url_or_options
|
45
|
+
when Hash
|
46
|
+
url_or_options[:access_url]
|
47
|
+
else
|
48
|
+
url_or_options
|
49
|
+
end
|
50
|
+
@access_url = DataKitten::Fetcher.wrap(url)
|
51
|
+
|
40
52
|
detect_origin
|
41
53
|
detect_host
|
42
54
|
detect_publishing_format
|
@@ -51,9 +63,11 @@ module DataKitten
|
|
51
63
|
end
|
52
64
|
|
53
65
|
def source
|
54
|
-
@access_url.as_json if @access_url.ok?
|
66
|
+
@source ||= @access_url.as_json if @access_url.ok?
|
55
67
|
end
|
56
68
|
|
69
|
+
attr_writer :source
|
70
|
+
|
57
71
|
# Can metadata be loaded for this Dataset?
|
58
72
|
#
|
59
73
|
# @return [Boolean] true if metadata can be loaded, false if it's
|
@@ -84,9 +98,7 @@ module DataKitten
|
|
84
98
|
#
|
85
99
|
# @return [String] the identifier of the dataset
|
86
100
|
#
|
87
|
-
|
88
|
-
nil
|
89
|
-
end
|
101
|
+
attr_accessor :identifier
|
90
102
|
|
91
103
|
# The human-readable title of the dataset.
|
92
104
|
#
|
@@ -269,5 +281,7 @@ module DataKitten
|
|
269
281
|
nil
|
270
282
|
end
|
271
283
|
|
284
|
+
attr_accessor :metadata
|
285
|
+
|
272
286
|
end
|
273
287
|
end
|
@@ -108,6 +108,8 @@ module DataKitten
|
|
108
108
|
@dialect ||= {
|
109
109
|
"delimiter" => ","
|
110
110
|
}
|
111
|
+
|
112
|
+
@download = Fetcher.wrap(@download_url)
|
111
113
|
end
|
112
114
|
|
113
115
|
# A usable name for the distribution, unique within the {Dataset}.
|
@@ -136,9 +138,7 @@ module DataKitten
|
|
136
138
|
#
|
137
139
|
# @return [Boolean] whether the HTTP response returns a success code or not
|
138
140
|
def exists?
|
139
|
-
|
140
|
-
http_head.response_code != 404
|
141
|
-
end
|
141
|
+
@download.exists?
|
142
142
|
end
|
143
143
|
|
144
144
|
# A CSV object representing the loaded data.
|
@@ -148,8 +148,8 @@ module DataKitten
|
|
148
148
|
@data ||= begin
|
149
149
|
if @path
|
150
150
|
datafile = @dataset.send(:load_file, @path)
|
151
|
-
elsif @
|
152
|
-
datafile =
|
151
|
+
elsif @download.ok?
|
152
|
+
datafile = @download.body
|
153
153
|
end
|
154
154
|
if datafile
|
155
155
|
case format.extension
|
@@ -170,17 +170,6 @@ module DataKitten
|
|
170
170
|
end
|
171
171
|
end
|
172
172
|
|
173
|
-
def http_head
|
174
|
-
if @download_url
|
175
|
-
@http_head ||= begin
|
176
|
-
Curl::Easy.http_head(@download_url) do |c|
|
177
|
-
c.follow_location = true
|
178
|
-
c.useragent = "curb"
|
179
|
-
end
|
180
|
-
end
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
173
|
end
|
185
174
|
|
186
175
|
end
|
@@ -6,6 +6,30 @@ module DataKitten
|
|
6
6
|
#
|
7
7
|
class DistributionFormat
|
8
8
|
|
9
|
+
FORMATS = {
|
10
|
+
csv: { structured: true, open: true },
|
11
|
+
xls: { structured: true, open: false },
|
12
|
+
xlsx: { structured: true, open: true },
|
13
|
+
rdf: { structured: true, open: true },
|
14
|
+
xml: { structured: true, open: true },
|
15
|
+
wms: { structured: true, open: true },
|
16
|
+
ods: { structured: true, open: true },
|
17
|
+
rdfa: { structured: true, open: true },
|
18
|
+
kml: { structured: true, open: true },
|
19
|
+
rss: { structured: true, open: true },
|
20
|
+
json: { structured: true, open: true },
|
21
|
+
ical: { structured: true, open: true },
|
22
|
+
sparql: { structured: true, open: true },
|
23
|
+
kml: { structured: true, open: true },
|
24
|
+
georss: { structured: true, open: true },
|
25
|
+
geojson: { structured: true, open: true },
|
26
|
+
shp: { structured: true, open: true },
|
27
|
+
html: { structured: false, open: true },
|
28
|
+
doc: { structured: false, open: false },
|
29
|
+
pdf: { structured: false, open: true }
|
30
|
+
}
|
31
|
+
FORMATS.default = {}
|
32
|
+
|
9
33
|
#@!attribute extension
|
10
34
|
#@return [Symbol] a symbol for the file extension. For instance, :csv.
|
11
35
|
attr_reader :extension
|
@@ -17,43 +41,20 @@ module DataKitten
|
|
17
41
|
@distribution = distribution
|
18
42
|
# Store extension as a lowercase symbol
|
19
43
|
@extension = distribution.extension.to_s.downcase.to_sym
|
20
|
-
# Set up format lists
|
21
|
-
@@formats ||= {
|
22
|
-
csv: { structured: true, open: true },
|
23
|
-
xls: { structured: true, open: false },
|
24
|
-
xlsx: { structured: true, open: true },
|
25
|
-
rdf: { structured: true, open: true },
|
26
|
-
xml: { structured: true, open: true },
|
27
|
-
wms: { structured: true, open: true },
|
28
|
-
ods: { structured: true, open: true },
|
29
|
-
rdfa: { structured: true, open: true },
|
30
|
-
kml: { structured: true, open: true },
|
31
|
-
rss: { structured: true, open: true },
|
32
|
-
json: { structured: true, open: true },
|
33
|
-
ical: { structured: true, open: true },
|
34
|
-
sparql: { structured: true, open: true },
|
35
|
-
kml: { structured: true, open: true },
|
36
|
-
georss: { structured: true, open: true },
|
37
|
-
geojson: { structured: true, open: true },
|
38
|
-
shp: { structured: true, open: true },
|
39
|
-
html: { structured: false, open: true },
|
40
|
-
doc: { structured: false, open: false },
|
41
|
-
pdf: { structured: false, open: true },
|
42
|
-
}
|
43
44
|
end
|
44
45
|
|
45
46
|
# Is this a structured format?
|
46
47
|
#
|
47
48
|
# @return [Boolean] whether the format is machine-readable or not.
|
48
49
|
def structured?
|
49
|
-
|
50
|
+
FORMATS[extension][:structured]
|
50
51
|
end
|
51
52
|
|
52
53
|
# Is this an open format?
|
53
54
|
#
|
54
55
|
# @return [Boolean] whether the format is open or not
|
55
56
|
def open?
|
56
|
-
|
57
|
+
FORMATS[extension][:open]
|
57
58
|
end
|
58
59
|
|
59
60
|
# Whether the format of the file matches the extension given by the data
|
data/lib/data_kitten/fetcher.rb
CHANGED
@@ -30,7 +30,7 @@ module DataKitten
|
|
30
30
|
# @return [String] The supplied path with the Bitbucket base URL prepended
|
31
31
|
#
|
32
32
|
# @example
|
33
|
-
# dataset = Dataset.new(
|
33
|
+
# dataset = Dataset.new('https://bitbucket.org/floppy/hot-drinks.git')
|
34
34
|
# dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
|
35
35
|
# dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
|
36
36
|
def bitbucket_path(path = '')
|
@@ -51,4 +51,4 @@ module DataKitten
|
|
51
51
|
|
52
52
|
end
|
53
53
|
|
54
|
-
end
|
54
|
+
end
|
@@ -30,7 +30,7 @@ module DataKitten
|
|
30
30
|
# @return [String] The supplied path with the Gist base URL prepended
|
31
31
|
#
|
32
32
|
# @example
|
33
|
-
# dataset = Dataset.new(
|
33
|
+
# dataset = Dataset.new('git://gist.github.com/5633865.git')
|
34
34
|
# dataset.gist_path # => 'https://gist.github.com/5633865'
|
35
35
|
# dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
|
36
36
|
def gist_path(path = '')
|
@@ -47,4 +47,4 @@ module DataKitten
|
|
47
47
|
|
48
48
|
end
|
49
49
|
|
50
|
-
end
|
50
|
+
end
|
@@ -30,7 +30,7 @@ module DataKitten
|
|
30
30
|
# @return [String] The supplied path with the GitHub base URL prepended
|
31
31
|
#
|
32
32
|
# @example
|
33
|
-
# dataset = Dataset.new(
|
33
|
+
# dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
|
34
34
|
# dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
|
35
35
|
# dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
|
36
36
|
def github_path(path = '')
|
@@ -51,4 +51,4 @@ module DataKitten
|
|
51
51
|
|
52
52
|
end
|
53
53
|
|
54
|
-
end
|
54
|
+
end
|
@@ -6,29 +6,36 @@ module DataKitten
|
|
6
6
|
|
7
7
|
module CKAN
|
8
8
|
|
9
|
-
@@metadata = nil
|
10
|
-
|
11
9
|
private
|
12
10
|
|
13
11
|
def self.supported?(instance)
|
14
12
|
uri = instance.uri
|
15
|
-
|
13
|
+
base_uri = uri.merge("/")
|
14
|
+
*base, package = uri.path.split('/')
|
15
|
+
# If the 2nd to last element in the path is 'dataset' then it's probably
|
16
|
+
# the CKAN dataset view page, the last element will be the dataset id
|
17
|
+
# or name
|
18
|
+
if base.last == "dataset"
|
19
|
+
instance.identifier = package
|
20
|
+
# build a base URI ending with a /
|
21
|
+
base_uri = uri.merge(base[0...-1].join('/') + '/')
|
16
22
|
# If the package is a UUID - it's more than likely to be a CKAN ID
|
17
|
-
|
18
|
-
|
23
|
+
elsif package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
|
24
|
+
instance.identifier = package
|
19
25
|
else
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
|
26
|
+
results = begin
|
27
|
+
RestClient.get base_uri.merge("api/3/action/package_show").to_s, {:params => {:id => package}}
|
28
|
+
rescue RestClient::Exception
|
29
|
+
RestClient.get base_uri.merge("api/2/rest/dataset/#{package}").to_s
|
25
30
|
end
|
26
31
|
|
27
32
|
result = JSON.parse results
|
28
|
-
|
33
|
+
instance.identifier = result.fetch("result", result)["id"]
|
29
34
|
end
|
30
|
-
|
31
|
-
|
35
|
+
instance.metadata = JSON.parse RestClient.get base_uri.merge("api/rest/package/#{instance.identifier}").to_s
|
36
|
+
instance.metadata.extend(GuessableLookup)
|
37
|
+
instance.source = instance.metadata
|
38
|
+
return true
|
32
39
|
rescue
|
33
40
|
false
|
34
41
|
end
|
@@ -62,7 +69,7 @@ module DataKitten
|
|
62
69
|
#
|
63
70
|
# @see Dataset#identifier
|
64
71
|
def identifier
|
65
|
-
metadata.lookup("name") ||
|
72
|
+
metadata.lookup("name") || @identifier
|
66
73
|
end
|
67
74
|
|
68
75
|
# A web page which can be used to gain access to the dataset
|
@@ -210,10 +217,6 @@ module DataKitten
|
|
210
217
|
|
211
218
|
private
|
212
219
|
|
213
|
-
def metadata
|
214
|
-
@@metadata
|
215
|
-
end
|
216
|
-
|
217
220
|
def select_extras(group, key)
|
218
221
|
extra = group["extras"][key] rescue ""
|
219
222
|
if extra == ""
|
data/lib/data_kitten/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_kitten
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Smith
|
@@ -95,20 +95,6 @@ dependencies:
|
|
95
95
|
- - ! '>='
|
96
96
|
- !ruby/object:Gem::Version
|
97
97
|
version: '0'
|
98
|
-
- !ruby/object:Gem::Dependency
|
99
|
-
name: curb
|
100
|
-
requirement: !ruby/object:Gem::Requirement
|
101
|
-
requirements:
|
102
|
-
- - ! '>='
|
103
|
-
- !ruby/object:Gem::Version
|
104
|
-
version: '0'
|
105
|
-
type: :runtime
|
106
|
-
prerelease: false
|
107
|
-
version_requirements: !ruby/object:Gem::Requirement
|
108
|
-
requirements:
|
109
|
-
- - ! '>='
|
110
|
-
- !ruby/object:Gem::Version
|
111
|
-
version: '0'
|
112
98
|
- !ruby/object:Gem::Dependency
|
113
99
|
name: datapackage
|
114
100
|
requirement: !ruby/object:Gem::Requirement
|