data_kitten 1.3.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +1 -1
- data/bin/data_kitten +1 -1
- data/lib/data_kitten.rb +1 -2
- data/lib/data_kitten/dataset.rb +25 -11
- data/lib/data_kitten/distribution.rb +5 -16
- data/lib/data_kitten/distribution_format.rb +26 -25
- data/lib/data_kitten/fetcher.rb +10 -0
- data/lib/data_kitten/hosts/bitbucket.rb +2 -2
- data/lib/data_kitten/hosts/gist.rb +2 -2
- data/lib/data_kitten/hosts/github.rb +2 -2
- data/lib/data_kitten/publishing_formats/ckan.rb +21 -18
- data/lib/data_kitten/version.rb +1 -1
- metadata +1 -15
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MzYzNmUyZmIxZGNlOGViNGQwMmQ5YThiZDQzYWZiNTc2OTBkZWYwYQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDVhMTRkZWRlMGI2YWEzNWZhNmRjZTRkNWJmNjAwOTRiNTVkMmQ0Yw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDg5ZGZiNjRjYjM3ODgwNjRlYWU2NTlhM2EzMTgyNmRhOGM4MjQ1YWYzMGU3
|
10
|
+
YjU1NTI2ZmY4OGJkNWRiYTZhYzNmM2QzZWUwMjQzMDlhZGViYjlmYjM3ZmU1
|
11
|
+
MzA1ZGJkZGU5MWM0NjIyZWMzY2M5YThiNjA1ZTIwOTg4OGY2ZDY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OTVmZTRlODU0ZDJkMmRhMjU3YTViY2VlODlhNDBlYjVkYjdlM2RhNDE4N2Uw
|
14
|
+
MGQ1MTQ4NTY2MGY2YTIwZjY4YmE5NzQ1OWExODJjNWM3MDk0YmU3YjcxZGVm
|
15
|
+
Njg0ZjQ0NzFjN2I0NmM5OTM4OWM0MzhlNTY2OWIyMjYyZDZiM2U=
|
data/README.md
CHANGED
@@ -45,7 +45,7 @@ Require if you need to:
|
|
45
45
|
|
46
46
|
Request a dataset:
|
47
47
|
|
48
|
-
dataset = DataKitten::Dataset.new(
|
48
|
+
dataset = DataKitten::Dataset.new("https://github.com/theodi/dataset-mod-disposals.git")
|
49
49
|
|
50
50
|
Use the results:
|
51
51
|
|
data/bin/data_kitten
CHANGED
data/lib/data_kitten.rb
CHANGED
@@ -8,7 +8,6 @@ require 'rdf'
|
|
8
8
|
require 'linkeddata'
|
9
9
|
require 'nokogiri'
|
10
10
|
require 'uri'
|
11
|
-
require 'curb'
|
12
11
|
require 'datapackage'
|
13
12
|
|
14
13
|
require 'data_kitten/license'
|
@@ -41,4 +40,4 @@ require 'data_kitten/fetcher'
|
|
41
40
|
# dataset.publishing_format # => :datapackage
|
42
41
|
# dataset.distributions # => [Distribution<#1>, Distribution<#2>]
|
43
42
|
# dataset.distributions[0].headers # => ['col1', 'col2']
|
44
|
-
# dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
|
43
|
+
# dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
|
data/lib/data_kitten/dataset.rb
CHANGED
@@ -13,7 +13,7 @@ module DataKitten
|
|
13
13
|
# use the Datapackage metadata format.
|
14
14
|
#
|
15
15
|
# @example Load a Dataset from a git repository
|
16
|
-
# dataset = Dataset.new(
|
16
|
+
# dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
|
17
17
|
# dataset.supported? # => true
|
18
18
|
# dataset.origin # => :git
|
19
19
|
# dataset.host # => :github
|
@@ -30,13 +30,25 @@ module DataKitten
|
|
30
30
|
attr_accessor :access_url
|
31
31
|
|
32
32
|
# Create a new Dataset object
|
33
|
-
#
|
34
|
-
# @param [Hash] options the details of the Dataset.
|
35
|
-
# @option options [String] :access_url A URL that can be used to access the Dataset.
|
36
|
-
# The class will attempt to auto-load metadata from this URL.
|
37
33
|
#
|
38
|
-
|
39
|
-
|
34
|
+
# The class will attempt to auto-load metadata from this URL.
|
35
|
+
#
|
36
|
+
# @overload new(url)
|
37
|
+
# @param [String] url A URL that can be used to access the Dataset
|
38
|
+
#
|
39
|
+
# @overload new(options)
|
40
|
+
# @param [Hash] options the details of the Dataset.
|
41
|
+
# @option options [String] :access_url A URL that can be used to access the Dataset.
|
42
|
+
#
|
43
|
+
def initialize(url_or_options)
|
44
|
+
url = case url_or_options
|
45
|
+
when Hash
|
46
|
+
url_or_options[:access_url]
|
47
|
+
else
|
48
|
+
url_or_options
|
49
|
+
end
|
50
|
+
@access_url = DataKitten::Fetcher.wrap(url)
|
51
|
+
|
40
52
|
detect_origin
|
41
53
|
detect_host
|
42
54
|
detect_publishing_format
|
@@ -51,9 +63,11 @@ module DataKitten
|
|
51
63
|
end
|
52
64
|
|
53
65
|
def source
|
54
|
-
@access_url.as_json if @access_url.ok?
|
66
|
+
@source ||= @access_url.as_json if @access_url.ok?
|
55
67
|
end
|
56
68
|
|
69
|
+
attr_writer :source
|
70
|
+
|
57
71
|
# Can metadata be loaded for this Dataset?
|
58
72
|
#
|
59
73
|
# @return [Boolean] true if metadata can be loaded, false if it's
|
@@ -84,9 +98,7 @@ module DataKitten
|
|
84
98
|
#
|
85
99
|
# @return [String] the identifier of the dataset
|
86
100
|
#
|
87
|
-
|
88
|
-
nil
|
89
|
-
end
|
101
|
+
attr_accessor :identifier
|
90
102
|
|
91
103
|
# The human-readable title of the dataset.
|
92
104
|
#
|
@@ -269,5 +281,7 @@ module DataKitten
|
|
269
281
|
nil
|
270
282
|
end
|
271
283
|
|
284
|
+
attr_accessor :metadata
|
285
|
+
|
272
286
|
end
|
273
287
|
end
|
@@ -108,6 +108,8 @@ module DataKitten
|
|
108
108
|
@dialect ||= {
|
109
109
|
"delimiter" => ","
|
110
110
|
}
|
111
|
+
|
112
|
+
@download = Fetcher.wrap(@download_url)
|
111
113
|
end
|
112
114
|
|
113
115
|
# A usable name for the distribution, unique within the {Dataset}.
|
@@ -136,9 +138,7 @@ module DataKitten
|
|
136
138
|
#
|
137
139
|
# @return [Boolean] whether the HTTP response returns a success code or not
|
138
140
|
def exists?
|
139
|
-
|
140
|
-
http_head.response_code != 404
|
141
|
-
end
|
141
|
+
@download.exists?
|
142
142
|
end
|
143
143
|
|
144
144
|
# A CSV object representing the loaded data.
|
@@ -148,8 +148,8 @@ module DataKitten
|
|
148
148
|
@data ||= begin
|
149
149
|
if @path
|
150
150
|
datafile = @dataset.send(:load_file, @path)
|
151
|
-
elsif @
|
152
|
-
datafile =
|
151
|
+
elsif @download.ok?
|
152
|
+
datafile = @download.body
|
153
153
|
end
|
154
154
|
if datafile
|
155
155
|
case format.extension
|
@@ -170,17 +170,6 @@ module DataKitten
|
|
170
170
|
end
|
171
171
|
end
|
172
172
|
|
173
|
-
def http_head
|
174
|
-
if @download_url
|
175
|
-
@http_head ||= begin
|
176
|
-
Curl::Easy.http_head(@download_url) do |c|
|
177
|
-
c.follow_location = true
|
178
|
-
c.useragent = "curb"
|
179
|
-
end
|
180
|
-
end
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
173
|
end
|
185
174
|
|
186
175
|
end
|
@@ -6,6 +6,30 @@ module DataKitten
|
|
6
6
|
#
|
7
7
|
class DistributionFormat
|
8
8
|
|
9
|
+
FORMATS = {
|
10
|
+
csv: { structured: true, open: true },
|
11
|
+
xls: { structured: true, open: false },
|
12
|
+
xlsx: { structured: true, open: true },
|
13
|
+
rdf: { structured: true, open: true },
|
14
|
+
xml: { structured: true, open: true },
|
15
|
+
wms: { structured: true, open: true },
|
16
|
+
ods: { structured: true, open: true },
|
17
|
+
rdfa: { structured: true, open: true },
|
18
|
+
kml: { structured: true, open: true },
|
19
|
+
rss: { structured: true, open: true },
|
20
|
+
json: { structured: true, open: true },
|
21
|
+
ical: { structured: true, open: true },
|
22
|
+
sparql: { structured: true, open: true },
|
23
|
+
kml: { structured: true, open: true },
|
24
|
+
georss: { structured: true, open: true },
|
25
|
+
geojson: { structured: true, open: true },
|
26
|
+
shp: { structured: true, open: true },
|
27
|
+
html: { structured: false, open: true },
|
28
|
+
doc: { structured: false, open: false },
|
29
|
+
pdf: { structured: false, open: true }
|
30
|
+
}
|
31
|
+
FORMATS.default = {}
|
32
|
+
|
9
33
|
#@!attribute extension
|
10
34
|
#@return [Symbol] a symbol for the file extension. For instance, :csv.
|
11
35
|
attr_reader :extension
|
@@ -17,43 +41,20 @@ module DataKitten
|
|
17
41
|
@distribution = distribution
|
18
42
|
# Store extension as a lowercase symbol
|
19
43
|
@extension = distribution.extension.to_s.downcase.to_sym
|
20
|
-
# Set up format lists
|
21
|
-
@@formats ||= {
|
22
|
-
csv: { structured: true, open: true },
|
23
|
-
xls: { structured: true, open: false },
|
24
|
-
xlsx: { structured: true, open: true },
|
25
|
-
rdf: { structured: true, open: true },
|
26
|
-
xml: { structured: true, open: true },
|
27
|
-
wms: { structured: true, open: true },
|
28
|
-
ods: { structured: true, open: true },
|
29
|
-
rdfa: { structured: true, open: true },
|
30
|
-
kml: { structured: true, open: true },
|
31
|
-
rss: { structured: true, open: true },
|
32
|
-
json: { structured: true, open: true },
|
33
|
-
ical: { structured: true, open: true },
|
34
|
-
sparql: { structured: true, open: true },
|
35
|
-
kml: { structured: true, open: true },
|
36
|
-
georss: { structured: true, open: true },
|
37
|
-
geojson: { structured: true, open: true },
|
38
|
-
shp: { structured: true, open: true },
|
39
|
-
html: { structured: false, open: true },
|
40
|
-
doc: { structured: false, open: false },
|
41
|
-
pdf: { structured: false, open: true },
|
42
|
-
}
|
43
44
|
end
|
44
45
|
|
45
46
|
# Is this a structured format?
|
46
47
|
#
|
47
48
|
# @return [Boolean] whether the format is machine-readable or not.
|
48
49
|
def structured?
|
49
|
-
|
50
|
+
FORMATS[extension][:structured]
|
50
51
|
end
|
51
52
|
|
52
53
|
# Is this an open format?
|
53
54
|
#
|
54
55
|
# @return [Boolean] whether the format is open or not
|
55
56
|
def open?
|
56
|
-
|
57
|
+
FORMATS[extension][:open]
|
57
58
|
end
|
58
59
|
|
59
60
|
# Whether the format of the file matches the extension given by the data
|
data/lib/data_kitten/fetcher.rb
CHANGED
@@ -30,7 +30,7 @@ module DataKitten
|
|
30
30
|
# @return [String] The supplied path with the Bitbucket base URL prepended
|
31
31
|
#
|
32
32
|
# @example
|
33
|
-
# dataset = Dataset.new(
|
33
|
+
# dataset = Dataset.new('https://bitbucket.org/floppy/hot-drinks.git')
|
34
34
|
# dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
|
35
35
|
# dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
|
36
36
|
def bitbucket_path(path = '')
|
@@ -51,4 +51,4 @@ module DataKitten
|
|
51
51
|
|
52
52
|
end
|
53
53
|
|
54
|
-
end
|
54
|
+
end
|
@@ -30,7 +30,7 @@ module DataKitten
|
|
30
30
|
# @return [String] The supplied path with the Gist base URL prepended
|
31
31
|
#
|
32
32
|
# @example
|
33
|
-
# dataset = Dataset.new(
|
33
|
+
# dataset = Dataset.new('git://gist.github.com/5633865.git')
|
34
34
|
# dataset.gist_path # => 'https://gist.github.com/5633865'
|
35
35
|
# dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
|
36
36
|
def gist_path(path = '')
|
@@ -47,4 +47,4 @@ module DataKitten
|
|
47
47
|
|
48
48
|
end
|
49
49
|
|
50
|
-
end
|
50
|
+
end
|
@@ -30,7 +30,7 @@ module DataKitten
|
|
30
30
|
# @return [String] The supplied path with the GitHub base URL prepended
|
31
31
|
#
|
32
32
|
# @example
|
33
|
-
# dataset = Dataset.new(
|
33
|
+
# dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
|
34
34
|
# dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
|
35
35
|
# dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
|
36
36
|
def github_path(path = '')
|
@@ -51,4 +51,4 @@ module DataKitten
|
|
51
51
|
|
52
52
|
end
|
53
53
|
|
54
|
-
end
|
54
|
+
end
|
@@ -6,29 +6,36 @@ module DataKitten
|
|
6
6
|
|
7
7
|
module CKAN
|
8
8
|
|
9
|
-
@@metadata = nil
|
10
|
-
|
11
9
|
private
|
12
10
|
|
13
11
|
def self.supported?(instance)
|
14
12
|
uri = instance.uri
|
15
|
-
|
13
|
+
base_uri = uri.merge("/")
|
14
|
+
*base, package = uri.path.split('/')
|
15
|
+
# If the 2nd to last element in the path is 'dataset' then it's probably
|
16
|
+
# the CKAN dataset view page, the last element will be the dataset id
|
17
|
+
# or name
|
18
|
+
if base.last == "dataset"
|
19
|
+
instance.identifier = package
|
20
|
+
# build a base URI ending with a /
|
21
|
+
base_uri = uri.merge(base[0...-1].join('/') + '/')
|
16
22
|
# If the package is a UUID - it's more than likely to be a CKAN ID
|
17
|
-
|
18
|
-
|
23
|
+
elsif package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
|
24
|
+
instance.identifier = package
|
19
25
|
else
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
|
26
|
+
results = begin
|
27
|
+
RestClient.get base_uri.merge("api/3/action/package_show").to_s, {:params => {:id => package}}
|
28
|
+
rescue RestClient::Exception
|
29
|
+
RestClient.get base_uri.merge("api/2/rest/dataset/#{package}").to_s
|
25
30
|
end
|
26
31
|
|
27
32
|
result = JSON.parse results
|
28
|
-
|
33
|
+
instance.identifier = result.fetch("result", result)["id"]
|
29
34
|
end
|
30
|
-
|
31
|
-
|
35
|
+
instance.metadata = JSON.parse RestClient.get base_uri.merge("api/rest/package/#{instance.identifier}").to_s
|
36
|
+
instance.metadata.extend(GuessableLookup)
|
37
|
+
instance.source = instance.metadata
|
38
|
+
return true
|
32
39
|
rescue
|
33
40
|
false
|
34
41
|
end
|
@@ -62,7 +69,7 @@ module DataKitten
|
|
62
69
|
#
|
63
70
|
# @see Dataset#identifier
|
64
71
|
def identifier
|
65
|
-
metadata.lookup("name") ||
|
72
|
+
metadata.lookup("name") || @identifier
|
66
73
|
end
|
67
74
|
|
68
75
|
# A web page which can be used to gain access to the dataset
|
@@ -210,10 +217,6 @@ module DataKitten
|
|
210
217
|
|
211
218
|
private
|
212
219
|
|
213
|
-
def metadata
|
214
|
-
@@metadata
|
215
|
-
end
|
216
|
-
|
217
220
|
def select_extras(group, key)
|
218
221
|
extra = group["extras"][key] rescue ""
|
219
222
|
if extra == ""
|
data/lib/data_kitten/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_kitten
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Smith
|
@@ -95,20 +95,6 @@ dependencies:
|
|
95
95
|
- - ! '>='
|
96
96
|
- !ruby/object:Gem::Version
|
97
97
|
version: '0'
|
98
|
-
- !ruby/object:Gem::Dependency
|
99
|
-
name: curb
|
100
|
-
requirement: !ruby/object:Gem::Requirement
|
101
|
-
requirements:
|
102
|
-
- - ! '>='
|
103
|
-
- !ruby/object:Gem::Version
|
104
|
-
version: '0'
|
105
|
-
type: :runtime
|
106
|
-
prerelease: false
|
107
|
-
version_requirements: !ruby/object:Gem::Requirement
|
108
|
-
requirements:
|
109
|
-
- - ! '>='
|
110
|
-
- !ruby/object:Gem::Version
|
111
|
-
version: '0'
|
112
98
|
- !ruby/object:Gem::Dependency
|
113
99
|
name: datapackage
|
114
100
|
requirement: !ruby/object:Gem::Requirement
|