data_kitten 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTU4NmQ1MmU2YjJhY2U2NmVjOWE5NTBhOGM2YjNhNGQzMWIxYTU4MQ==
4
+ MzYzNmUyZmIxZGNlOGViNGQwMmQ5YThiZDQzYWZiNTc2OTBkZWYwYQ==
5
5
  data.tar.gz: !binary |-
6
- ZjNhNWU4NzNlZjI4ODU3ZTRkYTgxZmY4MWM0NTA3OTQyNDNmMmJjMg==
6
+ NDVhMTRkZWRlMGI2YWEzNWZhNmRjZTRkNWJmNjAwOTRiNTVkMmQ0Yw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZjY0ODMyZmZjODMxNmU4NmNhMmI4N2YxMWMzMThhNGFlMTAxZmQ0ODA1OTcz
10
- MjcwOWIwMmYyMmRiNDlmNTEyN2ExMWE4N2E4NWJlMDFlNjI3NDU4ZmZjZWFj
11
- Yjk2MWZiZGJmODE5ZDVjMWQ0OTY2ODI4MWRiMzVlZmE5NjM0OGM=
9
+ MDg5ZGZiNjRjYjM3ODgwNjRlYWU2NTlhM2EzMTgyNmRhOGM4MjQ1YWYzMGU3
10
+ YjU1NTI2ZmY4OGJkNWRiYTZhYzNmM2QzZWUwMjQzMDlhZGViYjlmYjM3ZmU1
11
+ MzA1ZGJkZGU5MWM0NjIyZWMzY2M5YThiNjA1ZTIwOTg4OGY2ZDY=
12
12
  data.tar.gz: !binary |-
13
- MjY5NDZiZjliNThjZTk5NTQ4YzZlM2M4OGFiOGYzYjFjZDhmYzQ4NmRlMjJm
14
- ZmE4MTMxZTMyZmQzNjBhODEyODZhODA3ZjIyMDUzNDFmMWJiMWRhNTBlMTU3
15
- MTM2MTU4ZGY4YmZlMjcyM2VkYmM5Y2Q3NzY3YWJmNmNlODdlZTI=
13
+ OTVmZTRlODU0ZDJkMmRhMjU3YTViY2VlODlhNDBlYjVkYjdlM2RhNDE4N2Uw
14
+ MGQ1MTQ4NTY2MGY2YTIwZjY4YmE5NzQ1OWExODJjNWM3MDk0YmU3YjcxZGVm
15
+ Njg0ZjQ0NzFjN2I0NmM5OTM4OWM0MzhlNTY2OWIyMjYyZDZiM2U=
data/README.md CHANGED
@@ -45,7 +45,7 @@ Require if you need to:
45
45
 
46
46
  Request a dataset:
47
47
 
48
- dataset = DataKitten::Dataset.new(access_url: "https://github.com/theodi/dataset-mod-disposals.git")
48
+ dataset = DataKitten::Dataset.new("https://github.com/theodi/dataset-mod-disposals.git")
49
49
 
50
50
  Use the results:
51
51
 
data/bin/data_kitten CHANGED
@@ -9,7 +9,7 @@ if ARGV.length == 0
9
9
  exit 1
10
10
  end
11
11
 
12
- dataset = DataKitten::Dataset.new(access_url: ARGV[0])
12
+ dataset = DataKitten::Dataset.new(ARGV[0])
13
13
 
14
14
  if dataset.publishing_format == nil
15
15
  puts "Unable to determine format for dataset metadata"
data/lib/data_kitten.rb CHANGED
@@ -8,7 +8,6 @@ require 'rdf'
8
8
  require 'linkeddata'
9
9
  require 'nokogiri'
10
10
  require 'uri'
11
- require 'curb'
12
11
  require 'datapackage'
13
12
 
14
13
  require 'data_kitten/license'
@@ -41,4 +40,4 @@ require 'data_kitten/fetcher'
41
40
  # dataset.publishing_format # => :datapackage
42
41
  # dataset.distributions # => [Distribution<#1>, Distribution<#2>]
43
42
  # dataset.distributions[0].headers # => ['col1', 'col2']
44
- # dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
43
+ # dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
@@ -13,7 +13,7 @@ module DataKitten
13
13
  # use the Datapackage metadata format.
14
14
  #
15
15
  # @example Load a Dataset from a git repository
16
- # dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
16
+ # dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
17
17
  # dataset.supported? # => true
18
18
  # dataset.origin # => :git
19
19
  # dataset.host # => :github
@@ -30,13 +30,25 @@ module DataKitten
30
30
  attr_accessor :access_url
31
31
 
32
32
  # Create a new Dataset object
33
- #
34
- # @param [Hash] options the details of the Dataset.
35
- # @option options [String] :access_url A URL that can be used to access the Dataset.
36
- # The class will attempt to auto-load metadata from this URL.
37
33
  #
38
- def initialize(options)
39
- @access_url = DataKitten::Fetcher.wrap(options[:access_url])
34
+ # The class will attempt to auto-load metadata from this URL.
35
+ #
36
+ # @overload new(url)
37
+ # @param [String] url A URL that can be used to access the Dataset
38
+ #
39
+ # @overload new(options)
40
+ # @param [Hash] options the details of the Dataset.
41
+ # @option options [String] :access_url A URL that can be used to access the Dataset.
42
+ #
43
+ def initialize(url_or_options)
44
+ url = case url_or_options
45
+ when Hash
46
+ url_or_options[:access_url]
47
+ else
48
+ url_or_options
49
+ end
50
+ @access_url = DataKitten::Fetcher.wrap(url)
51
+
40
52
  detect_origin
41
53
  detect_host
42
54
  detect_publishing_format
@@ -51,9 +63,11 @@ module DataKitten
51
63
  end
52
64
 
53
65
  def source
54
- @access_url.as_json if @access_url.ok?
66
+ @source ||= @access_url.as_json if @access_url.ok?
55
67
  end
56
68
 
69
+ attr_writer :source
70
+
57
71
  # Can metadata be loaded for this Dataset?
58
72
  #
59
73
  # @return [Boolean] true if metadata can be loaded, false if it's
@@ -84,9 +98,7 @@ module DataKitten
84
98
  #
85
99
  # @return [String] the identifier of the dataset
86
100
  #
87
- def identifier
88
- nil
89
- end
101
+ attr_accessor :identifier
90
102
 
91
103
  # The human-readable title of the dataset.
92
104
  #
@@ -269,5 +281,7 @@ module DataKitten
269
281
  nil
270
282
  end
271
283
 
284
+ attr_accessor :metadata
285
+
272
286
  end
273
287
  end
@@ -108,6 +108,8 @@ module DataKitten
108
108
  @dialect ||= {
109
109
  "delimiter" => ","
110
110
  }
111
+
112
+ @download = Fetcher.wrap(@download_url)
111
113
  end
112
114
 
113
115
  # A usable name for the distribution, unique within the {Dataset}.
@@ -136,9 +138,7 @@ module DataKitten
136
138
  #
137
139
  # @return [Boolean] whether the HTTP response returns a success code or not
138
140
  def exists?
139
- if @download_url
140
- http_head.response_code != 404
141
- end
141
+ @download.exists?
142
142
  end
143
143
 
144
144
  # A CSV object representing the loaded data.
@@ -148,8 +148,8 @@ module DataKitten
148
148
  @data ||= begin
149
149
  if @path
150
150
  datafile = @dataset.send(:load_file, @path)
151
- elsif @download_url
152
- datafile = RestClient.get @download_url rescue nil
151
+ elsif @download.ok?
152
+ datafile = @download.body
153
153
  end
154
154
  if datafile
155
155
  case format.extension
@@ -170,17 +170,6 @@ module DataKitten
170
170
  end
171
171
  end
172
172
 
173
- def http_head
174
- if @download_url
175
- @http_head ||= begin
176
- Curl::Easy.http_head(@download_url) do |c|
177
- c.follow_location = true
178
- c.useragent = "curb"
179
- end
180
- end
181
- end
182
- end
183
-
184
173
  end
185
174
 
186
175
  end
@@ -6,6 +6,30 @@ module DataKitten
6
6
  #
7
7
  class DistributionFormat
8
8
 
9
+ FORMATS = {
10
+ csv: { structured: true, open: true },
11
+ xls: { structured: true, open: false },
12
+ xlsx: { structured: true, open: true },
13
+ rdf: { structured: true, open: true },
14
+ xml: { structured: true, open: true },
15
+ wms: { structured: true, open: true },
16
+ ods: { structured: true, open: true },
17
+ rdfa: { structured: true, open: true },
18
+ kml: { structured: true, open: true },
19
+ rss: { structured: true, open: true },
20
+ json: { structured: true, open: true },
21
+ ical: { structured: true, open: true },
22
+ sparql: { structured: true, open: true },
23
+ kml: { structured: true, open: true },
24
+ georss: { structured: true, open: true },
25
+ geojson: { structured: true, open: true },
26
+ shp: { structured: true, open: true },
27
+ html: { structured: false, open: true },
28
+ doc: { structured: false, open: false },
29
+ pdf: { structured: false, open: true }
30
+ }
31
+ FORMATS.default = {}
32
+
9
33
  #@!attribute extension
10
34
  #@return [Symbol] a symbol for the file extension. For instance, :csv.
11
35
  attr_reader :extension
@@ -17,43 +41,20 @@ module DataKitten
17
41
  @distribution = distribution
18
42
  # Store extension as a lowercase symbol
19
43
  @extension = distribution.extension.to_s.downcase.to_sym
20
- # Set up format lists
21
- @@formats ||= {
22
- csv: { structured: true, open: true },
23
- xls: { structured: true, open: false },
24
- xlsx: { structured: true, open: true },
25
- rdf: { structured: true, open: true },
26
- xml: { structured: true, open: true },
27
- wms: { structured: true, open: true },
28
- ods: { structured: true, open: true },
29
- rdfa: { structured: true, open: true },
30
- kml: { structured: true, open: true },
31
- rss: { structured: true, open: true },
32
- json: { structured: true, open: true },
33
- ical: { structured: true, open: true },
34
- sparql: { structured: true, open: true },
35
- kml: { structured: true, open: true },
36
- georss: { structured: true, open: true },
37
- geojson: { structured: true, open: true },
38
- shp: { structured: true, open: true },
39
- html: { structured: false, open: true },
40
- doc: { structured: false, open: false },
41
- pdf: { structured: false, open: true },
42
- }
43
44
  end
44
45
 
45
46
  # Is this a structured format?
46
47
  #
47
48
  # @return [Boolean] whether the format is machine-readable or not.
48
49
  def structured?
49
- @@formats[@extension][:structured] rescue nil
50
+ FORMATS[extension][:structured]
50
51
  end
51
52
 
52
53
  # Is this an open format?
53
54
  #
54
55
  # @return [Boolean] whether the format is open or not
55
56
  def open?
56
- @@formats[@extension][:open] rescue nil
57
+ FORMATS[extension][:open]
57
58
  end
58
59
 
59
60
  # Whether the format of the file matches the extension given by the data
@@ -16,6 +16,16 @@ module DataKitten
16
16
  @url = url
17
17
  end
18
18
 
19
+ def exists?
20
+ if @requested
21
+ ok?
22
+ else
23
+ RestClient.head(url).code == 200
24
+ end
25
+ rescue RestClient::ExceptionWithResponse => error
26
+ false
27
+ end
28
+
19
29
  def ok?
20
30
  code == 200
21
31
  end
@@ -30,7 +30,7 @@ module DataKitten
30
30
  # @return [String] The supplied path with the Bitbucket base URL prepended
31
31
  #
32
32
  # @example
33
- # dataset = Dataset.new(access_url: 'https://bitbucket.org/floppy/hot-drinks.git')
33
+ # dataset = Dataset.new('https://bitbucket.org/floppy/hot-drinks.git')
34
34
  # dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
35
35
  # dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
36
36
  def bitbucket_path(path = '')
@@ -51,4 +51,4 @@ module DataKitten
51
51
 
52
52
  end
53
53
 
54
- end
54
+ end
@@ -30,7 +30,7 @@ module DataKitten
30
30
  # @return [String] The supplied path with the Gist base URL prepended
31
31
  #
32
32
  # @example
33
- # dataset = Dataset.new(access_url: 'git://gist.github.com/5633865.git')
33
+ # dataset = Dataset.new('git://gist.github.com/5633865.git')
34
34
  # dataset.gist_path # => 'https://gist.github.com/5633865'
35
35
  # dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
36
36
  def gist_path(path = '')
@@ -47,4 +47,4 @@ module DataKitten
47
47
 
48
48
  end
49
49
 
50
- end
50
+ end
@@ -30,7 +30,7 @@ module DataKitten
30
30
  # @return [String] The supplied path with the GitHub base URL prepended
31
31
  #
32
32
  # @example
33
- # dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
33
+ # dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
34
34
  # dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
35
35
  # dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
36
36
  def github_path(path = '')
@@ -51,4 +51,4 @@ module DataKitten
51
51
 
52
52
  end
53
53
 
54
- end
54
+ end
@@ -6,29 +6,36 @@ module DataKitten
6
6
 
7
7
  module CKAN
8
8
 
9
- @@metadata = nil
10
-
11
9
  private
12
10
 
13
11
  def self.supported?(instance)
14
12
  uri = instance.uri
15
- package = uri.path.split("/").last
13
+ base_uri = uri.merge("/")
14
+ *base, package = uri.path.split('/')
15
+ # If the 2nd to last element in the path is 'dataset' then it's probably
16
+ # the CKAN dataset view page, the last element will be the dataset id
17
+ # or name
18
+ if base.last == "dataset"
19
+ instance.identifier = package
20
+ # build a base URI ending with a /
21
+ base_uri = uri.merge(base[0...-1].join('/') + '/')
16
22
  # If the package is a UUID - it's more than likely to be a CKAN ID
17
- if package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
18
- @@id = package
23
+ elsif package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
24
+ instance.identifier = package
19
25
  else
20
-
21
- results = RestClient.get "#{uri.scheme}://#{uri.host}/api/3/action/package_show", {:params => {:id => package}} rescue ""
22
-
23
- if results == ""
24
- results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
26
+ results = begin
27
+ RestClient.get base_uri.merge("api/3/action/package_show").to_s, {:params => {:id => package}}
28
+ rescue RestClient::Exception
29
+ RestClient.get base_uri.merge("api/2/rest/dataset/#{package}").to_s
25
30
  end
26
31
 
27
32
  result = JSON.parse results
28
- @@id = result["result"]["id"] rescue result["id"]
33
+ instance.identifier = result.fetch("result", result)["id"]
29
34
  end
30
- @@metadata = JSON.parse RestClient.get "#{uri.scheme}://#{uri.host}/api/rest/package/#{@@id}"
31
- @@metadata.extend(GuessableLookup)
35
+ instance.metadata = JSON.parse RestClient.get base_uri.merge("api/rest/package/#{instance.identifier}").to_s
36
+ instance.metadata.extend(GuessableLookup)
37
+ instance.source = instance.metadata
38
+ return true
32
39
  rescue
33
40
  false
34
41
  end
@@ -62,7 +69,7 @@ module DataKitten
62
69
  #
63
70
  # @see Dataset#identifier
64
71
  def identifier
65
- metadata.lookup("name") || @@id
72
+ metadata.lookup("name") || @identifier
66
73
  end
67
74
 
68
75
  # A web page which can be used to gain access to the dataset
@@ -210,10 +217,6 @@ module DataKitten
210
217
 
211
218
  private
212
219
 
213
- def metadata
214
- @@metadata
215
- end
216
-
217
220
  def select_extras(group, key)
218
221
  extra = group["extras"][key] rescue ""
219
222
  if extra == ""
@@ -1,3 +1,3 @@
1
1
  module DataKitten
2
- VERSION = "1.3.0"
2
+ VERSION = "1.3.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_kitten
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Smith
@@ -95,20 +95,6 @@ dependencies:
95
95
  - - ! '>='
96
96
  - !ruby/object:Gem::Version
97
97
  version: '0'
98
- - !ruby/object:Gem::Dependency
99
- name: curb
100
- requirement: !ruby/object:Gem::Requirement
101
- requirements:
102
- - - ! '>='
103
- - !ruby/object:Gem::Version
104
- version: '0'
105
- type: :runtime
106
- prerelease: false
107
- version_requirements: !ruby/object:Gem::Requirement
108
- requirements:
109
- - - ! '>='
110
- - !ruby/object:Gem::Version
111
- version: '0'
112
98
  - !ruby/object:Gem::Dependency
113
99
  name: datapackage
114
100
  requirement: !ruby/object:Gem::Requirement