data_kitten 1.3.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTU4NmQ1MmU2YjJhY2U2NmVjOWE5NTBhOGM2YjNhNGQzMWIxYTU4MQ==
4
+ MzYzNmUyZmIxZGNlOGViNGQwMmQ5YThiZDQzYWZiNTc2OTBkZWYwYQ==
5
5
  data.tar.gz: !binary |-
6
- ZjNhNWU4NzNlZjI4ODU3ZTRkYTgxZmY4MWM0NTA3OTQyNDNmMmJjMg==
6
+ NDVhMTRkZWRlMGI2YWEzNWZhNmRjZTRkNWJmNjAwOTRiNTVkMmQ0Yw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZjY0ODMyZmZjODMxNmU4NmNhMmI4N2YxMWMzMThhNGFlMTAxZmQ0ODA1OTcz
10
- MjcwOWIwMmYyMmRiNDlmNTEyN2ExMWE4N2E4NWJlMDFlNjI3NDU4ZmZjZWFj
11
- Yjk2MWZiZGJmODE5ZDVjMWQ0OTY2ODI4MWRiMzVlZmE5NjM0OGM=
9
+ MDg5ZGZiNjRjYjM3ODgwNjRlYWU2NTlhM2EzMTgyNmRhOGM4MjQ1YWYzMGU3
10
+ YjU1NTI2ZmY4OGJkNWRiYTZhYzNmM2QzZWUwMjQzMDlhZGViYjlmYjM3ZmU1
11
+ MzA1ZGJkZGU5MWM0NjIyZWMzY2M5YThiNjA1ZTIwOTg4OGY2ZDY=
12
12
  data.tar.gz: !binary |-
13
- MjY5NDZiZjliNThjZTk5NTQ4YzZlM2M4OGFiOGYzYjFjZDhmYzQ4NmRlMjJm
14
- ZmE4MTMxZTMyZmQzNjBhODEyODZhODA3ZjIyMDUzNDFmMWJiMWRhNTBlMTU3
15
- MTM2MTU4ZGY4YmZlMjcyM2VkYmM5Y2Q3NzY3YWJmNmNlODdlZTI=
13
+ OTVmZTRlODU0ZDJkMmRhMjU3YTViY2VlODlhNDBlYjVkYjdlM2RhNDE4N2Uw
14
+ MGQ1MTQ4NTY2MGY2YTIwZjY4YmE5NzQ1OWExODJjNWM3MDk0YmU3YjcxZGVm
15
+ Njg0ZjQ0NzFjN2I0NmM5OTM4OWM0MzhlNTY2OWIyMjYyZDZiM2U=
data/README.md CHANGED
@@ -45,7 +45,7 @@ Require if you need to:
45
45
 
46
46
  Request a dataset:
47
47
 
48
- dataset = DataKitten::Dataset.new(access_url: "https://github.com/theodi/dataset-mod-disposals.git")
48
+ dataset = DataKitten::Dataset.new("https://github.com/theodi/dataset-mod-disposals.git")
49
49
 
50
50
  Use the results:
51
51
 
data/bin/data_kitten CHANGED
@@ -9,7 +9,7 @@ if ARGV.length == 0
9
9
  exit 1
10
10
  end
11
11
 
12
- dataset = DataKitten::Dataset.new(access_url: ARGV[0])
12
+ dataset = DataKitten::Dataset.new(ARGV[0])
13
13
 
14
14
  if dataset.publishing_format == nil
15
15
  puts "Unable to determine format for dataset metadata"
data/lib/data_kitten.rb CHANGED
@@ -8,7 +8,6 @@ require 'rdf'
8
8
  require 'linkeddata'
9
9
  require 'nokogiri'
10
10
  require 'uri'
11
- require 'curb'
12
11
  require 'datapackage'
13
12
 
14
13
  require 'data_kitten/license'
@@ -41,4 +40,4 @@ require 'data_kitten/fetcher'
41
40
  # dataset.publishing_format # => :datapackage
42
41
  # dataset.distributions # => [Distribution<#1>, Distribution<#2>]
43
42
  # dataset.distributions[0].headers # => ['col1', 'col2']
44
- # dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
43
+ # dataset.distributions[0].data[0] # => {'col1' => 'value_1', 'col2' => 'value_2'}
@@ -13,7 +13,7 @@ module DataKitten
13
13
  # use the Datapackage metadata format.
14
14
  #
15
15
  # @example Load a Dataset from a git repository
16
- # dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
16
+ # dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
17
17
  # dataset.supported? # => true
18
18
  # dataset.origin # => :git
19
19
  # dataset.host # => :github
@@ -30,13 +30,25 @@ module DataKitten
30
30
  attr_accessor :access_url
31
31
 
32
32
  # Create a new Dataset object
33
- #
34
- # @param [Hash] options the details of the Dataset.
35
- # @option options [String] :access_url A URL that can be used to access the Dataset.
36
- # The class will attempt to auto-load metadata from this URL.
37
33
  #
38
- def initialize(options)
39
- @access_url = DataKitten::Fetcher.wrap(options[:access_url])
34
+ # The class will attempt to auto-load metadata from this URL.
35
+ #
36
+ # @overload new(url)
37
+ # @param [String] url A URL that can be used to access the Dataset
38
+ #
39
+ # @overload new(options)
40
+ # @param [Hash] options the details of the Dataset.
41
+ # @option options [String] :access_url A URL that can be used to access the Dataset.
42
+ #
43
+ def initialize(url_or_options)
44
+ url = case url_or_options
45
+ when Hash
46
+ url_or_options[:access_url]
47
+ else
48
+ url_or_options
49
+ end
50
+ @access_url = DataKitten::Fetcher.wrap(url)
51
+
40
52
  detect_origin
41
53
  detect_host
42
54
  detect_publishing_format
@@ -51,9 +63,11 @@ module DataKitten
51
63
  end
52
64
 
53
65
  def source
54
- @access_url.as_json if @access_url.ok?
66
+ @source ||= @access_url.as_json if @access_url.ok?
55
67
  end
56
68
 
69
+ attr_writer :source
70
+
57
71
  # Can metadata be loaded for this Dataset?
58
72
  #
59
73
  # @return [Boolean] true if metadata can be loaded, false if it's
@@ -84,9 +98,7 @@ module DataKitten
84
98
  #
85
99
  # @return [String] the identifier of the dataset
86
100
  #
87
- def identifier
88
- nil
89
- end
101
+ attr_accessor :identifier
90
102
 
91
103
  # The human-readable title of the dataset.
92
104
  #
@@ -269,5 +281,7 @@ module DataKitten
269
281
  nil
270
282
  end
271
283
 
284
+ attr_accessor :metadata
285
+
272
286
  end
273
287
  end
@@ -108,6 +108,8 @@ module DataKitten
108
108
  @dialect ||= {
109
109
  "delimiter" => ","
110
110
  }
111
+
112
+ @download = Fetcher.wrap(@download_url)
111
113
  end
112
114
 
113
115
  # A usable name for the distribution, unique within the {Dataset}.
@@ -136,9 +138,7 @@ module DataKitten
136
138
  #
137
139
  # @return [Boolean] whether the HTTP response returns a success code or not
138
140
  def exists?
139
- if @download_url
140
- http_head.response_code != 404
141
- end
141
+ @download.exists?
142
142
  end
143
143
 
144
144
  # A CSV object representing the loaded data.
@@ -148,8 +148,8 @@ module DataKitten
148
148
  @data ||= begin
149
149
  if @path
150
150
  datafile = @dataset.send(:load_file, @path)
151
- elsif @download_url
152
- datafile = RestClient.get @download_url rescue nil
151
+ elsif @download.ok?
152
+ datafile = @download.body
153
153
  end
154
154
  if datafile
155
155
  case format.extension
@@ -170,17 +170,6 @@ module DataKitten
170
170
  end
171
171
  end
172
172
 
173
- def http_head
174
- if @download_url
175
- @http_head ||= begin
176
- Curl::Easy.http_head(@download_url) do |c|
177
- c.follow_location = true
178
- c.useragent = "curb"
179
- end
180
- end
181
- end
182
- end
183
-
184
173
  end
185
174
 
186
175
  end
@@ -6,6 +6,30 @@ module DataKitten
6
6
  #
7
7
  class DistributionFormat
8
8
 
9
+ FORMATS = {
10
+ csv: { structured: true, open: true },
11
+ xls: { structured: true, open: false },
12
+ xlsx: { structured: true, open: true },
13
+ rdf: { structured: true, open: true },
14
+ xml: { structured: true, open: true },
15
+ wms: { structured: true, open: true },
16
+ ods: { structured: true, open: true },
17
+ rdfa: { structured: true, open: true },
18
+ kml: { structured: true, open: true },
19
+ rss: { structured: true, open: true },
20
+ json: { structured: true, open: true },
21
+ ical: { structured: true, open: true },
22
+ sparql: { structured: true, open: true },
23
+ kml: { structured: true, open: true },
24
+ georss: { structured: true, open: true },
25
+ geojson: { structured: true, open: true },
26
+ shp: { structured: true, open: true },
27
+ html: { structured: false, open: true },
28
+ doc: { structured: false, open: false },
29
+ pdf: { structured: false, open: true }
30
+ }
31
+ FORMATS.default = {}
32
+
9
33
  #@!attribute extension
10
34
  #@return [Symbol] a symbol for the file extension. For instance, :csv.
11
35
  attr_reader :extension
@@ -17,43 +41,20 @@ module DataKitten
17
41
  @distribution = distribution
18
42
  # Store extension as a lowercase symbol
19
43
  @extension = distribution.extension.to_s.downcase.to_sym
20
- # Set up format lists
21
- @@formats ||= {
22
- csv: { structured: true, open: true },
23
- xls: { structured: true, open: false },
24
- xlsx: { structured: true, open: true },
25
- rdf: { structured: true, open: true },
26
- xml: { structured: true, open: true },
27
- wms: { structured: true, open: true },
28
- ods: { structured: true, open: true },
29
- rdfa: { structured: true, open: true },
30
- kml: { structured: true, open: true },
31
- rss: { structured: true, open: true },
32
- json: { structured: true, open: true },
33
- ical: { structured: true, open: true },
34
- sparql: { structured: true, open: true },
35
- kml: { structured: true, open: true },
36
- georss: { structured: true, open: true },
37
- geojson: { structured: true, open: true },
38
- shp: { structured: true, open: true },
39
- html: { structured: false, open: true },
40
- doc: { structured: false, open: false },
41
- pdf: { structured: false, open: true },
42
- }
43
44
  end
44
45
 
45
46
  # Is this a structured format?
46
47
  #
47
48
  # @return [Boolean] whether the format is machine-readable or not.
48
49
  def structured?
49
- @@formats[@extension][:structured] rescue nil
50
+ FORMATS[extension][:structured]
50
51
  end
51
52
 
52
53
  # Is this an open format?
53
54
  #
54
55
  # @return [Boolean] whether the format is open or not
55
56
  def open?
56
- @@formats[@extension][:open] rescue nil
57
+ FORMATS[extension][:open]
57
58
  end
58
59
 
59
60
  # Whether the format of the file matches the extension given by the data
@@ -16,6 +16,16 @@ module DataKitten
16
16
  @url = url
17
17
  end
18
18
 
19
+ def exists?
20
+ if @requested
21
+ ok?
22
+ else
23
+ RestClient.head(url).code == 200
24
+ end
25
+ rescue RestClient::ExceptionWithResponse => error
26
+ false
27
+ end
28
+
19
29
  def ok?
20
30
  code == 200
21
31
  end
@@ -30,7 +30,7 @@ module DataKitten
30
30
  # @return [String] The supplied path with the Bitbucket base URL prepended
31
31
  #
32
32
  # @example
33
- # dataset = Dataset.new(access_url: 'https://bitbucket.org/floppy/hot-drinks.git')
33
+ # dataset = Dataset.new('https://bitbucket.org/floppy/hot-drinks.git')
34
34
  # dataset.bitbucket_path # => 'https://bitbucket.org/floppy/hot-drinks/'
35
35
  # dataset.bitbucket_path('pull-requests') # => 'https://bitbucket.org/floppy/hot-drinks/pull-requests'
36
36
  def bitbucket_path(path = '')
@@ -51,4 +51,4 @@ module DataKitten
51
51
 
52
52
  end
53
53
 
54
- end
54
+ end
@@ -30,7 +30,7 @@ module DataKitten
30
30
  # @return [String] The supplied path with the Gist base URL prepended
31
31
  #
32
32
  # @example
33
- # dataset = Dataset.new(access_url: 'git://gist.github.com/5633865.git')
33
+ # dataset = Dataset.new('git://gist.github.com/5633865.git')
34
34
  # dataset.gist_path # => 'https://gist.github.com/5633865'
35
35
  # dataset.gist_path('download') # => 'https://gist.github.com/5633865/download'
36
36
  def gist_path(path = '')
@@ -47,4 +47,4 @@ module DataKitten
47
47
 
48
48
  end
49
49
 
50
- end
50
+ end
@@ -30,7 +30,7 @@ module DataKitten
30
30
  # @return [String] The supplied path with the GitHub base URL prepended
31
31
  #
32
32
  # @example
33
- # dataset = Dataset.new(access_url: 'git://github.com/theodi/dataset-metadata-survey.git')
33
+ # dataset = Dataset.new('git://github.com/theodi/dataset-metadata-survey.git')
34
34
  # dataset.github_path # => 'https://github.com/theodi/dataset-metadata-survey/'
35
35
  # dataset.github_path('issues') # => 'https://github.com/theodi/dataset-metadata-survey/issues'
36
36
  def github_path(path = '')
@@ -51,4 +51,4 @@ module DataKitten
51
51
 
52
52
  end
53
53
 
54
- end
54
+ end
@@ -6,29 +6,36 @@ module DataKitten
6
6
 
7
7
  module CKAN
8
8
 
9
- @@metadata = nil
10
-
11
9
  private
12
10
 
13
11
  def self.supported?(instance)
14
12
  uri = instance.uri
15
- package = uri.path.split("/").last
13
+ base_uri = uri.merge("/")
14
+ *base, package = uri.path.split('/')
15
+ # If the 2nd to last element in the path is 'dataset' then it's probably
16
+ # the CKAN dataset view page, the last element will be the dataset id
17
+ # or name
18
+ if base.last == "dataset"
19
+ instance.identifier = package
20
+ # build a base URI ending with a /
21
+ base_uri = uri.merge(base[0...-1].join('/') + '/')
16
22
  # If the package is a UUID - it's more than likely to be a CKAN ID
17
- if package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
18
- @@id = package
23
+ elsif package.match(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/)
24
+ instance.identifier = package
19
25
  else
20
-
21
- results = RestClient.get "#{uri.scheme}://#{uri.host}/api/3/action/package_show", {:params => {:id => package}} rescue ""
22
-
23
- if results == ""
24
- results = RestClient.get "#{uri.scheme}://#{uri.host}/api/2/rest/dataset/#{package}"
26
+ results = begin
27
+ RestClient.get base_uri.merge("api/3/action/package_show").to_s, {:params => {:id => package}}
28
+ rescue RestClient::Exception
29
+ RestClient.get base_uri.merge("api/2/rest/dataset/#{package}").to_s
25
30
  end
26
31
 
27
32
  result = JSON.parse results
28
- @@id = result["result"]["id"] rescue result["id"]
33
+ instance.identifier = result.fetch("result", result)["id"]
29
34
  end
30
- @@metadata = JSON.parse RestClient.get "#{uri.scheme}://#{uri.host}/api/rest/package/#{@@id}"
31
- @@metadata.extend(GuessableLookup)
35
+ instance.metadata = JSON.parse RestClient.get base_uri.merge("api/rest/package/#{instance.identifier}").to_s
36
+ instance.metadata.extend(GuessableLookup)
37
+ instance.source = instance.metadata
38
+ return true
32
39
  rescue
33
40
  false
34
41
  end
@@ -62,7 +69,7 @@ module DataKitten
62
69
  #
63
70
  # @see Dataset#identifier
64
71
  def identifier
65
- metadata.lookup("name") || @@id
72
+ metadata.lookup("name") || @identifier
66
73
  end
67
74
 
68
75
  # A web page which can be used to gain access to the dataset
@@ -210,10 +217,6 @@ module DataKitten
210
217
 
211
218
  private
212
219
 
213
- def metadata
214
- @@metadata
215
- end
216
-
217
220
  def select_extras(group, key)
218
221
  extra = group["extras"][key] rescue ""
219
222
  if extra == ""
@@ -1,3 +1,3 @@
1
1
  module DataKitten
2
- VERSION = "1.3.0"
2
+ VERSION = "1.3.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_kitten
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Smith
@@ -95,20 +95,6 @@ dependencies:
95
95
  - - ! '>='
96
96
  - !ruby/object:Gem::Version
97
97
  version: '0'
98
- - !ruby/object:Gem::Dependency
99
- name: curb
100
- requirement: !ruby/object:Gem::Requirement
101
- requirements:
102
- - - ! '>='
103
- - !ruby/object:Gem::Version
104
- version: '0'
105
- type: :runtime
106
- prerelease: false
107
- version_requirements: !ruby/object:Gem::Requirement
108
- requirements:
109
- - - ! '>='
110
- - !ruby/object:Gem::Version
111
- version: '0'
112
98
  - !ruby/object:Gem::Dependency
113
99
  name: datapackage
114
100
  requirement: !ruby/object:Gem::Requirement