datacatalog-importer 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -69,7 +69,7 @@ Put the main logic / algorithm / voodoo of your importer in the `run` method. Th
69
69
  :catalog_name => "...",
70
70
  :catalog_url => "http://...",
71
71
  }
72
-
72
+
73
73
  Note that most of these parameters match up with the properties defined for a [Source in the National Data Catalog API](http://github.com/sunlightlabs/datacatalog-api/blob/master/resources/sources.rb). These parameters are just passed along to the API, which will validate the values.
74
74
 
75
75
  The remaining parameters (`organization` and `downloads`) are handled by the importer framework:
@@ -79,7 +79,7 @@ The remaining parameters (`organization` and `downloads`) are handled by the imp
79
79
  * The downloads array is used to lookup or create the associate download formats for a data source.
80
80
 
81
81
  You may have noticed the use of `Kronos.parse` above. We highly recommend the use of the [kronos library](http://github.com/djsun/kronos) for the parsing of dates.
82
-
82
+
83
83
  ### organization parameter
84
84
 
85
85
  `@handler.organization()` expects a hash parameter of this shape:
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.3.1
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.3.0"
8
+ s.version = "0.3.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
12
- s.date = %q{2010-08-25}
12
+ s.date = %q{2010-08-30}
13
13
  s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
14
14
  s.email = %q{djames@sunlightfoundation.com}
15
15
  s.extra_rdoc_files = [
@@ -45,7 +45,7 @@ class Puller
45
45
  def initialize(handler)
46
46
  @handler = handler
47
47
  end
48
-
48
+
49
49
  def run
50
50
  common = {
51
51
  :catalog_name => "Example Catalog",
data/lib/handler.rb CHANGED
@@ -4,7 +4,7 @@ module DataCatalog
4
4
  module ImporterFramework
5
5
  class Handler
6
6
  include Shared
7
-
7
+
8
8
  def initialize(options)
9
9
  @options = options
10
10
  @counter = {}
@@ -13,15 +13,15 @@ module DataCatalog
13
13
  @counter[resource] = 1
14
14
  end
15
15
  end
16
-
16
+
17
17
  def source(data)
18
18
  write_data(:source, data)
19
19
  end
20
-
20
+
21
21
  def organization(data)
22
22
  write_data(:organization, data)
23
23
  end
24
-
24
+
25
25
  def write_data(resource, data)
26
26
  file = folder(resource) + ("/%08i.yml" % @counter[resource])
27
27
  Utility.write_yaml(file, data)
data/lib/puller.rb CHANGED
@@ -3,14 +3,14 @@ module DataCatalog
3
3
  class Puller
4
4
 
5
5
  REQUIRED = %w(cache_folder puller)
6
-
6
+
7
7
  def initialize(options)
8
8
  REQUIRED.each do |r|
9
9
  raise Error, "option :#{r} is required" unless options[r.intern]
10
10
  end
11
11
  @options = options
12
12
  end
13
-
13
+
14
14
  def run
15
15
  Utility.report_timing "pull" do
16
16
  handler = Handler.new(@options)
data/lib/pusher.rb CHANGED
@@ -7,9 +7,9 @@ module DataCatalog
7
7
  module ImporterFramework
8
8
  class Pusher
9
9
  include Shared
10
-
10
+
11
11
  REQUIRED = %w(api_key base_uri cache_folder)
12
-
12
+
13
13
  # These keys should not be passed along directly; they need to be
14
14
  # examined so that real ID's can be passed along instead.
15
15
  LOOKUP_KEYS = [:organization, :downloads]
@@ -30,7 +30,7 @@ module DataCatalog
30
30
  push_sources
31
31
  end
32
32
  end
33
-
33
+
34
34
  protected
35
35
 
36
36
  def setup_api
@@ -44,7 +44,7 @@ module DataCatalog
44
44
  create_or_update_organization(data)
45
45
  end
46
46
  end
47
-
47
+
48
48
  def push_sources
49
49
  read_data(:source) do |data|
50
50
  link_to_existing_organization!(data, :organization_id)
@@ -59,8 +59,6 @@ module DataCatalog
59
59
  end
60
60
  end
61
61
  end
62
-
63
- # ---
64
62
 
65
63
  def read_data(resource)
66
64
  folder = folder(resource)
@@ -77,60 +75,53 @@ module DataCatalog
77
75
  yield data
78
76
  end
79
77
  end
80
-
81
- # ---
82
-
78
+
83
79
  def create_or_update_organization(data)
84
80
  url, name = data[:url], data[:name]
85
81
  raise "#{name} has blank URL" if url.blank?
86
- docs = DataCatalog::Organization.all(:url => url)
87
- n = docs.length
88
- if n == 0
89
- puts "Creating Organization: #{name}"
82
+ org, attempts = lookup_organization({ :url => url, :name => name })
83
+ if org
84
+ puts "Updating Organization: #{name}"
90
85
  begin
91
- DataCatalog::Organization.create(data)
86
+ DataCatalog::Organization.update(org.id, data)
92
87
  rescue DataCatalog::BadRequest => e
93
- error("Cannot create Organization", {
88
+ error("Cannot update Organization with id : #{org.id}", {
94
89
  :params => data,
95
90
  :errors => e.errors,
96
91
  })
97
92
  end
98
93
  else
99
- if n > 1
100
- warning("Cannot find unique Source with url : #{url}", {
101
- :warning => "#{n} matches: " + docs.map { |x| x.id }.join(" "),
102
- :workaround => "Using #{docs[0].id}"
103
- })
104
- end
105
- puts "Updating Organization: #{name}"
94
+ puts "Creating Organization: #{name}"
106
95
  begin
107
- DataCatalog::Organization.update(docs[0].id, data)
96
+ DataCatalog::Organization.create(data)
108
97
  rescue DataCatalog::BadRequest => e
109
- error("Cannot update Organization with id : #{docs[0].id}", {
98
+ error("Cannot create Organization", {
110
99
  :params => data,
111
100
  :errors => e.errors,
112
101
  })
113
102
  end
114
103
  end
115
104
  end
116
-
117
- def find_organization_by(field, name)
118
- docs = DataCatalog::Organization.all(field => name)
105
+
106
+ def find_organization_by(field, value)
107
+ docs = DataCatalog::Organization.all(field => value)
119
108
  n = docs.length
120
109
  if n == 0
121
110
  nil
122
111
  else
123
112
  if n > 1
124
- warning("Cannot find unique Organization with #{field} : #{name}", {
125
- :warning => "#{n} matches: " + docs.map { |x| x.id }.join(" "),
126
- :workaround => "Using #{docs[0].id}"
127
- })
113
+ id_matches = docs.map { |x| x.id }.join(" ")
114
+ warning("Cannot find unique Organization with #{field} : " +
115
+ "#{value}", {
116
+ :warning => "#{n} matches: #{id_matches}",
117
+ :workaround => "Using #{docs[0].id}"
118
+ })
128
119
  end
129
120
  docs[0]
130
121
  end
131
122
  end
132
-
133
- # Important: do not modify data
123
+
124
+ # Important: do not modify the `data` parameter
134
125
  def create_or_update_source(data)
135
126
  data = clean_source_data(data)
136
127
  docs = DataCatalog::Source.all(:url => data[:url])
@@ -163,8 +154,8 @@ module DataCatalog
163
154
  })
164
155
  end
165
156
  end
166
-
167
- # Important: do not modify data
157
+
158
+ # Note: it is important to not modify the 'data' parameter.
168
159
  def create_or_update_download(source, data)
169
160
  data = data.merge({:source_id => source.id})
170
161
  docs = DataCatalog::Download.all({
@@ -194,63 +185,65 @@ module DataCatalog
194
185
  })
195
186
  end
196
187
  else
197
- error("Cannot find unique Download with source_id : #{source.id} and format : #{data[:format]}", {
198
- :error => "#{n} matches: " + docs.map { |x| x.id }.join(" ")
199
- })
188
+ id_matches = docs.map { |x| x.id }.join(" ")
189
+ error("Cannot find unique Download with source_id : #{source.id}" +
190
+ " and format : #{data[:format]}", {
191
+ :error => "#{n} matches: #{id_matches}"
192
+ })
200
193
  end
201
194
  end
202
-
203
- # Try to link to an existing organization, first by
204
- # using an URL, then by name.
195
+
196
+ # Try to link to an existing organization, first by using an URL, then
197
+ # by name.
205
198
  #
206
- # Note: modifies data (that is why I use the !)
199
+ # Note: modifies data (hence the ! in the method name)
207
200
  def link_to_existing_organization!(data, organization_id_key)
208
- hash = data.delete(:organization)
209
- raise "Could not find :organization key" unless hash
201
+ organization_data = data.delete(:organization)
202
+ raise "Could not find :organization key" unless organization_data
203
+ org, attempts = lookup_organization(organization_data)
204
+ if org
205
+ data[organization_id_key] = org.id
206
+ else
207
+ puts "- Could not find organization (to link) with " +
208
+ attempts.join(' or ')
209
+ end
210
+ true # return value not important
211
+ end
210
212
 
213
+ def lookup_organization(organization_data)
211
214
  keys = [:url, :home_url, :name]
212
- unless hash.any? { |key, value| keys.include?(key) }
215
+ unless organization_data.any? { |key, value| keys.include?(key) }
213
216
  raise "Need #{keys.join(' or ')} to lookup an organization"
214
217
  end
215
-
216
218
  attempts = []
217
219
  organization = nil
218
220
  keys.each do |key|
219
- value = hash[key]
221
+ value = organization_data[key]
220
222
  organization = if value
221
223
  attempts << "#{key} : #{value}"
222
224
  find_organization_by(key, value)
223
225
  end
224
226
  break if organization
225
227
  end
226
-
227
- if organization
228
- data[organization_id_key] = organization.id
229
- else
230
- puts "- Could not find organization with #{attempts.join(' or ')}"
231
- end
232
- true # return value not important
228
+ [organization, attempts]
233
229
  end
234
-
235
- protected
236
230
 
237
- # Important: do not modify data
231
+ # Important: do not modify the `data` parameter
238
232
  def clean_source_data(data)
239
233
  data = data.reject { |k, v| LOOKUP_KEYS.include?(k) }
240
-
241
234
  frequency = Frequency.new(data[:frequency])
242
235
  data[:frequency] = "other" unless frequency.valid?
243
236
  data
244
237
  end
245
-
238
+
246
239
  def error(text, object)
247
240
  report("Error", text, object)
248
241
  end
249
-
242
+
250
243
  def warning(text, object)
251
244
  report("Warning", text, object)
252
245
  end
253
-
246
+
254
247
  def report(type, text, object)
255
248
  puts "- #{type} : #{text}"
256
249
  puts " Uploading Report to API."
@@ -263,7 +256,7 @@ module DataCatalog
263
256
  rescue DataCatalog::BadRequest => e
264
257
  raise Error, "Could not upload Report to API: #{e.errors.inspect}"
265
258
  end
266
-
259
+
267
260
  end
268
261
  end
269
262
  end
data/lib/tasks.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module DataCatalog
2
2
  module ImporterFramework
3
3
  class Tasks
4
-
4
+
5
5
  def initialize(options)
6
6
  define(options)
7
7
  end
data/lib/utility.rb CHANGED
@@ -85,7 +85,7 @@ module DataCatalog
85
85
  end
86
86
  end
87
87
  end
88
-
88
+
89
89
  def self.remove_fetch_options(options={})
90
90
  [:max_attempts, :retry_delay, :quiet].each do |opt|
91
91
  options.delete(opt)
@@ -101,9 +101,9 @@ module DataCatalog
101
101
  puts "Elapsed time [#{label}] %.2f s" % diff
102
102
  result
103
103
  end
104
-
104
+
105
105
  # == Parsing ===
106
-
106
+
107
107
  def self.parse_file(format, file, options={})
108
108
  File.open(file) do |f|
109
109
  case format
@@ -122,7 +122,7 @@ module DataCatalog
122
122
  end
123
123
  end
124
124
  end
125
-
125
+
126
126
  def self.parse_uri(format, uri, options={})
127
127
  data = fetch(uri, options)
128
128
  case format
@@ -139,7 +139,7 @@ module DataCatalog
139
139
  raise "Unexpected format : #{format.inspect}"
140
140
  end
141
141
  end
142
-
142
+
143
143
  def self.parse_file_or_uri(format, file, uri, options={})
144
144
  force_fetch = options.delete(:force_fetch) || false
145
145
  if force_fetch || !File.exist?(file)
data/natdat_is_hungry.md CHANGED
@@ -74,7 +74,7 @@ Put the main logic / algorithm / secret recipe / voodoo of your importer in the
74
74
  :catalog_name => "...",
75
75
  :catalog_url => "http://...",
76
76
  }
77
-
77
+
78
78
  Note that most of these parameters match up with the properties defined for a [Source in the National Data Catalog API](http://github.com/sunlightlabs/datacatalog-api/blob/master/resources/sources.rb). These parameters are just passed along to the API, which will validate the values.
79
79
 
80
80
  The remaining parameters (`organization` and `downloads`) are handled by the importer framework:
@@ -84,7 +84,7 @@ The remaining parameters (`organization` and `downloads`) are handled by the imp
84
84
  * The downloads array is used to lookup or create the associate download formats for a data source.
85
85
 
86
86
  You may have noticed the use of `Kronos.parse` above. We highly recommend the use of the [kronos library](http://github.com/djsun/kronos) for the parsing of dates.
87
-
87
+
88
88
  **organization parameter**
89
89
 
90
90
  `@handler.organization()` expects a hash parameter of this shape:
data/spec/spec_helper.rb CHANGED
@@ -6,5 +6,5 @@ require 'spec'
6
6
  require 'spec/autorun'
7
7
 
8
8
  Spec::Runner.configure do |config|
9
-
9
+
10
10
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datacatalog-importer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 0
10
- version: 0.3.0
9
+ - 1
10
+ version: 0.3.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - David James
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-25 00:00:00 -04:00
18
+ date: 2010-08-30 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency