datacatalog-importer 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +2 -2
- data/example/lib/puller.rb +1 -1
- data/lib/handler.rb +4 -4
- data/lib/puller.rb +2 -2
- data/lib/pusher.rb +55 -62
- data/lib/tasks.rb +1 -1
- data/lib/utility.rb +5 -5
- data/natdat_is_hungry.md +2 -2
- data/spec/spec_helper.rb +1 -1
- metadata +4 -4
data/README.md
CHANGED
@@ -69,7 +69,7 @@ Put the main logic / algorithm / voodoo of your importer in the `run` method. Th
|
|
69
69
|
:catalog_name => "...",
|
70
70
|
:catalog_url => "http://...",
|
71
71
|
}
|
72
|
-
|
72
|
+
|
73
73
|
Note that most of these parameters match up with the properties defined for a [Source in the National Data Catalog API](http://github.com/sunlightlabs/datacatalog-api/blob/master/resources/sources.rb). These parameters are just passed along to the API, which will validate the values.
|
74
74
|
|
75
75
|
The remaining parameters (`organization` and `downloads`) are handled by the importer framework:
|
@@ -79,7 +79,7 @@ The remaining parameters (`organization` and `downloads`) are handled by the imp
|
|
79
79
|
* The downloads array is used to lookup or create the associate download formats for a data source.
|
80
80
|
|
81
81
|
You may have noticed the use of `Kronos.parse` above. We highly recommend the use of the [kronos library](http://github.com/djsun/kronos) for the parsing of dates.
|
82
|
-
|
82
|
+
|
83
83
|
### organization parameter
|
84
84
|
|
85
85
|
`@handler.organization()` expects a hash parameter of this shape:
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.1
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
12
|
-
s.date = %q{2010-08-
|
12
|
+
s.date = %q{2010-08-30}
|
13
13
|
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
14
|
s.email = %q{djames@sunlightfoundation.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/example/lib/puller.rb
CHANGED
data/lib/handler.rb
CHANGED
@@ -4,7 +4,7 @@ module DataCatalog
|
|
4
4
|
module ImporterFramework
|
5
5
|
class Handler
|
6
6
|
include Shared
|
7
|
-
|
7
|
+
|
8
8
|
def initialize(options)
|
9
9
|
@options = options
|
10
10
|
@counter = {}
|
@@ -13,15 +13,15 @@ module DataCatalog
|
|
13
13
|
@counter[resource] = 1
|
14
14
|
end
|
15
15
|
end
|
16
|
-
|
16
|
+
|
17
17
|
def source(data)
|
18
18
|
write_data(:source, data)
|
19
19
|
end
|
20
|
-
|
20
|
+
|
21
21
|
def organization(data)
|
22
22
|
write_data(:organization, data)
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
def write_data(resource, data)
|
26
26
|
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
27
27
|
Utility.write_yaml(file, data)
|
data/lib/puller.rb
CHANGED
@@ -3,14 +3,14 @@ module DataCatalog
|
|
3
3
|
class Puller
|
4
4
|
|
5
5
|
REQUIRED = %w(cache_folder puller)
|
6
|
-
|
6
|
+
|
7
7
|
def initialize(options)
|
8
8
|
REQUIRED.each do |r|
|
9
9
|
raise Error, "option :#{r} is required" unless options[r.intern]
|
10
10
|
end
|
11
11
|
@options = options
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
def run
|
15
15
|
Utility.report_timing "pull" do
|
16
16
|
handler = Handler.new(@options)
|
data/lib/pusher.rb
CHANGED
@@ -7,9 +7,9 @@ module DataCatalog
|
|
7
7
|
module ImporterFramework
|
8
8
|
class Pusher
|
9
9
|
include Shared
|
10
|
-
|
10
|
+
|
11
11
|
REQUIRED = %w(api_key base_uri cache_folder)
|
12
|
-
|
12
|
+
|
13
13
|
# These keys should not be passed along directly; they need to be
|
14
14
|
# examined so that real ID's can be passed along instead.
|
15
15
|
LOOKUP_KEYS = [:organization, :downloads]
|
@@ -30,7 +30,7 @@ module DataCatalog
|
|
30
30
|
push_sources
|
31
31
|
end
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
34
|
protected
|
35
35
|
|
36
36
|
def setup_api
|
@@ -44,7 +44,7 @@ module DataCatalog
|
|
44
44
|
create_or_update_organization(data)
|
45
45
|
end
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
def push_sources
|
49
49
|
read_data(:source) do |data|
|
50
50
|
link_to_existing_organization!(data, :organization_id)
|
@@ -59,8 +59,6 @@ module DataCatalog
|
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end
|
62
|
-
|
63
|
-
# ---
|
64
62
|
|
65
63
|
def read_data(resource)
|
66
64
|
folder = folder(resource)
|
@@ -77,60 +75,53 @@ module DataCatalog
|
|
77
75
|
yield data
|
78
76
|
end
|
79
77
|
end
|
80
|
-
|
81
|
-
# ---
|
82
|
-
|
78
|
+
|
83
79
|
def create_or_update_organization(data)
|
84
80
|
url, name = data[:url], data[:name]
|
85
81
|
raise "#{name} has blank URL" if url.blank?
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
puts "Creating Organization: #{name}"
|
82
|
+
org, attempts = lookup_organization({ :url => url, :name => name })
|
83
|
+
if org
|
84
|
+
puts "Updating Organization: #{name}"
|
90
85
|
begin
|
91
|
-
DataCatalog::Organization.
|
86
|
+
DataCatalog::Organization.update(org.id, data)
|
92
87
|
rescue DataCatalog::BadRequest => e
|
93
|
-
error("Cannot
|
88
|
+
error("Cannot update Organization with id : #{org.id}", {
|
94
89
|
:params => data,
|
95
90
|
:errors => e.errors,
|
96
91
|
})
|
97
92
|
end
|
98
93
|
else
|
99
|
-
|
100
|
-
warning("Cannot find unique Source with url : #{url}", {
|
101
|
-
:warning => "#{n} matches: " + docs.map { |x| x.id }.join(" "),
|
102
|
-
:workaround => "Using #{docs[0].id}"
|
103
|
-
})
|
104
|
-
end
|
105
|
-
puts "Updating Organization: #{name}"
|
94
|
+
puts "Creating Organization: #{name}"
|
106
95
|
begin
|
107
|
-
DataCatalog::Organization.
|
96
|
+
DataCatalog::Organization.create(data)
|
108
97
|
rescue DataCatalog::BadRequest => e
|
109
|
-
error("Cannot
|
98
|
+
error("Cannot create Organization", {
|
110
99
|
:params => data,
|
111
100
|
:errors => e.errors,
|
112
101
|
})
|
113
102
|
end
|
114
103
|
end
|
115
104
|
end
|
116
|
-
|
117
|
-
def find_organization_by(field,
|
118
|
-
docs = DataCatalog::Organization.all(field =>
|
105
|
+
|
106
|
+
def find_organization_by(field, value)
|
107
|
+
docs = DataCatalog::Organization.all(field => value)
|
119
108
|
n = docs.length
|
120
109
|
if n == 0
|
121
110
|
nil
|
122
111
|
else
|
123
112
|
if n > 1
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
113
|
+
id_matches = docs.map { |x| x.id }.join(" ")
|
114
|
+
warning("Cannot find unique Organization with #{field} : " +
|
115
|
+
"#{value}", {
|
116
|
+
:warning => "#{n} matches: #{id_matches}",
|
117
|
+
:workaround => "Using #{docs[0].id}"
|
118
|
+
})
|
128
119
|
end
|
129
120
|
docs[0]
|
130
121
|
end
|
131
122
|
end
|
132
|
-
|
133
|
-
# Important: do not modify data
|
123
|
+
|
124
|
+
# Important: do not modify the `data` parameter
|
134
125
|
def create_or_update_source(data)
|
135
126
|
data = clean_source_data(data)
|
136
127
|
docs = DataCatalog::Source.all(:url => data[:url])
|
@@ -163,8 +154,8 @@ module DataCatalog
|
|
163
154
|
})
|
164
155
|
end
|
165
156
|
end
|
166
|
-
|
167
|
-
#
|
157
|
+
|
158
|
+
# Note: it is important to not modify the 'data' parameter.
|
168
159
|
def create_or_update_download(source, data)
|
169
160
|
data = data.merge({:source_id => source.id})
|
170
161
|
docs = DataCatalog::Download.all({
|
@@ -194,63 +185,65 @@ module DataCatalog
|
|
194
185
|
})
|
195
186
|
end
|
196
187
|
else
|
197
|
-
|
198
|
-
|
199
|
-
|
188
|
+
id_matches = docs.map { |x| x.id }.join(" ")
|
189
|
+
error("Cannot find unique Download with source_id : #{source.id}" +
|
190
|
+
" and format : #{data[:format]}", {
|
191
|
+
:error => "#{n} matches: #{id_matches}"
|
192
|
+
})
|
200
193
|
end
|
201
194
|
end
|
202
|
-
|
203
|
-
# Try to link to an existing organization, first by
|
204
|
-
#
|
195
|
+
|
196
|
+
# Try to link to an existing organization, first by using an URL, then
|
197
|
+
# by name.
|
205
198
|
#
|
206
|
-
# Note: modifies data (
|
199
|
+
# Note: modifies data (hence the ! in the method name)
|
207
200
|
def link_to_existing_organization!(data, organization_id_key)
|
208
|
-
|
209
|
-
raise "Could not find :organization key" unless
|
201
|
+
organization_data = data.delete(:organization)
|
202
|
+
raise "Could not find :organization key" unless organization_data
|
203
|
+
org, attempts = lookup_organization(organization_data)
|
204
|
+
if org
|
205
|
+
data[organization_id_key] = org.id
|
206
|
+
else
|
207
|
+
puts "- Could not find organization (to link) with " +
|
208
|
+
attempts.join(' or ')
|
209
|
+
end
|
210
|
+
true # return value not important
|
211
|
+
end
|
210
212
|
|
213
|
+
def lookup_organization(organization_data)
|
211
214
|
keys = [:url, :home_url, :name]
|
212
|
-
unless
|
215
|
+
unless organization_data.any? { |key, value| keys.include?(key) }
|
213
216
|
raise "Need #{keys.join(' or ')} to lookup an organization"
|
214
217
|
end
|
215
|
-
|
216
218
|
attempts = []
|
217
219
|
organization = nil
|
218
220
|
keys.each do |key|
|
219
|
-
value =
|
221
|
+
value = organization_data[key]
|
220
222
|
organization = if value
|
221
223
|
attempts << "#{key} : #{value}"
|
222
224
|
find_organization_by(key, value)
|
223
225
|
end
|
224
226
|
break if organization
|
225
227
|
end
|
226
|
-
|
227
|
-
if organization
|
228
|
-
data[organization_id_key] = organization.id
|
229
|
-
else
|
230
|
-
puts "- Could not find organization with #{attempts.join(' or ')}"
|
231
|
-
end
|
232
|
-
true # return value not important
|
228
|
+
[organization, attempts]
|
233
229
|
end
|
234
|
-
|
235
|
-
protected
|
236
230
|
|
237
|
-
# Important: do not modify data
|
231
|
+
# Important: do not modify the `data` parameter
|
238
232
|
def clean_source_data(data)
|
239
233
|
data = data.reject { |k, v| LOOKUP_KEYS.include?(k) }
|
240
|
-
|
241
234
|
frequency = Frequency.new(data[:frequency])
|
242
235
|
data[:frequency] = "other" unless frequency.valid?
|
243
236
|
data
|
244
237
|
end
|
245
|
-
|
238
|
+
|
246
239
|
def error(text, object)
|
247
240
|
report("Error", text, object)
|
248
241
|
end
|
249
|
-
|
242
|
+
|
250
243
|
def warning(text, object)
|
251
244
|
report("Warning", text, object)
|
252
245
|
end
|
253
|
-
|
246
|
+
|
254
247
|
def report(type, text, object)
|
255
248
|
puts "- #{type} : #{text}"
|
256
249
|
puts " Uploading Report to API."
|
@@ -263,7 +256,7 @@ module DataCatalog
|
|
263
256
|
rescue DataCatalog::BadRequest => e
|
264
257
|
raise Error, "Could not upload Report to API: #{e.errors.inspect}"
|
265
258
|
end
|
266
|
-
|
259
|
+
|
267
260
|
end
|
268
261
|
end
|
269
262
|
end
|
data/lib/tasks.rb
CHANGED
data/lib/utility.rb
CHANGED
@@ -85,7 +85,7 @@ module DataCatalog
|
|
85
85
|
end
|
86
86
|
end
|
87
87
|
end
|
88
|
-
|
88
|
+
|
89
89
|
def self.remove_fetch_options(options={})
|
90
90
|
[:max_attempts, :retry_delay, :quiet].each do |opt|
|
91
91
|
options.delete(opt)
|
@@ -101,9 +101,9 @@ module DataCatalog
|
|
101
101
|
puts "Elapsed time [#{label}] %.2f s" % diff
|
102
102
|
result
|
103
103
|
end
|
104
|
-
|
104
|
+
|
105
105
|
# == Parsing ===
|
106
|
-
|
106
|
+
|
107
107
|
def self.parse_file(format, file, options={})
|
108
108
|
File.open(file) do |f|
|
109
109
|
case format
|
@@ -122,7 +122,7 @@ module DataCatalog
|
|
122
122
|
end
|
123
123
|
end
|
124
124
|
end
|
125
|
-
|
125
|
+
|
126
126
|
def self.parse_uri(format, uri, options={})
|
127
127
|
data = fetch(uri, options)
|
128
128
|
case format
|
@@ -139,7 +139,7 @@ module DataCatalog
|
|
139
139
|
raise "Unexpected format : #{format.inspect}"
|
140
140
|
end
|
141
141
|
end
|
142
|
-
|
142
|
+
|
143
143
|
def self.parse_file_or_uri(format, file, uri, options={})
|
144
144
|
force_fetch = options.delete(:force_fetch) || false
|
145
145
|
if force_fetch || !File.exist?(file)
|
data/natdat_is_hungry.md
CHANGED
@@ -74,7 +74,7 @@ Put the main logic / algorithm / secret recipe / voodoo of your importer in the
|
|
74
74
|
:catalog_name => "...",
|
75
75
|
:catalog_url => "http://...",
|
76
76
|
}
|
77
|
-
|
77
|
+
|
78
78
|
Note that most of these parameters match up with the properties defined for a [Source in the National Data Catalog API](http://github.com/sunlightlabs/datacatalog-api/blob/master/resources/sources.rb). These parameters are just passed along to the API, which will validate the values.
|
79
79
|
|
80
80
|
The remaining parameters (`organization` and `downloads`) are handled by the importer framework:
|
@@ -84,7 +84,7 @@ The remaining parameters (`organization` and `downloads`) are handled by the imp
|
|
84
84
|
* The downloads array is used to lookup or create the associate download formats for a data source.
|
85
85
|
|
86
86
|
You may have noticed the use of `Kronos.parse` above. We highly recommend the use of the [kronos library](http://github.com/djsun/kronos) for the parsing of dates.
|
87
|
-
|
87
|
+
|
88
88
|
**organization parameter**
|
89
89
|
|
90
90
|
`@handler.organization()` expects a hash parameter of this shape:
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 1
|
10
|
+
version: 0.3.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-30 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|