datacatalog-importer 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -2
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +2 -2
- data/example/lib/puller.rb +1 -1
- data/lib/handler.rb +4 -4
- data/lib/puller.rb +2 -2
- data/lib/pusher.rb +55 -62
- data/lib/tasks.rb +1 -1
- data/lib/utility.rb +5 -5
- data/natdat_is_hungry.md +2 -2
- data/spec/spec_helper.rb +1 -1
- metadata +4 -4
data/README.md
CHANGED
@@ -69,7 +69,7 @@ Put the main logic / algorithm / voodoo of your importer in the `run` method. Th
|
|
69
69
|
:catalog_name => "...",
|
70
70
|
:catalog_url => "http://...",
|
71
71
|
}
|
72
|
-
|
72
|
+
|
73
73
|
Note that most of these parameters match up with the properties defined for a [Source in the National Data Catalog API](http://github.com/sunlightlabs/datacatalog-api/blob/master/resources/sources.rb). These parameters are just passed along to the API, which will validate the values.
|
74
74
|
|
75
75
|
The remaining parameters (`organization` and `downloads`) are handled by the importer framework:
|
@@ -79,7 +79,7 @@ The remaining parameters (`organization` and `downloads`) are handled by the imp
|
|
79
79
|
* The downloads array is used to lookup or create the associate download formats for a data source.
|
80
80
|
|
81
81
|
You may have noticed the use of `Kronos.parse` above. We highly recommend the use of the [kronos library](http://github.com/djsun/kronos) for the parsing of dates.
|
82
|
-
|
82
|
+
|
83
83
|
### organization parameter
|
84
84
|
|
85
85
|
`@handler.organization()` expects a hash parameter of this shape:
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.1
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
12
|
-
s.date = %q{2010-08-
|
12
|
+
s.date = %q{2010-08-30}
|
13
13
|
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
14
|
s.email = %q{djames@sunlightfoundation.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/example/lib/puller.rb
CHANGED
data/lib/handler.rb
CHANGED
@@ -4,7 +4,7 @@ module DataCatalog
|
|
4
4
|
module ImporterFramework
|
5
5
|
class Handler
|
6
6
|
include Shared
|
7
|
-
|
7
|
+
|
8
8
|
def initialize(options)
|
9
9
|
@options = options
|
10
10
|
@counter = {}
|
@@ -13,15 +13,15 @@ module DataCatalog
|
|
13
13
|
@counter[resource] = 1
|
14
14
|
end
|
15
15
|
end
|
16
|
-
|
16
|
+
|
17
17
|
def source(data)
|
18
18
|
write_data(:source, data)
|
19
19
|
end
|
20
|
-
|
20
|
+
|
21
21
|
def organization(data)
|
22
22
|
write_data(:organization, data)
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
def write_data(resource, data)
|
26
26
|
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
27
27
|
Utility.write_yaml(file, data)
|
data/lib/puller.rb
CHANGED
@@ -3,14 +3,14 @@ module DataCatalog
|
|
3
3
|
class Puller
|
4
4
|
|
5
5
|
REQUIRED = %w(cache_folder puller)
|
6
|
-
|
6
|
+
|
7
7
|
def initialize(options)
|
8
8
|
REQUIRED.each do |r|
|
9
9
|
raise Error, "option :#{r} is required" unless options[r.intern]
|
10
10
|
end
|
11
11
|
@options = options
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
def run
|
15
15
|
Utility.report_timing "pull" do
|
16
16
|
handler = Handler.new(@options)
|
data/lib/pusher.rb
CHANGED
@@ -7,9 +7,9 @@ module DataCatalog
|
|
7
7
|
module ImporterFramework
|
8
8
|
class Pusher
|
9
9
|
include Shared
|
10
|
-
|
10
|
+
|
11
11
|
REQUIRED = %w(api_key base_uri cache_folder)
|
12
|
-
|
12
|
+
|
13
13
|
# These keys should not be passed along directly; they need to be
|
14
14
|
# examined so that real ID's can be passed along instead.
|
15
15
|
LOOKUP_KEYS = [:organization, :downloads]
|
@@ -30,7 +30,7 @@ module DataCatalog
|
|
30
30
|
push_sources
|
31
31
|
end
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
34
|
protected
|
35
35
|
|
36
36
|
def setup_api
|
@@ -44,7 +44,7 @@ module DataCatalog
|
|
44
44
|
create_or_update_organization(data)
|
45
45
|
end
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
def push_sources
|
49
49
|
read_data(:source) do |data|
|
50
50
|
link_to_existing_organization!(data, :organization_id)
|
@@ -59,8 +59,6 @@ module DataCatalog
|
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end
|
62
|
-
|
63
|
-
# ---
|
64
62
|
|
65
63
|
def read_data(resource)
|
66
64
|
folder = folder(resource)
|
@@ -77,60 +75,53 @@ module DataCatalog
|
|
77
75
|
yield data
|
78
76
|
end
|
79
77
|
end
|
80
|
-
|
81
|
-
# ---
|
82
|
-
|
78
|
+
|
83
79
|
def create_or_update_organization(data)
|
84
80
|
url, name = data[:url], data[:name]
|
85
81
|
raise "#{name} has blank URL" if url.blank?
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
puts "Creating Organization: #{name}"
|
82
|
+
org, attempts = lookup_organization({ :url => url, :name => name })
|
83
|
+
if org
|
84
|
+
puts "Updating Organization: #{name}"
|
90
85
|
begin
|
91
|
-
DataCatalog::Organization.
|
86
|
+
DataCatalog::Organization.update(org.id, data)
|
92
87
|
rescue DataCatalog::BadRequest => e
|
93
|
-
error("Cannot
|
88
|
+
error("Cannot update Organization with id : #{org.id}", {
|
94
89
|
:params => data,
|
95
90
|
:errors => e.errors,
|
96
91
|
})
|
97
92
|
end
|
98
93
|
else
|
99
|
-
|
100
|
-
warning("Cannot find unique Source with url : #{url}", {
|
101
|
-
:warning => "#{n} matches: " + docs.map { |x| x.id }.join(" "),
|
102
|
-
:workaround => "Using #{docs[0].id}"
|
103
|
-
})
|
104
|
-
end
|
105
|
-
puts "Updating Organization: #{name}"
|
94
|
+
puts "Creating Organization: #{name}"
|
106
95
|
begin
|
107
|
-
DataCatalog::Organization.
|
96
|
+
DataCatalog::Organization.create(data)
|
108
97
|
rescue DataCatalog::BadRequest => e
|
109
|
-
error("Cannot
|
98
|
+
error("Cannot create Organization", {
|
110
99
|
:params => data,
|
111
100
|
:errors => e.errors,
|
112
101
|
})
|
113
102
|
end
|
114
103
|
end
|
115
104
|
end
|
116
|
-
|
117
|
-
def find_organization_by(field,
|
118
|
-
docs = DataCatalog::Organization.all(field =>
|
105
|
+
|
106
|
+
def find_organization_by(field, value)
|
107
|
+
docs = DataCatalog::Organization.all(field => value)
|
119
108
|
n = docs.length
|
120
109
|
if n == 0
|
121
110
|
nil
|
122
111
|
else
|
123
112
|
if n > 1
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
113
|
+
id_matches = docs.map { |x| x.id }.join(" ")
|
114
|
+
warning("Cannot find unique Organization with #{field} : " +
|
115
|
+
"#{value}", {
|
116
|
+
:warning => "#{n} matches: #{id_matches}",
|
117
|
+
:workaround => "Using #{docs[0].id}"
|
118
|
+
})
|
128
119
|
end
|
129
120
|
docs[0]
|
130
121
|
end
|
131
122
|
end
|
132
|
-
|
133
|
-
# Important: do not modify data
|
123
|
+
|
124
|
+
# Important: do not modify the `data` parameter
|
134
125
|
def create_or_update_source(data)
|
135
126
|
data = clean_source_data(data)
|
136
127
|
docs = DataCatalog::Source.all(:url => data[:url])
|
@@ -163,8 +154,8 @@ module DataCatalog
|
|
163
154
|
})
|
164
155
|
end
|
165
156
|
end
|
166
|
-
|
167
|
-
#
|
157
|
+
|
158
|
+
# Note: it is important to not modify the 'data' parameter.
|
168
159
|
def create_or_update_download(source, data)
|
169
160
|
data = data.merge({:source_id => source.id})
|
170
161
|
docs = DataCatalog::Download.all({
|
@@ -194,63 +185,65 @@ module DataCatalog
|
|
194
185
|
})
|
195
186
|
end
|
196
187
|
else
|
197
|
-
|
198
|
-
|
199
|
-
|
188
|
+
id_matches = docs.map { |x| x.id }.join(" ")
|
189
|
+
error("Cannot find unique Download with source_id : #{source.id}" +
|
190
|
+
" and format : #{data[:format]}", {
|
191
|
+
:error => "#{n} matches: #{id_matches}"
|
192
|
+
})
|
200
193
|
end
|
201
194
|
end
|
202
|
-
|
203
|
-
# Try to link to an existing organization, first by
|
204
|
-
#
|
195
|
+
|
196
|
+
# Try to link to an existing organization, first by using an URL, then
|
197
|
+
# by name.
|
205
198
|
#
|
206
|
-
# Note: modifies data (
|
199
|
+
# Note: modifies data (hence the ! in the method name)
|
207
200
|
def link_to_existing_organization!(data, organization_id_key)
|
208
|
-
|
209
|
-
raise "Could not find :organization key" unless
|
201
|
+
organization_data = data.delete(:organization)
|
202
|
+
raise "Could not find :organization key" unless organization_data
|
203
|
+
org, attempts = lookup_organization(organization_data)
|
204
|
+
if org
|
205
|
+
data[organization_id_key] = org.id
|
206
|
+
else
|
207
|
+
puts "- Could not find organization (to link) with " +
|
208
|
+
attempts.join(' or ')
|
209
|
+
end
|
210
|
+
true # return value not important
|
211
|
+
end
|
210
212
|
|
213
|
+
def lookup_organization(organization_data)
|
211
214
|
keys = [:url, :home_url, :name]
|
212
|
-
unless
|
215
|
+
unless organization_data.any? { |key, value| keys.include?(key) }
|
213
216
|
raise "Need #{keys.join(' or ')} to lookup an organization"
|
214
217
|
end
|
215
|
-
|
216
218
|
attempts = []
|
217
219
|
organization = nil
|
218
220
|
keys.each do |key|
|
219
|
-
value =
|
221
|
+
value = organization_data[key]
|
220
222
|
organization = if value
|
221
223
|
attempts << "#{key} : #{value}"
|
222
224
|
find_organization_by(key, value)
|
223
225
|
end
|
224
226
|
break if organization
|
225
227
|
end
|
226
|
-
|
227
|
-
if organization
|
228
|
-
data[organization_id_key] = organization.id
|
229
|
-
else
|
230
|
-
puts "- Could not find organization with #{attempts.join(' or ')}"
|
231
|
-
end
|
232
|
-
true # return value not important
|
228
|
+
[organization, attempts]
|
233
229
|
end
|
234
|
-
|
235
|
-
protected
|
236
230
|
|
237
|
-
# Important: do not modify data
|
231
|
+
# Important: do not modify the `data` parameter
|
238
232
|
def clean_source_data(data)
|
239
233
|
data = data.reject { |k, v| LOOKUP_KEYS.include?(k) }
|
240
|
-
|
241
234
|
frequency = Frequency.new(data[:frequency])
|
242
235
|
data[:frequency] = "other" unless frequency.valid?
|
243
236
|
data
|
244
237
|
end
|
245
|
-
|
238
|
+
|
246
239
|
def error(text, object)
|
247
240
|
report("Error", text, object)
|
248
241
|
end
|
249
|
-
|
242
|
+
|
250
243
|
def warning(text, object)
|
251
244
|
report("Warning", text, object)
|
252
245
|
end
|
253
|
-
|
246
|
+
|
254
247
|
def report(type, text, object)
|
255
248
|
puts "- #{type} : #{text}"
|
256
249
|
puts " Uploading Report to API."
|
@@ -263,7 +256,7 @@ module DataCatalog
|
|
263
256
|
rescue DataCatalog::BadRequest => e
|
264
257
|
raise Error, "Could not upload Report to API: #{e.errors.inspect}"
|
265
258
|
end
|
266
|
-
|
259
|
+
|
267
260
|
end
|
268
261
|
end
|
269
262
|
end
|
data/lib/tasks.rb
CHANGED
data/lib/utility.rb
CHANGED
@@ -85,7 +85,7 @@ module DataCatalog
|
|
85
85
|
end
|
86
86
|
end
|
87
87
|
end
|
88
|
-
|
88
|
+
|
89
89
|
def self.remove_fetch_options(options={})
|
90
90
|
[:max_attempts, :retry_delay, :quiet].each do |opt|
|
91
91
|
options.delete(opt)
|
@@ -101,9 +101,9 @@ module DataCatalog
|
|
101
101
|
puts "Elapsed time [#{label}] %.2f s" % diff
|
102
102
|
result
|
103
103
|
end
|
104
|
-
|
104
|
+
|
105
105
|
# == Parsing ===
|
106
|
-
|
106
|
+
|
107
107
|
def self.parse_file(format, file, options={})
|
108
108
|
File.open(file) do |f|
|
109
109
|
case format
|
@@ -122,7 +122,7 @@ module DataCatalog
|
|
122
122
|
end
|
123
123
|
end
|
124
124
|
end
|
125
|
-
|
125
|
+
|
126
126
|
def self.parse_uri(format, uri, options={})
|
127
127
|
data = fetch(uri, options)
|
128
128
|
case format
|
@@ -139,7 +139,7 @@ module DataCatalog
|
|
139
139
|
raise "Unexpected format : #{format.inspect}"
|
140
140
|
end
|
141
141
|
end
|
142
|
-
|
142
|
+
|
143
143
|
def self.parse_file_or_uri(format, file, uri, options={})
|
144
144
|
force_fetch = options.delete(:force_fetch) || false
|
145
145
|
if force_fetch || !File.exist?(file)
|
data/natdat_is_hungry.md
CHANGED
@@ -74,7 +74,7 @@ Put the main logic / algorithm / secret recipe / voodoo of your importer in the
|
|
74
74
|
:catalog_name => "...",
|
75
75
|
:catalog_url => "http://...",
|
76
76
|
}
|
77
|
-
|
77
|
+
|
78
78
|
Note that most of these parameters match up with the properties defined for a [Source in the National Data Catalog API](http://github.com/sunlightlabs/datacatalog-api/blob/master/resources/sources.rb). These parameters are just passed along to the API, which will validate the values.
|
79
79
|
|
80
80
|
The remaining parameters (`organization` and `downloads`) are handled by the importer framework:
|
@@ -84,7 +84,7 @@ The remaining parameters (`organization` and `downloads`) are handled by the imp
|
|
84
84
|
* The downloads array is used to lookup or create the associate download formats for a data source.
|
85
85
|
|
86
86
|
You may have noticed the use of `Kronos.parse` above. We highly recommend the use of the [kronos library](http://github.com/djsun/kronos) for the parsing of dates.
|
87
|
-
|
87
|
+
|
88
88
|
**organization parameter**
|
89
89
|
|
90
90
|
`@handler.organization()` expects a hash parameter of this shape:
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 1
|
10
|
+
version: 0.3.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-30 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|