datacatalog-importer 0.1.19 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/Rakefile +3 -3
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +16 -11
- data/example/README.md +5 -0
- data/example/config.example.yml +3 -0
- data/example/lib/puller.rb +71 -0
- data/example/rakefile.rb +22 -0
- data/lib/datacatalog-importer.rb +1 -0
- data/lib/handler.rb +33 -0
- data/lib/puller.rb +5 -46
- data/lib/shared.rb +2 -0
- data/lib/tasks.rb +1 -1
- data/lib/utility.rb +1 -1
- data/spec/spec_helper.rb +1 -0
- metadata +26 -10
data/.gitignore
CHANGED
data/Rakefile
CHANGED
@@ -8,10 +8,10 @@ begin
|
|
8
8
|
gem.summary = %Q{A framework to write National Data Catalog importers}
|
9
9
|
gem.description = %Q{This framework makes it easier to write importers for the National Data Catalog.}
|
10
10
|
gem.email = "djames@sunlightfoundation.com"
|
11
|
-
gem.homepage = "http://github.com/
|
11
|
+
gem.homepage = "http://github.com/sunlightlabs/datacatalog-importer"
|
12
12
|
gem.authors = ["David James"]
|
13
|
-
gem.add_dependency "nokogiri", ">= 1.4.
|
14
|
-
gem.add_dependency "datacatalog", ">= 0.4.
|
13
|
+
gem.add_dependency "nokogiri", ">= 1.4.2"
|
14
|
+
gem.add_dependency "datacatalog", ">= 0.4.15"
|
15
15
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
16
16
|
# gem is a Gem::Specification...
|
17
17
|
# see http://www.rubygems.org/read/chapter/20 for additional settings
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-08}
|
13
13
|
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
14
|
s.email = %q{djames@sunlightfoundation.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,7 +24,12 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
26
|
"datacatalog-importer.gemspec",
|
27
|
+
"example/README.md",
|
28
|
+
"example/config.example.yml",
|
29
|
+
"example/lib/puller.rb",
|
30
|
+
"example/rakefile.rb",
|
27
31
|
"lib/datacatalog-importer.rb",
|
32
|
+
"lib/handler.rb",
|
28
33
|
"lib/importer.rb",
|
29
34
|
"lib/puller.rb",
|
30
35
|
"lib/pusher.rb",
|
@@ -36,10 +41,10 @@ Gem::Specification.new do |s|
|
|
36
41
|
"spec/spec_helper.rb",
|
37
42
|
"spec/utility_spec.rb"
|
38
43
|
]
|
39
|
-
s.homepage = %q{http://github.com/
|
44
|
+
s.homepage = %q{http://github.com/sunlightlabs/datacatalog-importer}
|
40
45
|
s.rdoc_options = ["--charset=UTF-8"]
|
41
46
|
s.require_paths = ["lib"]
|
42
|
-
s.rubygems_version = %q{1.3.
|
47
|
+
s.rubygems_version = %q{1.3.7}
|
43
48
|
s.summary = %q{A framework to write National Data Catalog importers}
|
44
49
|
s.test_files = [
|
45
50
|
"spec/spec_helper.rb",
|
@@ -50,18 +55,18 @@ Gem::Specification.new do |s|
|
|
50
55
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
51
56
|
s.specification_version = 3
|
52
57
|
|
53
|
-
if Gem::Version.new(Gem::
|
54
|
-
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.
|
55
|
-
s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.
|
58
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
59
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.2"])
|
60
|
+
s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.15"])
|
56
61
|
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
57
62
|
else
|
58
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.
|
59
|
-
s.add_dependency(%q<datacatalog>, [">= 0.4.
|
63
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
64
|
+
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
60
65
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
61
66
|
end
|
62
67
|
else
|
63
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.
|
64
|
-
s.add_dependency(%q<datacatalog>, [">= 0.4.
|
68
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
69
|
+
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
65
70
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
66
71
|
end
|
67
72
|
end
|
data/example/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
class Puller
|
2
|
+
|
3
|
+
ORGS = [
|
4
|
+
{
|
5
|
+
:name => "Budget Office",
|
6
|
+
:url => "http://example.gov/orgs/budget-office",
|
7
|
+
:description => "Prepares the executive budget..."
|
8
|
+
},
|
9
|
+
{
|
10
|
+
:name => "Environmental Agency",
|
11
|
+
:url => "http://example.gov/orgs/environmental-agency",
|
12
|
+
:description => "Tracks environmental compliance..."
|
13
|
+
},
|
14
|
+
{
|
15
|
+
:name => "Inspector General",
|
16
|
+
:url => "http://example.gov/orgs/inspector-general",
|
17
|
+
:description => "Inspects..."
|
18
|
+
}
|
19
|
+
]
|
20
|
+
|
21
|
+
SOURCES = [
|
22
|
+
{
|
23
|
+
:title => "School District Performance",
|
24
|
+
:url => "http://example.gov/data-sets/209",
|
25
|
+
:description => "Comparative school performance...",
|
26
|
+
:frequency => "annual",
|
27
|
+
:source_type => "dataset",
|
28
|
+
},
|
29
|
+
{
|
30
|
+
:title => "Economic Development",
|
31
|
+
:url => "http://example.gov/data-sets/210",
|
32
|
+
:description => "Economic indicators for...",
|
33
|
+
:frequency => "monthly",
|
34
|
+
:source_type => "dataset",
|
35
|
+
},
|
36
|
+
{
|
37
|
+
:title => "Superfund Projects",
|
38
|
+
:url => "http://example.gov/apis/5",
|
39
|
+
:description => "API for environmental cleanup...",
|
40
|
+
:frequency => "monthly",
|
41
|
+
:source_type => "api",
|
42
|
+
},
|
43
|
+
]
|
44
|
+
|
45
|
+
def initialize(handler)
|
46
|
+
@handler = handler
|
47
|
+
end
|
48
|
+
|
49
|
+
def run
|
50
|
+
common = {
|
51
|
+
:catalog_name => "Example Catalog",
|
52
|
+
:catalog_url => "http://example.gov",
|
53
|
+
}
|
54
|
+
ORGS.each do |o|
|
55
|
+
@handler.organization(
|
56
|
+
o.merge(common).merge({
|
57
|
+
:org_type => "governmental",
|
58
|
+
})
|
59
|
+
)
|
60
|
+
end
|
61
|
+
SOURCES.each do |s|
|
62
|
+
@handler.source(
|
63
|
+
s.merge(common).merge({
|
64
|
+
:license => "public domain",
|
65
|
+
:license_url => "http://example.gov/license",
|
66
|
+
})
|
67
|
+
)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/example/rakefile.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'yaml'
|
3
|
+
require File.dirname(__FILE__) + '/../lib/datacatalog-importer'
|
4
|
+
require File.dirname(__FILE__) + '/lib/puller'
|
5
|
+
|
6
|
+
def setup
|
7
|
+
config_file = File.dirname(__FILE__) + '/config.yml'
|
8
|
+
config = YAML.load_file(config_file)
|
9
|
+
env = ENV['IMPORTER_ENV']
|
10
|
+
raise "IMPORTER_ENV undefined" unless env
|
11
|
+
raise "IMPORTER_ENV invalid" unless config[env]
|
12
|
+
DataCatalog::ImporterFramework::Tasks.new({
|
13
|
+
:api_key => config[env]['api_key'],
|
14
|
+
:base_uri => config[env]['base_uri'],
|
15
|
+
:cache_folder => File.dirname(__FILE__) + '/cache/parsed',
|
16
|
+
:name => "Example Catalog",
|
17
|
+
:uri => "http://example.datacatalog.gov",
|
18
|
+
:puller => Puller,
|
19
|
+
})
|
20
|
+
end
|
21
|
+
|
22
|
+
setup
|
data/lib/datacatalog-importer.rb
CHANGED
data/lib/handler.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/shared'
|
2
|
+
|
3
|
+
module DataCatalog
|
4
|
+
module ImporterFramework
|
5
|
+
class Handler
|
6
|
+
include Shared
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
@options = options
|
10
|
+
@counter = {}
|
11
|
+
[:source, :organization].each do |resource|
|
12
|
+
FileUtils.mkdir_p(folder(resource))
|
13
|
+
@counter[resource] = 1
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def source(data)
|
18
|
+
write_data(:source, data)
|
19
|
+
end
|
20
|
+
|
21
|
+
def organization(data)
|
22
|
+
write_data(:organization, data)
|
23
|
+
end
|
24
|
+
|
25
|
+
def write_data(resource, data)
|
26
|
+
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
27
|
+
Utility.write_yaml(file, data)
|
28
|
+
@counter[resource] += 1
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/puller.rb
CHANGED
@@ -1,64 +1,23 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/shared'
|
2
|
-
|
3
1
|
module DataCatalog
|
4
2
|
module ImporterFramework
|
5
3
|
class Puller
|
6
|
-
include Shared
|
7
4
|
|
8
|
-
REQUIRED = %w(cache_folder
|
5
|
+
REQUIRED = %w(cache_folder puller)
|
9
6
|
|
10
7
|
def initialize(options)
|
11
8
|
REQUIRED.each do |r|
|
12
9
|
raise Error, "option :#{r} is required" unless options[r.intern]
|
13
10
|
end
|
14
11
|
@options = options
|
15
|
-
@counter = {
|
16
|
-
:source => 1,
|
17
|
-
:organization => 1,
|
18
|
-
}
|
19
12
|
end
|
20
13
|
|
21
14
|
def run
|
22
|
-
Utility.report_timing "pull
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
pull_resource(:organization)
|
15
|
+
Utility.report_timing "pull" do
|
16
|
+
handler = Handler.new(@options)
|
17
|
+
puller = @options[:puller].new(handler)
|
18
|
+
puller.run
|
27
19
|
end
|
28
20
|
end
|
29
|
-
|
30
|
-
protected
|
31
|
-
|
32
|
-
# Note on HTTP Throttling
|
33
|
-
#
|
34
|
-
# It might make sense to throttle HTTP calls in
|
35
|
-
# * pull_organizations
|
36
|
-
# * pull_sources
|
37
|
-
#
|
38
|
-
# However, doing a simple sleep(TIME_DELAY) is too blunt.
|
39
|
-
# It makes sense when an HTTP call is made; however, it does
|
40
|
-
# not make sense when the importer uses a local cache.
|
41
|
-
#
|
42
|
-
# An alternative is to wrap HTTP calls in this Importer library.
|
43
|
-
# It could add a little bit of delay to HTTP calls that are made
|
44
|
-
# too rapidly.
|
45
|
-
#
|
46
|
-
def pull_resource(resource)
|
47
|
-
unless importer_class = @options[:pullers][resource]
|
48
|
-
raise Error, "options[:pullers][:#{resource}] is required"
|
49
|
-
end
|
50
|
-
importer = importer_class.new
|
51
|
-
FileUtils.mkdir_p(folder(resource))
|
52
|
-
while (data = importer.fetch) do
|
53
|
-
write_data(resource, data)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
def write_data(resource, data)
|
58
|
-
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
59
|
-
Utility.write_yaml(file, data)
|
60
|
-
@counter[resource] += 1
|
61
|
-
end
|
62
21
|
|
63
22
|
end
|
64
23
|
end
|
data/lib/shared.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module DataCatalog
|
2
2
|
module ImporterFramework
|
3
3
|
module Shared
|
4
|
+
|
4
5
|
def folder(resource)
|
5
6
|
unless @options
|
6
7
|
raise Error, "@options is undefined"
|
@@ -10,6 +11,7 @@ module DataCatalog
|
|
10
11
|
end
|
11
12
|
File.join(@options[:cache_folder], resource.to_s)
|
12
13
|
end
|
14
|
+
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
data/lib/tasks.rb
CHANGED
data/lib/utility.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- David James
|
@@ -14,44 +15,50 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-
|
18
|
+
date: 2010-07-08 00:00:00 -04:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
21
22
|
name: nokogiri
|
22
23
|
prerelease: false
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
27
30
|
segments:
|
28
31
|
- 1
|
29
32
|
- 4
|
30
|
-
-
|
31
|
-
version: 1.4.
|
33
|
+
- 2
|
34
|
+
version: 1.4.2
|
32
35
|
type: :runtime
|
33
36
|
version_requirements: *id001
|
34
37
|
- !ruby/object:Gem::Dependency
|
35
38
|
name: datacatalog
|
36
39
|
prerelease: false
|
37
40
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
38
42
|
requirements:
|
39
43
|
- - ">="
|
40
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 17
|
41
46
|
segments:
|
42
47
|
- 0
|
43
48
|
- 4
|
44
|
-
-
|
45
|
-
version: 0.4.
|
49
|
+
- 15
|
50
|
+
version: 0.4.15
|
46
51
|
type: :runtime
|
47
52
|
version_requirements: *id002
|
48
53
|
- !ruby/object:Gem::Dependency
|
49
54
|
name: rspec
|
50
55
|
prerelease: false
|
51
56
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
52
58
|
requirements:
|
53
59
|
- - ">="
|
54
60
|
- !ruby/object:Gem::Version
|
61
|
+
hash: 13
|
55
62
|
segments:
|
56
63
|
- 1
|
57
64
|
- 2
|
@@ -76,7 +83,12 @@ files:
|
|
76
83
|
- Rakefile
|
77
84
|
- VERSION
|
78
85
|
- datacatalog-importer.gemspec
|
86
|
+
- example/README.md
|
87
|
+
- example/config.example.yml
|
88
|
+
- example/lib/puller.rb
|
89
|
+
- example/rakefile.rb
|
79
90
|
- lib/datacatalog-importer.rb
|
91
|
+
- lib/handler.rb
|
80
92
|
- lib/importer.rb
|
81
93
|
- lib/puller.rb
|
82
94
|
- lib/pusher.rb
|
@@ -88,7 +100,7 @@ files:
|
|
88
100
|
- spec/spec_helper.rb
|
89
101
|
- spec/utility_spec.rb
|
90
102
|
has_rdoc: true
|
91
|
-
homepage: http://github.com/
|
103
|
+
homepage: http://github.com/sunlightlabs/datacatalog-importer
|
92
104
|
licenses: []
|
93
105
|
|
94
106
|
post_install_message:
|
@@ -97,23 +109,27 @@ rdoc_options:
|
|
97
109
|
require_paths:
|
98
110
|
- lib
|
99
111
|
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
+
none: false
|
100
113
|
requirements:
|
101
114
|
- - ">="
|
102
115
|
- !ruby/object:Gem::Version
|
116
|
+
hash: 3
|
103
117
|
segments:
|
104
118
|
- 0
|
105
119
|
version: "0"
|
106
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
107
122
|
requirements:
|
108
123
|
- - ">="
|
109
124
|
- !ruby/object:Gem::Version
|
125
|
+
hash: 3
|
110
126
|
segments:
|
111
127
|
- 0
|
112
128
|
version: "0"
|
113
129
|
requirements: []
|
114
130
|
|
115
131
|
rubyforge_project:
|
116
|
-
rubygems_version: 1.3.
|
132
|
+
rubygems_version: 1.3.7
|
117
133
|
signing_key:
|
118
134
|
specification_version: 3
|
119
135
|
summary: A framework to write National Data Catalog importers
|