datacatalog-importer 0.1.19 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/Rakefile +3 -3
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +16 -11
- data/example/README.md +5 -0
- data/example/config.example.yml +3 -0
- data/example/lib/puller.rb +71 -0
- data/example/rakefile.rb +22 -0
- data/lib/datacatalog-importer.rb +1 -0
- data/lib/handler.rb +33 -0
- data/lib/puller.rb +5 -46
- data/lib/shared.rb +2 -0
- data/lib/tasks.rb +1 -1
- data/lib/utility.rb +1 -1
- data/spec/spec_helper.rb +1 -0
- metadata +26 -10
data/.gitignore
CHANGED
data/Rakefile
CHANGED
@@ -8,10 +8,10 @@ begin
|
|
8
8
|
gem.summary = %Q{A framework to write National Data Catalog importers}
|
9
9
|
gem.description = %Q{This framework makes it easier to write importers for the National Data Catalog.}
|
10
10
|
gem.email = "djames@sunlightfoundation.com"
|
11
|
-
gem.homepage = "http://github.com/
|
11
|
+
gem.homepage = "http://github.com/sunlightlabs/datacatalog-importer"
|
12
12
|
gem.authors = ["David James"]
|
13
|
-
gem.add_dependency "nokogiri", ">= 1.4.
|
14
|
-
gem.add_dependency "datacatalog", ">= 0.4.
|
13
|
+
gem.add_dependency "nokogiri", ">= 1.4.2"
|
14
|
+
gem.add_dependency "datacatalog", ">= 0.4.15"
|
15
15
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
16
16
|
# gem is a Gem::Specification...
|
17
17
|
# see http://www.rubygems.org/read/chapter/20 for additional settings
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-08}
|
13
13
|
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
14
|
s.email = %q{djames@sunlightfoundation.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,7 +24,12 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
26
|
"datacatalog-importer.gemspec",
|
27
|
+
"example/README.md",
|
28
|
+
"example/config.example.yml",
|
29
|
+
"example/lib/puller.rb",
|
30
|
+
"example/rakefile.rb",
|
27
31
|
"lib/datacatalog-importer.rb",
|
32
|
+
"lib/handler.rb",
|
28
33
|
"lib/importer.rb",
|
29
34
|
"lib/puller.rb",
|
30
35
|
"lib/pusher.rb",
|
@@ -36,10 +41,10 @@ Gem::Specification.new do |s|
|
|
36
41
|
"spec/spec_helper.rb",
|
37
42
|
"spec/utility_spec.rb"
|
38
43
|
]
|
39
|
-
s.homepage = %q{http://github.com/
|
44
|
+
s.homepage = %q{http://github.com/sunlightlabs/datacatalog-importer}
|
40
45
|
s.rdoc_options = ["--charset=UTF-8"]
|
41
46
|
s.require_paths = ["lib"]
|
42
|
-
s.rubygems_version = %q{1.3.
|
47
|
+
s.rubygems_version = %q{1.3.7}
|
43
48
|
s.summary = %q{A framework to write National Data Catalog importers}
|
44
49
|
s.test_files = [
|
45
50
|
"spec/spec_helper.rb",
|
@@ -50,18 +55,18 @@ Gem::Specification.new do |s|
|
|
50
55
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
51
56
|
s.specification_version = 3
|
52
57
|
|
53
|
-
if Gem::Version.new(Gem::
|
54
|
-
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.
|
55
|
-
s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.
|
58
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
59
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.2"])
|
60
|
+
s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.15"])
|
56
61
|
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
57
62
|
else
|
58
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.
|
59
|
-
s.add_dependency(%q<datacatalog>, [">= 0.4.
|
63
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
64
|
+
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
60
65
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
61
66
|
end
|
62
67
|
else
|
63
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.
|
64
|
-
s.add_dependency(%q<datacatalog>, [">= 0.4.
|
68
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
69
|
+
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
65
70
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
66
71
|
end
|
67
72
|
end
|
data/example/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
class Puller
|
2
|
+
|
3
|
+
ORGS = [
|
4
|
+
{
|
5
|
+
:name => "Budget Office",
|
6
|
+
:url => "http://example.gov/orgs/budget-office",
|
7
|
+
:description => "Prepares the executive budget..."
|
8
|
+
},
|
9
|
+
{
|
10
|
+
:name => "Environmental Agency",
|
11
|
+
:url => "http://example.gov/orgs/environmental-agency",
|
12
|
+
:description => "Tracks environmental compliance..."
|
13
|
+
},
|
14
|
+
{
|
15
|
+
:name => "Inspector General",
|
16
|
+
:url => "http://example.gov/orgs/inspector-general",
|
17
|
+
:description => "Inspects..."
|
18
|
+
}
|
19
|
+
]
|
20
|
+
|
21
|
+
SOURCES = [
|
22
|
+
{
|
23
|
+
:title => "School District Performance",
|
24
|
+
:url => "http://example.gov/data-sets/209",
|
25
|
+
:description => "Comparative school performance...",
|
26
|
+
:frequency => "annual",
|
27
|
+
:source_type => "dataset",
|
28
|
+
},
|
29
|
+
{
|
30
|
+
:title => "Economic Development",
|
31
|
+
:url => "http://example.gov/data-sets/210",
|
32
|
+
:description => "Economic indicators for...",
|
33
|
+
:frequency => "monthly",
|
34
|
+
:source_type => "dataset",
|
35
|
+
},
|
36
|
+
{
|
37
|
+
:title => "Superfund Projects",
|
38
|
+
:url => "http://example.gov/apis/5",
|
39
|
+
:description => "API for environmental cleanup...",
|
40
|
+
:frequency => "monthly",
|
41
|
+
:source_type => "api",
|
42
|
+
},
|
43
|
+
]
|
44
|
+
|
45
|
+
def initialize(handler)
|
46
|
+
@handler = handler
|
47
|
+
end
|
48
|
+
|
49
|
+
def run
|
50
|
+
common = {
|
51
|
+
:catalog_name => "Example Catalog",
|
52
|
+
:catalog_url => "http://example.gov",
|
53
|
+
}
|
54
|
+
ORGS.each do |o|
|
55
|
+
@handler.organization(
|
56
|
+
o.merge(common).merge({
|
57
|
+
:org_type => "governmental",
|
58
|
+
})
|
59
|
+
)
|
60
|
+
end
|
61
|
+
SOURCES.each do |s|
|
62
|
+
@handler.source(
|
63
|
+
s.merge(common).merge({
|
64
|
+
:license => "public domain",
|
65
|
+
:license_url => "http://example.gov/license",
|
66
|
+
})
|
67
|
+
)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/example/rakefile.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'yaml'
|
3
|
+
require File.dirname(__FILE__) + '/../lib/datacatalog-importer'
|
4
|
+
require File.dirname(__FILE__) + '/lib/puller'
|
5
|
+
|
6
|
+
def setup
|
7
|
+
config_file = File.dirname(__FILE__) + '/config.yml'
|
8
|
+
config = YAML.load_file(config_file)
|
9
|
+
env = ENV['IMPORTER_ENV']
|
10
|
+
raise "IMPORTER_ENV undefined" unless env
|
11
|
+
raise "IMPORTER_ENV invalid" unless config[env]
|
12
|
+
DataCatalog::ImporterFramework::Tasks.new({
|
13
|
+
:api_key => config[env]['api_key'],
|
14
|
+
:base_uri => config[env]['base_uri'],
|
15
|
+
:cache_folder => File.dirname(__FILE__) + '/cache/parsed',
|
16
|
+
:name => "Example Catalog",
|
17
|
+
:uri => "http://example.datacatalog.gov",
|
18
|
+
:puller => Puller,
|
19
|
+
})
|
20
|
+
end
|
21
|
+
|
22
|
+
setup
|
data/lib/datacatalog-importer.rb
CHANGED
data/lib/handler.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/shared'
|
2
|
+
|
3
|
+
module DataCatalog
|
4
|
+
module ImporterFramework
|
5
|
+
class Handler
|
6
|
+
include Shared
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
@options = options
|
10
|
+
@counter = {}
|
11
|
+
[:source, :organization].each do |resource|
|
12
|
+
FileUtils.mkdir_p(folder(resource))
|
13
|
+
@counter[resource] = 1
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def source(data)
|
18
|
+
write_data(:source, data)
|
19
|
+
end
|
20
|
+
|
21
|
+
def organization(data)
|
22
|
+
write_data(:organization, data)
|
23
|
+
end
|
24
|
+
|
25
|
+
def write_data(resource, data)
|
26
|
+
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
27
|
+
Utility.write_yaml(file, data)
|
28
|
+
@counter[resource] += 1
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/puller.rb
CHANGED
@@ -1,64 +1,23 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/shared'
|
2
|
-
|
3
1
|
module DataCatalog
|
4
2
|
module ImporterFramework
|
5
3
|
class Puller
|
6
|
-
include Shared
|
7
4
|
|
8
|
-
REQUIRED = %w(cache_folder
|
5
|
+
REQUIRED = %w(cache_folder puller)
|
9
6
|
|
10
7
|
def initialize(options)
|
11
8
|
REQUIRED.each do |r|
|
12
9
|
raise Error, "option :#{r} is required" unless options[r.intern]
|
13
10
|
end
|
14
11
|
@options = options
|
15
|
-
@counter = {
|
16
|
-
:source => 1,
|
17
|
-
:organization => 1,
|
18
|
-
}
|
19
12
|
end
|
20
13
|
|
21
14
|
def run
|
22
|
-
Utility.report_timing "pull
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
pull_resource(:organization)
|
15
|
+
Utility.report_timing "pull" do
|
16
|
+
handler = Handler.new(@options)
|
17
|
+
puller = @options[:puller].new(handler)
|
18
|
+
puller.run
|
27
19
|
end
|
28
20
|
end
|
29
|
-
|
30
|
-
protected
|
31
|
-
|
32
|
-
# Note on HTTP Throttling
|
33
|
-
#
|
34
|
-
# It might make sense to throttle HTTP calls in
|
35
|
-
# * pull_organizations
|
36
|
-
# * pull_sources
|
37
|
-
#
|
38
|
-
# However, doing a simple sleep(TIME_DELAY) is too blunt.
|
39
|
-
# It makes sense when an HTTP call is made; however, it does
|
40
|
-
# not make sense when the importer uses a local cache.
|
41
|
-
#
|
42
|
-
# An alternative is to wrap HTTP calls in this Importer library.
|
43
|
-
# It could add a little bit of delay to HTTP calls that are made
|
44
|
-
# too rapidly.
|
45
|
-
#
|
46
|
-
def pull_resource(resource)
|
47
|
-
unless importer_class = @options[:pullers][resource]
|
48
|
-
raise Error, "options[:pullers][:#{resource}] is required"
|
49
|
-
end
|
50
|
-
importer = importer_class.new
|
51
|
-
FileUtils.mkdir_p(folder(resource))
|
52
|
-
while (data = importer.fetch) do
|
53
|
-
write_data(resource, data)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
def write_data(resource, data)
|
58
|
-
file = folder(resource) + ("/%08i.yml" % @counter[resource])
|
59
|
-
Utility.write_yaml(file, data)
|
60
|
-
@counter[resource] += 1
|
61
|
-
end
|
62
21
|
|
63
22
|
end
|
64
23
|
end
|
data/lib/shared.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module DataCatalog
|
2
2
|
module ImporterFramework
|
3
3
|
module Shared
|
4
|
+
|
4
5
|
def folder(resource)
|
5
6
|
unless @options
|
6
7
|
raise Error, "@options is undefined"
|
@@ -10,6 +11,7 @@ module DataCatalog
|
|
10
11
|
end
|
11
12
|
File.join(@options[:cache_folder], resource.to_s)
|
12
13
|
end
|
14
|
+
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
data/lib/tasks.rb
CHANGED
data/lib/utility.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- David James
|
@@ -14,44 +15,50 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-
|
18
|
+
date: 2010-07-08 00:00:00 -04:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
21
22
|
name: nokogiri
|
22
23
|
prerelease: false
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
27
30
|
segments:
|
28
31
|
- 1
|
29
32
|
- 4
|
30
|
-
-
|
31
|
-
version: 1.4.
|
33
|
+
- 2
|
34
|
+
version: 1.4.2
|
32
35
|
type: :runtime
|
33
36
|
version_requirements: *id001
|
34
37
|
- !ruby/object:Gem::Dependency
|
35
38
|
name: datacatalog
|
36
39
|
prerelease: false
|
37
40
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
38
42
|
requirements:
|
39
43
|
- - ">="
|
40
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 17
|
41
46
|
segments:
|
42
47
|
- 0
|
43
48
|
- 4
|
44
|
-
-
|
45
|
-
version: 0.4.
|
49
|
+
- 15
|
50
|
+
version: 0.4.15
|
46
51
|
type: :runtime
|
47
52
|
version_requirements: *id002
|
48
53
|
- !ruby/object:Gem::Dependency
|
49
54
|
name: rspec
|
50
55
|
prerelease: false
|
51
56
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
52
58
|
requirements:
|
53
59
|
- - ">="
|
54
60
|
- !ruby/object:Gem::Version
|
61
|
+
hash: 13
|
55
62
|
segments:
|
56
63
|
- 1
|
57
64
|
- 2
|
@@ -76,7 +83,12 @@ files:
|
|
76
83
|
- Rakefile
|
77
84
|
- VERSION
|
78
85
|
- datacatalog-importer.gemspec
|
86
|
+
- example/README.md
|
87
|
+
- example/config.example.yml
|
88
|
+
- example/lib/puller.rb
|
89
|
+
- example/rakefile.rb
|
79
90
|
- lib/datacatalog-importer.rb
|
91
|
+
- lib/handler.rb
|
80
92
|
- lib/importer.rb
|
81
93
|
- lib/puller.rb
|
82
94
|
- lib/pusher.rb
|
@@ -88,7 +100,7 @@ files:
|
|
88
100
|
- spec/spec_helper.rb
|
89
101
|
- spec/utility_spec.rb
|
90
102
|
has_rdoc: true
|
91
|
-
homepage: http://github.com/
|
103
|
+
homepage: http://github.com/sunlightlabs/datacatalog-importer
|
92
104
|
licenses: []
|
93
105
|
|
94
106
|
post_install_message:
|
@@ -97,23 +109,27 @@ rdoc_options:
|
|
97
109
|
require_paths:
|
98
110
|
- lib
|
99
111
|
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
+
none: false
|
100
113
|
requirements:
|
101
114
|
- - ">="
|
102
115
|
- !ruby/object:Gem::Version
|
116
|
+
hash: 3
|
103
117
|
segments:
|
104
118
|
- 0
|
105
119
|
version: "0"
|
106
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
107
122
|
requirements:
|
108
123
|
- - ">="
|
109
124
|
- !ruby/object:Gem::Version
|
125
|
+
hash: 3
|
110
126
|
segments:
|
111
127
|
- 0
|
112
128
|
version: "0"
|
113
129
|
requirements: []
|
114
130
|
|
115
131
|
rubyforge_project:
|
116
|
-
rubygems_version: 1.3.
|
132
|
+
rubygems_version: 1.3.7
|
117
133
|
signing_key:
|
118
134
|
specification_version: 3
|
119
135
|
summary: A framework to write National Data Catalog importers
|