datacatalog-importer 0.1.19 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -17,5 +17,7 @@ tmtags
17
17
  coverage
18
18
  rdoc
19
19
  pkg
20
+ example/cache
21
+ example/config.yml
20
22
 
21
23
  ## PROJECT::SPECIFIC
data/Rakefile CHANGED
@@ -8,10 +8,10 @@ begin
8
8
  gem.summary = %Q{A framework to write National Data Catalog importers}
9
9
  gem.description = %Q{This framework makes it easier to write importers for the National Data Catalog.}
10
10
  gem.email = "djames@sunlightfoundation.com"
11
- gem.homepage = "http://github.com/djsun/datacatalog-importer"
11
+ gem.homepage = "http://github.com/sunlightlabs/datacatalog-importer"
12
12
  gem.authors = ["David James"]
13
- gem.add_dependency "nokogiri", ">= 1.4.1"
14
- gem.add_dependency "datacatalog", ">= 0.4.14"
13
+ gem.add_dependency "nokogiri", ">= 1.4.2"
14
+ gem.add_dependency "datacatalog", ">= 0.4.15"
15
15
  gem.add_development_dependency "rspec", ">= 1.2.9"
16
16
  # gem is a Gem::Specification...
17
17
  # see http://www.rubygems.org/read/chapter/20 for additional settings
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.19
1
+ 0.2.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.1.19"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
12
- s.date = %q{2010-05-12}
12
+ s.date = %q{2010-07-08}
13
13
  s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
14
14
  s.email = %q{djames@sunlightfoundation.com}
15
15
  s.extra_rdoc_files = [
@@ -24,7 +24,12 @@ Gem::Specification.new do |s|
24
24
  "Rakefile",
25
25
  "VERSION",
26
26
  "datacatalog-importer.gemspec",
27
+ "example/README.md",
28
+ "example/config.example.yml",
29
+ "example/lib/puller.rb",
30
+ "example/rakefile.rb",
27
31
  "lib/datacatalog-importer.rb",
32
+ "lib/handler.rb",
28
33
  "lib/importer.rb",
29
34
  "lib/puller.rb",
30
35
  "lib/pusher.rb",
@@ -36,10 +41,10 @@ Gem::Specification.new do |s|
36
41
  "spec/spec_helper.rb",
37
42
  "spec/utility_spec.rb"
38
43
  ]
39
- s.homepage = %q{http://github.com/djsun/datacatalog-importer}
44
+ s.homepage = %q{http://github.com/sunlightlabs/datacatalog-importer}
40
45
  s.rdoc_options = ["--charset=UTF-8"]
41
46
  s.require_paths = ["lib"]
42
- s.rubygems_version = %q{1.3.6}
47
+ s.rubygems_version = %q{1.3.7}
43
48
  s.summary = %q{A framework to write National Data Catalog importers}
44
49
  s.test_files = [
45
50
  "spec/spec_helper.rb",
@@ -50,18 +55,18 @@ Gem::Specification.new do |s|
50
55
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
51
56
  s.specification_version = 3
52
57
 
53
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
54
- s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
55
- s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.14"])
58
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
59
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.2"])
60
+ s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.15"])
56
61
  s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
57
62
  else
58
- s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
59
- s.add_dependency(%q<datacatalog>, [">= 0.4.14"])
63
+ s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
64
+ s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
60
65
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
61
66
  end
62
67
  else
63
- s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
64
- s.add_dependency(%q<datacatalog>, [">= 0.4.14"])
68
+ s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
69
+ s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
65
70
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
66
71
  end
67
72
  end
data/example/README.md ADDED
@@ -0,0 +1,5 @@
1
+ # README
2
+
3
+ This is an example of how to write an importer that depends on the National Data Catalog importer framework.
4
+
5
+ Please note that it is a bare bones example. It simply demonstrates how to setup your rakefile and just the basic moving parts.
@@ -0,0 +1,3 @@
1
+ local:
2
+ api_key: "0000000000000000000000000000000000000000"
3
+ base_uri: "http://localhost:3000"
@@ -0,0 +1,71 @@
1
+ class Puller
2
+
3
+ ORGS = [
4
+ {
5
+ :name => "Budget Office",
6
+ :url => "http://example.gov/orgs/budget-office",
7
+ :description => "Prepares the executive budget..."
8
+ },
9
+ {
10
+ :name => "Environmental Agency",
11
+ :url => "http://example.gov/orgs/environmental-agency",
12
+ :description => "Tracks environmental compliance..."
13
+ },
14
+ {
15
+ :name => "Inspector General",
16
+ :url => "http://example.gov/orgs/inspector-general",
17
+ :description => "Inspects..."
18
+ }
19
+ ]
20
+
21
+ SOURCES = [
22
+ {
23
+ :title => "School District Performance",
24
+ :url => "http://example.gov/data-sets/209",
25
+ :description => "Comparative school performance...",
26
+ :frequency => "annual",
27
+ :source_type => "dataset",
28
+ },
29
+ {
30
+ :title => "Economic Development",
31
+ :url => "http://example.gov/data-sets/210",
32
+ :description => "Economic indicators for...",
33
+ :frequency => "monthly",
34
+ :source_type => "dataset",
35
+ },
36
+ {
37
+ :title => "Superfund Projects",
38
+ :url => "http://example.gov/apis/5",
39
+ :description => "API for environmental cleanup...",
40
+ :frequency => "monthly",
41
+ :source_type => "api",
42
+ },
43
+ ]
44
+
45
+ def initialize(handler)
46
+ @handler = handler
47
+ end
48
+
49
+ def run
50
+ common = {
51
+ :catalog_name => "Example Catalog",
52
+ :catalog_url => "http://example.gov",
53
+ }
54
+ ORGS.each do |o|
55
+ @handler.organization(
56
+ o.merge(common).merge({
57
+ :org_type => "governmental",
58
+ })
59
+ )
60
+ end
61
+ SOURCES.each do |s|
62
+ @handler.source(
63
+ s.merge(common).merge({
64
+ :license => "public domain",
65
+ :license_url => "http://example.gov/license",
66
+ })
67
+ )
68
+ end
69
+ end
70
+
71
+ end
@@ -0,0 +1,22 @@
1
+ require 'rubygems'
2
+ require 'yaml'
3
+ require File.dirname(__FILE__) + '/../lib/datacatalog-importer'
4
+ require File.dirname(__FILE__) + '/lib/puller'
5
+
6
+ def setup
7
+ config_file = File.dirname(__FILE__) + '/config.yml'
8
+ config = YAML.load_file(config_file)
9
+ env = ENV['IMPORTER_ENV']
10
+ raise "IMPORTER_ENV undefined" unless env
11
+ raise "IMPORTER_ENV invalid" unless config[env]
12
+ DataCatalog::ImporterFramework::Tasks.new({
13
+ :api_key => config[env]['api_key'],
14
+ :base_uri => config[env]['base_uri'],
15
+ :cache_folder => File.dirname(__FILE__) + '/cache/parsed',
16
+ :name => "Example Catalog",
17
+ :uri => "http://example.datacatalog.gov",
18
+ :puller => Puller,
19
+ })
20
+ end
21
+
22
+ setup
@@ -6,6 +6,7 @@
6
6
  # strange conflicts.
7
7
  require 'generator.rb'
8
8
 
9
+ require File.dirname(__FILE__) + '/handler'
9
10
  require File.dirname(__FILE__) + '/importer'
10
11
  require File.dirname(__FILE__) + '/puller'
11
12
  require File.dirname(__FILE__) + '/pusher'
data/lib/handler.rb ADDED
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/shared'
2
+
3
+ module DataCatalog
4
+ module ImporterFramework
5
+ class Handler
6
+ include Shared
7
+
8
+ def initialize(options)
9
+ @options = options
10
+ @counter = {}
11
+ [:source, :organization].each do |resource|
12
+ FileUtils.mkdir_p(folder(resource))
13
+ @counter[resource] = 1
14
+ end
15
+ end
16
+
17
+ def source(data)
18
+ write_data(:source, data)
19
+ end
20
+
21
+ def organization(data)
22
+ write_data(:organization, data)
23
+ end
24
+
25
+ def write_data(resource, data)
26
+ file = folder(resource) + ("/%08i.yml" % @counter[resource])
27
+ Utility.write_yaml(file, data)
28
+ @counter[resource] += 1
29
+ end
30
+
31
+ end
32
+ end
33
+ end
data/lib/puller.rb CHANGED
@@ -1,64 +1,23 @@
1
- require File.dirname(__FILE__) + '/shared'
2
-
3
1
  module DataCatalog
4
2
  module ImporterFramework
5
3
  class Puller
6
- include Shared
7
4
 
8
- REQUIRED = %w(cache_folder pullers)
5
+ REQUIRED = %w(cache_folder puller)
9
6
 
10
7
  def initialize(options)
11
8
  REQUIRED.each do |r|
12
9
  raise Error, "option :#{r} is required" unless options[r.intern]
13
10
  end
14
11
  @options = options
15
- @counter = {
16
- :source => 1,
17
- :organization => 1,
18
- }
19
12
  end
20
13
 
21
14
  def run
22
- Utility.report_timing "pull source" do
23
- pull_resource(:source)
24
- end
25
- Utility.report_timing "pull organization" do
26
- pull_resource(:organization)
15
+ Utility.report_timing "pull" do
16
+ handler = Handler.new(@options)
17
+ puller = @options[:puller].new(handler)
18
+ puller.run
27
19
  end
28
20
  end
29
-
30
- protected
31
-
32
- # Note on HTTP Throttling
33
- #
34
- # It might make sense to throttle HTTP calls in
35
- # * pull_organizations
36
- # * pull_sources
37
- #
38
- # However, doing a simple sleep(TIME_DELAY) is too blunt.
39
- # It makes sense when an HTTP call is made; however, it does
40
- # not make sense when the importer uses a local cache.
41
- #
42
- # An alternative is to wrap HTTP calls in this Importer library.
43
- # It could add a little bit of delay to HTTP calls that are made
44
- # too rapidly.
45
- #
46
- def pull_resource(resource)
47
- unless importer_class = @options[:pullers][resource]
48
- raise Error, "options[:pullers][:#{resource}] is required"
49
- end
50
- importer = importer_class.new
51
- FileUtils.mkdir_p(folder(resource))
52
- while (data = importer.fetch) do
53
- write_data(resource, data)
54
- end
55
- end
56
-
57
- def write_data(resource, data)
58
- file = folder(resource) + ("/%08i.yml" % @counter[resource])
59
- Utility.write_yaml(file, data)
60
- @counter[resource] += 1
61
- end
62
21
 
63
22
  end
64
23
  end
data/lib/shared.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  module DataCatalog
2
2
  module ImporterFramework
3
3
  module Shared
4
+
4
5
  def folder(resource)
5
6
  unless @options
6
7
  raise Error, "@options is undefined"
@@ -10,6 +11,7 @@ module DataCatalog
10
11
  end
11
12
  File.join(@options[:cache_folder], resource.to_s)
12
13
  end
14
+
13
15
  end
14
16
  end
15
17
  end
data/lib/tasks.rb CHANGED
@@ -14,7 +14,7 @@ module DataCatalog
14
14
  puts "Pulling data from the #{options[:name]}..."
15
15
  puller = Puller.new({
16
16
  :cache_folder => options[:cache_folder],
17
- :pullers => options[:pullers],
17
+ :puller => options[:puller],
18
18
  })
19
19
  puller.run
20
20
  end
data/lib/utility.rb CHANGED
@@ -56,7 +56,7 @@ module DataCatalog
56
56
 
57
57
  def self.headers
58
58
  {
59
- "UserAgent" => "National Data Catalog Importer/0.1.19",
59
+ "UserAgent" => "National Data Catalog Importer/0.2.0",
60
60
  }
61
61
  end
62
62
 
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  $LOAD_PATH.unshift(File.dirname(__FILE__))
2
2
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
3
4
  require 'datacatalog-importer'
4
5
  require 'spec'
5
6
  require 'spec/autorun'
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datacatalog-importer
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 23
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
- - 1
8
- - 19
9
- version: 0.1.19
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - David James
@@ -14,44 +15,50 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-05-12 00:00:00 -04:00
18
+ date: 2010-07-08 00:00:00 -04:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
21
22
  name: nokogiri
22
23
  prerelease: false
23
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
24
26
  requirements:
25
27
  - - ">="
26
28
  - !ruby/object:Gem::Version
29
+ hash: 3
27
30
  segments:
28
31
  - 1
29
32
  - 4
30
- - 1
31
- version: 1.4.1
33
+ - 2
34
+ version: 1.4.2
32
35
  type: :runtime
33
36
  version_requirements: *id001
34
37
  - !ruby/object:Gem::Dependency
35
38
  name: datacatalog
36
39
  prerelease: false
37
40
  requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
38
42
  requirements:
39
43
  - - ">="
40
44
  - !ruby/object:Gem::Version
45
+ hash: 17
41
46
  segments:
42
47
  - 0
43
48
  - 4
44
- - 14
45
- version: 0.4.14
49
+ - 15
50
+ version: 0.4.15
46
51
  type: :runtime
47
52
  version_requirements: *id002
48
53
  - !ruby/object:Gem::Dependency
49
54
  name: rspec
50
55
  prerelease: false
51
56
  requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
52
58
  requirements:
53
59
  - - ">="
54
60
  - !ruby/object:Gem::Version
61
+ hash: 13
55
62
  segments:
56
63
  - 1
57
64
  - 2
@@ -76,7 +83,12 @@ files:
76
83
  - Rakefile
77
84
  - VERSION
78
85
  - datacatalog-importer.gemspec
86
+ - example/README.md
87
+ - example/config.example.yml
88
+ - example/lib/puller.rb
89
+ - example/rakefile.rb
79
90
  - lib/datacatalog-importer.rb
91
+ - lib/handler.rb
80
92
  - lib/importer.rb
81
93
  - lib/puller.rb
82
94
  - lib/pusher.rb
@@ -88,7 +100,7 @@ files:
88
100
  - spec/spec_helper.rb
89
101
  - spec/utility_spec.rb
90
102
  has_rdoc: true
91
- homepage: http://github.com/djsun/datacatalog-importer
103
+ homepage: http://github.com/sunlightlabs/datacatalog-importer
92
104
  licenses: []
93
105
 
94
106
  post_install_message:
@@ -97,23 +109,27 @@ rdoc_options:
97
109
  require_paths:
98
110
  - lib
99
111
  required_ruby_version: !ruby/object:Gem::Requirement
112
+ none: false
100
113
  requirements:
101
114
  - - ">="
102
115
  - !ruby/object:Gem::Version
116
+ hash: 3
103
117
  segments:
104
118
  - 0
105
119
  version: "0"
106
120
  required_rubygems_version: !ruby/object:Gem::Requirement
121
+ none: false
107
122
  requirements:
108
123
  - - ">="
109
124
  - !ruby/object:Gem::Version
125
+ hash: 3
110
126
  segments:
111
127
  - 0
112
128
  version: "0"
113
129
  requirements: []
114
130
 
115
131
  rubyforge_project:
116
- rubygems_version: 1.3.6
132
+ rubygems_version: 1.3.7
117
133
  signing_key:
118
134
  specification_version: 3
119
135
  summary: A framework to write National Data Catalog importers