datacatalog-importer 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.13
1
+ 0.1.14
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.1.13"
8
+ s.version = "0.1.14"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
12
- s.date = %q{2010-04-30}
12
+ s.date = %q{2010-05-04}
13
13
  s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
14
14
  s.email = %q{djames@sunlightfoundation.com}
15
15
  s.extra_rdoc_files = [
data/lib/importer.rb CHANGED
@@ -1,7 +1,5 @@
1
1
  module DataCatalog
2
2
  module ImporterFramework
3
-
4
3
  class Error < RuntimeError; end
5
-
6
4
  end
7
5
  end
data/lib/puller.rb CHANGED
@@ -3,7 +3,7 @@ require File.dirname(__FILE__) + '/shared'
3
3
  module DataCatalog
4
4
  module ImporterFramework
5
5
  class Puller
6
- include DataCatalog::ImporterFramework::Shared
6
+ include Shared
7
7
 
8
8
  REQUIRED = %w(cache_folder pullers)
9
9
 
data/lib/pusher.rb CHANGED
@@ -6,7 +6,7 @@ require File.dirname(__FILE__) + '/shared'
6
6
  module DataCatalog
7
7
  module ImporterFramework
8
8
  class Pusher
9
- include DataCatalog::ImporterFramework::Shared
9
+ include Shared
10
10
 
11
11
  REQUIRED = %w(api_key base_uri cache_folder)
12
12
 
data/lib/shared.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  module DataCatalog
2
2
  module ImporterFramework
3
3
  module Shared
4
-
5
4
  def folder(resource)
6
5
  unless @options
7
6
  raise Error, "@options is undefined"
@@ -11,7 +10,6 @@ module DataCatalog
11
10
  end
12
11
  File.join(@options[:cache_folder], resource.to_s)
13
12
  end
14
-
15
13
  end
16
14
  end
17
15
  end
@@ -1,7 +1,6 @@
1
1
  require 'yaml'
2
2
 
3
3
  class Hash
4
-
5
4
  def to_yaml(opts = {})
6
5
  YAML::quick_emit(object_id, opts) do |out|
7
6
  out.map(taguri, to_yaml_style) do |map|
@@ -10,13 +9,10 @@ class Hash
10
9
  end
11
10
  end
12
11
  end
13
-
14
12
  end
15
13
 
16
14
  class Symbol
17
-
18
15
  def <=>(other)
19
16
  self.to_s <=> other.to_s
20
17
  end
21
-
22
18
  end
data/lib/tasks.rb CHANGED
@@ -12,7 +12,7 @@ module DataCatalog
12
12
  desc "Pull data from the #{options[:name]}"
13
13
  task :pull do
14
14
  puts "Pulling data from the #{options[:name]}..."
15
- puller = DataCatalog::ImporterFramework::Puller.new({
15
+ puller = Puller.new({
16
16
  :cache_folder => options[:cache_folder],
17
17
  :pullers => options[:pullers],
18
18
  })
@@ -22,7 +22,7 @@ module DataCatalog
22
22
  desc "Push data to the Data Catalog API"
23
23
  task :push do
24
24
  desc "Pushing data to the Data Catalog API..."
25
- pusher = DataCatalog::ImporterFramework::Pusher.new({
25
+ pusher = Pusher.new({
26
26
  :api_key => options[:api_key],
27
27
  :base_uri => options[:base_uri],
28
28
  :cache_folder => options[:cache_folder],
data/lib/utility.rb CHANGED
@@ -3,150 +3,160 @@ require 'nokogiri'
3
3
  require 'open-uri'
4
4
 
5
5
  module DataCatalog
6
-
7
- class Utility
6
+ module ImporterFramework
7
+ class Utility
8
8
 
9
- # == URLs ==
9
+ # == URLs ==
10
10
 
11
- def self.absolute_url(page_url, url)
12
- Utility.plain_string(URI.parse(page_url).merge(url).to_s)
13
- end
11
+ def self.absolute_url(page_url, url)
12
+ Utility.plain_string(URI.parse(page_url).merge(url).to_s)
13
+ end
14
14
 
15
- # == Cleaning ==
15
+ # == Cleaning ==
16
16
 
17
- def self.single_line_clean(s)
18
- plain_string(
19
- s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
20
- end
17
+ def self.single_line_clean(s)
18
+ plain_string(
19
+ s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
20
+ end
21
21
 
22
- def self.multi_line_clean(s)
23
- plain_string(
24
- s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
25
- end
22
+ def self.multi_line_clean(s)
23
+ plain_string(
24
+ s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
25
+ end
26
26
 
27
- # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
28
- # This method removes it so you can output clean YAML.
29
- def self.plain_string(s)
30
- if s.instance_variable_defined?(:@_rails_html_safe)
31
- s.send(:remove_instance_variable, :@_rails_html_safe)
27
+ # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
28
+ # This method removes it so you can output clean YAML.
29
+ def self.plain_string(s)
30
+ if s.instance_variable_defined?(:@_rails_html_safe)
31
+ s.send(:remove_instance_variable, :@_rails_html_safe)
32
+ end
33
+ s
32
34
  end
33
- s
34
- end
35
35
 
36
- # == API ===
36
+ # == API ===
37
37
 
38
- def self.setup_api(api_key, base_uri)
39
- DataCatalog.api_key = api_key
40
- DataCatalog.base_uri = base_uri
41
- end
38
+ def self.setup_api(api_key, base_uri)
39
+ DataCatalog.api_key = api_key
40
+ DataCatalog.base_uri = base_uri
41
+ end
42
42
 
43
- def self.headers
44
- {
45
- "UserAgent" => "National Data Catalog Importer/0.1.6",
46
- }
47
- end
43
+ def self.headers
44
+ {
45
+ "UserAgent" => "National Data Catalog Importer/0.1.6",
46
+ }
47
+ end
48
48
 
49
- # == Various ==
50
-
51
- def self.fetch(uri)
52
- puts "Fetching #{uri}..."
53
- io = open(uri, headers)
54
- io.read
55
- end
49
+ # == Various ==
50
+
51
+ def self.fetch(uri, max_attempts=3)
52
+ attempts = 0
53
+ loop do
54
+ begin
55
+ puts "Fetching #{uri}..."
56
+ io = open(uri, headers)
57
+ return io.read
58
+ rescue SocketError
59
+ attempts += 1
60
+ puts " Attempt ##{attempts} failed."
61
+ break if attempts >= max_attempts
62
+ end
63
+ end
64
+ end
56
65
 
57
- def self.report_timing(label)
58
- puts "Starting: [#{label}]"
59
- t0 = Time.now
60
- result = yield
61
- t1 = Time.now
62
- diff = t1 - t0
63
- puts "Elapsed time [#{label}] %.2f s" % diff
64
- result
65
- end
66
+ def self.report_timing(label)
67
+ puts "Starting: [#{label}]"
68
+ t0 = Time.now
69
+ result = yield
70
+ t1 = Time.now
71
+ diff = t1 - t0
72
+ puts "Elapsed time [#{label}] %.2f s" % diff
73
+ result
74
+ end
66
75
 
67
- # == CSV ==
68
-
69
- # { :headers => true } is a common option
70
- def self.parse_csv_from_file(filename, options={})
71
- extra_header_rows = options.delete(:extra_header_rows) || 0
72
- File.open(filename) do |f|
73
- extra_header_rows.times { f.gets } # ignore these rows
74
- FasterCSV.parse(f, options)
76
+ # == CSV ==
77
+
78
+ # { :headers => true } is a common option
79
+ def self.parse_csv_from_file(filename, options={})
80
+ extra_header_rows = options.delete(:extra_header_rows) || 0
81
+ File.open(filename) do |f|
82
+ extra_header_rows.times { f.gets } # ignore these rows
83
+ FasterCSV.parse(f, options)
84
+ end
75
85
  end
76
- end
77
86
 
78
- def self.parse_csv_from_uri(uri, options={})
79
- puts "Fetching #{uri}..."
80
- data = open(uri, headers)
81
- puts data.inspect
82
- FasterCSV.parse(data, options)
83
- end
87
+ def self.parse_csv_from_uri(uri, options={})
88
+ puts "Fetching #{uri}..."
89
+ data = open(uri, headers)
90
+ puts data.inspect
91
+ FasterCSV.parse(data, options)
92
+ end
84
93
 
85
- def self.parse_csv_from_file_or_uri(uri, file, options={})
86
- force_fetch = options.delete(:force_fetch) || false
87
- if force_fetch || !File.exist?(file)
88
- document = fetch(uri)
89
- File.open(file, "w") { |f| f.write(document) }
94
+ def self.parse_csv_from_file_or_uri(uri, file, options={})
95
+ force_fetch = options.delete(:force_fetch) || false
96
+ if force_fetch || !File.exist?(file)
97
+ document = fetch(uri)
98
+ File.open(file, "w") { |f| f.write(document) }
99
+ end
100
+ parse_csv_from_file(file, options)
90
101
  end
91
- parse_csv_from_file(file, options)
92
- end
93
102
 
94
- # == HTML ==
103
+ # == HTML ==
95
104
 
96
- def self.parse_html_from_file(filename)
97
- File.open(filename) do |f|
98
- Nokogiri::HTML::Document.parse(f)
105
+ def self.parse_html_from_file(filename)
106
+ File.open(filename) do |f|
107
+ Nokogiri::HTML::Document.parse(f)
108
+ end
99
109
  end
100
- end
101
110
 
102
- def self.parse_html_from_uri(uri)
103
- puts "Fetching #{uri}..."
104
- open(uri, headers) do |io|
105
- Nokogiri::HTML::Document.parse(io)
111
+ def self.parse_html_from_uri(uri)
112
+ puts "Fetching #{uri}..."
113
+ open(uri, headers) do |io|
114
+ Nokogiri::HTML::Document.parse(io)
115
+ end
106
116
  end
107
- end
108
117
 
109
- def self.parse_html_from_file_or_uri(uri, file, options={})
110
- if options[:force_fetch] || !File.exist?(file)
111
- document = parse_html_from_uri(uri)
112
- File.open(file, "w") { |f| f.write(document) }
118
+ def self.parse_html_from_file_or_uri(uri, file, options={})
119
+ if options[:force_fetch] || !File.exist?(file)
120
+ document = parse_html_from_uri(uri)
121
+ File.open(file, "w") { |f| f.write(document) }
122
+ end
123
+ parse_html_from_file(file) # Why always parse the file? See Note 001, below.
113
124
  end
114
- parse_html_from_file(file) # Why always parse the file? See Note 001, below.
115
- end
116
125
 
117
- # == XML
126
+ # == XML
118
127
 
119
- def self.parse_xml_from_file(filename)
120
- File.open(filename) do |f|
121
- Nokogiri::XML::Document.parse(f)
128
+ def self.parse_xml_from_file(filename)
129
+ File.open(filename) do |f|
130
+ Nokogiri::XML::Document.parse(f)
131
+ end
122
132
  end
123
- end
124
133
 
125
- def self.parse_xml_from_uri(uri)
126
- puts "Fetching #{uri}..."
127
- Nokogiri::XML(open(uri))
128
- end
134
+ def self.parse_xml_from_uri(uri)
135
+ puts "Fetching #{uri}..."
136
+ Nokogiri::XML(open(uri))
137
+ end
129
138
 
130
- def self.parse_xml_from_file_or_uri(uri, file, options={})
131
- if options[:force_fetch] || !File.exist?(file)
132
- document = parse_xml_from_uri(uri)
133
- File.open(file, "w") { |f| f.write(document) }
134
- end
135
- parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
136
- end
139
+ def self.parse_xml_from_file_or_uri(uri, file, options={})
140
+ if options[:force_fetch] || !File.exist?(file)
141
+ document = parse_xml_from_uri(uri)
142
+ File.open(file, "w") { |f| f.write(document) }
143
+ end
144
+ parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
145
+ end
137
146
 
138
- # == YAML
147
+ # == YAML
139
148
 
140
- # To load YAML use: YAML::load_file(filename)
149
+ # To load YAML use: YAML::load_file(filename)
141
150
 
142
- def self.write_yaml(filename, contents)
143
- File.open(filename, "w") do |f|
144
- YAML::dump(contents, f)
151
+ def self.write_yaml(filename, contents)
152
+ File.open(filename, "w") do |f|
153
+ YAML::dump(contents, f)
154
+ end
145
155
  end
156
+
146
157
  end
147
158
 
148
159
  end
149
-
150
160
  end
151
161
 
152
162
  # == Note 001 ==
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 13
9
- version: 0.1.13
8
+ - 14
9
+ version: 0.1.14
10
10
  platform: ruby
11
11
  authors:
12
12
  - David James
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-04-30 00:00:00 -04:00
17
+ date: 2010-05-04 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency