datacatalog-importer 0.1.13 → 0.1.14

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.13
1
+ 0.1.14
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.1.13"
8
+ s.version = "0.1.14"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
12
- s.date = %q{2010-04-30}
12
+ s.date = %q{2010-05-04}
13
13
  s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
14
14
  s.email = %q{djames@sunlightfoundation.com}
15
15
  s.extra_rdoc_files = [
data/lib/importer.rb CHANGED
@@ -1,7 +1,5 @@
1
1
  module DataCatalog
2
2
  module ImporterFramework
3
-
4
3
  class Error < RuntimeError; end
5
-
6
4
  end
7
5
  end
data/lib/puller.rb CHANGED
@@ -3,7 +3,7 @@ require File.dirname(__FILE__) + '/shared'
3
3
  module DataCatalog
4
4
  module ImporterFramework
5
5
  class Puller
6
- include DataCatalog::ImporterFramework::Shared
6
+ include Shared
7
7
 
8
8
  REQUIRED = %w(cache_folder pullers)
9
9
 
data/lib/pusher.rb CHANGED
@@ -6,7 +6,7 @@ require File.dirname(__FILE__) + '/shared'
6
6
  module DataCatalog
7
7
  module ImporterFramework
8
8
  class Pusher
9
- include DataCatalog::ImporterFramework::Shared
9
+ include Shared
10
10
 
11
11
  REQUIRED = %w(api_key base_uri cache_folder)
12
12
 
data/lib/shared.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  module DataCatalog
2
2
  module ImporterFramework
3
3
  module Shared
4
-
5
4
  def folder(resource)
6
5
  unless @options
7
6
  raise Error, "@options is undefined"
@@ -11,7 +10,6 @@ module DataCatalog
11
10
  end
12
11
  File.join(@options[:cache_folder], resource.to_s)
13
12
  end
14
-
15
13
  end
16
14
  end
17
15
  end
@@ -1,7 +1,6 @@
1
1
  require 'yaml'
2
2
 
3
3
  class Hash
4
-
5
4
  def to_yaml(opts = {})
6
5
  YAML::quick_emit(object_id, opts) do |out|
7
6
  out.map(taguri, to_yaml_style) do |map|
@@ -10,13 +9,10 @@ class Hash
10
9
  end
11
10
  end
12
11
  end
13
-
14
12
  end
15
13
 
16
14
  class Symbol
17
-
18
15
  def <=>(other)
19
16
  self.to_s <=> other.to_s
20
17
  end
21
-
22
18
  end
data/lib/tasks.rb CHANGED
@@ -12,7 +12,7 @@ module DataCatalog
12
12
  desc "Pull data from the #{options[:name]}"
13
13
  task :pull do
14
14
  puts "Pulling data from the #{options[:name]}..."
15
- puller = DataCatalog::ImporterFramework::Puller.new({
15
+ puller = Puller.new({
16
16
  :cache_folder => options[:cache_folder],
17
17
  :pullers => options[:pullers],
18
18
  })
@@ -22,7 +22,7 @@ module DataCatalog
22
22
  desc "Push data to the Data Catalog API"
23
23
  task :push do
24
24
  desc "Pushing data to the Data Catalog API..."
25
- pusher = DataCatalog::ImporterFramework::Pusher.new({
25
+ pusher = Pusher.new({
26
26
  :api_key => options[:api_key],
27
27
  :base_uri => options[:base_uri],
28
28
  :cache_folder => options[:cache_folder],
data/lib/utility.rb CHANGED
@@ -3,150 +3,160 @@ require 'nokogiri'
3
3
  require 'open-uri'
4
4
 
5
5
  module DataCatalog
6
-
7
- class Utility
6
+ module ImporterFramework
7
+ class Utility
8
8
 
9
- # == URLs ==
9
+ # == URLs ==
10
10
 
11
- def self.absolute_url(page_url, url)
12
- Utility.plain_string(URI.parse(page_url).merge(url).to_s)
13
- end
11
+ def self.absolute_url(page_url, url)
12
+ Utility.plain_string(URI.parse(page_url).merge(url).to_s)
13
+ end
14
14
 
15
- # == Cleaning ==
15
+ # == Cleaning ==
16
16
 
17
- def self.single_line_clean(s)
18
- plain_string(
19
- s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
20
- end
17
+ def self.single_line_clean(s)
18
+ plain_string(
19
+ s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
20
+ end
21
21
 
22
- def self.multi_line_clean(s)
23
- plain_string(
24
- s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
25
- end
22
+ def self.multi_line_clean(s)
23
+ plain_string(
24
+ s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
25
+ end
26
26
 
27
- # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
28
- # This method removes it so you can output clean YAML.
29
- def self.plain_string(s)
30
- if s.instance_variable_defined?(:@_rails_html_safe)
31
- s.send(:remove_instance_variable, :@_rails_html_safe)
27
+ # ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
28
+ # This method removes it so you can output clean YAML.
29
+ def self.plain_string(s)
30
+ if s.instance_variable_defined?(:@_rails_html_safe)
31
+ s.send(:remove_instance_variable, :@_rails_html_safe)
32
+ end
33
+ s
32
34
  end
33
- s
34
- end
35
35
 
36
- # == API ===
36
+ # == API ===
37
37
 
38
- def self.setup_api(api_key, base_uri)
39
- DataCatalog.api_key = api_key
40
- DataCatalog.base_uri = base_uri
41
- end
38
+ def self.setup_api(api_key, base_uri)
39
+ DataCatalog.api_key = api_key
40
+ DataCatalog.base_uri = base_uri
41
+ end
42
42
 
43
- def self.headers
44
- {
45
- "UserAgent" => "National Data Catalog Importer/0.1.6",
46
- }
47
- end
43
+ def self.headers
44
+ {
45
+ "UserAgent" => "National Data Catalog Importer/0.1.6",
46
+ }
47
+ end
48
48
 
49
- # == Various ==
50
-
51
- def self.fetch(uri)
52
- puts "Fetching #{uri}..."
53
- io = open(uri, headers)
54
- io.read
55
- end
49
+ # == Various ==
50
+
51
+ def self.fetch(uri, max_attempts=3)
52
+ attempts = 0
53
+ loop do
54
+ begin
55
+ puts "Fetching #{uri}..."
56
+ io = open(uri, headers)
57
+ return io.read
58
+ rescue SocketError
59
+ attempts += 1
60
+ puts " Attempt ##{attempts} failed."
61
+ break if attempts >= max_attempts
62
+ end
63
+ end
64
+ end
56
65
 
57
- def self.report_timing(label)
58
- puts "Starting: [#{label}]"
59
- t0 = Time.now
60
- result = yield
61
- t1 = Time.now
62
- diff = t1 - t0
63
- puts "Elapsed time [#{label}] %.2f s" % diff
64
- result
65
- end
66
+ def self.report_timing(label)
67
+ puts "Starting: [#{label}]"
68
+ t0 = Time.now
69
+ result = yield
70
+ t1 = Time.now
71
+ diff = t1 - t0
72
+ puts "Elapsed time [#{label}] %.2f s" % diff
73
+ result
74
+ end
66
75
 
67
- # == CSV ==
68
-
69
- # { :headers => true } is a common option
70
- def self.parse_csv_from_file(filename, options={})
71
- extra_header_rows = options.delete(:extra_header_rows) || 0
72
- File.open(filename) do |f|
73
- extra_header_rows.times { f.gets } # ignore these rows
74
- FasterCSV.parse(f, options)
76
+ # == CSV ==
77
+
78
+ # { :headers => true } is a common option
79
+ def self.parse_csv_from_file(filename, options={})
80
+ extra_header_rows = options.delete(:extra_header_rows) || 0
81
+ File.open(filename) do |f|
82
+ extra_header_rows.times { f.gets } # ignore these rows
83
+ FasterCSV.parse(f, options)
84
+ end
75
85
  end
76
- end
77
86
 
78
- def self.parse_csv_from_uri(uri, options={})
79
- puts "Fetching #{uri}..."
80
- data = open(uri, headers)
81
- puts data.inspect
82
- FasterCSV.parse(data, options)
83
- end
87
+ def self.parse_csv_from_uri(uri, options={})
88
+ puts "Fetching #{uri}..."
89
+ data = open(uri, headers)
90
+ puts data.inspect
91
+ FasterCSV.parse(data, options)
92
+ end
84
93
 
85
- def self.parse_csv_from_file_or_uri(uri, file, options={})
86
- force_fetch = options.delete(:force_fetch) || false
87
- if force_fetch || !File.exist?(file)
88
- document = fetch(uri)
89
- File.open(file, "w") { |f| f.write(document) }
94
+ def self.parse_csv_from_file_or_uri(uri, file, options={})
95
+ force_fetch = options.delete(:force_fetch) || false
96
+ if force_fetch || !File.exist?(file)
97
+ document = fetch(uri)
98
+ File.open(file, "w") { |f| f.write(document) }
99
+ end
100
+ parse_csv_from_file(file, options)
90
101
  end
91
- parse_csv_from_file(file, options)
92
- end
93
102
 
94
- # == HTML ==
103
+ # == HTML ==
95
104
 
96
- def self.parse_html_from_file(filename)
97
- File.open(filename) do |f|
98
- Nokogiri::HTML::Document.parse(f)
105
+ def self.parse_html_from_file(filename)
106
+ File.open(filename) do |f|
107
+ Nokogiri::HTML::Document.parse(f)
108
+ end
99
109
  end
100
- end
101
110
 
102
- def self.parse_html_from_uri(uri)
103
- puts "Fetching #{uri}..."
104
- open(uri, headers) do |io|
105
- Nokogiri::HTML::Document.parse(io)
111
+ def self.parse_html_from_uri(uri)
112
+ puts "Fetching #{uri}..."
113
+ open(uri, headers) do |io|
114
+ Nokogiri::HTML::Document.parse(io)
115
+ end
106
116
  end
107
- end
108
117
 
109
- def self.parse_html_from_file_or_uri(uri, file, options={})
110
- if options[:force_fetch] || !File.exist?(file)
111
- document = parse_html_from_uri(uri)
112
- File.open(file, "w") { |f| f.write(document) }
118
+ def self.parse_html_from_file_or_uri(uri, file, options={})
119
+ if options[:force_fetch] || !File.exist?(file)
120
+ document = parse_html_from_uri(uri)
121
+ File.open(file, "w") { |f| f.write(document) }
122
+ end
123
+ parse_html_from_file(file) # Why always parse the file? See Note 001, below.
113
124
  end
114
- parse_html_from_file(file) # Why always parse the file? See Note 001, below.
115
- end
116
125
 
117
- # == XML
126
+ # == XML
118
127
 
119
- def self.parse_xml_from_file(filename)
120
- File.open(filename) do |f|
121
- Nokogiri::XML::Document.parse(f)
128
+ def self.parse_xml_from_file(filename)
129
+ File.open(filename) do |f|
130
+ Nokogiri::XML::Document.parse(f)
131
+ end
122
132
  end
123
- end
124
133
 
125
- def self.parse_xml_from_uri(uri)
126
- puts "Fetching #{uri}..."
127
- Nokogiri::XML(open(uri))
128
- end
134
+ def self.parse_xml_from_uri(uri)
135
+ puts "Fetching #{uri}..."
136
+ Nokogiri::XML(open(uri))
137
+ end
129
138
 
130
- def self.parse_xml_from_file_or_uri(uri, file, options={})
131
- if options[:force_fetch] || !File.exist?(file)
132
- document = parse_xml_from_uri(uri)
133
- File.open(file, "w") { |f| f.write(document) }
134
- end
135
- parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
136
- end
139
+ def self.parse_xml_from_file_or_uri(uri, file, options={})
140
+ if options[:force_fetch] || !File.exist?(file)
141
+ document = parse_xml_from_uri(uri)
142
+ File.open(file, "w") { |f| f.write(document) }
143
+ end
144
+ parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
145
+ end
137
146
 
138
- # == YAML
147
+ # == YAML
139
148
 
140
- # To load YAML use: YAML::load_file(filename)
149
+ # To load YAML use: YAML::load_file(filename)
141
150
 
142
- def self.write_yaml(filename, contents)
143
- File.open(filename, "w") do |f|
144
- YAML::dump(contents, f)
151
+ def self.write_yaml(filename, contents)
152
+ File.open(filename, "w") do |f|
153
+ YAML::dump(contents, f)
154
+ end
145
155
  end
156
+
146
157
  end
147
158
 
148
159
  end
149
-
150
160
  end
151
161
 
152
162
  # == Note 001 ==
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 13
9
- version: 0.1.13
8
+ - 14
9
+ version: 0.1.14
10
10
  platform: ruby
11
11
  authors:
12
12
  - David James
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-04-30 00:00:00 -04:00
17
+ date: 2010-05-04 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency