datacatalog-importer 0.1.13 → 0.1.14
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +2 -2
- data/lib/importer.rb +0 -2
- data/lib/puller.rb +1 -1
- data/lib/pusher.rb +1 -1
- data/lib/shared.rb +0 -2
- data/lib/sort_yaml_hash.rb +0 -4
- data/lib/tasks.rb +2 -2
- data/lib/utility.rb +118 -108
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.14
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.14"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
12
|
-
s.date = %q{2010-04
|
12
|
+
s.date = %q{2010-05-04}
|
13
13
|
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
14
|
s.email = %q{djames@sunlightfoundation.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/importer.rb
CHANGED
data/lib/puller.rb
CHANGED
data/lib/pusher.rb
CHANGED
data/lib/shared.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
module DataCatalog
|
2
2
|
module ImporterFramework
|
3
3
|
module Shared
|
4
|
-
|
5
4
|
def folder(resource)
|
6
5
|
unless @options
|
7
6
|
raise Error, "@options is undefined"
|
@@ -11,7 +10,6 @@ module DataCatalog
|
|
11
10
|
end
|
12
11
|
File.join(@options[:cache_folder], resource.to_s)
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
end
|
17
15
|
end
|
data/lib/sort_yaml_hash.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'yaml'
|
2
2
|
|
3
3
|
class Hash
|
4
|
-
|
5
4
|
def to_yaml(opts = {})
|
6
5
|
YAML::quick_emit(object_id, opts) do |out|
|
7
6
|
out.map(taguri, to_yaml_style) do |map|
|
@@ -10,13 +9,10 @@ class Hash
|
|
10
9
|
end
|
11
10
|
end
|
12
11
|
end
|
13
|
-
|
14
12
|
end
|
15
13
|
|
16
14
|
class Symbol
|
17
|
-
|
18
15
|
def <=>(other)
|
19
16
|
self.to_s <=> other.to_s
|
20
17
|
end
|
21
|
-
|
22
18
|
end
|
data/lib/tasks.rb
CHANGED
@@ -12,7 +12,7 @@ module DataCatalog
|
|
12
12
|
desc "Pull data from the #{options[:name]}"
|
13
13
|
task :pull do
|
14
14
|
puts "Pulling data from the #{options[:name]}..."
|
15
|
-
puller =
|
15
|
+
puller = Puller.new({
|
16
16
|
:cache_folder => options[:cache_folder],
|
17
17
|
:pullers => options[:pullers],
|
18
18
|
})
|
@@ -22,7 +22,7 @@ module DataCatalog
|
|
22
22
|
desc "Push data to the Data Catalog API"
|
23
23
|
task :push do
|
24
24
|
desc "Pushing data to the Data Catalog API..."
|
25
|
-
pusher =
|
25
|
+
pusher = Pusher.new({
|
26
26
|
:api_key => options[:api_key],
|
27
27
|
:base_uri => options[:base_uri],
|
28
28
|
:cache_folder => options[:cache_folder],
|
data/lib/utility.rb
CHANGED
@@ -3,150 +3,160 @@ require 'nokogiri'
|
|
3
3
|
require 'open-uri'
|
4
4
|
|
5
5
|
module DataCatalog
|
6
|
-
|
7
|
-
|
6
|
+
module ImporterFramework
|
7
|
+
class Utility
|
8
8
|
|
9
|
-
|
9
|
+
# == URLs ==
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
def self.absolute_url(page_url, url)
|
12
|
+
Utility.plain_string(URI.parse(page_url).merge(url).to_s)
|
13
|
+
end
|
14
14
|
|
15
|
-
|
15
|
+
# == Cleaning ==
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
def self.single_line_clean(s)
|
18
|
+
plain_string(
|
19
|
+
s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
20
|
+
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
def self.multi_line_clean(s)
|
23
|
+
plain_string(
|
24
|
+
s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
25
|
+
end
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
# ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
|
28
|
+
# This method removes it so you can output clean YAML.
|
29
|
+
def self.plain_string(s)
|
30
|
+
if s.instance_variable_defined?(:@_rails_html_safe)
|
31
|
+
s.send(:remove_instance_variable, :@_rails_html_safe)
|
32
|
+
end
|
33
|
+
s
|
32
34
|
end
|
33
|
-
s
|
34
|
-
end
|
35
35
|
|
36
|
-
|
36
|
+
# == API ===
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
def self.setup_api(api_key, base_uri)
|
39
|
+
DataCatalog.api_key = api_key
|
40
|
+
DataCatalog.base_uri = base_uri
|
41
|
+
end
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
43
|
+
def self.headers
|
44
|
+
{
|
45
|
+
"UserAgent" => "National Data Catalog Importer/0.1.6",
|
46
|
+
}
|
47
|
+
end
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
49
|
+
# == Various ==
|
50
|
+
|
51
|
+
def self.fetch(uri, max_attempts=3)
|
52
|
+
attempts = 0
|
53
|
+
loop do
|
54
|
+
begin
|
55
|
+
puts "Fetching #{uri}..."
|
56
|
+
io = open(uri, headers)
|
57
|
+
return io.read
|
58
|
+
rescue SocketError
|
59
|
+
attempts += 1
|
60
|
+
puts " Attempt ##{attempts} failed."
|
61
|
+
break if attempts >= max_attempts
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
56
65
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
+
def self.report_timing(label)
|
67
|
+
puts "Starting: [#{label}]"
|
68
|
+
t0 = Time.now
|
69
|
+
result = yield
|
70
|
+
t1 = Time.now
|
71
|
+
diff = t1 - t0
|
72
|
+
puts "Elapsed time [#{label}] %.2f s" % diff
|
73
|
+
result
|
74
|
+
end
|
66
75
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
76
|
+
# == CSV ==
|
77
|
+
|
78
|
+
# { :headers => true } is a common option
|
79
|
+
def self.parse_csv_from_file(filename, options={})
|
80
|
+
extra_header_rows = options.delete(:extra_header_rows) || 0
|
81
|
+
File.open(filename) do |f|
|
82
|
+
extra_header_rows.times { f.gets } # ignore these rows
|
83
|
+
FasterCSV.parse(f, options)
|
84
|
+
end
|
75
85
|
end
|
76
|
-
end
|
77
86
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
87
|
+
def self.parse_csv_from_uri(uri, options={})
|
88
|
+
puts "Fetching #{uri}..."
|
89
|
+
data = open(uri, headers)
|
90
|
+
puts data.inspect
|
91
|
+
FasterCSV.parse(data, options)
|
92
|
+
end
|
84
93
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
94
|
+
def self.parse_csv_from_file_or_uri(uri, file, options={})
|
95
|
+
force_fetch = options.delete(:force_fetch) || false
|
96
|
+
if force_fetch || !File.exist?(file)
|
97
|
+
document = fetch(uri)
|
98
|
+
File.open(file, "w") { |f| f.write(document) }
|
99
|
+
end
|
100
|
+
parse_csv_from_file(file, options)
|
90
101
|
end
|
91
|
-
parse_csv_from_file(file, options)
|
92
|
-
end
|
93
102
|
|
94
|
-
|
103
|
+
# == HTML ==
|
95
104
|
|
96
|
-
|
97
|
-
|
98
|
-
|
105
|
+
def self.parse_html_from_file(filename)
|
106
|
+
File.open(filename) do |f|
|
107
|
+
Nokogiri::HTML::Document.parse(f)
|
108
|
+
end
|
99
109
|
end
|
100
|
-
end
|
101
110
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
111
|
+
def self.parse_html_from_uri(uri)
|
112
|
+
puts "Fetching #{uri}..."
|
113
|
+
open(uri, headers) do |io|
|
114
|
+
Nokogiri::HTML::Document.parse(io)
|
115
|
+
end
|
106
116
|
end
|
107
|
-
end
|
108
117
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
118
|
+
def self.parse_html_from_file_or_uri(uri, file, options={})
|
119
|
+
if options[:force_fetch] || !File.exist?(file)
|
120
|
+
document = parse_html_from_uri(uri)
|
121
|
+
File.open(file, "w") { |f| f.write(document) }
|
122
|
+
end
|
123
|
+
parse_html_from_file(file) # Why always parse the file? See Note 001, below.
|
113
124
|
end
|
114
|
-
parse_html_from_file(file) # Why always parse the file? See Note 001, below.
|
115
|
-
end
|
116
125
|
|
117
|
-
|
126
|
+
# == XML
|
118
127
|
|
119
|
-
|
120
|
-
|
121
|
-
|
128
|
+
def self.parse_xml_from_file(filename)
|
129
|
+
File.open(filename) do |f|
|
130
|
+
Nokogiri::XML::Document.parse(f)
|
131
|
+
end
|
122
132
|
end
|
123
|
-
end
|
124
133
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
134
|
+
def self.parse_xml_from_uri(uri)
|
135
|
+
puts "Fetching #{uri}..."
|
136
|
+
Nokogiri::XML(open(uri))
|
137
|
+
end
|
129
138
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
139
|
+
def self.parse_xml_from_file_or_uri(uri, file, options={})
|
140
|
+
if options[:force_fetch] || !File.exist?(file)
|
141
|
+
document = parse_xml_from_uri(uri)
|
142
|
+
File.open(file, "w") { |f| f.write(document) }
|
143
|
+
end
|
144
|
+
parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
|
145
|
+
end
|
137
146
|
|
138
|
-
|
147
|
+
# == YAML
|
139
148
|
|
140
|
-
|
149
|
+
# To load YAML use: YAML::load_file(filename)
|
141
150
|
|
142
|
-
|
143
|
-
|
144
|
-
|
151
|
+
def self.write_yaml(filename, contents)
|
152
|
+
File.open(filename, "w") do |f|
|
153
|
+
YAML::dump(contents, f)
|
154
|
+
end
|
145
155
|
end
|
156
|
+
|
146
157
|
end
|
147
158
|
|
148
159
|
end
|
149
|
-
|
150
160
|
end
|
151
161
|
|
152
162
|
# == Note 001 ==
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 14
|
9
|
+
version: 0.1.14
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- David James
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-04
|
17
|
+
date: 2010-05-04 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|