datacatalog-importer 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +2 -2
- data/lib/importer.rb +0 -2
- data/lib/puller.rb +1 -1
- data/lib/pusher.rb +1 -1
- data/lib/shared.rb +0 -2
- data/lib/sort_yaml_hash.rb +0 -4
- data/lib/tasks.rb +2 -2
- data/lib/utility.rb +118 -108
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.14
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.14"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
12
|
-
s.date = %q{2010-04
|
12
|
+
s.date = %q{2010-05-04}
|
13
13
|
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
14
|
s.email = %q{djames@sunlightfoundation.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/importer.rb
CHANGED
data/lib/puller.rb
CHANGED
data/lib/pusher.rb
CHANGED
data/lib/shared.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
module DataCatalog
|
2
2
|
module ImporterFramework
|
3
3
|
module Shared
|
4
|
-
|
5
4
|
def folder(resource)
|
6
5
|
unless @options
|
7
6
|
raise Error, "@options is undefined"
|
@@ -11,7 +10,6 @@ module DataCatalog
|
|
11
10
|
end
|
12
11
|
File.join(@options[:cache_folder], resource.to_s)
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
end
|
17
15
|
end
|
data/lib/sort_yaml_hash.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'yaml'
|
2
2
|
|
3
3
|
class Hash
|
4
|
-
|
5
4
|
def to_yaml(opts = {})
|
6
5
|
YAML::quick_emit(object_id, opts) do |out|
|
7
6
|
out.map(taguri, to_yaml_style) do |map|
|
@@ -10,13 +9,10 @@ class Hash
|
|
10
9
|
end
|
11
10
|
end
|
12
11
|
end
|
13
|
-
|
14
12
|
end
|
15
13
|
|
16
14
|
class Symbol
|
17
|
-
|
18
15
|
def <=>(other)
|
19
16
|
self.to_s <=> other.to_s
|
20
17
|
end
|
21
|
-
|
22
18
|
end
|
data/lib/tasks.rb
CHANGED
@@ -12,7 +12,7 @@ module DataCatalog
|
|
12
12
|
desc "Pull data from the #{options[:name]}"
|
13
13
|
task :pull do
|
14
14
|
puts "Pulling data from the #{options[:name]}..."
|
15
|
-
puller =
|
15
|
+
puller = Puller.new({
|
16
16
|
:cache_folder => options[:cache_folder],
|
17
17
|
:pullers => options[:pullers],
|
18
18
|
})
|
@@ -22,7 +22,7 @@ module DataCatalog
|
|
22
22
|
desc "Push data to the Data Catalog API"
|
23
23
|
task :push do
|
24
24
|
desc "Pushing data to the Data Catalog API..."
|
25
|
-
pusher =
|
25
|
+
pusher = Pusher.new({
|
26
26
|
:api_key => options[:api_key],
|
27
27
|
:base_uri => options[:base_uri],
|
28
28
|
:cache_folder => options[:cache_folder],
|
data/lib/utility.rb
CHANGED
@@ -3,150 +3,160 @@ require 'nokogiri'
|
|
3
3
|
require 'open-uri'
|
4
4
|
|
5
5
|
module DataCatalog
|
6
|
-
|
7
|
-
|
6
|
+
module ImporterFramework
|
7
|
+
class Utility
|
8
8
|
|
9
|
-
|
9
|
+
# == URLs ==
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
def self.absolute_url(page_url, url)
|
12
|
+
Utility.plain_string(URI.parse(page_url).merge(url).to_s)
|
13
|
+
end
|
14
14
|
|
15
|
-
|
15
|
+
# == Cleaning ==
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
def self.single_line_clean(s)
|
18
|
+
plain_string(
|
19
|
+
s.gsub(/[\r\n\t]/, " ").gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
20
|
+
end
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
def self.multi_line_clean(s)
|
23
|
+
plain_string(
|
24
|
+
s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
25
|
+
end
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
# ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
|
28
|
+
# This method removes it so you can output clean YAML.
|
29
|
+
def self.plain_string(s)
|
30
|
+
if s.instance_variable_defined?(:@_rails_html_safe)
|
31
|
+
s.send(:remove_instance_variable, :@_rails_html_safe)
|
32
|
+
end
|
33
|
+
s
|
32
34
|
end
|
33
|
-
s
|
34
|
-
end
|
35
35
|
|
36
|
-
|
36
|
+
# == API ===
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
def self.setup_api(api_key, base_uri)
|
39
|
+
DataCatalog.api_key = api_key
|
40
|
+
DataCatalog.base_uri = base_uri
|
41
|
+
end
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
43
|
+
def self.headers
|
44
|
+
{
|
45
|
+
"UserAgent" => "National Data Catalog Importer/0.1.6",
|
46
|
+
}
|
47
|
+
end
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
49
|
+
# == Various ==
|
50
|
+
|
51
|
+
def self.fetch(uri, max_attempts=3)
|
52
|
+
attempts = 0
|
53
|
+
loop do
|
54
|
+
begin
|
55
|
+
puts "Fetching #{uri}..."
|
56
|
+
io = open(uri, headers)
|
57
|
+
return io.read
|
58
|
+
rescue SocketError
|
59
|
+
attempts += 1
|
60
|
+
puts " Attempt ##{attempts} failed."
|
61
|
+
break if attempts >= max_attempts
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
56
65
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
+
def self.report_timing(label)
|
67
|
+
puts "Starting: [#{label}]"
|
68
|
+
t0 = Time.now
|
69
|
+
result = yield
|
70
|
+
t1 = Time.now
|
71
|
+
diff = t1 - t0
|
72
|
+
puts "Elapsed time [#{label}] %.2f s" % diff
|
73
|
+
result
|
74
|
+
end
|
66
75
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
76
|
+
# == CSV ==
|
77
|
+
|
78
|
+
# { :headers => true } is a common option
|
79
|
+
def self.parse_csv_from_file(filename, options={})
|
80
|
+
extra_header_rows = options.delete(:extra_header_rows) || 0
|
81
|
+
File.open(filename) do |f|
|
82
|
+
extra_header_rows.times { f.gets } # ignore these rows
|
83
|
+
FasterCSV.parse(f, options)
|
84
|
+
end
|
75
85
|
end
|
76
|
-
end
|
77
86
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
87
|
+
def self.parse_csv_from_uri(uri, options={})
|
88
|
+
puts "Fetching #{uri}..."
|
89
|
+
data = open(uri, headers)
|
90
|
+
puts data.inspect
|
91
|
+
FasterCSV.parse(data, options)
|
92
|
+
end
|
84
93
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
94
|
+
def self.parse_csv_from_file_or_uri(uri, file, options={})
|
95
|
+
force_fetch = options.delete(:force_fetch) || false
|
96
|
+
if force_fetch || !File.exist?(file)
|
97
|
+
document = fetch(uri)
|
98
|
+
File.open(file, "w") { |f| f.write(document) }
|
99
|
+
end
|
100
|
+
parse_csv_from_file(file, options)
|
90
101
|
end
|
91
|
-
parse_csv_from_file(file, options)
|
92
|
-
end
|
93
102
|
|
94
|
-
|
103
|
+
# == HTML ==
|
95
104
|
|
96
|
-
|
97
|
-
|
98
|
-
|
105
|
+
def self.parse_html_from_file(filename)
|
106
|
+
File.open(filename) do |f|
|
107
|
+
Nokogiri::HTML::Document.parse(f)
|
108
|
+
end
|
99
109
|
end
|
100
|
-
end
|
101
110
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
111
|
+
def self.parse_html_from_uri(uri)
|
112
|
+
puts "Fetching #{uri}..."
|
113
|
+
open(uri, headers) do |io|
|
114
|
+
Nokogiri::HTML::Document.parse(io)
|
115
|
+
end
|
106
116
|
end
|
107
|
-
end
|
108
117
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
118
|
+
def self.parse_html_from_file_or_uri(uri, file, options={})
|
119
|
+
if options[:force_fetch] || !File.exist?(file)
|
120
|
+
document = parse_html_from_uri(uri)
|
121
|
+
File.open(file, "w") { |f| f.write(document) }
|
122
|
+
end
|
123
|
+
parse_html_from_file(file) # Why always parse the file? See Note 001, below.
|
113
124
|
end
|
114
|
-
parse_html_from_file(file) # Why always parse the file? See Note 001, below.
|
115
|
-
end
|
116
125
|
|
117
|
-
|
126
|
+
# == XML
|
118
127
|
|
119
|
-
|
120
|
-
|
121
|
-
|
128
|
+
def self.parse_xml_from_file(filename)
|
129
|
+
File.open(filename) do |f|
|
130
|
+
Nokogiri::XML::Document.parse(f)
|
131
|
+
end
|
122
132
|
end
|
123
|
-
end
|
124
133
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
134
|
+
def self.parse_xml_from_uri(uri)
|
135
|
+
puts "Fetching #{uri}..."
|
136
|
+
Nokogiri::XML(open(uri))
|
137
|
+
end
|
129
138
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
139
|
+
def self.parse_xml_from_file_or_uri(uri, file, options={})
|
140
|
+
if options[:force_fetch] || !File.exist?(file)
|
141
|
+
document = parse_xml_from_uri(uri)
|
142
|
+
File.open(file, "w") { |f| f.write(document) }
|
143
|
+
end
|
144
|
+
parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
|
145
|
+
end
|
137
146
|
|
138
|
-
|
147
|
+
# == YAML
|
139
148
|
|
140
|
-
|
149
|
+
# To load YAML use: YAML::load_file(filename)
|
141
150
|
|
142
|
-
|
143
|
-
|
144
|
-
|
151
|
+
def self.write_yaml(filename, contents)
|
152
|
+
File.open(filename, "w") do |f|
|
153
|
+
YAML::dump(contents, f)
|
154
|
+
end
|
145
155
|
end
|
156
|
+
|
146
157
|
end
|
147
158
|
|
148
159
|
end
|
149
|
-
|
150
160
|
end
|
151
161
|
|
152
162
|
# == Note 001 ==
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 14
|
9
|
+
version: 0.1.14
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- David James
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-04
|
17
|
+
date: 2010-05-04 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|