datacatalog-importer 0.1.10 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +2 -2
- data/lib/utility.rb +76 -39
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.11
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.11"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-04-05}
|
13
13
|
s.description = %q{This framework makes it easier to write importers for the National Data Catalog.}
|
14
14
|
s.email = %q{djames@sunlightfoundation.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/utility.rb
CHANGED
@@ -6,9 +6,13 @@ module DataCatalog
|
|
6
6
|
|
7
7
|
class Utility
|
8
8
|
|
9
|
+
# == URLs ==
|
10
|
+
|
9
11
|
def self.absolute_url(page_url, url)
|
10
12
|
Utility.plain_string(URI.parse(page_url).merge(url).to_s)
|
11
13
|
end
|
14
|
+
|
15
|
+
# == Cleaning ==
|
12
16
|
|
13
17
|
def self.single_line_clean(s)
|
14
18
|
plain_string(
|
@@ -20,10 +24,20 @@ module DataCatalog
|
|
20
24
|
s.gsub(/[\x80-\xFF]/, "").squeeze(" ").strip)
|
21
25
|
end
|
22
26
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
+
# ActiveSupport 2.3.5 adds @_rails_html_safe aggressively.
|
28
|
+
# This method removes it so you can output clean YAML.
|
29
|
+
def self.plain_string(s)
|
30
|
+
if s.instance_variable_defined?(:@_rails_html_safe)
|
31
|
+
s.send(:remove_instance_variable, :@_rails_html_safe)
|
32
|
+
end
|
33
|
+
s
|
34
|
+
end
|
35
|
+
|
36
|
+
# == API ===
|
37
|
+
|
38
|
+
def self.setup_api(api_key, base_uri)
|
39
|
+
DataCatalog.api_key = api_key
|
40
|
+
DataCatalog.base_uri = base_uri
|
27
41
|
end
|
28
42
|
|
29
43
|
def self.headers
|
@@ -31,7 +45,27 @@ module DataCatalog
|
|
31
45
|
"UserAgent" => "National Data Catalog Importer/0.1.6",
|
32
46
|
}
|
33
47
|
end
|
48
|
+
|
49
|
+
# == Various ==
|
34
50
|
|
51
|
+
def self.fetch(uri)
|
52
|
+
puts "Fetching #{uri}..."
|
53
|
+
io = open(uri, headers)
|
54
|
+
io.read
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.report_timing(label)
|
58
|
+
puts "Starting: [#{label}]"
|
59
|
+
t0 = Time.now
|
60
|
+
result = yield
|
61
|
+
t1 = Time.now
|
62
|
+
diff = t1 - t0
|
63
|
+
puts "Elapsed time [#{label}] %.2f s" % diff
|
64
|
+
result
|
65
|
+
end
|
66
|
+
|
67
|
+
# == CSV ==
|
68
|
+
|
35
69
|
# { :headers => true } is a common option
|
36
70
|
def self.parse_csv_from_file(filename, options={})
|
37
71
|
extra_header_rows = options.delete(:extra_header_rows) || 0
|
@@ -41,12 +75,13 @@ module DataCatalog
|
|
41
75
|
end
|
42
76
|
end
|
43
77
|
|
44
|
-
def self.
|
45
|
-
|
46
|
-
|
47
|
-
|
78
|
+
def self.parse_csv_from_uri(uri, options={})
|
79
|
+
puts "Fetching #{uri}..."
|
80
|
+
data = open(uri, headers)
|
81
|
+
puts data.inspect
|
82
|
+
FasterCSV.parse(data, options)
|
48
83
|
end
|
49
|
-
|
84
|
+
|
50
85
|
def self.parse_csv_from_file_or_uri(uri, file, options={})
|
51
86
|
force_fetch = options.delete(:force_fetch) || false
|
52
87
|
if force_fetch || !File.exist?(file)
|
@@ -56,19 +91,12 @@ module DataCatalog
|
|
56
91
|
parse_csv_from_file(file, options)
|
57
92
|
end
|
58
93
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
94
|
+
# == HTML ==
|
95
|
+
|
96
|
+
def self.parse_html_from_file(filename)
|
97
|
+
File.open(filename) do |f|
|
98
|
+
Nokogiri::HTML::Document.parse(f)
|
63
99
|
end
|
64
|
-
parse_html_from_file(file) # Why always parse the file? See Note 001, below.
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.parse_csv_from_uri(uri, options={})
|
68
|
-
puts "Fetching #{uri}..."
|
69
|
-
data = open(uri, headers)
|
70
|
-
puts data.inspect
|
71
|
-
FasterCSV.parse(data, options)
|
72
100
|
end
|
73
101
|
|
74
102
|
def self.parse_html_from_uri(uri)
|
@@ -78,30 +106,39 @@ module DataCatalog
|
|
78
106
|
end
|
79
107
|
end
|
80
108
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
s.send(:remove_instance_variable, :@_rails_html_safe)
|
109
|
+
def self.parse_html_from_file_or_uri(uri, file, options={})
|
110
|
+
if options[:force_fetch] || !File.exist?(file)
|
111
|
+
document = parse_html_from_uri(uri)
|
112
|
+
File.open(file, "w") { |f| f.write(document) }
|
86
113
|
end
|
87
|
-
|
114
|
+
parse_html_from_file(file) # Why always parse the file? See Note 001, below.
|
88
115
|
end
|
116
|
+
|
117
|
+
# == XML
|
89
118
|
|
90
|
-
def self.
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
t1 = Time.now
|
95
|
-
diff = t1 - t0
|
96
|
-
puts "Elapsed time [#{label}] %.2f s" % diff
|
97
|
-
result
|
119
|
+
def self.parse_xml_from_file(filename)
|
120
|
+
File.open(filename) do |f|
|
121
|
+
Nokogiri::XML::Document.parse(f)
|
122
|
+
end
|
98
123
|
end
|
99
124
|
|
100
|
-
def self.
|
101
|
-
|
102
|
-
|
125
|
+
def self.parse_xml_from_uri(uri)
|
126
|
+
puts "Fetching #{uri}..."
|
127
|
+
Nokogiri::XML(open(uri))
|
103
128
|
end
|
104
|
-
|
129
|
+
|
130
|
+
def self.parse_xml_from_file_or_uri(uri, file, options={})
|
131
|
+
if options[:force_fetch] || !File.exist?(file)
|
132
|
+
document = parse_xml_from_uri(uri)
|
133
|
+
File.open(file, "w") { |f| f.write(document) }
|
134
|
+
end
|
135
|
+
parse_xml_from_file(file) # Why always parse the file? See Note 001, below.
|
136
|
+
end
|
137
|
+
|
138
|
+
# == YAML
|
139
|
+
|
140
|
+
# To load YAML use: YAML::load_file(filename)
|
141
|
+
|
105
142
|
def self.write_yaml(filename, contents)
|
106
143
|
File.open(filename, "w") do |f|
|
107
144
|
YAML::dump(contents, f)
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 11
|
9
|
+
version: 0.1.11
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- David James
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-04-05 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|