datacatalog-importer 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +6 -6
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +6 -4
- data/lib/utility.rb +35 -10
- data/spec/test.json +1 -0
- data/spec/test.xml +4 -0
- data/spec/utility_spec.rb +65 -13
- metadata +9 -7
data/Rakefile
CHANGED
@@ -12,7 +12,7 @@ begin
|
|
12
12
|
gem.authors = ["David James"]
|
13
13
|
gem.add_dependency "nokogiri", ">= 1.4.2"
|
14
14
|
gem.add_dependency "datacatalog", ">= 0.4.15"
|
15
|
-
gem.add_development_dependency "rspec", ">= 1.
|
15
|
+
gem.add_development_dependency "rspec", ">= 1.3.0"
|
16
16
|
# gem is a Gem::Specification...
|
17
17
|
# see http://www.rubygems.org/read/chapter/20 for additional settings
|
18
18
|
end
|
@@ -31,11 +31,11 @@ end
|
|
31
31
|
# rdoc.rdoc_files.include('lib/**/*.rb')
|
32
32
|
# end
|
33
33
|
#
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
require 'spec/rake/spectask'
|
35
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
36
|
+
spec.libs << 'lib' << 'spec'
|
37
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
38
|
+
end
|
39
39
|
#
|
40
40
|
# Spec::Rake::SpecTask.new(:rcov) do |spec|
|
41
41
|
# spec.libs << 'lib' << 'spec'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.2
|
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
"natdat_is_hungry.md",
|
41
41
|
"spec/spec.opts",
|
42
42
|
"spec/spec_helper.rb",
|
43
|
+
"spec/test.json",
|
44
|
+
"spec/test.xml",
|
43
45
|
"spec/utility_spec.rb"
|
44
46
|
]
|
45
47
|
s.homepage = %q{http://github.com/sunlightlabs/datacatalog-importer}
|
@@ -59,16 +61,16 @@ Gem::Specification.new do |s|
|
|
59
61
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
60
62
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.2"])
|
61
63
|
s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.15"])
|
62
|
-
s.add_development_dependency(%q<rspec>, [">= 1.
|
64
|
+
s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
|
63
65
|
else
|
64
66
|
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
65
67
|
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
66
|
-
s.add_dependency(%q<rspec>, [">= 1.
|
68
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
67
69
|
end
|
68
70
|
else
|
69
71
|
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
70
72
|
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
71
|
-
s.add_dependency(%q<rspec>, [">= 1.
|
73
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
72
74
|
end
|
73
75
|
end
|
74
76
|
|
data/lib/utility.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'fastercsv'
|
2
|
+
require 'json'
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'open-uri'
|
4
5
|
|
@@ -11,7 +12,7 @@ module DataCatalog
|
|
11
12
|
def self.absolute_url(base_url, url)
|
12
13
|
plain_string(URI.parse(base_url).merge(url).to_s)
|
13
14
|
end
|
14
|
-
|
15
|
+
|
15
16
|
def self.normalize_url(url)
|
16
17
|
uri = URI.parse(url).normalize
|
17
18
|
unless uri.scheme
|
@@ -56,7 +57,7 @@ module DataCatalog
|
|
56
57
|
|
57
58
|
def self.headers
|
58
59
|
{
|
59
|
-
"UserAgent" => "National Data Catalog Importer/0.2.
|
60
|
+
"UserAgent" => "National Data Catalog Importer/0.2.2",
|
60
61
|
}
|
61
62
|
end
|
62
63
|
|
@@ -107,14 +108,14 @@ module DataCatalog
|
|
107
108
|
end
|
108
109
|
|
109
110
|
def self.parse_csv_from_uri(uri, options={})
|
110
|
-
data = fetch(uri)
|
111
|
+
data = fetch(uri, options)
|
111
112
|
FasterCSV.parse(data, options)
|
112
113
|
end
|
113
114
|
|
114
115
|
def self.parse_csv_from_file_or_uri(uri, file, options={})
|
115
116
|
force_fetch = options.delete(:force_fetch) || false
|
116
117
|
if force_fetch || !File.exist?(file)
|
117
|
-
document = fetch(uri)
|
118
|
+
document = fetch(uri, options)
|
118
119
|
File.open(file, "w") { |f| f.write(document) }
|
119
120
|
end
|
120
121
|
parse_csv_from_file(file, options)
|
@@ -128,8 +129,8 @@ module DataCatalog
|
|
128
129
|
end
|
129
130
|
end
|
130
131
|
|
131
|
-
def self.parse_html_from_uri(uri)
|
132
|
-
data = fetch(uri)
|
132
|
+
def self.parse_html_from_uri(uri, options={})
|
133
|
+
data = fetch(uri, options)
|
133
134
|
Nokogiri::HTML::Document.parse(data)
|
134
135
|
end
|
135
136
|
|
@@ -138,7 +139,30 @@ module DataCatalog
|
|
138
139
|
document = parse_html_from_uri(uri)
|
139
140
|
File.open(file, "w") { |f| f.write(document) }
|
140
141
|
end
|
141
|
-
|
142
|
+
# Why always parse the file? See Note 001, below.
|
143
|
+
parse_html_from_file(file)
|
144
|
+
end
|
145
|
+
|
146
|
+
# == JSON
|
147
|
+
|
148
|
+
def self.parse_json_from_file(filename)
|
149
|
+
File.open(filename) do |f|
|
150
|
+
JSON.parse(f.read)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.parse_json_from_uri(uri, options={})
|
155
|
+
data = fetch(uri, options)
|
156
|
+
JSON.parse(data)
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.parse_json_from_file_or_uri(uri, file, options={})
|
160
|
+
if options[:force_fetch] || !File.exist?(file)
|
161
|
+
document = parse_json_from_uri(uri)
|
162
|
+
File.open(file, "w") { |f| f.write(document) }
|
163
|
+
end
|
164
|
+
# Why always parse the file? See Note 001, below.
|
165
|
+
parse_json_from_file(file)
|
142
166
|
end
|
143
167
|
|
144
168
|
# == XML
|
@@ -149,8 +173,8 @@ module DataCatalog
|
|
149
173
|
end
|
150
174
|
end
|
151
175
|
|
152
|
-
def self.parse_xml_from_uri(uri)
|
153
|
-
data = fetch(uri)
|
176
|
+
def self.parse_xml_from_uri(uri, options={})
|
177
|
+
data = fetch(uri, options)
|
154
178
|
Nokogiri::XML::Document.parse(data)
|
155
179
|
end
|
156
180
|
|
@@ -159,7 +183,8 @@ module DataCatalog
|
|
159
183
|
document = parse_xml_from_uri(uri)
|
160
184
|
File.open(file, "w") { |f| f.write(document) }
|
161
185
|
end
|
162
|
-
|
186
|
+
# Why always parse the file? See Note 001, below.
|
187
|
+
parse_xml_from_file(file)
|
163
188
|
end
|
164
189
|
|
165
190
|
# == YAML
|
data/spec/test.json
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
{ "stations" : ["Metro Center","Dupont Circle"] }
|
data/spec/test.xml
ADDED
data/spec/utility_spec.rb
CHANGED
@@ -20,14 +20,14 @@ describe "Utility" do
|
|
20
20
|
"http://sunlightlabs.com/"
|
21
21
|
end
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
describe "absolute_url" do
|
25
25
|
it "should work" do
|
26
26
|
U.absolute_url("http://sunlightlabs.com", "/contact").should ==
|
27
27
|
"http://sunlightlabs.com/contact"
|
28
28
|
end
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
describe "single_line_clean" do
|
32
32
|
it "should clean up leading and trailing whitespace" do
|
33
33
|
U.single_line_clean("\t \ttext\t\t ").should == "text"
|
@@ -41,19 +41,18 @@ describe "Utility" do
|
|
41
41
|
U.single_line_clean("sunlight\nlabs").should == "sunlight labs"
|
42
42
|
end
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
describe "multi_line_clean" do
|
46
46
|
it "should remove leading and trailing newlines" do
|
47
47
|
input = "\nline 1\nline 2\nline 3\n"
|
48
48
|
U.multi_line_clean(input).should == "line 1\nline 2\nline 3"
|
49
49
|
end
|
50
50
|
end
|
51
|
-
|
51
|
+
|
52
52
|
describe "fetch" do
|
53
53
|
before do
|
54
54
|
@readable = Object.new
|
55
55
|
@readable.stub(:read).and_return("result")
|
56
|
-
|
57
56
|
@sleep_count = 0
|
58
57
|
U.stub(:sleep).and_return {
|
59
58
|
@sleep_count += 1
|
@@ -64,7 +63,7 @@ describe "Utility" do
|
|
64
63
|
U.stub(:open).and_return(@readable)
|
65
64
|
U.fetch("fake", :quiet => true).should == "result"
|
66
65
|
end
|
67
|
-
|
66
|
+
|
68
67
|
it "bad fetches below retry limit are ok" do
|
69
68
|
@count = 0
|
70
69
|
U.stub(:open).and_return {
|
@@ -75,8 +74,8 @@ describe "Utility" do
|
|
75
74
|
@readable
|
76
75
|
end
|
77
76
|
}
|
78
|
-
U.fetch("fake", :max_attempts => 3, :quiet => true)
|
79
|
-
@sleep_count.should == 2
|
77
|
+
result = U.fetch("fake", :max_attempts => 3, :quiet => true)
|
78
|
+
result.should == "result" && @sleep_count.should == 2
|
80
79
|
end
|
81
80
|
|
82
81
|
it "bad fetches above retry limit give nil" do
|
@@ -89,10 +88,9 @@ describe "Utility" do
|
|
89
88
|
@readable
|
90
89
|
end
|
91
90
|
}
|
92
|
-
U.fetch("fake", :max_attempts => 2, :quiet => true)
|
93
|
-
@sleep_count.should == 1
|
91
|
+
result = U.fetch("fake", :max_attempts => 2, :quiet => true)
|
92
|
+
result.should == nil && @sleep_count.should == 1
|
94
93
|
end
|
95
|
-
|
96
94
|
end
|
97
95
|
|
98
96
|
describe "standardize_name" do
|
@@ -100,11 +98,65 @@ describe "Utility" do
|
|
100
98
|
U.standardize_name("City Administrator, Office of").should ==
|
101
99
|
"Office of City Administrator"
|
102
100
|
end
|
103
|
-
|
101
|
+
|
104
102
|
it "two commas" do
|
105
103
|
U.standardize_name("Children, Youth & Families, Department of").should ==
|
106
104
|
"Department of Children, Youth & Families"
|
107
105
|
end
|
108
106
|
end
|
109
|
-
|
107
|
+
|
108
|
+
# == XML
|
109
|
+
|
110
|
+
describe "parse_xml_from_file" do
|
111
|
+
it "should work" do
|
112
|
+
file = File.dirname(__FILE__) + '/test.xml'
|
113
|
+
parsed = U.parse_xml_from_file(file)
|
114
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
115
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "parse_xml_from_uri" do
|
120
|
+
before do
|
121
|
+
@readable = Object.new
|
122
|
+
@readable.stub(:read).and_return(%(
|
123
|
+
<stations>
|
124
|
+
<station>Metro Center</station>
|
125
|
+
<station>Dupont Circle</station>
|
126
|
+
</stations>
|
127
|
+
))
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should work" do
|
131
|
+
U.stub(:open).and_return(@readable)
|
132
|
+
parsed = U.parse_xml_from_uri("fake", :quiet => true)
|
133
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
134
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# == JSON
|
139
|
+
|
140
|
+
describe "parse_json_from_file" do
|
141
|
+
it "should work" do
|
142
|
+
file = File.dirname(__FILE__) + '/test.json'
|
143
|
+
U.parse_json_from_file(file).should ==
|
144
|
+
{ "stations" => ["Metro Center","Dupont Circle"] }
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
describe "parse_json_from_uri" do
|
149
|
+
before do
|
150
|
+
@readable = Object.new
|
151
|
+
@readable.stub(:read).and_return(
|
152
|
+
%({"stations":["Metro Center","Dupont Circle"]}))
|
153
|
+
end
|
154
|
+
|
155
|
+
it "should work" do
|
156
|
+
U.stub(:open).and_return(@readable)
|
157
|
+
result = U.parse_json_from_uri("fake", :quiet => true)
|
158
|
+
result.should == { "stations" => ["Metro Center","Dupont Circle"] }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
110
162
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|
@@ -58,12 +58,12 @@ dependencies:
|
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
hash:
|
61
|
+
hash: 27
|
62
62
|
segments:
|
63
63
|
- 1
|
64
|
-
-
|
65
|
-
-
|
66
|
-
version: 1.
|
64
|
+
- 3
|
65
|
+
- 0
|
66
|
+
version: 1.3.0
|
67
67
|
type: :development
|
68
68
|
version_requirements: *id003
|
69
69
|
description: This framework makes it easier to write importers for the National Data Catalog.
|
@@ -99,6 +99,8 @@ files:
|
|
99
99
|
- natdat_is_hungry.md
|
100
100
|
- spec/spec.opts
|
101
101
|
- spec/spec_helper.rb
|
102
|
+
- spec/test.json
|
103
|
+
- spec/test.xml
|
102
104
|
- spec/utility_spec.rb
|
103
105
|
has_rdoc: true
|
104
106
|
homepage: http://github.com/sunlightlabs/datacatalog-importer
|