datacatalog-importer 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +6 -6
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +6 -4
- data/lib/utility.rb +35 -10
- data/spec/test.json +1 -0
- data/spec/test.xml +4 -0
- data/spec/utility_spec.rb +65 -13
- metadata +9 -7
data/Rakefile
CHANGED
@@ -12,7 +12,7 @@ begin
|
|
12
12
|
gem.authors = ["David James"]
|
13
13
|
gem.add_dependency "nokogiri", ">= 1.4.2"
|
14
14
|
gem.add_dependency "datacatalog", ">= 0.4.15"
|
15
|
-
gem.add_development_dependency "rspec", ">= 1.
|
15
|
+
gem.add_development_dependency "rspec", ">= 1.3.0"
|
16
16
|
# gem is a Gem::Specification...
|
17
17
|
# see http://www.rubygems.org/read/chapter/20 for additional settings
|
18
18
|
end
|
@@ -31,11 +31,11 @@ end
|
|
31
31
|
# rdoc.rdoc_files.include('lib/**/*.rb')
|
32
32
|
# end
|
33
33
|
#
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
require 'spec/rake/spectask'
|
35
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
36
|
+
spec.libs << 'lib' << 'spec'
|
37
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
38
|
+
end
|
39
39
|
#
|
40
40
|
# Spec::Rake::SpecTask.new(:rcov) do |spec|
|
41
41
|
# spec.libs << 'lib' << 'spec'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.2
|
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
"natdat_is_hungry.md",
|
41
41
|
"spec/spec.opts",
|
42
42
|
"spec/spec_helper.rb",
|
43
|
+
"spec/test.json",
|
44
|
+
"spec/test.xml",
|
43
45
|
"spec/utility_spec.rb"
|
44
46
|
]
|
45
47
|
s.homepage = %q{http://github.com/sunlightlabs/datacatalog-importer}
|
@@ -59,16 +61,16 @@ Gem::Specification.new do |s|
|
|
59
61
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
60
62
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.2"])
|
61
63
|
s.add_runtime_dependency(%q<datacatalog>, [">= 0.4.15"])
|
62
|
-
s.add_development_dependency(%q<rspec>, [">= 1.
|
64
|
+
s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
|
63
65
|
else
|
64
66
|
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
65
67
|
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
66
|
-
s.add_dependency(%q<rspec>, [">= 1.
|
68
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
67
69
|
end
|
68
70
|
else
|
69
71
|
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
|
70
72
|
s.add_dependency(%q<datacatalog>, [">= 0.4.15"])
|
71
|
-
s.add_dependency(%q<rspec>, [">= 1.
|
73
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
72
74
|
end
|
73
75
|
end
|
74
76
|
|
data/lib/utility.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'fastercsv'
|
2
|
+
require 'json'
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'open-uri'
|
4
5
|
|
@@ -11,7 +12,7 @@ module DataCatalog
|
|
11
12
|
def self.absolute_url(base_url, url)
|
12
13
|
plain_string(URI.parse(base_url).merge(url).to_s)
|
13
14
|
end
|
14
|
-
|
15
|
+
|
15
16
|
def self.normalize_url(url)
|
16
17
|
uri = URI.parse(url).normalize
|
17
18
|
unless uri.scheme
|
@@ -56,7 +57,7 @@ module DataCatalog
|
|
56
57
|
|
57
58
|
def self.headers
|
58
59
|
{
|
59
|
-
"UserAgent" => "National Data Catalog Importer/0.2.
|
60
|
+
"UserAgent" => "National Data Catalog Importer/0.2.2",
|
60
61
|
}
|
61
62
|
end
|
62
63
|
|
@@ -107,14 +108,14 @@ module DataCatalog
|
|
107
108
|
end
|
108
109
|
|
109
110
|
def self.parse_csv_from_uri(uri, options={})
|
110
|
-
data = fetch(uri)
|
111
|
+
data = fetch(uri, options)
|
111
112
|
FasterCSV.parse(data, options)
|
112
113
|
end
|
113
114
|
|
114
115
|
def self.parse_csv_from_file_or_uri(uri, file, options={})
|
115
116
|
force_fetch = options.delete(:force_fetch) || false
|
116
117
|
if force_fetch || !File.exist?(file)
|
117
|
-
document = fetch(uri)
|
118
|
+
document = fetch(uri, options)
|
118
119
|
File.open(file, "w") { |f| f.write(document) }
|
119
120
|
end
|
120
121
|
parse_csv_from_file(file, options)
|
@@ -128,8 +129,8 @@ module DataCatalog
|
|
128
129
|
end
|
129
130
|
end
|
130
131
|
|
131
|
-
def self.parse_html_from_uri(uri)
|
132
|
-
data = fetch(uri)
|
132
|
+
def self.parse_html_from_uri(uri, options={})
|
133
|
+
data = fetch(uri, options)
|
133
134
|
Nokogiri::HTML::Document.parse(data)
|
134
135
|
end
|
135
136
|
|
@@ -138,7 +139,30 @@ module DataCatalog
|
|
138
139
|
document = parse_html_from_uri(uri)
|
139
140
|
File.open(file, "w") { |f| f.write(document) }
|
140
141
|
end
|
141
|
-
|
142
|
+
# Why always parse the file? See Note 001, below.
|
143
|
+
parse_html_from_file(file)
|
144
|
+
end
|
145
|
+
|
146
|
+
# == JSON
|
147
|
+
|
148
|
+
def self.parse_json_from_file(filename)
|
149
|
+
File.open(filename) do |f|
|
150
|
+
JSON.parse(f.read)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.parse_json_from_uri(uri, options={})
|
155
|
+
data = fetch(uri, options)
|
156
|
+
JSON.parse(data)
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.parse_json_from_file_or_uri(uri, file, options={})
|
160
|
+
if options[:force_fetch] || !File.exist?(file)
|
161
|
+
document = parse_json_from_uri(uri)
|
162
|
+
File.open(file, "w") { |f| f.write(document) }
|
163
|
+
end
|
164
|
+
# Why always parse the file? See Note 001, below.
|
165
|
+
parse_json_from_file(file)
|
142
166
|
end
|
143
167
|
|
144
168
|
# == XML
|
@@ -149,8 +173,8 @@ module DataCatalog
|
|
149
173
|
end
|
150
174
|
end
|
151
175
|
|
152
|
-
def self.parse_xml_from_uri(uri)
|
153
|
-
data = fetch(uri)
|
176
|
+
def self.parse_xml_from_uri(uri, options={})
|
177
|
+
data = fetch(uri, options)
|
154
178
|
Nokogiri::XML::Document.parse(data)
|
155
179
|
end
|
156
180
|
|
@@ -159,7 +183,8 @@ module DataCatalog
|
|
159
183
|
document = parse_xml_from_uri(uri)
|
160
184
|
File.open(file, "w") { |f| f.write(document) }
|
161
185
|
end
|
162
|
-
|
186
|
+
# Why always parse the file? See Note 001, below.
|
187
|
+
parse_xml_from_file(file)
|
163
188
|
end
|
164
189
|
|
165
190
|
# == YAML
|
data/spec/test.json
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
{ "stations" : ["Metro Center","Dupont Circle"] }
|
data/spec/test.xml
ADDED
data/spec/utility_spec.rb
CHANGED
@@ -20,14 +20,14 @@ describe "Utility" do
|
|
20
20
|
"http://sunlightlabs.com/"
|
21
21
|
end
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
describe "absolute_url" do
|
25
25
|
it "should work" do
|
26
26
|
U.absolute_url("http://sunlightlabs.com", "/contact").should ==
|
27
27
|
"http://sunlightlabs.com/contact"
|
28
28
|
end
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
describe "single_line_clean" do
|
32
32
|
it "should clean up leading and trailing whitespace" do
|
33
33
|
U.single_line_clean("\t \ttext\t\t ").should == "text"
|
@@ -41,19 +41,18 @@ describe "Utility" do
|
|
41
41
|
U.single_line_clean("sunlight\nlabs").should == "sunlight labs"
|
42
42
|
end
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
describe "multi_line_clean" do
|
46
46
|
it "should remove leading and trailing newlines" do
|
47
47
|
input = "\nline 1\nline 2\nline 3\n"
|
48
48
|
U.multi_line_clean(input).should == "line 1\nline 2\nline 3"
|
49
49
|
end
|
50
50
|
end
|
51
|
-
|
51
|
+
|
52
52
|
describe "fetch" do
|
53
53
|
before do
|
54
54
|
@readable = Object.new
|
55
55
|
@readable.stub(:read).and_return("result")
|
56
|
-
|
57
56
|
@sleep_count = 0
|
58
57
|
U.stub(:sleep).and_return {
|
59
58
|
@sleep_count += 1
|
@@ -64,7 +63,7 @@ describe "Utility" do
|
|
64
63
|
U.stub(:open).and_return(@readable)
|
65
64
|
U.fetch("fake", :quiet => true).should == "result"
|
66
65
|
end
|
67
|
-
|
66
|
+
|
68
67
|
it "bad fetches below retry limit are ok" do
|
69
68
|
@count = 0
|
70
69
|
U.stub(:open).and_return {
|
@@ -75,8 +74,8 @@ describe "Utility" do
|
|
75
74
|
@readable
|
76
75
|
end
|
77
76
|
}
|
78
|
-
U.fetch("fake", :max_attempts => 3, :quiet => true)
|
79
|
-
@sleep_count.should == 2
|
77
|
+
result = U.fetch("fake", :max_attempts => 3, :quiet => true)
|
78
|
+
result.should == "result" && @sleep_count.should == 2
|
80
79
|
end
|
81
80
|
|
82
81
|
it "bad fetches above retry limit give nil" do
|
@@ -89,10 +88,9 @@ describe "Utility" do
|
|
89
88
|
@readable
|
90
89
|
end
|
91
90
|
}
|
92
|
-
U.fetch("fake", :max_attempts => 2, :quiet => true)
|
93
|
-
@sleep_count.should == 1
|
91
|
+
result = U.fetch("fake", :max_attempts => 2, :quiet => true)
|
92
|
+
result.should == nil && @sleep_count.should == 1
|
94
93
|
end
|
95
|
-
|
96
94
|
end
|
97
95
|
|
98
96
|
describe "standardize_name" do
|
@@ -100,11 +98,65 @@ describe "Utility" do
|
|
100
98
|
U.standardize_name("City Administrator, Office of").should ==
|
101
99
|
"Office of City Administrator"
|
102
100
|
end
|
103
|
-
|
101
|
+
|
104
102
|
it "two commas" do
|
105
103
|
U.standardize_name("Children, Youth & Families, Department of").should ==
|
106
104
|
"Department of Children, Youth & Families"
|
107
105
|
end
|
108
106
|
end
|
109
|
-
|
107
|
+
|
108
|
+
# == XML
|
109
|
+
|
110
|
+
describe "parse_xml_from_file" do
|
111
|
+
it "should work" do
|
112
|
+
file = File.dirname(__FILE__) + '/test.xml'
|
113
|
+
parsed = U.parse_xml_from_file(file)
|
114
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
115
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "parse_xml_from_uri" do
|
120
|
+
before do
|
121
|
+
@readable = Object.new
|
122
|
+
@readable.stub(:read).and_return(%(
|
123
|
+
<stations>
|
124
|
+
<station>Metro Center</station>
|
125
|
+
<station>Dupont Circle</station>
|
126
|
+
</stations>
|
127
|
+
))
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should work" do
|
131
|
+
U.stub(:open).and_return(@readable)
|
132
|
+
parsed = U.parse_xml_from_uri("fake", :quiet => true)
|
133
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
134
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# == JSON
|
139
|
+
|
140
|
+
describe "parse_json_from_file" do
|
141
|
+
it "should work" do
|
142
|
+
file = File.dirname(__FILE__) + '/test.json'
|
143
|
+
U.parse_json_from_file(file).should ==
|
144
|
+
{ "stations" => ["Metro Center","Dupont Circle"] }
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
describe "parse_json_from_uri" do
|
149
|
+
before do
|
150
|
+
@readable = Object.new
|
151
|
+
@readable.stub(:read).and_return(
|
152
|
+
%({"stations":["Metro Center","Dupont Circle"]}))
|
153
|
+
end
|
154
|
+
|
155
|
+
it "should work" do
|
156
|
+
U.stub(:open).and_return(@readable)
|
157
|
+
result = U.parse_json_from_uri("fake", :quiet => true)
|
158
|
+
result.should == { "stations" => ["Metro Center","Dupont Circle"] }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
110
162
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|
@@ -58,12 +58,12 @@ dependencies:
|
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
hash:
|
61
|
+
hash: 27
|
62
62
|
segments:
|
63
63
|
- 1
|
64
|
-
-
|
65
|
-
-
|
66
|
-
version: 1.
|
64
|
+
- 3
|
65
|
+
- 0
|
66
|
+
version: 1.3.0
|
67
67
|
type: :development
|
68
68
|
version_requirements: *id003
|
69
69
|
description: This framework makes it easier to write importers for the National Data Catalog.
|
@@ -99,6 +99,8 @@ files:
|
|
99
99
|
- natdat_is_hungry.md
|
100
100
|
- spec/spec.opts
|
101
101
|
- spec/spec_helper.rb
|
102
|
+
- spec/test.json
|
103
|
+
- spec/test.xml
|
102
104
|
- spec/utility_spec.rb
|
103
105
|
has_rdoc: true
|
104
106
|
homepage: http://github.com/sunlightlabs/datacatalog-importer
|