remote_table 0.2.11 → 0.2.12
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/remote_table.rb +2 -0
- data/lib/remote_table/file.rb +5 -13
- data/lib/remote_table/file/csv.rb +1 -1
- data/lib/remote_table/file/html.rb +36 -0
- data/remote_table.gemspec +6 -2
- data/test/remote_table_test.rb +205 -189
- metadata +18 -3
data/Rakefile
CHANGED
@@ -14,8 +14,8 @@ begin
|
|
14
14
|
gem.add_dependency 'fastercsv', '>=1.5.0'
|
15
15
|
gem.add_dependency 'activesupport', '>=2.3.4'
|
16
16
|
gem.add_dependency 'slither', '>=0.99.3'
|
17
|
+
gem.add_dependency 'nokogiri', '>=1.4.1'
|
17
18
|
gem.require_path = "lib"
|
18
|
-
gem.files.include %w(lib/remote_table) unless gem.files.empty? # seems to fail once it's in the wild
|
19
19
|
gem.rdoc_options << '--line-numbers' << '--inline-source'
|
20
20
|
gem.requirements << 'curl'
|
21
21
|
gem.rubyforge_project = "remotetable"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.12
|
data/lib/remote_table.rb
CHANGED
@@ -15,6 +15,7 @@ end if ActiveSupport::VERSION::MAJOR == 3
|
|
15
15
|
require 'fastercsv'
|
16
16
|
require 'slither'
|
17
17
|
require 'roo'
|
18
|
+
require 'nokogiri'
|
18
19
|
require 'remote_table/transform'
|
19
20
|
require 'remote_table/request'
|
20
21
|
require 'remote_table/package'
|
@@ -24,6 +25,7 @@ require 'remote_table/file/fixed_width'
|
|
24
25
|
require 'remote_table/file/roo_spreadsheet'
|
25
26
|
require 'remote_table/file/ods'
|
26
27
|
require 'remote_table/file/xls'
|
28
|
+
require 'remote_table/file/html'
|
27
29
|
|
28
30
|
class RemoteTable
|
29
31
|
attr_accessor :request, :package, :file, :transform
|
data/lib/remote_table/file.rb
CHANGED
@@ -4,6 +4,8 @@ class RemoteTable
|
|
4
4
|
attr_accessor :encoding
|
5
5
|
attr_accessor :path
|
6
6
|
attr_accessor :keep_blank_rows
|
7
|
+
attr_accessor :row_xpath
|
8
|
+
attr_accessor :column_xpath
|
7
9
|
|
8
10
|
def initialize(bus)
|
9
11
|
@filename = bus[:filename]
|
@@ -19,22 +21,11 @@ class RemoteTable
|
|
19
21
|
@schema_name = bus[:schema_name]
|
20
22
|
@trap = bus[:trap]
|
21
23
|
@encoding = bus[:encoding] || 'UTF-8'
|
24
|
+
@row_xpath = bus[:row_xpath]
|
25
|
+
@column_xpath = bus[:column_xpath]
|
22
26
|
extend "RemoteTable::#{format.to_s.camelcase}".constantize
|
23
27
|
end
|
24
28
|
|
25
|
-
class << self
|
26
|
-
# http://santanatechnotes.blogspot.com/2005/12/matching-iso-8859-1-strings-with-ruby.html
|
27
|
-
def convert_to_utf8(str, encoding)
|
28
|
-
if encoding == 'UTF-8' or encoding == 'UTF8'
|
29
|
-
str.toutf8 # just in case
|
30
|
-
else
|
31
|
-
@_iconv ||= Hash.new
|
32
|
-
@_iconv[encoding] ||= Iconv.new 'UTF-8', encoding
|
33
|
-
@_iconv[encoding].iconv(str).toutf8
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
29
|
def tabulate(path)
|
39
30
|
define_fixed_width_schema! if format == :fixed_width and schema.is_a?(Array) # TODO move to generic subclass callback
|
40
31
|
self.path = path
|
@@ -98,6 +89,7 @@ class RemoteTable
|
|
98
89
|
extname = ::File.extname(filename).gsub('.', '')
|
99
90
|
return :csv if extname.blank?
|
100
91
|
format = [ :xls, :ods ].detect { |i| i == extname.to_sym }
|
92
|
+
format = :html if extname =~ /\Ahtm/
|
101
93
|
format = :csv if format.blank?
|
102
94
|
format
|
103
95
|
end
|
@@ -35,7 +35,7 @@ class RemoteTable
|
|
35
35
|
private
|
36
36
|
|
37
37
|
def fastercsv_options
|
38
|
-
fastercsv_options = { :skip_blanks => !keep_blank_rows
|
38
|
+
fastercsv_options = { :skip_blanks => !keep_blank_rows }
|
39
39
|
if headers == false
|
40
40
|
fastercsv_options.merge!(:headers => nil)
|
41
41
|
else
|
@@ -0,0 +1,36 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
module Html
|
3
|
+
def each_row(&block)
|
4
|
+
backup_file!
|
5
|
+
convert_file_to_utf8!
|
6
|
+
html_headers = (headers.is_a?(Array)) ? headers : nil
|
7
|
+
Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
|
8
|
+
values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
|
9
|
+
if html_headers.nil?
|
10
|
+
html_headers = values
|
11
|
+
next
|
12
|
+
end
|
13
|
+
hash = zip html_headers, values
|
14
|
+
yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
|
15
|
+
end
|
16
|
+
ensure
|
17
|
+
restore_file!
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# http://snippets.dzone.com/posts/show/406
|
23
|
+
def zip(keys, values)
|
24
|
+
hash = Hash.new
|
25
|
+
keys.zip(values) { |k,v| hash[k]=v }
|
26
|
+
hash
|
27
|
+
end
|
28
|
+
|
29
|
+
# should we be doing this in ruby?
|
30
|
+
def unescaped_html_without_soft_hyphens
|
31
|
+
str = CGI.unescapeHTML IO.read(path)
|
32
|
+
str.gsub! /­/, ''
|
33
|
+
str
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/remote_table.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{remote_table}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.12"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
12
|
-
s.date = %q{2010-04-
|
12
|
+
s.date = %q{2010-04-22}
|
13
13
|
s.description = %q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
|
14
14
|
s.email = %q{seamus@abshere.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -28,6 +28,7 @@ Gem::Specification.new do |s|
|
|
28
28
|
"lib/remote_table/file.rb",
|
29
29
|
"lib/remote_table/file/csv.rb",
|
30
30
|
"lib/remote_table/file/fixed_width.rb",
|
31
|
+
"lib/remote_table/file/html.rb",
|
31
32
|
"lib/remote_table/file/ods.rb",
|
32
33
|
"lib/remote_table/file/roo_spreadsheet.rb",
|
33
34
|
"lib/remote_table/file/xls.rb",
|
@@ -59,17 +60,20 @@ Gem::Specification.new do |s|
|
|
59
60
|
s.add_runtime_dependency(%q<fastercsv>, [">= 1.5.0"])
|
60
61
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.4"])
|
61
62
|
s.add_runtime_dependency(%q<slither>, [">= 0.99.3"])
|
63
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
62
64
|
else
|
63
65
|
s.add_dependency(%q<roo>, ["= 1.3.11"])
|
64
66
|
s.add_dependency(%q<fastercsv>, [">= 1.5.0"])
|
65
67
|
s.add_dependency(%q<activesupport>, [">= 2.3.4"])
|
66
68
|
s.add_dependency(%q<slither>, [">= 0.99.3"])
|
69
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
67
70
|
end
|
68
71
|
else
|
69
72
|
s.add_dependency(%q<roo>, ["= 1.3.11"])
|
70
73
|
s.add_dependency(%q<fastercsv>, [">= 1.5.0"])
|
71
74
|
s.add_dependency(%q<activesupport>, [">= 2.3.4"])
|
72
75
|
s.add_dependency(%q<slither>, [">= 0.99.3"])
|
76
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
73
77
|
end
|
74
78
|
end
|
75
79
|
|
data/test/remote_table_test.rb
CHANGED
@@ -50,230 +50,246 @@ class RemoteTableTest < Test::Unit::TestCase
|
|
50
50
|
]
|
51
51
|
end
|
52
52
|
|
53
|
-
|
54
|
-
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
|
55
|
-
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
56
|
-
assert_equal 'NSX', t.rows.first['carline name']
|
57
|
-
assert_equal 'VOLVO', t.rows.last['Manufacturer']
|
58
|
-
assert_equal 'V70 XC AWD', t.rows.last['carline name']
|
53
|
+
if ENV['NEW'] == 'true'
|
59
54
|
end
|
60
55
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
56
|
+
if ENV['OLD'] == 'true'
|
57
|
+
should "read an HTML table made with frontpage" do
|
58
|
+
t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-E.htm",
|
59
|
+
:encoding => 'US-ASCII',
|
60
|
+
:row_xpath => '//table/tr[2]/td/table/tr',
|
61
|
+
:column_xpath => 'td'
|
62
|
+
assert_equal 'E110', t.rows.first['Designator']
|
63
|
+
assert_equal 'EMBRAER', t.rows.first['Manufacturer']
|
64
|
+
assert_equal 'EZKC', t.rows.last['Designator']
|
65
|
+
assert_equal 'EZ King Cobra', t.rows.last['Model']
|
66
|
+
end
|
67
|
+
|
68
|
+
should "open an XLS inside a zip file" do
|
69
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
|
70
|
+
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
71
|
+
assert_equal 'NSX', t.rows.first['carline name']
|
72
|
+
assert_equal 'VOLVO', t.rows.last['Manufacturer']
|
73
|
+
assert_equal 'V70 XC AWD', t.rows.last['carline name']
|
74
|
+
end
|
75
|
+
|
76
|
+
should "not have indifferent string/symbol hash access" do
|
77
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
|
78
|
+
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
79
|
+
assert_equal nil, t.rows.first[:Manufacturer]
|
80
|
+
end
|
66
81
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
82
|
+
should "hash rows without paying attention to order" do
|
83
|
+
x = ActiveSupport::OrderedHash.new
|
84
|
+
x[:a] = 1
|
85
|
+
x[:b] = 2
|
71
86
|
|
72
|
-
|
73
|
-
|
74
|
-
|
87
|
+
y = ActiveSupport::OrderedHash.new
|
88
|
+
y[:b] = 2
|
89
|
+
y[:a] = 1
|
75
90
|
|
76
|
-
|
77
|
-
|
78
|
-
|
91
|
+
assert Marshal.dump(x) != Marshal.dump(y)
|
92
|
+
assert RemoteTable::Transform.row_hash(x) == RemoteTable::Transform.row_hash(y)
|
93
|
+
end
|
79
94
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
95
|
+
should "open a Google Docs url (as a CSV)" do
|
96
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA')
|
97
|
+
assert_equal 'Gulf Coast', t.rows.first['PAD district name']
|
98
|
+
assert_equal 'AL', t.rows.first['State']
|
99
|
+
assert_equal 'Rocky Mountain', t.rows.last['PAD district name']
|
100
|
+
assert_equal 'WY', t.rows.last['State']
|
101
|
+
end
|
87
102
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
103
|
+
should "open a Google Docs url as a CSV without headers" do
|
104
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false)
|
105
|
+
assert_equal 'AL', t.rows.first[0]
|
106
|
+
assert_equal 'Gulf Coast', t.rows.first[4]
|
107
|
+
assert_equal 'WY', t.rows.last[0]
|
108
|
+
assert_equal 'Rocky Mountain', t.rows.last[4]
|
109
|
+
end
|
95
110
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
111
|
+
should "send form data, follow redirects and use a filename glob" do
|
112
|
+
url = 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0'
|
113
|
+
form_data = 'UserTableName=T_100_Segment__All_Carriers&DBShortName=Air_Carriers&RawDataTable=T_T100_SEGMENT_ALL_CARRIER&sqlstr=+SELECT+DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE+FROM++T_T100_SEGMENT_ALL_CARRIER+WHERE+Month+%3D1+AND+YEAR%3D2008&varlist=DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE&grouplist=&suml=&sumRegion=&filter1=title%3D&filter2=title%3D&geo=All%A0&time=January&timename=Month&GEOGRAPHY=All&XYEAR=2008&FREQUENCY=1&AllVars=All&VarName=DEPARTURES_SCHEDULED&VarDesc=DepScheduled&VarType=Num&VarName=DEPARTURES_PERFORMED&VarDesc=DepPerformed&VarType=Num&VarName=PAYLOAD&VarDesc=Payload&VarType=Num&VarName=SEATS&VarDesc=Seats&VarType=Num&VarName=PASSENGERS&VarDesc=Passengers&VarType=Num&VarName=FREIGHT&VarDesc=Freight&VarType=Num&VarName=MAIL&VarDesc=Mail&VarType=Num&VarName=DISTANCE&VarDesc=Distance&VarType=Num&VarName=RAMP_TO_RAMP&VarDesc=RampToRamp&VarType=Num&VarName=AIR_TIME&VarDesc=AirTime&VarType=Num&VarName=UNIQUE_CARRIER&VarDesc=UniqueCarrier&VarType=Char&VarName=AIRLINE_ID&VarDesc=AirlineID&VarType=Num&VarName=UNIQUE_CARRIER_NAME&VarDesc=UniqueCarrierName&VarType=Char&VarName=UNIQUE_CARRIER_ENTITY&VarDesc=UniqCarrierEntity&VarType=Char&VarName=REGION&VarDesc=CarrierRegion&VarType=Char&VarName=CARRIER&VarDesc=Carrier&VarType=Char&VarName=CARRIER_NAME&VarDesc=CarrierName&VarType=Char&VarName=CARRIER_GROUP&VarDesc=CarrierGroup&VarType=Num&VarName=CARRIER_GROUP_NEW&VarDesc=CarrierGroupNew&VarType=Num&VarName=ORIGIN&VarDesc=Origin&VarType=Char&VarName=ORIGIN_CITY_NAME&VarDesc=OriginCityName&VarType=Char&VarName=ORIGIN_CITY_NUM&VarDesc=OriginCityNum&VarType=Num&VarName=ORIGIN_STATE_ABR&VarDesc=OriginState&VarType=Char&VarName=ORIGIN_STATE_FIPS&VarDesc=OriginStateFips&VarType=Char&VarName=ORIGIN_STATE_NM&VarDesc=OriginStateName&VarType=Char&VarName=ORIGIN_COUNTRY&VarDesc=OriginCountry&VarType=Char&VarName=ORIGIN_COUNTRY_NAME&VarDesc=OriginCountryName&VarType=Char&VarName=ORIGIN_WAC&VarDesc=OriginWac&VarType=Num&VarName=DEST&VarDesc=Dest&VarType=Char&VarName=DEST_CITY_NAME&VarDesc=DestCityName&VarType=Char&VarName=DEST_CITY_NUM&VarDesc=DestCityNum&VarType=Num&VarName=DEST_STATE_ABR&VarDesc=DestState&VarType=Char&VarName=DEST_STATE_FIPS&VarDesc=DestStateFips&VarType=Char&VarName=DEST_STATE_NM&VarDesc=DestStateName&VarType=Char&VarName=DEST_COUNTRY&VarDesc=DestCountry&VarType=Char&VarName=DEST_COUNTRY_NAME&VarDesc=DestCountryName&VarType=Char&VarName=DEST_WAC&VarDesc=DestWac&VarType=Num&VarName=AIRCRAFT_GROUP&VarDesc=AircraftGroup&VarType=Num&VarName=AIRCRAFT_TYPE&VarDesc=AircraftType&VarType=Char&VarName=AIRCRAFT_CONFIG&VarDesc=AircraftConfig&VarType=Num&VarName=YEAR&VarDesc=Year&VarType=Num&VarName=QUARTER&VarDesc=Quarter&VarType=Num&VarName=MONTH&VarDesc=Month&VarType=Num&VarName=DISTANCE_GROUP&VarDesc=DistanceGroup&VarType=Num&VarName=CLASS&VarDesc=Class&VarType=Char&VarName=DATA_SOURCE&VarDesc=DataSource&VarType=Char'
|
114
|
+
t = RemoteTable.new :url => url, :form_data => form_data, :compression => :zip, :glob => '/*.csv'
|
115
|
+
assert_equal 'United States of America', t.rows.first['DEST_COUNTRY_NAME']
|
116
|
+
end
|
102
117
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
118
|
+
should "take the last of values if the header is duplicated" do
|
119
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tujrgUOwDSLWb-P4KCt1qBg')
|
120
|
+
assert_equal '2', t.rows.first['dup_header']
|
121
|
+
end
|
107
122
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
123
|
+
should "respect field order in CSVs without headers" do
|
124
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false)
|
125
|
+
last_k = -1
|
126
|
+
saw_string = false
|
127
|
+
t.rows.each do |row|
|
128
|
+
row.each do |k, v|
|
129
|
+
if k.is_a?(Fixnum) and last_k.is_a?(Fixnum)
|
130
|
+
assert !saw_string
|
131
|
+
assert k > last_k
|
132
|
+
end
|
133
|
+
last_k = k
|
134
|
+
saw_string = k.is_a?(String)
|
117
135
|
end
|
118
|
-
last_k = k
|
119
|
-
saw_string = k.is_a?(String)
|
120
136
|
end
|
121
137
|
end
|
122
|
-
end
|
123
138
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
139
|
+
%w{ csv ods xls }.each do |format|
|
140
|
+
eval %{
|
141
|
+
should "read #{format}" do
|
142
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}')
|
143
|
+
# no blank headers
|
144
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
145
|
+
# correct values
|
146
|
+
t.rows.each_with_index do |row, index|
|
147
|
+
assert_equal row.except('row_hash'), @test2_rows[index]
|
148
|
+
end
|
133
149
|
end
|
134
|
-
end
|
135
150
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
151
|
+
should "read #{format}, keeping blank rows" do
|
152
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}', :keep_blank_rows => true)
|
153
|
+
# no blank headers
|
154
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
155
|
+
# correct values
|
156
|
+
t.rows.each_with_index do |row, index|
|
157
|
+
assert_equal row.except('row_hash'), @test2_rows_with_blanks[index]
|
158
|
+
end
|
143
159
|
end
|
144
|
-
|
145
|
-
|
146
|
-
end
|
160
|
+
}
|
161
|
+
end
|
147
162
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
163
|
+
should "read fixed width correctly" do
|
164
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
165
|
+
:format => :fixed_width,
|
166
|
+
:skip => 1,
|
167
|
+
:schema => [[ 'header4', 10, { :type => :string } ],
|
168
|
+
[ 'spacer', 1 ],
|
169
|
+
[ 'header5', 10, { :type => :string } ],
|
170
|
+
[ 'spacer', 12 ],
|
171
|
+
[ 'header6', 10, { :type => :string } ]])
|
157
172
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
173
|
+
# no blank headers
|
174
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
175
|
+
# correct values
|
176
|
+
t.rows.each_with_index do |row, index|
|
177
|
+
assert_equal row.except('row_hash'), @test2_rows[index]
|
178
|
+
end
|
163
179
|
end
|
164
|
-
end
|
165
180
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
181
|
+
should "read fixed width correctly, keeping blank rows" do
|
182
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
183
|
+
:format => :fixed_width,
|
184
|
+
:keep_blank_rows => true,
|
185
|
+
:skip => 1,
|
186
|
+
:schema => [[ 'header4', 10, { :type => :string } ],
|
187
|
+
[ 'spacer', 1 ],
|
188
|
+
[ 'header5', 10, { :type => :string } ],
|
189
|
+
[ 'spacer', 12 ],
|
190
|
+
[ 'header6', 10, { :type => :string } ]])
|
176
191
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
192
|
+
# no blank headers
|
193
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
194
|
+
# correct values
|
195
|
+
t.rows.each_with_index do |row, index|
|
196
|
+
assert_equal row.except('row_hash'), @test2_rows_with_blanks[index]
|
197
|
+
end
|
182
198
|
end
|
183
|
-
end
|
184
199
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
200
|
+
should "have the same row hash across formats" do
|
201
|
+
csv = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.csv')
|
202
|
+
ods = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.ods')
|
203
|
+
xls = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.xls')
|
204
|
+
fixed_width = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.fixed_width.txt',
|
205
|
+
:format => :fixed_width,
|
206
|
+
:skip => 1,
|
207
|
+
:schema => [[ 'header1', 10, { :type => :string } ],
|
208
|
+
[ 'spacer', 1 ],
|
209
|
+
[ 'header2', 10, { :type => :string } ],
|
210
|
+
[ 'spacer', 12 ],
|
211
|
+
[ 'header3', 10, { :type => :string } ]])
|
197
212
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
213
|
+
csv2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.csv')
|
214
|
+
ods2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.ods')
|
215
|
+
xls2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.xls')
|
216
|
+
fixed_width2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.fixed_width.txt',
|
217
|
+
:format => :fixed_width,
|
218
|
+
:skip => 1,
|
219
|
+
:schema => [[ 'spacer', 11 ],
|
220
|
+
[ 'header2', 10, { :type => :string } ],
|
221
|
+
[ 'spacer', 1 ],
|
222
|
+
[ 'header3', 10, { :type => :string } ],
|
223
|
+
[ 'spacer', 1 ],
|
224
|
+
[ 'header1', 10, { :type => :string } ]])
|
210
225
|
|
211
226
|
|
212
|
-
|
227
|
+
reference = csv.rows[0]['row_hash']
|
213
228
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
229
|
+
# same row hashes
|
230
|
+
assert_equal reference, ods.rows[0]['row_hash']
|
231
|
+
assert_equal reference, xls.rows[0]['row_hash']
|
232
|
+
assert_equal reference, fixed_width.rows[0]['row_hash']
|
233
|
+
# same row hashes with different order
|
234
|
+
assert_equal reference, csv2.rows[0]['row_hash']
|
235
|
+
assert_equal reference, ods2.rows[0]['row_hash']
|
236
|
+
assert_equal reference, xls2.rows[0]['row_hash']
|
237
|
+
assert_equal reference, fixed_width2.rows[0]['row_hash']
|
238
|
+
end
|
224
239
|
|
225
|
-
|
226
|
-
|
240
|
+
should "open an ODS" do
|
241
|
+
t = RemoteTable.new(:url => 'http://www.worldmapper.org/data/opendoc/2_worldmapper_data.ods', :sheet => 'Data', :keep_blank_rows => true)
|
227
242
|
|
228
|
-
|
229
|
-
|
230
|
-
|
243
|
+
assert_equal 'Central Africa', t.rows[5]['name']
|
244
|
+
assert_equal 99, t.rows[5]['MAP DATA population (millions) 2002'].to_i
|
245
|
+
end
|
231
246
|
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
247
|
+
should "open a CSV inside a zip file" do
|
248
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv')
|
249
|
+
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
250
|
+
assert_equal 'NSX', t.rows.first['carline name']
|
251
|
+
assert_equal 'TOYOTA', t.rows.last['Manufacturer']
|
252
|
+
assert_equal 'RAV4 SOFT TOP 4WD', t.rows.last['carline name']
|
253
|
+
end
|
239
254
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
+
should "open a fixed-width file with an inline schema inside a zip file" do
|
256
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
257
|
+
:filename => 'Gd6-dsc.txt',
|
258
|
+
:format => :fixed_width,
|
259
|
+
:crop => 21..26, # inclusive
|
260
|
+
:cut => '2-',
|
261
|
+
:select => lambda { |row| /\A[A-Z]/.match row['code'] },
|
262
|
+
:schema => [[ 'code', 2, { :type => :string } ],
|
263
|
+
[ 'spacer', 2 ],
|
264
|
+
[ 'name', 52, { :type => :string } ]])
|
265
|
+
assert_equal 'regular grade gasoline (octane number of 87)', t.rows.first['name']
|
266
|
+
assert_equal 'R', t.rows.first['code']
|
267
|
+
assert_equal 'electricity', t.rows.last['name']
|
268
|
+
assert_equal 'El', t.rows.last['code']
|
269
|
+
end
|
255
270
|
|
256
|
-
|
257
|
-
|
258
|
-
|
271
|
+
should "open an XLS with a parser" do
|
272
|
+
ma_1990_01 = {"month"=>1, "cost"=>"54.0", "locatable"=>"Massachusetts (State)", "year"=>1990}
|
273
|
+
ga_1990_01 = {"month"=>1, "cost"=>"50.7", "locatable"=>"Georgia (State)", "year"=>1990}
|
259
274
|
|
260
|
-
|
261
|
-
|
275
|
+
t = RemoteTable.new(:url => 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls',
|
276
|
+
:transform => { :class => FuelOilParser })
|
262
277
|
|
263
|
-
|
264
|
-
|
265
|
-
|
278
|
+
assert t.rows.include?(ma_1990_01)
|
279
|
+
assert t.rows.include?(ga_1990_01)
|
280
|
+
end
|
266
281
|
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
282
|
+
should "provide a row_hash on demand" do
|
283
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
284
|
+
:filename => 'Gd6-dsc.txt',
|
285
|
+
:format => :fixed_width,
|
286
|
+
:crop => 21..26, # inclusive
|
287
|
+
:cut => '2-',
|
288
|
+
:select => lambda { |row| /\A[A-Z]/.match row['code'] },
|
289
|
+
:schema => [[ 'code', 2, { :type => :string } ],
|
290
|
+
[ 'spacer', 2 ],
|
291
|
+
[ 'name', 52, { :type => :string } ]])
|
292
|
+
assert_equal 'a8a5d7f17b56772723c657eb62b0f238', t.rows.first['row_hash']
|
293
|
+
end
|
278
294
|
end
|
279
295
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 12
|
9
|
+
version: 0.2.12
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Seamus Abshere
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-04-
|
18
|
+
date: 2010-04-22 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -74,6 +74,20 @@ dependencies:
|
|
74
74
|
version: 0.99.3
|
75
75
|
type: :runtime
|
76
76
|
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: nokogiri
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
segments:
|
85
|
+
- 1
|
86
|
+
- 4
|
87
|
+
- 1
|
88
|
+
version: 1.4.1
|
89
|
+
type: :runtime
|
90
|
+
version_requirements: *id005
|
77
91
|
description: Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.
|
78
92
|
email: seamus@abshere.net
|
79
93
|
executables: []
|
@@ -95,6 +109,7 @@ files:
|
|
95
109
|
- lib/remote_table/file.rb
|
96
110
|
- lib/remote_table/file/csv.rb
|
97
111
|
- lib/remote_table/file/fixed_width.rb
|
112
|
+
- lib/remote_table/file/html.rb
|
98
113
|
- lib/remote_table/file/ods.rb
|
99
114
|
- lib/remote_table/file/roo_spreadsheet.rb
|
100
115
|
- lib/remote_table/file/xls.rb
|