remote_table 0.2.11 → 0.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/remote_table.rb +2 -0
- data/lib/remote_table/file.rb +5 -13
- data/lib/remote_table/file/csv.rb +1 -1
- data/lib/remote_table/file/html.rb +36 -0
- data/remote_table.gemspec +6 -2
- data/test/remote_table_test.rb +205 -189
- metadata +18 -3
data/Rakefile
CHANGED
@@ -14,8 +14,8 @@ begin
|
|
14
14
|
gem.add_dependency 'fastercsv', '>=1.5.0'
|
15
15
|
gem.add_dependency 'activesupport', '>=2.3.4'
|
16
16
|
gem.add_dependency 'slither', '>=0.99.3'
|
17
|
+
gem.add_dependency 'nokogiri', '>=1.4.1'
|
17
18
|
gem.require_path = "lib"
|
18
|
-
gem.files.include %w(lib/remote_table) unless gem.files.empty? # seems to fail once it's in the wild
|
19
19
|
gem.rdoc_options << '--line-numbers' << '--inline-source'
|
20
20
|
gem.requirements << 'curl'
|
21
21
|
gem.rubyforge_project = "remotetable"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.12
|
data/lib/remote_table.rb
CHANGED
@@ -15,6 +15,7 @@ end if ActiveSupport::VERSION::MAJOR == 3
|
|
15
15
|
require 'fastercsv'
|
16
16
|
require 'slither'
|
17
17
|
require 'roo'
|
18
|
+
require 'nokogiri'
|
18
19
|
require 'remote_table/transform'
|
19
20
|
require 'remote_table/request'
|
20
21
|
require 'remote_table/package'
|
@@ -24,6 +25,7 @@ require 'remote_table/file/fixed_width'
|
|
24
25
|
require 'remote_table/file/roo_spreadsheet'
|
25
26
|
require 'remote_table/file/ods'
|
26
27
|
require 'remote_table/file/xls'
|
28
|
+
require 'remote_table/file/html'
|
27
29
|
|
28
30
|
class RemoteTable
|
29
31
|
attr_accessor :request, :package, :file, :transform
|
data/lib/remote_table/file.rb
CHANGED
@@ -4,6 +4,8 @@ class RemoteTable
|
|
4
4
|
attr_accessor :encoding
|
5
5
|
attr_accessor :path
|
6
6
|
attr_accessor :keep_blank_rows
|
7
|
+
attr_accessor :row_xpath
|
8
|
+
attr_accessor :column_xpath
|
7
9
|
|
8
10
|
def initialize(bus)
|
9
11
|
@filename = bus[:filename]
|
@@ -19,22 +21,11 @@ class RemoteTable
|
|
19
21
|
@schema_name = bus[:schema_name]
|
20
22
|
@trap = bus[:trap]
|
21
23
|
@encoding = bus[:encoding] || 'UTF-8'
|
24
|
+
@row_xpath = bus[:row_xpath]
|
25
|
+
@column_xpath = bus[:column_xpath]
|
22
26
|
extend "RemoteTable::#{format.to_s.camelcase}".constantize
|
23
27
|
end
|
24
28
|
|
25
|
-
class << self
|
26
|
-
# http://santanatechnotes.blogspot.com/2005/12/matching-iso-8859-1-strings-with-ruby.html
|
27
|
-
def convert_to_utf8(str, encoding)
|
28
|
-
if encoding == 'UTF-8' or encoding == 'UTF8'
|
29
|
-
str.toutf8 # just in case
|
30
|
-
else
|
31
|
-
@_iconv ||= Hash.new
|
32
|
-
@_iconv[encoding] ||= Iconv.new 'UTF-8', encoding
|
33
|
-
@_iconv[encoding].iconv(str).toutf8
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
29
|
def tabulate(path)
|
39
30
|
define_fixed_width_schema! if format == :fixed_width and schema.is_a?(Array) # TODO move to generic subclass callback
|
40
31
|
self.path = path
|
@@ -98,6 +89,7 @@ class RemoteTable
|
|
98
89
|
extname = ::File.extname(filename).gsub('.', '')
|
99
90
|
return :csv if extname.blank?
|
100
91
|
format = [ :xls, :ods ].detect { |i| i == extname.to_sym }
|
92
|
+
format = :html if extname =~ /\Ahtm/
|
101
93
|
format = :csv if format.blank?
|
102
94
|
format
|
103
95
|
end
|
@@ -35,7 +35,7 @@ class RemoteTable
|
|
35
35
|
private
|
36
36
|
|
37
37
|
def fastercsv_options
|
38
|
-
fastercsv_options = { :skip_blanks => !keep_blank_rows
|
38
|
+
fastercsv_options = { :skip_blanks => !keep_blank_rows }
|
39
39
|
if headers == false
|
40
40
|
fastercsv_options.merge!(:headers => nil)
|
41
41
|
else
|
@@ -0,0 +1,36 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
module Html
|
3
|
+
def each_row(&block)
|
4
|
+
backup_file!
|
5
|
+
convert_file_to_utf8!
|
6
|
+
html_headers = (headers.is_a?(Array)) ? headers : nil
|
7
|
+
Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
|
8
|
+
values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
|
9
|
+
if html_headers.nil?
|
10
|
+
html_headers = values
|
11
|
+
next
|
12
|
+
end
|
13
|
+
hash = zip html_headers, values
|
14
|
+
yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
|
15
|
+
end
|
16
|
+
ensure
|
17
|
+
restore_file!
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# http://snippets.dzone.com/posts/show/406
|
23
|
+
def zip(keys, values)
|
24
|
+
hash = Hash.new
|
25
|
+
keys.zip(values) { |k,v| hash[k]=v }
|
26
|
+
hash
|
27
|
+
end
|
28
|
+
|
29
|
+
# should we be doing this in ruby?
|
30
|
+
def unescaped_html_without_soft_hyphens
|
31
|
+
str = CGI.unescapeHTML IO.read(path)
|
32
|
+
str.gsub! /­/, ''
|
33
|
+
str
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/remote_table.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{remote_table}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.12"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
12
|
-
s.date = %q{2010-04-
|
12
|
+
s.date = %q{2010-04-22}
|
13
13
|
s.description = %q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
|
14
14
|
s.email = %q{seamus@abshere.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -28,6 +28,7 @@ Gem::Specification.new do |s|
|
|
28
28
|
"lib/remote_table/file.rb",
|
29
29
|
"lib/remote_table/file/csv.rb",
|
30
30
|
"lib/remote_table/file/fixed_width.rb",
|
31
|
+
"lib/remote_table/file/html.rb",
|
31
32
|
"lib/remote_table/file/ods.rb",
|
32
33
|
"lib/remote_table/file/roo_spreadsheet.rb",
|
33
34
|
"lib/remote_table/file/xls.rb",
|
@@ -59,17 +60,20 @@ Gem::Specification.new do |s|
|
|
59
60
|
s.add_runtime_dependency(%q<fastercsv>, [">= 1.5.0"])
|
60
61
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.4"])
|
61
62
|
s.add_runtime_dependency(%q<slither>, [">= 0.99.3"])
|
63
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
62
64
|
else
|
63
65
|
s.add_dependency(%q<roo>, ["= 1.3.11"])
|
64
66
|
s.add_dependency(%q<fastercsv>, [">= 1.5.0"])
|
65
67
|
s.add_dependency(%q<activesupport>, [">= 2.3.4"])
|
66
68
|
s.add_dependency(%q<slither>, [">= 0.99.3"])
|
69
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
67
70
|
end
|
68
71
|
else
|
69
72
|
s.add_dependency(%q<roo>, ["= 1.3.11"])
|
70
73
|
s.add_dependency(%q<fastercsv>, [">= 1.5.0"])
|
71
74
|
s.add_dependency(%q<activesupport>, [">= 2.3.4"])
|
72
75
|
s.add_dependency(%q<slither>, [">= 0.99.3"])
|
76
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
73
77
|
end
|
74
78
|
end
|
75
79
|
|
data/test/remote_table_test.rb
CHANGED
@@ -50,230 +50,246 @@ class RemoteTableTest < Test::Unit::TestCase
|
|
50
50
|
]
|
51
51
|
end
|
52
52
|
|
53
|
-
|
54
|
-
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
|
55
|
-
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
56
|
-
assert_equal 'NSX', t.rows.first['carline name']
|
57
|
-
assert_equal 'VOLVO', t.rows.last['Manufacturer']
|
58
|
-
assert_equal 'V70 XC AWD', t.rows.last['carline name']
|
53
|
+
if ENV['NEW'] == 'true'
|
59
54
|
end
|
60
55
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
56
|
+
if ENV['OLD'] == 'true'
|
57
|
+
should "read an HTML table made with frontpage" do
|
58
|
+
t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-E.htm",
|
59
|
+
:encoding => 'US-ASCII',
|
60
|
+
:row_xpath => '//table/tr[2]/td/table/tr',
|
61
|
+
:column_xpath => 'td'
|
62
|
+
assert_equal 'E110', t.rows.first['Designator']
|
63
|
+
assert_equal 'EMBRAER', t.rows.first['Manufacturer']
|
64
|
+
assert_equal 'EZKC', t.rows.last['Designator']
|
65
|
+
assert_equal 'EZ King Cobra', t.rows.last['Model']
|
66
|
+
end
|
67
|
+
|
68
|
+
should "open an XLS inside a zip file" do
|
69
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
|
70
|
+
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
71
|
+
assert_equal 'NSX', t.rows.first['carline name']
|
72
|
+
assert_equal 'VOLVO', t.rows.last['Manufacturer']
|
73
|
+
assert_equal 'V70 XC AWD', t.rows.last['carline name']
|
74
|
+
end
|
75
|
+
|
76
|
+
should "not have indifferent string/symbol hash access" do
|
77
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
|
78
|
+
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
79
|
+
assert_equal nil, t.rows.first[:Manufacturer]
|
80
|
+
end
|
66
81
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
82
|
+
should "hash rows without paying attention to order" do
|
83
|
+
x = ActiveSupport::OrderedHash.new
|
84
|
+
x[:a] = 1
|
85
|
+
x[:b] = 2
|
71
86
|
|
72
|
-
|
73
|
-
|
74
|
-
|
87
|
+
y = ActiveSupport::OrderedHash.new
|
88
|
+
y[:b] = 2
|
89
|
+
y[:a] = 1
|
75
90
|
|
76
|
-
|
77
|
-
|
78
|
-
|
91
|
+
assert Marshal.dump(x) != Marshal.dump(y)
|
92
|
+
assert RemoteTable::Transform.row_hash(x) == RemoteTable::Transform.row_hash(y)
|
93
|
+
end
|
79
94
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
95
|
+
should "open a Google Docs url (as a CSV)" do
|
96
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA')
|
97
|
+
assert_equal 'Gulf Coast', t.rows.first['PAD district name']
|
98
|
+
assert_equal 'AL', t.rows.first['State']
|
99
|
+
assert_equal 'Rocky Mountain', t.rows.last['PAD district name']
|
100
|
+
assert_equal 'WY', t.rows.last['State']
|
101
|
+
end
|
87
102
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
103
|
+
should "open a Google Docs url as a CSV without headers" do
|
104
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false)
|
105
|
+
assert_equal 'AL', t.rows.first[0]
|
106
|
+
assert_equal 'Gulf Coast', t.rows.first[4]
|
107
|
+
assert_equal 'WY', t.rows.last[0]
|
108
|
+
assert_equal 'Rocky Mountain', t.rows.last[4]
|
109
|
+
end
|
95
110
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
111
|
+
should "send form data, follow redirects and use a filename glob" do
|
112
|
+
url = 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0'
|
113
|
+
form_data = 'UserTableName=T_100_Segment__All_Carriers&DBShortName=Air_Carriers&RawDataTable=T_T100_SEGMENT_ALL_CARRIER&sqlstr=+SELECT+DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE+FROM++T_T100_SEGMENT_ALL_CARRIER+WHERE+Month+%3D1+AND+YEAR%3D2008&varlist=DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE&grouplist=&suml=&sumRegion=&filter1=title%3D&filter2=title%3D&geo=All%A0&time=January&timename=Month&GEOGRAPHY=All&XYEAR=2008&FREQUENCY=1&AllVars=All&VarName=DEPARTURES_SCHEDULED&VarDesc=DepScheduled&VarType=Num&VarName=DEPARTURES_PERFORMED&VarDesc=DepPerformed&VarType=Num&VarName=PAYLOAD&VarDesc=Payload&VarType=Num&VarName=SEATS&VarDesc=Seats&VarType=Num&VarName=PASSENGERS&VarDesc=Passengers&VarType=Num&VarName=FREIGHT&VarDesc=Freight&VarType=Num&VarName=MAIL&VarDesc=Mail&VarType=Num&VarName=DISTANCE&VarDesc=Distance&VarType=Num&VarName=RAMP_TO_RAMP&VarDesc=RampToRamp&VarType=Num&VarName=AIR_TIME&VarDesc=AirTime&VarType=Num&VarName=UNIQUE_CARRIER&VarDesc=UniqueCarrier&VarType=Char&VarName=AIRLINE_ID&VarDesc=AirlineID&VarType=Num&VarName=UNIQUE_CARRIER_NAME&VarDesc=UniqueCarrierName&VarType=Char&VarName=UNIQUE_CARRIER_ENTITY&VarDesc=UniqCarrierEntity&VarType=Char&VarName=REGION&VarDesc=CarrierRegion&VarType=Char&VarName=CARRIER&VarDesc=Carrier&VarType=Char&VarName=CARRIER_NAME&VarDesc=CarrierName&VarType=Char&VarName=CARRIER_GROUP&VarDesc=CarrierGroup&VarType=Num&VarName=CARRIER_GROUP_NEW&VarDesc=CarrierGroupNew&VarType=Num&VarName=ORIGIN&VarDesc=Origin&VarType=Char&VarName=ORIGIN_CITY_NAME&VarDesc=OriginCityName&VarType=Char&VarName=ORIGIN_CITY_NUM&VarDesc=OriginCityNum&VarType=Num&VarName=ORIGIN_STATE_ABR&VarDesc=OriginState&VarType=Char&VarName=ORIGIN_STATE_FIPS&VarDesc=OriginStateFips&VarType=Char&VarName=ORIGIN_STATE_NM&VarDesc=OriginStateName&VarType=Char&VarName=ORIGIN_COUNTRY&VarDesc=OriginCountry&VarType=Char&VarName=ORIGIN_COUNTRY_NAME&VarDesc=OriginCountryName&VarType=Char&VarName=ORIGIN_WAC&VarDesc=OriginWac&VarType=Num&VarName=DEST&VarDesc=Dest&VarType=Char&VarName=DEST_CITY_NAME&VarDesc=DestCityName&VarType=Char&VarName=DEST_CITY_NUM&VarDesc=DestCityNum&VarType=Num&VarName=DEST_STATE_ABR&VarDesc=DestState&VarType=Char&VarName=DEST_STATE_FIPS&VarDesc=DestStateFips&VarType=Char&VarName=DEST_STATE_NM&VarDesc=DestStateName&VarType=Char&VarName=DEST_COUNTRY&VarDesc=DestCountry&VarType=Char&VarName=DEST_COUNTRY_NAME&VarDesc=DestCountryName&VarType=Char&VarName=DEST_WAC&VarDesc=DestWac&VarType=Num&VarName=AIRCRAFT_GROUP&VarDesc=AircraftGroup&VarType=Num&VarName=AIRCRAFT_TYPE&VarDesc=AircraftType&VarType=Char&VarName=AIRCRAFT_CONFIG&VarDesc=AircraftConfig&VarType=Num&VarName=YEAR&VarDesc=Year&VarType=Num&VarName=QUARTER&VarDesc=Quarter&VarType=Num&VarName=MONTH&VarDesc=Month&VarType=Num&VarName=DISTANCE_GROUP&VarDesc=DistanceGroup&VarType=Num&VarName=CLASS&VarDesc=Class&VarType=Char&VarName=DATA_SOURCE&VarDesc=DataSource&VarType=Char'
|
114
|
+
t = RemoteTable.new :url => url, :form_data => form_data, :compression => :zip, :glob => '/*.csv'
|
115
|
+
assert_equal 'United States of America', t.rows.first['DEST_COUNTRY_NAME']
|
116
|
+
end
|
102
117
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
118
|
+
should "take the last of values if the header is duplicated" do
|
119
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tujrgUOwDSLWb-P4KCt1qBg')
|
120
|
+
assert_equal '2', t.rows.first['dup_header']
|
121
|
+
end
|
107
122
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
123
|
+
should "respect field order in CSVs without headers" do
|
124
|
+
t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false)
|
125
|
+
last_k = -1
|
126
|
+
saw_string = false
|
127
|
+
t.rows.each do |row|
|
128
|
+
row.each do |k, v|
|
129
|
+
if k.is_a?(Fixnum) and last_k.is_a?(Fixnum)
|
130
|
+
assert !saw_string
|
131
|
+
assert k > last_k
|
132
|
+
end
|
133
|
+
last_k = k
|
134
|
+
saw_string = k.is_a?(String)
|
117
135
|
end
|
118
|
-
last_k = k
|
119
|
-
saw_string = k.is_a?(String)
|
120
136
|
end
|
121
137
|
end
|
122
|
-
end
|
123
138
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
139
|
+
%w{ csv ods xls }.each do |format|
|
140
|
+
eval %{
|
141
|
+
should "read #{format}" do
|
142
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}')
|
143
|
+
# no blank headers
|
144
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
145
|
+
# correct values
|
146
|
+
t.rows.each_with_index do |row, index|
|
147
|
+
assert_equal row.except('row_hash'), @test2_rows[index]
|
148
|
+
end
|
133
149
|
end
|
134
|
-
end
|
135
150
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
151
|
+
should "read #{format}, keeping blank rows" do
|
152
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}', :keep_blank_rows => true)
|
153
|
+
# no blank headers
|
154
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
155
|
+
# correct values
|
156
|
+
t.rows.each_with_index do |row, index|
|
157
|
+
assert_equal row.except('row_hash'), @test2_rows_with_blanks[index]
|
158
|
+
end
|
143
159
|
end
|
144
|
-
|
145
|
-
|
146
|
-
end
|
160
|
+
}
|
161
|
+
end
|
147
162
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
163
|
+
should "read fixed width correctly" do
|
164
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
165
|
+
:format => :fixed_width,
|
166
|
+
:skip => 1,
|
167
|
+
:schema => [[ 'header4', 10, { :type => :string } ],
|
168
|
+
[ 'spacer', 1 ],
|
169
|
+
[ 'header5', 10, { :type => :string } ],
|
170
|
+
[ 'spacer', 12 ],
|
171
|
+
[ 'header6', 10, { :type => :string } ]])
|
157
172
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
173
|
+
# no blank headers
|
174
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
175
|
+
# correct values
|
176
|
+
t.rows.each_with_index do |row, index|
|
177
|
+
assert_equal row.except('row_hash'), @test2_rows[index]
|
178
|
+
end
|
163
179
|
end
|
164
|
-
end
|
165
180
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
181
|
+
should "read fixed width correctly, keeping blank rows" do
|
182
|
+
t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
183
|
+
:format => :fixed_width,
|
184
|
+
:keep_blank_rows => true,
|
185
|
+
:skip => 1,
|
186
|
+
:schema => [[ 'header4', 10, { :type => :string } ],
|
187
|
+
[ 'spacer', 1 ],
|
188
|
+
[ 'header5', 10, { :type => :string } ],
|
189
|
+
[ 'spacer', 12 ],
|
190
|
+
[ 'header6', 10, { :type => :string } ]])
|
176
191
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
192
|
+
# no blank headers
|
193
|
+
assert t.rows.all? { |row| row.keys.all?(&:present?) }
|
194
|
+
# correct values
|
195
|
+
t.rows.each_with_index do |row, index|
|
196
|
+
assert_equal row.except('row_hash'), @test2_rows_with_blanks[index]
|
197
|
+
end
|
182
198
|
end
|
183
|
-
end
|
184
199
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
200
|
+
should "have the same row hash across formats" do
|
201
|
+
csv = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.csv')
|
202
|
+
ods = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.ods')
|
203
|
+
xls = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.xls')
|
204
|
+
fixed_width = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.fixed_width.txt',
|
205
|
+
:format => :fixed_width,
|
206
|
+
:skip => 1,
|
207
|
+
:schema => [[ 'header1', 10, { :type => :string } ],
|
208
|
+
[ 'spacer', 1 ],
|
209
|
+
[ 'header2', 10, { :type => :string } ],
|
210
|
+
[ 'spacer', 12 ],
|
211
|
+
[ 'header3', 10, { :type => :string } ]])
|
197
212
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
213
|
+
csv2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.csv')
|
214
|
+
ods2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.ods')
|
215
|
+
xls2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.xls')
|
216
|
+
fixed_width2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.fixed_width.txt',
|
217
|
+
:format => :fixed_width,
|
218
|
+
:skip => 1,
|
219
|
+
:schema => [[ 'spacer', 11 ],
|
220
|
+
[ 'header2', 10, { :type => :string } ],
|
221
|
+
[ 'spacer', 1 ],
|
222
|
+
[ 'header3', 10, { :type => :string } ],
|
223
|
+
[ 'spacer', 1 ],
|
224
|
+
[ 'header1', 10, { :type => :string } ]])
|
210
225
|
|
211
226
|
|
212
|
-
|
227
|
+
reference = csv.rows[0]['row_hash']
|
213
228
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
229
|
+
# same row hashes
|
230
|
+
assert_equal reference, ods.rows[0]['row_hash']
|
231
|
+
assert_equal reference, xls.rows[0]['row_hash']
|
232
|
+
assert_equal reference, fixed_width.rows[0]['row_hash']
|
233
|
+
# same row hashes with different order
|
234
|
+
assert_equal reference, csv2.rows[0]['row_hash']
|
235
|
+
assert_equal reference, ods2.rows[0]['row_hash']
|
236
|
+
assert_equal reference, xls2.rows[0]['row_hash']
|
237
|
+
assert_equal reference, fixed_width2.rows[0]['row_hash']
|
238
|
+
end
|
224
239
|
|
225
|
-
|
226
|
-
|
240
|
+
should "open an ODS" do
|
241
|
+
t = RemoteTable.new(:url => 'http://www.worldmapper.org/data/opendoc/2_worldmapper_data.ods', :sheet => 'Data', :keep_blank_rows => true)
|
227
242
|
|
228
|
-
|
229
|
-
|
230
|
-
|
243
|
+
assert_equal 'Central Africa', t.rows[5]['name']
|
244
|
+
assert_equal 99, t.rows[5]['MAP DATA population (millions) 2002'].to_i
|
245
|
+
end
|
231
246
|
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
247
|
+
should "open a CSV inside a zip file" do
|
248
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv')
|
249
|
+
assert_equal 'ACURA', t.rows.first['Manufacturer']
|
250
|
+
assert_equal 'NSX', t.rows.first['carline name']
|
251
|
+
assert_equal 'TOYOTA', t.rows.last['Manufacturer']
|
252
|
+
assert_equal 'RAV4 SOFT TOP 4WD', t.rows.last['carline name']
|
253
|
+
end
|
239
254
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
+
should "open a fixed-width file with an inline schema inside a zip file" do
|
256
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
257
|
+
:filename => 'Gd6-dsc.txt',
|
258
|
+
:format => :fixed_width,
|
259
|
+
:crop => 21..26, # inclusive
|
260
|
+
:cut => '2-',
|
261
|
+
:select => lambda { |row| /\A[A-Z]/.match row['code'] },
|
262
|
+
:schema => [[ 'code', 2, { :type => :string } ],
|
263
|
+
[ 'spacer', 2 ],
|
264
|
+
[ 'name', 52, { :type => :string } ]])
|
265
|
+
assert_equal 'regular grade gasoline (octane number of 87)', t.rows.first['name']
|
266
|
+
assert_equal 'R', t.rows.first['code']
|
267
|
+
assert_equal 'electricity', t.rows.last['name']
|
268
|
+
assert_equal 'El', t.rows.last['code']
|
269
|
+
end
|
255
270
|
|
256
|
-
|
257
|
-
|
258
|
-
|
271
|
+
should "open an XLS with a parser" do
|
272
|
+
ma_1990_01 = {"month"=>1, "cost"=>"54.0", "locatable"=>"Massachusetts (State)", "year"=>1990}
|
273
|
+
ga_1990_01 = {"month"=>1, "cost"=>"50.7", "locatable"=>"Georgia (State)", "year"=>1990}
|
259
274
|
|
260
|
-
|
261
|
-
|
275
|
+
t = RemoteTable.new(:url => 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls',
|
276
|
+
:transform => { :class => FuelOilParser })
|
262
277
|
|
263
|
-
|
264
|
-
|
265
|
-
|
278
|
+
assert t.rows.include?(ma_1990_01)
|
279
|
+
assert t.rows.include?(ga_1990_01)
|
280
|
+
end
|
266
281
|
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
282
|
+
should "provide a row_hash on demand" do
|
283
|
+
t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
284
|
+
:filename => 'Gd6-dsc.txt',
|
285
|
+
:format => :fixed_width,
|
286
|
+
:crop => 21..26, # inclusive
|
287
|
+
:cut => '2-',
|
288
|
+
:select => lambda { |row| /\A[A-Z]/.match row['code'] },
|
289
|
+
:schema => [[ 'code', 2, { :type => :string } ],
|
290
|
+
[ 'spacer', 2 ],
|
291
|
+
[ 'name', 52, { :type => :string } ]])
|
292
|
+
assert_equal 'a8a5d7f17b56772723c657eb62b0f238', t.rows.first['row_hash']
|
293
|
+
end
|
278
294
|
end
|
279
295
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 12
|
9
|
+
version: 0.2.12
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Seamus Abshere
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-04-
|
18
|
+
date: 2010-04-22 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -74,6 +74,20 @@ dependencies:
|
|
74
74
|
version: 0.99.3
|
75
75
|
type: :runtime
|
76
76
|
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: nokogiri
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
segments:
|
85
|
+
- 1
|
86
|
+
- 4
|
87
|
+
- 1
|
88
|
+
version: 1.4.1
|
89
|
+
type: :runtime
|
90
|
+
version_requirements: *id005
|
77
91
|
description: Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.
|
78
92
|
email: seamus@abshere.net
|
79
93
|
executables: []
|
@@ -95,6 +109,7 @@ files:
|
|
95
109
|
- lib/remote_table/file.rb
|
96
110
|
- lib/remote_table/file/csv.rb
|
97
111
|
- lib/remote_table/file/fixed_width.rb
|
112
|
+
- lib/remote_table/file/html.rb
|
98
113
|
- lib/remote_table/file/ods.rb
|
99
114
|
- lib/remote_table/file/roo_spreadsheet.rb
|
100
115
|
- lib/remote_table/file/xls.rb
|