remote_table 0.2.32 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/CHANGELOG +5 -0
  2. data/Gemfile +4 -0
  3. data/Gemfile.lock +65 -0
  4. data/LICENSE +1 -1
  5. data/README.rdoc +21 -7
  6. data/Rakefile +12 -61
  7. data/lib/remote_table/cleaner.rb +19 -0
  8. data/lib/remote_table/executor.rb +29 -0
  9. data/lib/remote_table/format/delimited.rb +62 -0
  10. data/lib/remote_table/format/excel.rb +10 -0
  11. data/lib/remote_table/format/excelx.rb +10 -0
  12. data/lib/remote_table/format/fixed_width.rb +47 -0
  13. data/lib/remote_table/format/html.rb +43 -0
  14. data/lib/remote_table/format/mixins/rooable.rb +47 -0
  15. data/lib/remote_table/format/mixins/textual.rb +34 -0
  16. data/lib/remote_table/format/open_office.rb +10 -0
  17. data/lib/remote_table/format.rb +35 -0
  18. data/lib/remote_table/hasher.rb +25 -0
  19. data/lib/remote_table/local_file.rb +92 -0
  20. data/lib/remote_table/properties.rb +209 -0
  21. data/lib/remote_table/transformer.rb +17 -0
  22. data/lib/remote_table/version.rb +3 -0
  23. data/lib/remote_table.rb +91 -99
  24. data/remote_table.gemspec +32 -77
  25. data/test/{test_helper.rb → helper.rb} +9 -2
  26. data/test/test_big.rb +61 -0
  27. data/test/test_errata.rb +46 -0
  28. data/test/test_old_syntax.rb +229 -0
  29. data/test/test_old_transform.rb +49 -0
  30. data/test/test_remote_table.rb +13 -0
  31. metadata +176 -53
  32. data/VERSION +0 -1
  33. data/lib/remote_table/file/csv.rb +0 -49
  34. data/lib/remote_table/file/fixed_width.rb +0 -19
  35. data/lib/remote_table/file/html.rb +0 -37
  36. data/lib/remote_table/file/ods.rb +0 -11
  37. data/lib/remote_table/file/roo_spreadsheet.rb +0 -44
  38. data/lib/remote_table/file/xls.rb +0 -11
  39. data/lib/remote_table/file/xlsx.rb +0 -11
  40. data/lib/remote_table/file.rb +0 -100
  41. data/lib/remote_table/package.rb +0 -89
  42. data/lib/remote_table/request.rb +0 -44
  43. data/lib/remote_table/transform.rb +0 -58
  44. data/test/remote_table_test.rb +0 -386
@@ -1,89 +0,0 @@
1
- class RemoteTable
2
- class Package
3
- attr_accessor :url, :compression, :packing, :filename, :glob
4
-
5
- def initialize(bus)
6
- @url = bus[:url] or raise "need url"
7
- @compression = bus[:compression] || compression_from_basename
8
- @packing = bus[:packing] || packing_from_basename_and_compression
9
- @filename = bus[:filename] || filename_from_basename_and_compression_and_packing
10
- @glob = bus[:glob]
11
- add_hints!(bus)
12
- end
13
-
14
- def add_hints!(hash)
15
- hash[:filename] = filename unless hash.has_key?(:filename)
16
- end
17
-
18
- def stage(path)
19
- decompress(path)
20
- unpack(path)
21
- identify(path)
22
- file_path(path)
23
- end
24
-
25
- private
26
-
27
- def decompress(path)
28
- return unless compression
29
- cmd, args = case compression
30
- when :zip, :exe
31
- ["unzip", "-d #{Escape.shell_single_word ::File.dirname(path)}"]
32
- when :bz2
33
- 'bunzip2'
34
- when :gz
35
- 'gunzip'
36
- end
37
- move_and_process path, compression, cmd, args
38
- end
39
-
40
- def unpack(path)
41
- return unless packing
42
- cmd, args = case packing
43
- when :tar
44
- ['tar -xf', "-C #{::File.dirname(path)}"]
45
- end
46
- move_and_process path, packing, cmd, args
47
- end
48
-
49
- def move_and_process(path, extname, cmd, args)
50
- new_path = "#{path}.#{extname}"
51
- FileUtils.mv path, new_path
52
- RemoteTable.backtick_with_reporting "#{cmd} #{Escape.shell_single_word new_path} #{args}"
53
- end
54
-
55
- # ex. A: 2007-01.csv.gz (compression not capable of storing multiple files)
56
- # ex. B: 2007-01.tar.gz (packing)
57
- # ex. C: 2007-01.zip (compression capable of storing multiple files)
58
- def identify(path)
59
- if glob.present?
60
- FileUtils.mv Dir[::File.dirname(path) + glob].first, file_path(path)
61
- elsif !packing and [ nil, :bz2, :gz ].include?(compression)
62
- FileUtils.mv path, file_path(path)
63
- end
64
- end
65
-
66
- def file_path(path)
67
- ::File.join(::File.dirname(path), filename)
68
- end
69
-
70
- def basename_parts
71
- ::File.basename(URI.parse(url).path).split('.').map(&:to_sym)
72
- end
73
-
74
- def compression_from_basename
75
- [ :zip, :exe, :bz2, :gz ].detect { |i| i == basename_parts.last }
76
- end
77
-
78
- def packing_from_basename_and_compression
79
- [ :tar ].detect { |i| i == ((basename_parts.last == compression) ? basename_parts[-2] : basename_parts.last) }
80
- end
81
-
82
- def filename_from_basename_and_compression_and_packing
83
- ary = basename_parts
84
- ary.pop if ary.last == compression
85
- ary.pop if ary.last == packing
86
- ary.join('.')
87
- end
88
- end
89
- end
@@ -1,44 +0,0 @@
1
- class RemoteTable
2
- class Request
3
- attr_accessor :parsed_url, :post_data, :username, :password
4
- attr_accessor :form_data
5
-
6
- # TODO: support HTTP basic auth
7
- def initialize(bus)
8
- raise(ArgumentError, "RemoteTable needs :url option") unless bus[:url].present?
9
- @parsed_url = URI.parse bus[:url]
10
- if @parsed_url.host == 'spreadsheets.google.com'
11
- if bus[:format].blank? or bus[:format].to_s == 'csv'
12
- @parsed_url.query = 'output=csv&' + @parsed_url.query.sub(/\&?output=.*?(\&|\z)/, '\1')
13
- end
14
- end
15
- @form_data = bus[:form_data]
16
- end
17
-
18
- def download
19
- path = ::File.join staging_dir_path, 'REMOTE_TABLE_PACKAGE'
20
- if parsed_url.scheme == 'file'
21
- ::FileUtils.cp parsed_url.path, path
22
- else
23
- RemoteTable.backtick_with_reporting %{
24
- curl
25
- --header "Expect: "
26
- --location
27
- #{"--data #{Escape.shell_single_word form_data}" if form_data.present?}
28
- #{Escape.shell_single_word parsed_url.to_s}
29
- --output #{Escape.shell_single_word path}
30
- 2>&1
31
- }
32
- end
33
- path
34
- end
35
-
36
- def staging_dir_path
37
- return @_staging_dir_path if @_staging_dir_path
38
- @_staging_dir_path = ::File.join Dir.tmpdir, 'remote_table_gem', rand.to_s
39
- FileUtils.mkdir_p @_staging_dir_path
40
- RemoteTable.remove_at_exit @_staging_dir_path
41
- @_staging_dir_path
42
- end
43
- end
44
- end
@@ -1,58 +0,0 @@
1
- class RemoteTable
2
- class Transform
3
- attr_accessor :select, :reject, :transform_class, :transform_options, :transform, :raw_table
4
- attr_accessor :errata
5
-
6
- def initialize(bus)
7
- if transform_params = bus.delete(:transform)
8
- @transform_class = transform_params.delete(:class)
9
- @transform_options = transform_params
10
- @transform = @transform_class.new(@transform_options)
11
- @transform.add_hints!(bus)
12
- end
13
- @select = bus[:select]
14
- @reject = bus[:reject]
15
- @errata = bus[:errata]
16
- end
17
-
18
- # the null transformation
19
- def apply(raw_table)
20
- self.raw_table = raw_table
21
- self
22
- end
23
-
24
- # - convert it to a plain hash for whatever ruby version you're on
25
- # - dump it
26
- # - digest it
27
- def self.row_hash(row)
28
- plain_hsh = if RUBY_VERSION >= '1.9'
29
- row.keys.sort.inject(::Hash.new) do |memo, key|
30
- value = row[key]
31
- key = key.to_s.toutf8
32
- value = value.to_s.toutf8 if value.respond_to? :to_s
33
- memo[key] = value
34
- memo
35
- end
36
- else
37
- ::Hash.new.replace(row)
38
- end
39
- ::Digest::MD5.hexdigest ::Marshal.dump(plain_hsh)
40
- end
41
-
42
- def each_row(&block)
43
- raw_table.each_row do |row|
44
- row['row_hash'] = self.class.row_hash(row)
45
- virtual_rows = transform ? transform.apply(row) : row # allow transform.apply(row) to return multiple rows
46
- Array.wrap(virtual_rows).each do |virtual_row|
47
- if errata
48
- next if errata.rejects? virtual_row
49
- errata.correct! virtual_row
50
- end
51
- next if select and !select.call(virtual_row)
52
- next if reject and reject.call(virtual_row)
53
- yield virtual_row
54
- end
55
- end
56
- end
57
- end
58
- end
@@ -1,386 +0,0 @@
1
- require 'test_helper'
2
-
3
- class FuelOilParser
4
- def initialize(options = {})
5
- # nothing
6
- end
7
- def add_hints!(bus)
8
- bus[:sheet] = 'Data 1'
9
- bus[:skip] = 2
10
- bus[:select] = lambda { |row| row['year'] > 1989 }
11
- end
12
- def apply(row)
13
- virtual_rows = []
14
- row.keys.grep(/(.+) Residual Fuel Oil/) do |location_column_name|
15
- first_part = $1
16
- next if (cost = row[location_column_name]).blank? or (date = row['Date']).blank?
17
- if first_part.start_with?('U.S.')
18
- locatable = "united_states (Country)"
19
- elsif first_part.include?('PADD')
20
- /\(PADD (.*)\)/.match(first_part)
21
- padd_part = $1
22
- next if padd_part == '1' # skip PADD 1 because we always prefer subdistricts
23
- locatable = "#{padd_part} (PetroleumAdministrationForDefenseDistrict)"
24
- else
25
- locatable = "#{first_part} (State)"
26
- end
27
- date = Time.parse(date)
28
- virtual_rows << {
29
- 'locatable' => locatable,
30
- 'cost' => cost,
31
- 'year' => date.year,
32
- 'month' => date.month
33
- }
34
- end
35
- virtual_rows
36
- end
37
- end
38
-
39
- class AircraftGuru
40
- def is_a_dc_plane?(row)
41
- row['Designator'] =~ /^DC\d/i
42
- end
43
-
44
- # def is_a_crj_900?(row)
45
- # row['Designator'].downcase == 'crj9'
46
- # end
47
-
48
- def is_a_g159?(row)
49
- row['Designator'] =~ /^G159$/
50
- end
51
-
52
- def is_a_galx?(row)
53
- row['Designator'] =~ /^GALX$/
54
- end
55
-
56
- def method_missing(method_id, *args, &block)
57
- if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
58
- manufacturer_name = $1
59
- manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
60
- matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
61
- method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
62
- else
63
- super
64
- end
65
- end
66
- end
67
-
68
- class RemoteTableTest < Test::Unit::TestCase
69
- def setup
70
- @test2_rows_with_blanks = [
71
- { 'header4' => '', 'header5' => '', 'header6' => '' },
72
- { 'header4' => '1 at 4', 'header5' => '1 at 5', 'header6' => '1 at 6' },
73
- { 'header4' => '', 'header5' => '', 'header6' => '' },
74
- { 'header4' => '2 at 4', 'header5' => '2 at 5', 'header6' => '2 at 6' },
75
- ]
76
- @test2_rows = [
77
- { 'header4' => '1 at 4', 'header5' => '1 at 5', 'header6' => '1 at 6' },
78
- { 'header4' => '2 at 4', 'header5' => '2 at 5', 'header6' => '2 at 6' },
79
- ]
80
- end
81
-
82
- if ENV['ALL'] == 'true' or ENV['SLOW'] == 'true'
83
- should "open an XLS inside a zip file" do
84
- t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
85
- assert_equal 'ACURA', t.rows.first['Manufacturer']
86
- assert_equal 'NSX', t.rows.first['carline name']
87
- assert_equal 'VOLVO', t.rows.last['Manufacturer']
88
- assert_equal 'V70 XC AWD', t.rows.last['carline name']
89
- end
90
-
91
- should "not have indifferent string/symbol hash access" do
92
- t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls')
93
- assert_equal 'ACURA', t.rows.first['Manufacturer']
94
- assert_equal nil, t.rows.first[:Manufacturer]
95
- end
96
-
97
- should "open a CSV inside a zip file" do
98
- t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv')
99
- assert_equal 'ACURA', t.rows.first['Manufacturer']
100
- assert_equal 'NSX', t.rows.first['carline name']
101
- assert_equal 'TOYOTA', t.rows.last['Manufacturer']
102
- assert_equal 'RAV4 SOFT TOP 4WD', t.rows.last['carline name']
103
- end
104
-
105
- should "open a fixed-width file with an inline schema inside a zip file" do
106
- t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
107
- :filename => 'Gd6-dsc.txt',
108
- :format => :fixed_width,
109
- :crop => 21..26, # inclusive
110
- :cut => '2-',
111
- :select => lambda { |row| /\A[A-Z]/.match row['code'] },
112
- :schema => [[ 'code', 2, { :type => :string } ],
113
- [ 'spacer', 2 ],
114
- [ 'name', 52, { :type => :string } ]])
115
- assert_equal 'regular grade gasoline (octane number of 87)', t.rows.first['name']
116
- assert_equal 'R', t.rows.first['code']
117
- assert_equal 'electricity', t.rows.last['name']
118
- assert_equal 'El', t.rows.last['code']
119
- end
120
-
121
- should "send form data, follow redirects and use a filename glob" do
122
- url = 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0'
123
- form_data = 'UserTableName=T_100_Segment__All_Carriers&DBShortName=Air_Carriers&RawDataTable=T_T100_SEGMENT_ALL_CARRIER&sqlstr=+SELECT+DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE+FROM++T_T100_SEGMENT_ALL_CARRIER+WHERE+Month+%3D1+AND+YEAR%3D2008&varlist=DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE&grouplist=&suml=&sumRegion=&filter1=title%3D&filter2=title%3D&geo=All%A0&time=January&timename=Month&GEOGRAPHY=All&XYEAR=2008&FREQUENCY=1&AllVars=All&VarName=DEPARTURES_SCHEDULED&VarDesc=DepScheduled&VarType=Num&VarName=DEPARTURES_PERFORMED&VarDesc=DepPerformed&VarType=Num&VarName=PAYLOAD&VarDesc=Payload&VarType=Num&VarName=SEATS&VarDesc=Seats&VarType=Num&VarName=PASSENGERS&VarDesc=Passengers&VarType=Num&VarName=FREIGHT&VarDesc=Freight&VarType=Num&VarName=MAIL&VarDesc=Mail&VarType=Num&VarName=DISTANCE&VarDesc=Distance&VarType=Num&VarName=RAMP_TO_RAMP&VarDesc=RampToRamp&VarType=Num&VarName=AIR_TIME&VarDesc=AirTime&VarType=Num&VarName=UNIQUE_CARRIER&VarDesc=UniqueCarrier&VarType=Char&VarName=AIRLINE_ID&VarDesc=AirlineID&VarType=Num&VarName=UNIQUE_CARRIER_NAME&VarDesc=UniqueCarrierName&VarType=Char&VarName=UNIQUE_CARRIER_ENTITY&VarDesc=UniqCarrierEntity&VarType=Char&VarName=REGION&VarDesc=CarrierRegion&VarType=Char&VarName=CARRIER&VarDesc=Carrier&VarType=Char&VarName=CARRIER_NAME&VarDesc=CarrierName&VarType=Char&VarName=CARRIER_GROUP&VarDesc=CarrierGroup&VarType=Num&VarName=CARRIER_GROUP_NEW&VarDesc=CarrierGroupNew&VarType=Num&VarName=ORIGIN&VarDesc=Origin&VarType=Char&VarName=ORIGIN_CITY_NAME&VarDesc=OriginCityName&VarType=Char&VarName=ORIGIN_CITY_NUM&VarDesc=OriginCityNum&VarType=Num&VarName=ORIGIN_STATE_ABR&VarDesc=OriginState&VarType=Char&VarName=ORIGIN_STATE_FIPS&VarDesc=OriginStateFips&VarType=Char&VarName=ORIGIN_STATE_NM&VarDesc=OriginStateName&VarType=Char&VarName=ORIGIN_COUNTRY&VarDesc=OriginCountry&VarType=Char&VarName=ORIGIN_COUNTRY_NAME&VarDesc=OriginCountryName&VarType=Char&VarName=ORIGIN_WAC&VarDesc=OriginWac&VarType=Num&VarName=DEST&VarDesc=Dest&VarType=Char&VarName=DEST_CITY_NAME&VarDesc=DestCityName&VarType=Char&VarName=DEST_CITY_NUM&VarDesc=DestCityNum&VarType=Num&VarName=DEST_STATE_ABR&VarDesc=DestState&VarType=Char&VarName=DEST_STATE_FIPS&VarDesc=DestStateFips&VarType=Char&VarName=DEST_STATE_NM&VarDesc=DestStateName&VarType=Char&VarName=DEST_COUNTRY&VarDesc=DestCountry&VarType=Char&VarName=DEST_COUNTRY_NAME&VarDesc=DestCountryName&VarType=Char&VarName=DEST_WAC&VarDesc=DestWac&VarType=Num&VarName=AIRCRAFT_GROUP&VarDesc=AircraftGroup&VarType=Num&VarName=AIRCRAFT_TYPE&VarDesc=AircraftType&VarType=Char&VarName=AIRCRAFT_CONFIG&VarDesc=AircraftConfig&VarType=Num&VarName=YEAR&VarDesc=Year&VarType=Num&VarName=QUARTER&VarDesc=Quarter&VarType=Num&VarName=MONTH&VarDesc=Month&VarType=Num&VarName=DISTANCE_GROUP&VarDesc=DistanceGroup&VarType=Num&VarName=CLASS&VarDesc=Class&VarType=Char&VarName=DATA_SOURCE&VarDesc=DataSource&VarType=Char'
124
- t = RemoteTable.new :url => url, :form_data => form_data, :compression => :zip, :glob => '/*.csv'
125
- assert_equal 'United States of America', t.rows.first['DEST_COUNTRY_NAME']
126
- end
127
- end
128
-
129
- if ENV['ALL'] == 'true' or ENV['NEW'] == 'true'
130
- end
131
-
132
- if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
133
- should "open an XLSX like an array (numbered columns)" do
134
- t = RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => false)
135
- assert_equal "Secure encryption of all data", t.rows[5][0]
136
- end
137
-
138
- should "open an XLSX with custom headers" do
139
- t = RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => %w{foo bar baz})
140
- assert_equal "Secure encryption of all data", t.rows[5]['foo']
141
- end
142
-
143
- should "open an XLSX" do
144
- t = RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx')
145
- assert_equal "Secure encryption of all data", t.rows[5]["Requirements"]
146
- end
147
-
148
- should "work on filenames with spaces, using globbing" do
149
- t = RemoteTable.new :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
150
- assert_equal 'ASTON MARTIN', t.rows.first['MFR']
151
- end
152
-
153
- should "work on filenames with spaces" do
154
- t = RemoteTable.new :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
155
- assert_equal 'ASTON MARTIN', t.rows.first['MFR']
156
- end
157
-
158
- should "ignore UTF-8 byte order marks" do
159
- t = RemoteTable.new :url => 'http://www.freebase.com/type/exporttypeinstances/base/horses/horse_breed?page=0&filter_mode=type&filter_view=table&show%01p%3D%2Ftype%2Fobject%2Fname%01index=0&show%01p%3D%2Fcommon%2Ftopic%2Fimage%01index=1&show%01p%3D%2Fcommon%2Ftopic%2Farticle%01index=2&sort%01p%3D%2Ftype%2Fobject%2Ftype%01p%3Dlink%01p%3D%2Ftype%2Flink%2Ftimestamp%01index=false&=&exporttype=csv-8'
160
- assert_equal 'Tawleed', t.rows.first['name']
161
- end
162
-
163
- should "be able to apply errata files" do
164
- t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
165
- :encoding => 'windows-1252',
166
- :row_xpath => '//table/tr[2]/td/table/tr',
167
- :column_xpath => 'td',
168
- :errata => Errata.new(:table => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'),
169
- :responder => AircraftGuru.new)
170
- g1 = t.rows.detect { |row| row['Model'] =~ /Gulfstream I/ }
171
- assert g1
172
- assert_equal 'GULFSTREAM AEROSPACE', g1['Manufacturer']
173
- assert_equal 'Gulfstream I', g1['Model']
174
- end
175
-
176
- # this will die with an error about libcurl if your curl doesn't support ssl
177
- should "connect using HTTPS if available" do
178
- t = RemoteTable.new(:url => 'https://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA')
179
- assert_equal 'Gulf Coast', t.rows.first['PAD district name']
180
- assert_equal 'AL', t.rows.first['State']
181
- assert_equal 'Rocky Mountain', t.rows.last['PAD district name']
182
- assert_equal 'WY', t.rows.last['State']
183
- end
184
-
185
- should "read an HTML table made with frontpage" do
186
- t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-E.htm",
187
- :encoding => 'US-ASCII',
188
- :row_xpath => '//table/tr[2]/td/table/tr',
189
- :column_xpath => 'td'
190
- assert_equal 'E110', t.rows.first['Designator']
191
- assert_equal 'EMBRAER', t.rows.first['Manufacturer']
192
- assert_equal 'EZKC', t.rows.last['Designator']
193
- assert_equal 'EZ King Cobra', t.rows.last['Model']
194
- end
195
-
196
- should "hash rows without paying attention to order" do
197
- x = ActiveSupport::OrderedHash.new
198
- x[:a] = 1
199
- x[:b] = 2
200
-
201
- y = ActiveSupport::OrderedHash.new
202
- y[:b] = 2
203
- y[:a] = 1
204
-
205
- assert_not_equal Marshal.dump(x), Marshal.dump(y)
206
- assert_equal RemoteTable::Transform.row_hash(x), RemoteTable::Transform.row_hash(y)
207
- end
208
-
209
- should "open a Google Docs url (as a CSV)" do
210
- t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA')
211
- assert_equal 'Gulf Coast', t.rows.first['PAD district name']
212
- assert_equal 'AL', t.rows.first['State']
213
- assert_equal 'Rocky Mountain', t.rows.last['PAD district name']
214
- assert_equal 'WY', t.rows.last['State']
215
- end
216
-
217
- should "open a Google Docs url (as a CSV, with sheet options)" do
218
- t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA&single=true&gid=0')
219
- assert_equal 'Gulf Coast', t.rows.first['PAD district name']
220
- assert_equal 'AL', t.rows.first['State']
221
- assert_equal 'Rocky Mountain', t.rows.last['PAD district name']
222
- assert_equal 'WY', t.rows.last['State']
223
- end
224
-
225
- should "open a Google Docs url as a CSV without headers" do
226
- t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false)
227
- assert_equal 'AL', t.rows.first[0]
228
- assert_equal 'Gulf Coast', t.rows.first[4]
229
- assert_equal 'WY', t.rows.last[0]
230
- assert_equal 'Rocky Mountain', t.rows.last[4]
231
- end
232
-
233
- should "take the last of values if the header is duplicated" do
234
- t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tujrgUOwDSLWb-P4KCt1qBg')
235
- assert_equal '2', t.rows.first['dup_header']
236
- end
237
-
238
- should "respect field order in CSVs without headers" do
239
- t = RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false)
240
- last_k = -1
241
- saw_string = false
242
- t.rows.each do |row|
243
- row.each do |k, v|
244
- if k.is_a?(Fixnum) and last_k.is_a?(Fixnum)
245
- assert !saw_string
246
- assert k > last_k
247
- end
248
- last_k = k
249
- saw_string = k.is_a?(String)
250
- end
251
- end
252
- end
253
-
254
- %w{ csv ods xls }.each do |format|
255
- eval %{
256
- should "read #{format}" do
257
- t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}')
258
- # no blank headers
259
- assert t.rows.all? { |row| row.keys.all?(&:present?) }
260
- # correct values
261
- t.rows.each_with_index do |row, index|
262
- assert_equal row.except('row_hash'), @test2_rows[index]
263
- end
264
- end
265
-
266
- should "read #{format}, keeping blank rows" do
267
- t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}', :keep_blank_rows => true)
268
- # no blank headers
269
- assert t.rows.all? { |row| row.keys.all?(&:present?) }
270
- # correct values
271
- t.rows.each_with_index do |row, index|
272
- assert_equal row.except('row_hash'), @test2_rows_with_blanks[index]
273
- end
274
- end
275
- }
276
- end
277
-
278
- should "read fixed width correctly" do
279
- t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
280
- :format => :fixed_width,
281
- :skip => 1,
282
- :schema => [[ 'header4', 10, { :type => :string } ],
283
- [ 'spacer', 1 ],
284
- [ 'header5', 10, { :type => :string } ],
285
- [ 'spacer', 12 ],
286
- [ 'header6', 10, { :type => :string } ]])
287
-
288
- # no blank headers
289
- assert t.rows.all? { |row| row.keys.all?(&:present?) }
290
- # correct values
291
- t.rows.each_with_index do |row, index|
292
- assert_equal row.except('row_hash'), @test2_rows[index]
293
- end
294
- end
295
-
296
- should "read fixed width correctly, keeping blank rows" do
297
- t = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
298
- :format => :fixed_width,
299
- :keep_blank_rows => true,
300
- :skip => 1,
301
- :schema => [[ 'header4', 10, { :type => :string } ],
302
- [ 'spacer', 1 ],
303
- [ 'header5', 10, { :type => :string } ],
304
- [ 'spacer', 12 ],
305
- [ 'header6', 10, { :type => :string } ]])
306
-
307
- # no blank headers
308
- assert t.rows.all? { |row| row.keys.all?(&:present?) }
309
- # correct values
310
- t.rows.each_with_index do |row, index|
311
- assert_equal row.except('row_hash'), @test2_rows_with_blanks[index]
312
- end
313
- end
314
-
315
- should "have the same row hash across formats" do
316
- csv = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.csv')
317
- ods = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.ods')
318
- xls = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.xls')
319
- fixed_width = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.fixed_width.txt',
320
- :format => :fixed_width,
321
- :skip => 1,
322
- :schema => [[ 'header1', 10, { :type => :string } ],
323
- [ 'spacer', 1 ],
324
- [ 'header2', 10, { :type => :string } ],
325
- [ 'spacer', 12 ],
326
- [ 'header3', 10, { :type => :string } ]])
327
-
328
- csv2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.csv')
329
- ods2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.ods')
330
- xls2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.xls')
331
- fixed_width2 = RemoteTable.new(:url => 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.fixed_width.txt',
332
- :format => :fixed_width,
333
- :skip => 1,
334
- :schema => [[ 'spacer', 11 ],
335
- [ 'header2', 10, { :type => :string } ],
336
- [ 'spacer', 1 ],
337
- [ 'header3', 10, { :type => :string } ],
338
- [ 'spacer', 1 ],
339
- [ 'header1', 10, { :type => :string } ]])
340
-
341
-
342
- reference = csv.rows[0]['row_hash']
343
-
344
- # same row hashes
345
- assert_equal reference, ods.rows[0]['row_hash']
346
- assert_equal reference, xls.rows[0]['row_hash']
347
- assert_equal reference, fixed_width.rows[0]['row_hash']
348
- # same row hashes with different order
349
- assert_equal reference, csv2.rows[0]['row_hash']
350
- assert_equal reference, ods2.rows[0]['row_hash']
351
- assert_equal reference, xls2.rows[0]['row_hash']
352
- assert_equal reference, fixed_width2.rows[0]['row_hash']
353
- end
354
-
355
- should "open an ODS" do
356
- t = RemoteTable.new(:url => 'http://www.worldmapper.org/data/opendoc/2_worldmapper_data.ods', :sheet => 'Data', :keep_blank_rows => true)
357
-
358
- assert_equal 'Central Africa', t.rows[5]['name']
359
- assert_equal 99, t.rows[5]['MAP DATA population (millions) 2002'].to_i
360
- end
361
-
362
- should "open an XLS with a parser" do
363
- ma_1990_01 = {"month"=>1, "cost"=>"54.0", "locatable"=>"Massachusetts (State)", "year"=>1990}
364
- ga_1990_01 = {"month"=>1, "cost"=>"50.7", "locatable"=>"Georgia (State)", "year"=>1990}
365
-
366
- t = RemoteTable.new(:url => 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls',
367
- :transform => { :class => FuelOilParser })
368
-
369
- assert t.rows.include?(ma_1990_01)
370
- assert t.rows.include?(ga_1990_01)
371
- end
372
-
373
- # should "provide a row_hash on demand" do
374
- # t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
375
- # :filename => 'Gd6-dsc.txt',
376
- # :format => :fixed_width,
377
- # :crop => 21..26, # inclusive
378
- # :cut => '2-',
379
- # :select => lambda { |row| /\A[A-Z]/.match row['code'] },
380
- # :schema => [[ 'code', 2, { :type => :string } ],
381
- # [ 'spacer', 2 ],
382
- # [ 'name', 52, { :type => :string } ]])
383
- # assert_equal 'a8a5d7f17b56772723c657eb62b0f238', t.rows.first['row_hash']
384
- # end
385
- end
386
- end