remote_table 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/CHANGELOG +19 -0
- data/Gemfile +7 -1
- data/README.markdown +440 -0
- data/Rakefile +6 -14
- data/lib/remote_table.rb +27 -38
- data/lib/remote_table/{properties.rb → config.rb} +39 -43
- data/lib/remote_table/format.rb +24 -27
- data/lib/remote_table/format/delimited.rb +17 -21
- data/lib/remote_table/format/fixed_width.rb +9 -9
- data/lib/remote_table/format/html.rb +0 -2
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +13 -12
- data/lib/remote_table/format/mixins/processed_by_roo.rb +17 -13
- data/lib/remote_table/format/mixins/textual.rb +13 -13
- data/lib/remote_table/format/open_office.rb +3 -0
- data/lib/remote_table/format/xml.rb +0 -2
- data/lib/remote_table/format/yaml.rb +14 -0
- data/lib/remote_table/local_file.rb +69 -7
- data/lib/remote_table/transformer.rb +7 -4
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +5 -13
- data/test/fixtures/data.yml +4 -0
- data/test/helper.rb +8 -9
- data/test/test_big.rb +43 -53
- data/test/test_errata.rb +27 -25
- data/test/test_old_syntax.rb +193 -191
- data/test/test_old_transform.rb +12 -10
- data/test/test_remote_table.rb +57 -47
- metadata +48 -64
- data/.document +0 -5
- data/README.rdoc +0 -167
- data/lib/remote_table/utils.rb +0 -157
data/README.rdoc
DELETED
@@ -1,167 +0,0 @@
|
|
1
|
-
=remote_table
|
2
|
-
|
3
|
-
Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
|
4
|
-
|
5
|
-
==Real-life usage
|
6
|
-
|
7
|
-
Used by http://data.brighterplanet.com and the data_miner gem (http://github.com/seamusabshere/data_miner)
|
8
|
-
|
9
|
-
==Requirements
|
10
|
-
|
11
|
-
* POSIX operating system (not windows)
|
12
|
-
* curl, iconv, perl, cat, cut, tail, etc. accessible from /usr/local/bin:/usr/bin:/bin
|
13
|
-
|
14
|
-
As this library matures, those should go away.
|
15
|
-
|
16
|
-
==Example
|
17
|
-
|
18
|
-
?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
|
19
|
-
=> #<RemoteTable:0x359da50 [...]>
|
20
|
-
?> t[0]
|
21
|
-
=> {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
|
22
|
-
|
23
|
-
More examples:
|
24
|
-
|
25
|
-
RemoteTable.new "file://#{f.path}", :quote_char => %{'}, :headers => nil
|
26
|
-
|
27
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.csv'
|
28
|
-
|
29
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.ods'
|
30
|
-
|
31
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.xls'
|
32
|
-
|
33
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.csv'
|
34
|
-
|
35
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.ods'
|
36
|
-
|
37
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.xls'
|
38
|
-
|
39
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}'
|
40
|
-
|
41
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}', :keep_blank_rows => true
|
42
|
-
|
43
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA&single=true&gid=0'
|
44
|
-
|
45
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
|
46
|
-
|
47
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false
|
48
|
-
|
49
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
|
50
|
-
|
51
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :headers => %w{ col1 col2 col3 }
|
52
|
-
|
53
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=tujrgUOwDSLWb-P4KCt1qBg'
|
54
|
-
|
55
|
-
RemoteTable.new 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls', :transform => { :class => FuelOilParser }
|
56
|
-
|
57
|
-
RemoteTable.new 'http://www.freebase.com/type/exporttypeinstances/base/horses/horse_breed?page=0&filter_mode=type&filter_view=table&show%01p%3D%2Ftype%2Fobject%2Fname%01index=0&show%01p%3D%2Fcommon%2Ftopic%2Fimage%01index=1&show%01p%3D%2Fcommon%2Ftopic%2Farticle%01index=2&sort%01p%3D%2Ftype%2Fobject%2Ftype%01p%3Dlink%01p%3D%2Ftype%2Flink%2Ftimestamp%01index=false&=&exporttype=csv-8'
|
58
|
-
|
59
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls'
|
60
|
-
|
61
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
|
62
|
-
|
63
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
|
64
|
-
|
65
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
|
66
|
-
|
67
|
-
RemoteTable.new 'http://www.worldmapper.org/data/opendoc/2_worldmapper_data.ods', :sheet => 'Data', :keep_blank_rows => true
|
68
|
-
|
69
|
-
RemoteTable.new 'https://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
|
70
|
-
|
71
|
-
RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx'
|
72
|
-
|
73
|
-
RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => %w{foo bar baz}
|
74
|
-
|
75
|
-
RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => false
|
76
|
-
|
77
|
-
RemoteTable.new 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0', :form_data => 'UserTableName=T_100_Segment__All_Carriers&[...]', :compression => :zip, :glob => '/*.csv'
|
78
|
-
|
79
|
-
RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-E.htm",
|
80
|
-
:encoding => 'US-ASCII',
|
81
|
-
:row_xpath => '//table/tr[2]/td/table/tr',
|
82
|
-
:column_xpath => 'td'
|
83
|
-
|
84
|
-
RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
|
85
|
-
:encoding => 'windows-1252',
|
86
|
-
:row_xpath => '//table/tr[2]/td/table/tr',
|
87
|
-
:column_xpath => 'td',
|
88
|
-
:errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
|
89
|
-
:responder => AircraftGuru.new)
|
90
|
-
|
91
|
-
RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
|
92
|
-
:encoding => 'windows-1252',
|
93
|
-
:row_xpath => '//table/tr[2]/td/table/tr',
|
94
|
-
:column_xpath => 'td',
|
95
|
-
:errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
|
96
|
-
:responder => AircraftGuru.new }
|
97
|
-
|
98
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
99
|
-
:filename => 'Gd6-dsc.txt',
|
100
|
-
:format => :fixed_width,
|
101
|
-
:crop => 21..26, # inclusive
|
102
|
-
:cut => '2-',
|
103
|
-
:select => lambda { |row| /\A[A-Z]/.match row['code'] },
|
104
|
-
:schema => [[ 'code', 2, { :type => :string } ],
|
105
|
-
[ 'spacer', 2 ],
|
106
|
-
[ 'name', 52, { :type => :string } ]]
|
107
|
-
|
108
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
109
|
-
:format => :fixed_width,
|
110
|
-
:skip => 1,
|
111
|
-
:schema => [[ 'header4', 10, { :type => :string } ],
|
112
|
-
[ 'spacer', 1 ],
|
113
|
-
[ 'header5', 10, { :type => :string } ],
|
114
|
-
[ 'spacer', 12 ],
|
115
|
-
[ 'header6', 10, { :type => :string } ]]
|
116
|
-
|
117
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
118
|
-
:format => :fixed_width,
|
119
|
-
:keep_blank_rows => true,
|
120
|
-
:skip => 1,
|
121
|
-
:schema => [[ 'header4', 10, { :type => :string } ],
|
122
|
-
[ 'spacer', 1 ],
|
123
|
-
[ 'header5', 10, { :type => :string } ],
|
124
|
-
[ 'spacer', 12 ],
|
125
|
-
[ 'header6', 10, { :type => :string } ]]
|
126
|
-
|
127
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.fixed_width.txt',
|
128
|
-
:format => :fixed_width,
|
129
|
-
:skip => 1,
|
130
|
-
:schema => [[ 'header1', 10, { :type => :string } ],
|
131
|
-
[ 'spacer', 1 ],
|
132
|
-
[ 'header2', 10, { :type => :string } ],
|
133
|
-
[ 'spacer', 12 ],
|
134
|
-
[ 'header3', 10, { :type => :string } ]]
|
135
|
-
|
136
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.fixed_width.txt',
|
137
|
-
:format => :fixed_width,
|
138
|
-
:skip => 1,
|
139
|
-
:schema => [[ 'spacer', 11 ],
|
140
|
-
[ 'header2', 10, { :type => :string } ],
|
141
|
-
[ 'spacer', 1 ],
|
142
|
-
[ 'header3', 10, { :type => :string } ],
|
143
|
-
[ 'spacer', 1 ],
|
144
|
-
[ 'header1', 10, { :type => :string } ]]
|
145
|
-
|
146
|
-
==Helpful hints
|
147
|
-
|
148
|
-
* ASCII-8BIT is the same as BINARY
|
149
|
-
* ISO-8859-1 is the same as Latin1
|
150
|
-
|
151
|
-
==Custom parsers
|
152
|
-
|
153
|
-
See the test file and also data_miner examples of custom parsers.
|
154
|
-
|
155
|
-
==Wishlist
|
156
|
-
|
157
|
-
* The new parser syntax (aka transformer) hasn't been defined yet... only the old-style syntax is available
|
158
|
-
* We currently call curl (and a lot of other utilities) using a shell. Is there a safer way to do this?
|
159
|
-
|
160
|
-
==Authors
|
161
|
-
|
162
|
-
* Seamus Abshere <seamus@abshere.net>
|
163
|
-
* Andy Rossmeissl <andy@rossmeissl.net>
|
164
|
-
|
165
|
-
== Copyright
|
166
|
-
|
167
|
-
Copyright (c) 2011 Brighter Planet. See LICENSE for details.
|
data/lib/remote_table/utils.rb
DELETED
@@ -1,157 +0,0 @@
|
|
1
|
-
require 'fileutils'
|
2
|
-
require 'posix/spawn'
|
3
|
-
require 'tmpdir'
|
4
|
-
|
5
|
-
class RemoteTable
|
6
|
-
class SpawnError < ::RuntimeError; end
|
7
|
-
|
8
|
-
module Utils
|
9
|
-
def self.tmp_path(ancestor)
|
10
|
-
basename = ::File.basename(ancestor).sub(/remote_table-[0-9]+-/, '')
|
11
|
-
::Kernel.srand
|
12
|
-
::File.join ::Dir.tmpdir, "remote_table-#{::Kernel.rand(1e11)}-#{basename}"
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.spawn(*argv)
|
16
|
-
options = argv.extract_options!
|
17
|
-
if options[:in] or options[:out]
|
18
|
-
# capture these now because posix/spawn is known to bork them
|
19
|
-
in_out = options.slice(:in, :out).map { |k, v| ":#{k} => #{v.path}" }.join(', ')
|
20
|
-
# --
|
21
|
-
pid = ::POSIX::Spawn.spawn *argv, options
|
22
|
-
::Process.waitpid pid
|
23
|
-
raise SpawnError, "[remote_table] spawn #{argv.join(' ')} (#{in_out}) failed with exit status #{$?.exitstatus}" unless $?.success?
|
24
|
-
else
|
25
|
-
child = ::POSIX::Spawn::Child.new *argv
|
26
|
-
raise SpawnError, "[remote_table] spawn #{argv.join(' ')}) failed with #{child.err}" unless child.success?
|
27
|
-
end
|
28
|
-
nil
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.in_place(*args)
|
32
|
-
options = args.extract_options!
|
33
|
-
input = args.shift
|
34
|
-
argv = args
|
35
|
-
output = tmp_path input
|
36
|
-
::File.open(input, 'r') do |f0|
|
37
|
-
::File.open(output, 'wb') do |f1|
|
38
|
-
spawn *argv, :in => f0, :out => f1
|
39
|
-
end
|
40
|
-
end
|
41
|
-
::FileUtils.mv output, input
|
42
|
-
nil
|
43
|
-
rescue SpawnError => e
|
44
|
-
if options[:ignore_error]
|
45
|
-
$stderr.puts "#{e.inspect} (ignoring error...)"
|
46
|
-
::FileUtils.mv output, input
|
47
|
-
else
|
48
|
-
raise e
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def self.download(uri, form_data = nil)
|
53
|
-
output = tmp_path uri.path
|
54
|
-
|
55
|
-
if uri.scheme == 'file'
|
56
|
-
$stderr.puts "[remote_table] Getting #{uri.path} from the local file system" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
|
57
|
-
::FileUtils.cp uri.path, output
|
58
|
-
return output
|
59
|
-
end
|
60
|
-
|
61
|
-
argv = [ 'curl', '--location', '--show-error', '--silent', '--compressed', '--header', 'Expect: ' ]
|
62
|
-
if form_data
|
63
|
-
argv += [ '--data', form_data ]
|
64
|
-
end
|
65
|
-
argv += [ uri.to_s, '--output', output ]
|
66
|
-
|
67
|
-
# sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
|
68
|
-
if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
|
69
|
-
::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
|
70
|
-
end
|
71
|
-
|
72
|
-
$stderr.puts "[remote_table] Downloading #{uri.to_s}" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
|
73
|
-
spawn *argv
|
74
|
-
output
|
75
|
-
end
|
76
|
-
|
77
|
-
def self.decompress(input, compression)
|
78
|
-
case compression
|
79
|
-
when :zip, :exe
|
80
|
-
Utils.unzip input
|
81
|
-
when :bz2
|
82
|
-
Utils.bunzip2 input
|
83
|
-
when :gz
|
84
|
-
Utils.gunzip input
|
85
|
-
else
|
86
|
-
raise ::ArgumentError, "[remote_table] Unrecognized compression #{compression}"
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
def self.unpack(input, packing)
|
91
|
-
case packing
|
92
|
-
when :tar
|
93
|
-
Utils.untar input
|
94
|
-
else
|
95
|
-
raise ::ArgumentError, "[remote_table] Unrecognized packing #{packing}"
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
def self.pick(input, options = {})
|
100
|
-
options = options.symbolize_keys
|
101
|
-
if (options[:filename] or options[:glob]) and not ::File.directory?(input)
|
102
|
-
raise ::RuntimeError, "[remote_table] Expecting #{input} to be a directory"
|
103
|
-
end
|
104
|
-
if filename = options[:filename]
|
105
|
-
src = ::File.join input, filename
|
106
|
-
raise(::RuntimeError, "[remote_table] Expecting #{src} to be a file") unless ::File.file?(src)
|
107
|
-
output = tmp_path src
|
108
|
-
::FileUtils.mv src, output
|
109
|
-
::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
|
110
|
-
elsif glob = options[:glob]
|
111
|
-
src = ::Dir[input+glob].first
|
112
|
-
raise(::RuntimeError, "[remote_table] Expecting #{glob} to find a file in #{input}") unless src and ::File.file?(src)
|
113
|
-
output = tmp_path src
|
114
|
-
::FileUtils.mv src, output
|
115
|
-
::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
|
116
|
-
else
|
117
|
-
output = tmp_path input
|
118
|
-
::FileUtils.mv input, output
|
119
|
-
end
|
120
|
-
output
|
121
|
-
end
|
122
|
-
|
123
|
-
def self.gunzip(input)
|
124
|
-
output = tmp_path input
|
125
|
-
::File.open(output, 'wb') do |f|
|
126
|
-
spawn 'gunzip', '--stdout', input, :out => f
|
127
|
-
end
|
128
|
-
::FileUtils.rm_f input
|
129
|
-
output
|
130
|
-
end
|
131
|
-
|
132
|
-
def self.bunzip2(input)
|
133
|
-
output = tmp_path input
|
134
|
-
::File.open(output, 'wb') do |f|
|
135
|
-
spawn 'bunzip2', '--stdout', input, :out => f
|
136
|
-
end
|
137
|
-
::FileUtils.rm_f input
|
138
|
-
output
|
139
|
-
end
|
140
|
-
|
141
|
-
def self.untar(input)
|
142
|
-
dest_dir = tmp_path input
|
143
|
-
::FileUtils.mkdir dest_dir
|
144
|
-
spawn 'tar', '-xf', input, '-C', dest_dir
|
145
|
-
::FileUtils.rm_f input
|
146
|
-
dest_dir
|
147
|
-
end
|
148
|
-
|
149
|
-
def self.unzip(input)
|
150
|
-
dest_dir = tmp_path input
|
151
|
-
::FileUtils.mkdir dest_dir
|
152
|
-
spawn 'unzip', '-qq', '-n', input, '-d', dest_dir
|
153
|
-
::FileUtils.rm_f input
|
154
|
-
dest_dir
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|