remote_table 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/CHANGELOG +19 -0
- data/Gemfile +7 -1
- data/README.markdown +440 -0
- data/Rakefile +6 -14
- data/lib/remote_table.rb +27 -38
- data/lib/remote_table/{properties.rb → config.rb} +39 -43
- data/lib/remote_table/format.rb +24 -27
- data/lib/remote_table/format/delimited.rb +17 -21
- data/lib/remote_table/format/fixed_width.rb +9 -9
- data/lib/remote_table/format/html.rb +0 -2
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +13 -12
- data/lib/remote_table/format/mixins/processed_by_roo.rb +17 -13
- data/lib/remote_table/format/mixins/textual.rb +13 -13
- data/lib/remote_table/format/open_office.rb +3 -0
- data/lib/remote_table/format/xml.rb +0 -2
- data/lib/remote_table/format/yaml.rb +14 -0
- data/lib/remote_table/local_file.rb +69 -7
- data/lib/remote_table/transformer.rb +7 -4
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +5 -13
- data/test/fixtures/data.yml +4 -0
- data/test/helper.rb +8 -9
- data/test/test_big.rb +43 -53
- data/test/test_errata.rb +27 -25
- data/test/test_old_syntax.rb +193 -191
- data/test/test_old_transform.rb +12 -10
- data/test/test_remote_table.rb +57 -47
- metadata +48 -64
- data/.document +0 -5
- data/README.rdoc +0 -167
- data/lib/remote_table/utils.rb +0 -157
data/README.rdoc
DELETED
@@ -1,167 +0,0 @@
|
|
1
|
-
=remote_table
|
2
|
-
|
3
|
-
Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
|
4
|
-
|
5
|
-
==Real-life usage
|
6
|
-
|
7
|
-
Used by http://data.brighterplanet.com and the data_miner gem (http://github.com/seamusabshere/data_miner)
|
8
|
-
|
9
|
-
==Requirements
|
10
|
-
|
11
|
-
* POSIX operating system (not windows)
|
12
|
-
* curl, iconv, perl, cat, cut, tail, etc. accessible from /usr/local/bin:/usr/bin:/bin
|
13
|
-
|
14
|
-
As this library matures, those should go away.
|
15
|
-
|
16
|
-
==Example
|
17
|
-
|
18
|
-
?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
|
19
|
-
=> #<RemoteTable:0x359da50 [...]>
|
20
|
-
?> t[0]
|
21
|
-
=> {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
|
22
|
-
|
23
|
-
More examples:
|
24
|
-
|
25
|
-
RemoteTable.new "file://#{f.path}", :quote_char => %{'}, :headers => nil
|
26
|
-
|
27
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.csv'
|
28
|
-
|
29
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.ods'
|
30
|
-
|
31
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.xls'
|
32
|
-
|
33
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.csv'
|
34
|
-
|
35
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.ods'
|
36
|
-
|
37
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.xls'
|
38
|
-
|
39
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}'
|
40
|
-
|
41
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}', :keep_blank_rows => true
|
42
|
-
|
43
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA&single=true&gid=0'
|
44
|
-
|
45
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
|
46
|
-
|
47
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false
|
48
|
-
|
49
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
|
50
|
-
|
51
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :headers => %w{ col1 col2 col3 }
|
52
|
-
|
53
|
-
RemoteTable.new 'http://spreadsheets.google.com/pub?key=tujrgUOwDSLWb-P4KCt1qBg'
|
54
|
-
|
55
|
-
RemoteTable.new 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls', :transform => { :class => FuelOilParser }
|
56
|
-
|
57
|
-
RemoteTable.new 'http://www.freebase.com/type/exporttypeinstances/base/horses/horse_breed?page=0&filter_mode=type&filter_view=table&show%01p%3D%2Ftype%2Fobject%2Fname%01index=0&show%01p%3D%2Fcommon%2Ftopic%2Fimage%01index=1&show%01p%3D%2Fcommon%2Ftopic%2Farticle%01index=2&sort%01p%3D%2Ftype%2Fobject%2Ftype%01p%3Dlink%01p%3D%2Ftype%2Flink%2Ftimestamp%01index=false&=&exporttype=csv-8'
|
58
|
-
|
59
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls'
|
60
|
-
|
61
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
|
62
|
-
|
63
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
|
64
|
-
|
65
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
|
66
|
-
|
67
|
-
RemoteTable.new 'http://www.worldmapper.org/data/opendoc/2_worldmapper_data.ods', :sheet => 'Data', :keep_blank_rows => true
|
68
|
-
|
69
|
-
RemoteTable.new 'https://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
|
70
|
-
|
71
|
-
RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx'
|
72
|
-
|
73
|
-
RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => %w{foo bar baz}
|
74
|
-
|
75
|
-
RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => false
|
76
|
-
|
77
|
-
RemoteTable.new 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0', :form_data => 'UserTableName=T_100_Segment__All_Carriers&[...]', :compression => :zip, :glob => '/*.csv'
|
78
|
-
|
79
|
-
RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-E.htm",
|
80
|
-
:encoding => 'US-ASCII',
|
81
|
-
:row_xpath => '//table/tr[2]/td/table/tr',
|
82
|
-
:column_xpath => 'td'
|
83
|
-
|
84
|
-
RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
|
85
|
-
:encoding => 'windows-1252',
|
86
|
-
:row_xpath => '//table/tr[2]/td/table/tr',
|
87
|
-
:column_xpath => 'td',
|
88
|
-
:errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
|
89
|
-
:responder => AircraftGuru.new)
|
90
|
-
|
91
|
-
RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
|
92
|
-
:encoding => 'windows-1252',
|
93
|
-
:row_xpath => '//table/tr[2]/td/table/tr',
|
94
|
-
:column_xpath => 'td',
|
95
|
-
:errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
|
96
|
-
:responder => AircraftGuru.new }
|
97
|
-
|
98
|
-
RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
99
|
-
:filename => 'Gd6-dsc.txt',
|
100
|
-
:format => :fixed_width,
|
101
|
-
:crop => 21..26, # inclusive
|
102
|
-
:cut => '2-',
|
103
|
-
:select => lambda { |row| /\A[A-Z]/.match row['code'] },
|
104
|
-
:schema => [[ 'code', 2, { :type => :string } ],
|
105
|
-
[ 'spacer', 2 ],
|
106
|
-
[ 'name', 52, { :type => :string } ]]
|
107
|
-
|
108
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
109
|
-
:format => :fixed_width,
|
110
|
-
:skip => 1,
|
111
|
-
:schema => [[ 'header4', 10, { :type => :string } ],
|
112
|
-
[ 'spacer', 1 ],
|
113
|
-
[ 'header5', 10, { :type => :string } ],
|
114
|
-
[ 'spacer', 12 ],
|
115
|
-
[ 'header6', 10, { :type => :string } ]]
|
116
|
-
|
117
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
118
|
-
:format => :fixed_width,
|
119
|
-
:keep_blank_rows => true,
|
120
|
-
:skip => 1,
|
121
|
-
:schema => [[ 'header4', 10, { :type => :string } ],
|
122
|
-
[ 'spacer', 1 ],
|
123
|
-
[ 'header5', 10, { :type => :string } ],
|
124
|
-
[ 'spacer', 12 ],
|
125
|
-
[ 'header6', 10, { :type => :string } ]]
|
126
|
-
|
127
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.fixed_width.txt',
|
128
|
-
:format => :fixed_width,
|
129
|
-
:skip => 1,
|
130
|
-
:schema => [[ 'header1', 10, { :type => :string } ],
|
131
|
-
[ 'spacer', 1 ],
|
132
|
-
[ 'header2', 10, { :type => :string } ],
|
133
|
-
[ 'spacer', 12 ],
|
134
|
-
[ 'header3', 10, { :type => :string } ]]
|
135
|
-
|
136
|
-
RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.fixed_width.txt',
|
137
|
-
:format => :fixed_width,
|
138
|
-
:skip => 1,
|
139
|
-
:schema => [[ 'spacer', 11 ],
|
140
|
-
[ 'header2', 10, { :type => :string } ],
|
141
|
-
[ 'spacer', 1 ],
|
142
|
-
[ 'header3', 10, { :type => :string } ],
|
143
|
-
[ 'spacer', 1 ],
|
144
|
-
[ 'header1', 10, { :type => :string } ]]
|
145
|
-
|
146
|
-
==Helpful hints
|
147
|
-
|
148
|
-
* ASCII-8BIT is the same as BINARY
|
149
|
-
* ISO-8859-1 is the same as Latin1
|
150
|
-
|
151
|
-
==Custom parsers
|
152
|
-
|
153
|
-
See the test file and also data_miner examples of custom parsers.
|
154
|
-
|
155
|
-
==Wishlist
|
156
|
-
|
157
|
-
* The new parser syntax (aka transformer) hasn't been defined yet... only the old-style syntax is available
|
158
|
-
* We currently call curl (and a lot of other utilities) using a shell. Is there a safer way to do this?
|
159
|
-
|
160
|
-
==Authors
|
161
|
-
|
162
|
-
* Seamus Abshere <seamus@abshere.net>
|
163
|
-
* Andy Rossmeissl <andy@rossmeissl.net>
|
164
|
-
|
165
|
-
== Copyright
|
166
|
-
|
167
|
-
Copyright (c) 2011 Brighter Planet. See LICENSE for details.
|
data/lib/remote_table/utils.rb
DELETED
@@ -1,157 +0,0 @@
|
|
1
|
-
require 'fileutils'
|
2
|
-
require 'posix/spawn'
|
3
|
-
require 'tmpdir'
|
4
|
-
|
5
|
-
class RemoteTable
|
6
|
-
class SpawnError < ::RuntimeError; end
|
7
|
-
|
8
|
-
module Utils
|
9
|
-
def self.tmp_path(ancestor)
|
10
|
-
basename = ::File.basename(ancestor).sub(/remote_table-[0-9]+-/, '')
|
11
|
-
::Kernel.srand
|
12
|
-
::File.join ::Dir.tmpdir, "remote_table-#{::Kernel.rand(1e11)}-#{basename}"
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.spawn(*argv)
|
16
|
-
options = argv.extract_options!
|
17
|
-
if options[:in] or options[:out]
|
18
|
-
# capture these now because posix/spawn is known to bork them
|
19
|
-
in_out = options.slice(:in, :out).map { |k, v| ":#{k} => #{v.path}" }.join(', ')
|
20
|
-
# --
|
21
|
-
pid = ::POSIX::Spawn.spawn *argv, options
|
22
|
-
::Process.waitpid pid
|
23
|
-
raise SpawnError, "[remote_table] spawn #{argv.join(' ')} (#{in_out}) failed with exit status #{$?.exitstatus}" unless $?.success?
|
24
|
-
else
|
25
|
-
child = ::POSIX::Spawn::Child.new *argv
|
26
|
-
raise SpawnError, "[remote_table] spawn #{argv.join(' ')}) failed with #{child.err}" unless child.success?
|
27
|
-
end
|
28
|
-
nil
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.in_place(*args)
|
32
|
-
options = args.extract_options!
|
33
|
-
input = args.shift
|
34
|
-
argv = args
|
35
|
-
output = tmp_path input
|
36
|
-
::File.open(input, 'r') do |f0|
|
37
|
-
::File.open(output, 'wb') do |f1|
|
38
|
-
spawn *argv, :in => f0, :out => f1
|
39
|
-
end
|
40
|
-
end
|
41
|
-
::FileUtils.mv output, input
|
42
|
-
nil
|
43
|
-
rescue SpawnError => e
|
44
|
-
if options[:ignore_error]
|
45
|
-
$stderr.puts "#{e.inspect} (ignoring error...)"
|
46
|
-
::FileUtils.mv output, input
|
47
|
-
else
|
48
|
-
raise e
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def self.download(uri, form_data = nil)
|
53
|
-
output = tmp_path uri.path
|
54
|
-
|
55
|
-
if uri.scheme == 'file'
|
56
|
-
$stderr.puts "[remote_table] Getting #{uri.path} from the local file system" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
|
57
|
-
::FileUtils.cp uri.path, output
|
58
|
-
return output
|
59
|
-
end
|
60
|
-
|
61
|
-
argv = [ 'curl', '--location', '--show-error', '--silent', '--compressed', '--header', 'Expect: ' ]
|
62
|
-
if form_data
|
63
|
-
argv += [ '--data', form_data ]
|
64
|
-
end
|
65
|
-
argv += [ uri.to_s, '--output', output ]
|
66
|
-
|
67
|
-
# sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
|
68
|
-
if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
|
69
|
-
::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
|
70
|
-
end
|
71
|
-
|
72
|
-
$stderr.puts "[remote_table] Downloading #{uri.to_s}" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
|
73
|
-
spawn *argv
|
74
|
-
output
|
75
|
-
end
|
76
|
-
|
77
|
-
def self.decompress(input, compression)
|
78
|
-
case compression
|
79
|
-
when :zip, :exe
|
80
|
-
Utils.unzip input
|
81
|
-
when :bz2
|
82
|
-
Utils.bunzip2 input
|
83
|
-
when :gz
|
84
|
-
Utils.gunzip input
|
85
|
-
else
|
86
|
-
raise ::ArgumentError, "[remote_table] Unrecognized compression #{compression}"
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
def self.unpack(input, packing)
|
91
|
-
case packing
|
92
|
-
when :tar
|
93
|
-
Utils.untar input
|
94
|
-
else
|
95
|
-
raise ::ArgumentError, "[remote_table] Unrecognized packing #{packing}"
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
def self.pick(input, options = {})
|
100
|
-
options = options.symbolize_keys
|
101
|
-
if (options[:filename] or options[:glob]) and not ::File.directory?(input)
|
102
|
-
raise ::RuntimeError, "[remote_table] Expecting #{input} to be a directory"
|
103
|
-
end
|
104
|
-
if filename = options[:filename]
|
105
|
-
src = ::File.join input, filename
|
106
|
-
raise(::RuntimeError, "[remote_table] Expecting #{src} to be a file") unless ::File.file?(src)
|
107
|
-
output = tmp_path src
|
108
|
-
::FileUtils.mv src, output
|
109
|
-
::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
|
110
|
-
elsif glob = options[:glob]
|
111
|
-
src = ::Dir[input+glob].first
|
112
|
-
raise(::RuntimeError, "[remote_table] Expecting #{glob} to find a file in #{input}") unless src and ::File.file?(src)
|
113
|
-
output = tmp_path src
|
114
|
-
::FileUtils.mv src, output
|
115
|
-
::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
|
116
|
-
else
|
117
|
-
output = tmp_path input
|
118
|
-
::FileUtils.mv input, output
|
119
|
-
end
|
120
|
-
output
|
121
|
-
end
|
122
|
-
|
123
|
-
def self.gunzip(input)
|
124
|
-
output = tmp_path input
|
125
|
-
::File.open(output, 'wb') do |f|
|
126
|
-
spawn 'gunzip', '--stdout', input, :out => f
|
127
|
-
end
|
128
|
-
::FileUtils.rm_f input
|
129
|
-
output
|
130
|
-
end
|
131
|
-
|
132
|
-
def self.bunzip2(input)
|
133
|
-
output = tmp_path input
|
134
|
-
::File.open(output, 'wb') do |f|
|
135
|
-
spawn 'bunzip2', '--stdout', input, :out => f
|
136
|
-
end
|
137
|
-
::FileUtils.rm_f input
|
138
|
-
output
|
139
|
-
end
|
140
|
-
|
141
|
-
def self.untar(input)
|
142
|
-
dest_dir = tmp_path input
|
143
|
-
::FileUtils.mkdir dest_dir
|
144
|
-
spawn 'tar', '-xf', input, '-C', dest_dir
|
145
|
-
::FileUtils.rm_f input
|
146
|
-
dest_dir
|
147
|
-
end
|
148
|
-
|
149
|
-
def self.unzip(input)
|
150
|
-
dest_dir = tmp_path input
|
151
|
-
::FileUtils.mkdir dest_dir
|
152
|
-
spawn 'unzip', '-qq', '-n', input, '-d', dest_dir
|
153
|
-
::FileUtils.rm_f input
|
154
|
-
dest_dir
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|