remote_table 1.3.0 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc DELETED
@@ -1,167 +0,0 @@
1
- =remote_table
2
-
3
- Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
4
-
5
- ==Real-life usage
6
-
7
- Used by http://data.brighterplanet.com and the data_miner gem (http://github.com/seamusabshere/data_miner)
8
-
9
- ==Requirements
10
-
11
- * POSIX operating system (not windows)
12
- * curl, iconv, perl, cat, cut, tail, etc. accessible from /usr/local/bin:/usr/bin:/bin
13
-
14
- As this library matures, those should go away.
15
-
16
- ==Example
17
-
18
- ?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
19
- => #<RemoteTable:0x359da50 [...]>
20
- ?> t[0]
21
- => {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
22
-
23
- More examples:
24
-
25
- RemoteTable.new "file://#{f.path}", :quote_char => %{'}, :headers => nil
26
-
27
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.csv'
28
-
29
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.ods'
30
-
31
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.xls'
32
-
33
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.csv'
34
-
35
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.ods'
36
-
37
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.xls'
38
-
39
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}'
40
-
41
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}', :keep_blank_rows => true
42
-
43
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA&single=true&gid=0'
44
-
45
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
46
-
47
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false
48
-
49
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
50
-
51
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :headers => %w{ col1 col2 col3 }
52
-
53
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=tujrgUOwDSLWb-P4KCt1qBg'
54
-
55
- RemoteTable.new 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls', :transform => { :class => FuelOilParser }
56
-
57
- RemoteTable.new 'http://www.freebase.com/type/exporttypeinstances/base/horses/horse_breed?page=0&filter_mode=type&filter_view=table&show%01p%3D%2Ftype%2Fobject%2Fname%01index=0&show%01p%3D%2Fcommon%2Ftopic%2Fimage%01index=1&show%01p%3D%2Fcommon%2Ftopic%2Farticle%01index=2&sort%01p%3D%2Ftype%2Fobject%2Ftype%01p%3Dlink%01p%3D%2Ftype%2Flink%2Ftimestamp%01index=false&=&exporttype=csv-8'
58
-
59
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls'
60
-
61
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
62
-
63
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
64
-
65
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
66
-
67
- RemoteTable.new 'http://www.worldmapper.org/data/opendoc/2_worldmapper_data.ods', :sheet => 'Data', :keep_blank_rows => true
68
-
69
- RemoteTable.new 'https://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
70
-
71
- RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx'
72
-
73
- RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => %w{foo bar baz}
74
-
75
- RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => false
76
-
77
- RemoteTable.new 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0', :form_data => 'UserTableName=T_100_Segment__All_Carriers&[...]', :compression => :zip, :glob => '/*.csv'
78
-
79
- RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-E.htm",
80
- :encoding => 'US-ASCII',
81
- :row_xpath => '//table/tr[2]/td/table/tr',
82
- :column_xpath => 'td'
83
-
84
- RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
85
- :encoding => 'windows-1252',
86
- :row_xpath => '//table/tr[2]/td/table/tr',
87
- :column_xpath => 'td',
88
- :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
89
- :responder => AircraftGuru.new)
90
-
91
- RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
92
- :encoding => 'windows-1252',
93
- :row_xpath => '//table/tr[2]/td/table/tr',
94
- :column_xpath => 'td',
95
- :errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
96
- :responder => AircraftGuru.new }
97
-
98
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
99
- :filename => 'Gd6-dsc.txt',
100
- :format => :fixed_width,
101
- :crop => 21..26, # inclusive
102
- :cut => '2-',
103
- :select => lambda { |row| /\A[A-Z]/.match row['code'] },
104
- :schema => [[ 'code', 2, { :type => :string } ],
105
- [ 'spacer', 2 ],
106
- [ 'name', 52, { :type => :string } ]]
107
-
108
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
109
- :format => :fixed_width,
110
- :skip => 1,
111
- :schema => [[ 'header4', 10, { :type => :string } ],
112
- [ 'spacer', 1 ],
113
- [ 'header5', 10, { :type => :string } ],
114
- [ 'spacer', 12 ],
115
- [ 'header6', 10, { :type => :string } ]]
116
-
117
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
118
- :format => :fixed_width,
119
- :keep_blank_rows => true,
120
- :skip => 1,
121
- :schema => [[ 'header4', 10, { :type => :string } ],
122
- [ 'spacer', 1 ],
123
- [ 'header5', 10, { :type => :string } ],
124
- [ 'spacer', 12 ],
125
- [ 'header6', 10, { :type => :string } ]]
126
-
127
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.fixed_width.txt',
128
- :format => :fixed_width,
129
- :skip => 1,
130
- :schema => [[ 'header1', 10, { :type => :string } ],
131
- [ 'spacer', 1 ],
132
- [ 'header2', 10, { :type => :string } ],
133
- [ 'spacer', 12 ],
134
- [ 'header3', 10, { :type => :string } ]]
135
-
136
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.fixed_width.txt',
137
- :format => :fixed_width,
138
- :skip => 1,
139
- :schema => [[ 'spacer', 11 ],
140
- [ 'header2', 10, { :type => :string } ],
141
- [ 'spacer', 1 ],
142
- [ 'header3', 10, { :type => :string } ],
143
- [ 'spacer', 1 ],
144
- [ 'header1', 10, { :type => :string } ]]
145
-
146
- ==Helpful hints
147
-
148
- * ASCII-8BIT is the same as BINARY
149
- * ISO-8859-1 is the same as Latin1
150
-
151
- ==Custom parsers
152
-
153
- See the test file and also data_miner examples of custom parsers.
154
-
155
- ==Wishlist
156
-
157
- * The new parser syntax (aka transformer) hasn't been defined yet... only the old-style syntax is available
158
- * We currently call curl (and a lot of other utilities) using a shell. Is there a safer way to do this?
159
-
160
- ==Authors
161
-
162
- * Seamus Abshere <seamus@abshere.net>
163
- * Andy Rossmeissl <andy@rossmeissl.net>
164
-
165
- == Copyright
166
-
167
- Copyright (c) 2011 Brighter Planet. See LICENSE for details.
@@ -1,157 +0,0 @@
1
- require 'fileutils'
2
- require 'posix/spawn'
3
- require 'tmpdir'
4
-
5
- class RemoteTable
6
- class SpawnError < ::RuntimeError; end
7
-
8
- module Utils
9
- def self.tmp_path(ancestor)
10
- basename = ::File.basename(ancestor).sub(/remote_table-[0-9]+-/, '')
11
- ::Kernel.srand
12
- ::File.join ::Dir.tmpdir, "remote_table-#{::Kernel.rand(1e11)}-#{basename}"
13
- end
14
-
15
- def self.spawn(*argv)
16
- options = argv.extract_options!
17
- if options[:in] or options[:out]
18
- # capture these now because posix/spawn is known to bork them
19
- in_out = options.slice(:in, :out).map { |k, v| ":#{k} => #{v.path}" }.join(', ')
20
- # --
21
- pid = ::POSIX::Spawn.spawn *argv, options
22
- ::Process.waitpid pid
23
- raise SpawnError, "[remote_table] spawn #{argv.join(' ')} (#{in_out}) failed with exit status #{$?.exitstatus}" unless $?.success?
24
- else
25
- child = ::POSIX::Spawn::Child.new *argv
26
- raise SpawnError, "[remote_table] spawn #{argv.join(' ')}) failed with #{child.err}" unless child.success?
27
- end
28
- nil
29
- end
30
-
31
- def self.in_place(*args)
32
- options = args.extract_options!
33
- input = args.shift
34
- argv = args
35
- output = tmp_path input
36
- ::File.open(input, 'r') do |f0|
37
- ::File.open(output, 'wb') do |f1|
38
- spawn *argv, :in => f0, :out => f1
39
- end
40
- end
41
- ::FileUtils.mv output, input
42
- nil
43
- rescue SpawnError => e
44
- if options[:ignore_error]
45
- $stderr.puts "#{e.inspect} (ignoring error...)"
46
- ::FileUtils.mv output, input
47
- else
48
- raise e
49
- end
50
- end
51
-
52
- def self.download(uri, form_data = nil)
53
- output = tmp_path uri.path
54
-
55
- if uri.scheme == 'file'
56
- $stderr.puts "[remote_table] Getting #{uri.path} from the local file system" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
57
- ::FileUtils.cp uri.path, output
58
- return output
59
- end
60
-
61
- argv = [ 'curl', '--location', '--show-error', '--silent', '--compressed', '--header', 'Expect: ' ]
62
- if form_data
63
- argv += [ '--data', form_data ]
64
- end
65
- argv += [ uri.to_s, '--output', output ]
66
-
67
- # sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
68
- if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
69
- ::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
70
- end
71
-
72
- $stderr.puts "[remote_table] Downloading #{uri.to_s}" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
73
- spawn *argv
74
- output
75
- end
76
-
77
- def self.decompress(input, compression)
78
- case compression
79
- when :zip, :exe
80
- Utils.unzip input
81
- when :bz2
82
- Utils.bunzip2 input
83
- when :gz
84
- Utils.gunzip input
85
- else
86
- raise ::ArgumentError, "[remote_table] Unrecognized compression #{compression}"
87
- end
88
- end
89
-
90
- def self.unpack(input, packing)
91
- case packing
92
- when :tar
93
- Utils.untar input
94
- else
95
- raise ::ArgumentError, "[remote_table] Unrecognized packing #{packing}"
96
- end
97
- end
98
-
99
- def self.pick(input, options = {})
100
- options = options.symbolize_keys
101
- if (options[:filename] or options[:glob]) and not ::File.directory?(input)
102
- raise ::RuntimeError, "[remote_table] Expecting #{input} to be a directory"
103
- end
104
- if filename = options[:filename]
105
- src = ::File.join input, filename
106
- raise(::RuntimeError, "[remote_table] Expecting #{src} to be a file") unless ::File.file?(src)
107
- output = tmp_path src
108
- ::FileUtils.mv src, output
109
- ::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
110
- elsif glob = options[:glob]
111
- src = ::Dir[input+glob].first
112
- raise(::RuntimeError, "[remote_table] Expecting #{glob} to find a file in #{input}") unless src and ::File.file?(src)
113
- output = tmp_path src
114
- ::FileUtils.mv src, output
115
- ::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
116
- else
117
- output = tmp_path input
118
- ::FileUtils.mv input, output
119
- end
120
- output
121
- end
122
-
123
- def self.gunzip(input)
124
- output = tmp_path input
125
- ::File.open(output, 'wb') do |f|
126
- spawn 'gunzip', '--stdout', input, :out => f
127
- end
128
- ::FileUtils.rm_f input
129
- output
130
- end
131
-
132
- def self.bunzip2(input)
133
- output = tmp_path input
134
- ::File.open(output, 'wb') do |f|
135
- spawn 'bunzip2', '--stdout', input, :out => f
136
- end
137
- ::FileUtils.rm_f input
138
- output
139
- end
140
-
141
- def self.untar(input)
142
- dest_dir = tmp_path input
143
- ::FileUtils.mkdir dest_dir
144
- spawn 'tar', '-xf', input, '-C', dest_dir
145
- ::FileUtils.rm_f input
146
- dest_dir
147
- end
148
-
149
- def self.unzip(input)
150
- dest_dir = tmp_path input
151
- ::FileUtils.mkdir dest_dir
152
- spawn 'unzip', '-qq', '-n', input, '-d', dest_dir
153
- ::FileUtils.rm_f input
154
- dest_dir
155
- end
156
- end
157
- end