remote_table 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc DELETED
@@ -1,167 +0,0 @@
1
- =remote_table
2
-
3
- Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
4
-
5
- ==Real-life usage
6
-
7
- Used by http://data.brighterplanet.com and the data_miner gem (http://github.com/seamusabshere/data_miner)
8
-
9
- ==Requirements
10
-
11
- * POSIX operating system (not windows)
12
- * curl, iconv, perl, cat, cut, tail, etc. accessible from /usr/local/bin:/usr/bin:/bin
13
-
14
- As this library matures, those should go away.
15
-
16
- ==Example
17
-
18
- ?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
19
- => #<RemoteTable:0x359da50 [...]>
20
- ?> t[0]
21
- => {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
22
-
23
- More examples:
24
-
25
- RemoteTable.new "file://#{f.path}", :quote_char => %{'}, :headers => nil
26
-
27
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.csv'
28
-
29
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.ods'
30
-
31
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.xls'
32
-
33
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.csv'
34
-
35
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.ods'
36
-
37
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.xls'
38
-
39
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}'
40
-
41
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.#{format}', :keep_blank_rows => true
42
-
43
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA&single=true&gid=0'
44
-
45
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
46
-
47
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA', :skip => 1, :headers => false
48
-
49
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
50
-
51
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :headers => %w{ col1 col2 col3 }
52
-
53
- RemoteTable.new 'http://spreadsheets.google.com/pub?key=tujrgUOwDSLWb-P4KCt1qBg'
54
-
55
- RemoteTable.new 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls', :transform => { :class => FuelOilParser }
56
-
57
- RemoteTable.new 'http://www.freebase.com/type/exporttypeinstances/base/horses/horse_breed?page=0&filter_mode=type&filter_view=table&show%01p%3D%2Ftype%2Fobject%2Fname%01index=0&show%01p%3D%2Fcommon%2Ftopic%2Fimage%01index=1&show%01p%3D%2Fcommon%2Ftopic%2Farticle%01index=2&sort%01p%3D%2Ftype%2Fobject%2Ftype%01p%3Dlink%01p%3D%2Ftype%2Flink%2Ftimestamp%01index=false&=&exporttype=csv-8'
58
-
59
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls'
60
-
61
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
62
-
63
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
64
-
65
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
66
-
67
- RemoteTable.new 'http://www.worldmapper.org/data/opendoc/2_worldmapper_data.ods', :sheet => 'Data', :keep_blank_rows => true
68
-
69
- RemoteTable.new 'https://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA'
70
-
71
- RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx'
72
-
73
- RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => %w{foo bar baz}
74
-
75
- RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => false
76
-
77
- RemoteTable.new 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0', :form_data => 'UserTableName=T_100_Segment__All_Carriers&[...]', :compression => :zip, :glob => '/*.csv'
78
-
79
- RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-E.htm",
80
- :encoding => 'US-ASCII',
81
- :row_xpath => '//table/tr[2]/td/table/tr',
82
- :column_xpath => 'td'
83
-
84
- RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
85
- :encoding => 'windows-1252',
86
- :row_xpath => '//table/tr[2]/td/table/tr',
87
- :column_xpath => 'td',
88
- :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
89
- :responder => AircraftGuru.new)
90
-
91
- RemoteTable.new "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
92
- :encoding => 'windows-1252',
93
- :row_xpath => '//table/tr[2]/td/table/tr',
94
- :column_xpath => 'td',
95
- :errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
96
- :responder => AircraftGuru.new }
97
-
98
- RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
99
- :filename => 'Gd6-dsc.txt',
100
- :format => :fixed_width,
101
- :crop => 21..26, # inclusive
102
- :cut => '2-',
103
- :select => lambda { |row| /\A[A-Z]/.match row['code'] },
104
- :schema => [[ 'code', 2, { :type => :string } ],
105
- [ 'spacer', 2 ],
106
- [ 'name', 52, { :type => :string } ]]
107
-
108
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
109
- :format => :fixed_width,
110
- :skip => 1,
111
- :schema => [[ 'header4', 10, { :type => :string } ],
112
- [ 'spacer', 1 ],
113
- [ 'header5', 10, { :type => :string } ],
114
- [ 'spacer', 12 ],
115
- [ 'header6', 10, { :type => :string } ]]
116
-
117
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
118
- :format => :fixed_width,
119
- :keep_blank_rows => true,
120
- :skip => 1,
121
- :schema => [[ 'header4', 10, { :type => :string } ],
122
- [ 'spacer', 1 ],
123
- [ 'header5', 10, { :type => :string } ],
124
- [ 'spacer', 12 ],
125
- [ 'header6', 10, { :type => :string } ]]
126
-
127
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.fixed_width.txt',
128
- :format => :fixed_width,
129
- :skip => 1,
130
- :schema => [[ 'header1', 10, { :type => :string } ],
131
- [ 'spacer', 1 ],
132
- [ 'header2', 10, { :type => :string } ],
133
- [ 'spacer', 12 ],
134
- [ 'header3', 10, { :type => :string } ]]
135
-
136
- RemoteTable.new 'http://cloud.github.com/downloads/seamusabshere/remote_table/remote_table_row_hash_test.alternate_order.fixed_width.txt',
137
- :format => :fixed_width,
138
- :skip => 1,
139
- :schema => [[ 'spacer', 11 ],
140
- [ 'header2', 10, { :type => :string } ],
141
- [ 'spacer', 1 ],
142
- [ 'header3', 10, { :type => :string } ],
143
- [ 'spacer', 1 ],
144
- [ 'header1', 10, { :type => :string } ]]
145
-
146
- ==Helpful hints
147
-
148
- * ASCII-8BIT is the same as BINARY
149
- * ISO-8859-1 is the same as Latin1
150
-
151
- ==Custom parsers
152
-
153
- See the test file and also data_miner examples of custom parsers.
154
-
155
- ==Wishlist
156
-
157
- * The new parser syntax (aka transformer) hasn't been defined yet... only the old-style syntax is available
158
- * We currently call curl (and a lot of other utilities) using a shell. Is there a safer way to do this?
159
-
160
- ==Authors
161
-
162
- * Seamus Abshere <seamus@abshere.net>
163
- * Andy Rossmeissl <andy@rossmeissl.net>
164
-
165
- == Copyright
166
-
167
- Copyright (c) 2011 Brighter Planet. See LICENSE for details.
@@ -1,157 +0,0 @@
1
- require 'fileutils'
2
- require 'posix/spawn'
3
- require 'tmpdir'
4
-
5
- class RemoteTable
6
- class SpawnError < ::RuntimeError; end
7
-
8
- module Utils
9
- def self.tmp_path(ancestor)
10
- basename = ::File.basename(ancestor).sub(/remote_table-[0-9]+-/, '')
11
- ::Kernel.srand
12
- ::File.join ::Dir.tmpdir, "remote_table-#{::Kernel.rand(1e11)}-#{basename}"
13
- end
14
-
15
- def self.spawn(*argv)
16
- options = argv.extract_options!
17
- if options[:in] or options[:out]
18
- # capture these now because posix/spawn is known to bork them
19
- in_out = options.slice(:in, :out).map { |k, v| ":#{k} => #{v.path}" }.join(', ')
20
- # --
21
- pid = ::POSIX::Spawn.spawn *argv, options
22
- ::Process.waitpid pid
23
- raise SpawnError, "[remote_table] spawn #{argv.join(' ')} (#{in_out}) failed with exit status #{$?.exitstatus}" unless $?.success?
24
- else
25
- child = ::POSIX::Spawn::Child.new *argv
26
- raise SpawnError, "[remote_table] spawn #{argv.join(' ')}) failed with #{child.err}" unless child.success?
27
- end
28
- nil
29
- end
30
-
31
- def self.in_place(*args)
32
- options = args.extract_options!
33
- input = args.shift
34
- argv = args
35
- output = tmp_path input
36
- ::File.open(input, 'r') do |f0|
37
- ::File.open(output, 'wb') do |f1|
38
- spawn *argv, :in => f0, :out => f1
39
- end
40
- end
41
- ::FileUtils.mv output, input
42
- nil
43
- rescue SpawnError => e
44
- if options[:ignore_error]
45
- $stderr.puts "#{e.inspect} (ignoring error...)"
46
- ::FileUtils.mv output, input
47
- else
48
- raise e
49
- end
50
- end
51
-
52
- def self.download(uri, form_data = nil)
53
- output = tmp_path uri.path
54
-
55
- if uri.scheme == 'file'
56
- $stderr.puts "[remote_table] Getting #{uri.path} from the local file system" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
57
- ::FileUtils.cp uri.path, output
58
- return output
59
- end
60
-
61
- argv = [ 'curl', '--location', '--show-error', '--silent', '--compressed', '--header', 'Expect: ' ]
62
- if form_data
63
- argv += [ '--data', form_data ]
64
- end
65
- argv += [ uri.to_s, '--output', output ]
66
-
67
- # sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
68
- if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
69
- ::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
70
- end
71
-
72
- $stderr.puts "[remote_table] Downloading #{uri.to_s}" if ::ENV['REMOTE_TABLE_VERBOSE'] == 'true'
73
- spawn *argv
74
- output
75
- end
76
-
77
- def self.decompress(input, compression)
78
- case compression
79
- when :zip, :exe
80
- Utils.unzip input
81
- when :bz2
82
- Utils.bunzip2 input
83
- when :gz
84
- Utils.gunzip input
85
- else
86
- raise ::ArgumentError, "[remote_table] Unrecognized compression #{compression}"
87
- end
88
- end
89
-
90
- def self.unpack(input, packing)
91
- case packing
92
- when :tar
93
- Utils.untar input
94
- else
95
- raise ::ArgumentError, "[remote_table] Unrecognized packing #{packing}"
96
- end
97
- end
98
-
99
- def self.pick(input, options = {})
100
- options = options.symbolize_keys
101
- if (options[:filename] or options[:glob]) and not ::File.directory?(input)
102
- raise ::RuntimeError, "[remote_table] Expecting #{input} to be a directory"
103
- end
104
- if filename = options[:filename]
105
- src = ::File.join input, filename
106
- raise(::RuntimeError, "[remote_table] Expecting #{src} to be a file") unless ::File.file?(src)
107
- output = tmp_path src
108
- ::FileUtils.mv src, output
109
- ::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
110
- elsif glob = options[:glob]
111
- src = ::Dir[input+glob].first
112
- raise(::RuntimeError, "[remote_table] Expecting #{glob} to find a file in #{input}") unless src and ::File.file?(src)
113
- output = tmp_path src
114
- ::FileUtils.mv src, output
115
- ::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
116
- else
117
- output = tmp_path input
118
- ::FileUtils.mv input, output
119
- end
120
- output
121
- end
122
-
123
- def self.gunzip(input)
124
- output = tmp_path input
125
- ::File.open(output, 'wb') do |f|
126
- spawn 'gunzip', '--stdout', input, :out => f
127
- end
128
- ::FileUtils.rm_f input
129
- output
130
- end
131
-
132
- def self.bunzip2(input)
133
- output = tmp_path input
134
- ::File.open(output, 'wb') do |f|
135
- spawn 'bunzip2', '--stdout', input, :out => f
136
- end
137
- ::FileUtils.rm_f input
138
- output
139
- end
140
-
141
- def self.untar(input)
142
- dest_dir = tmp_path input
143
- ::FileUtils.mkdir dest_dir
144
- spawn 'tar', '-xf', input, '-C', dest_dir
145
- ::FileUtils.rm_f input
146
- dest_dir
147
- end
148
-
149
- def self.unzip(input)
150
- dest_dir = tmp_path input
151
- ::FileUtils.mkdir dest_dir
152
- spawn 'unzip', '-qq', '-n', input, '-d', dest_dir
153
- ::FileUtils.rm_f input
154
- dest_dir
155
- end
156
- end
157
- end