remote_table 1.2.4 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -1
- data/lib/remote_table.rb +4 -13
- data/lib/remote_table/format/delimited.rb +20 -21
- data/lib/remote_table/format/fixed_width.rb +2 -1
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +7 -7
- data/lib/remote_table/format/mixins/processed_by_roo.rb +30 -20
- data/lib/remote_table/format/mixins/textual.rb +9 -9
- data/lib/remote_table/local_file.rb +18 -80
- data/lib/remote_table/properties.rb +52 -61
- data/lib/remote_table/transformer.rb +3 -4
- data/lib/remote_table/utils.rb +157 -0
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +1 -7
- data/test/test_big.rb +2 -2
- data/test/test_old_syntax.rb +20 -20
- data/test/test_remote_table.rb +11 -0
- metadata +28 -94
- data/lib/remote_table/executor.rb +0 -37
data/README.rdoc
CHANGED
@@ -15,7 +15,7 @@ As this library matures, those should go away.
|
|
15
15
|
|
16
16
|
==Example
|
17
17
|
|
18
|
-
?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip',
|
18
|
+
?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
|
19
19
|
=> #<RemoteTable:0x359da50 [...]>
|
20
20
|
?> t[0]
|
21
21
|
=> {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
|
data/lib/remote_table.rb
CHANGED
@@ -28,9 +28,7 @@ class RemoteTable
|
|
28
28
|
autoload :Properties, 'remote_table/properties'
|
29
29
|
autoload :LocalFile, 'remote_table/local_file'
|
30
30
|
autoload :Transformer, 'remote_table/transformer'
|
31
|
-
|
32
|
-
# singletons
|
33
|
-
autoload :Executor, 'remote_table/executor'
|
31
|
+
autoload :Utils, 'remote_table/utils'
|
34
32
|
|
35
33
|
# Legacy
|
36
34
|
class Transform
|
@@ -49,18 +47,17 @@ class RemoteTable
|
|
49
47
|
# RemoteTable.new(url, options = {})
|
50
48
|
#
|
51
49
|
# New syntax:
|
52
|
-
# RemoteTable.new('www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx',
|
50
|
+
# RemoteTable.new('www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :foo => 'bar')
|
53
51
|
# Old syntax:
|
54
52
|
# RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :foo => 'bar')
|
55
53
|
#
|
56
54
|
# See the <tt>Properties</tt> object for the sorts of options you can pass.
|
57
55
|
def initialize(*args)
|
58
|
-
@options = args.last.is_a?(::Hash) ? args.last.
|
59
|
-
@options.stringify_keys!
|
56
|
+
@options = args.last.is_a?(::Hash) ? args.last.symbolize_keys : {}
|
60
57
|
@url = if args.first.is_a? ::String
|
61
58
|
args.first.dup
|
62
59
|
else
|
63
|
-
@options[
|
60
|
+
@options[:url].dup
|
64
61
|
end
|
65
62
|
@url.freeze
|
66
63
|
@options.freeze
|
@@ -112,15 +109,9 @@ class RemoteTable
|
|
112
109
|
# clear the row cache to save memory
|
113
110
|
def free
|
114
111
|
cache.clear
|
115
|
-
::GC.start
|
116
112
|
nil
|
117
113
|
end
|
118
114
|
|
119
|
-
# Used internally to execute stuff in shells.
|
120
|
-
def self.executor
|
121
|
-
Executor.instance
|
122
|
-
end
|
123
|
-
|
124
115
|
# Used internally to access to a downloaded copy of the file
|
125
116
|
def local_file
|
126
117
|
@local_file ||= LocalFile.new self
|
@@ -1,10 +1,10 @@
|
|
1
1
|
if RUBY_VERSION >= '1.9'
|
2
2
|
require 'csv'
|
3
|
-
::RemoteTable::
|
3
|
+
::RemoteTable::MyCSV = ::CSV
|
4
4
|
else
|
5
5
|
begin
|
6
6
|
require 'fastercsv'
|
7
|
-
::RemoteTable::
|
7
|
+
::RemoteTable::MyCSV = ::FasterCSV
|
8
8
|
rescue ::LoadError
|
9
9
|
$stderr.puts "[remote_table] You probably need to manually install the fastercsv gem and/or require it in your Gemfile."
|
10
10
|
raise $!
|
@@ -20,8 +20,8 @@ class RemoteTable
|
|
20
20
|
fix_newlines!
|
21
21
|
transliterate_whole_file_to_utf8!
|
22
22
|
skip_rows!
|
23
|
-
|
24
|
-
if row.is_a?(
|
23
|
+
MyCSV.new(t.local_file.encoded_io, fastercsv_options).each do |row|
|
24
|
+
if row.is_a?(MyCSV::Row)
|
25
25
|
hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (k, v)|
|
26
26
|
if k.present?
|
27
27
|
memo[k] = v.to_s
|
@@ -35,30 +35,29 @@ class RemoteTable
|
|
35
35
|
end
|
36
36
|
end
|
37
37
|
ensure
|
38
|
-
t.local_file.
|
38
|
+
t.local_file.cleanup
|
39
39
|
end
|
40
40
|
|
41
41
|
private
|
42
42
|
|
43
|
-
FASTERCSV_OPTIONS =
|
44
|
-
unconverted_fields
|
45
|
-
col_sep
|
46
|
-
headers
|
47
|
-
row_sep
|
48
|
-
return_headers
|
49
|
-
header_converters
|
50
|
-
quote_char
|
51
|
-
skip_blanks
|
52
|
-
converters
|
53
|
-
force_quotes
|
54
|
-
|
43
|
+
FASTERCSV_OPTIONS = [
|
44
|
+
:unconverted_fields,
|
45
|
+
:col_sep,
|
46
|
+
:headers,
|
47
|
+
:row_sep,
|
48
|
+
:return_headers,
|
49
|
+
:header_converters,
|
50
|
+
:quote_char,
|
51
|
+
:skip_blanks,
|
52
|
+
:converters,
|
53
|
+
:force_quotes,
|
54
|
+
]
|
55
55
|
|
56
56
|
def fastercsv_options
|
57
57
|
hsh = t.options.slice *FASTERCSV_OPTIONS
|
58
|
-
hsh.merge!
|
59
|
-
hsh.reverse_merge!
|
60
|
-
hsh.reverse_merge!
|
61
|
-
hsh.symbolize_keys
|
58
|
+
hsh.merge! :skip_blanks => !t.properties.keep_blank_rows
|
59
|
+
hsh.reverse_merge! :headers => t.properties.headers
|
60
|
+
hsh.reverse_merge! :col_sep => t.properties.delimiter
|
62
61
|
end
|
63
62
|
end
|
64
63
|
end
|
@@ -19,7 +19,7 @@ class RemoteTable
|
|
19
19
|
yield row if t.properties.keep_blank_rows or row.any? { |k, v| v.present? }
|
20
20
|
end
|
21
21
|
ensure
|
22
|
-
t.local_file.
|
22
|
+
t.local_file.cleanup
|
23
23
|
end
|
24
24
|
|
25
25
|
private
|
@@ -42,6 +42,7 @@ class RemoteTable
|
|
42
42
|
d.rows do |row|
|
43
43
|
row.trap(&everything)
|
44
44
|
t.properties.schema.each do |name, width, options|
|
45
|
+
name = name.to_s
|
45
46
|
if name == 'spacer'
|
46
47
|
row.spacer width
|
47
48
|
else
|
@@ -7,8 +7,9 @@ class RemoteTable
|
|
7
7
|
raise "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML" unless t.properties.row_css or t.properties.row_xpath
|
8
8
|
remove_useless_characters!
|
9
9
|
transliterate_whole_file_to_utf8!
|
10
|
-
|
11
|
-
|
10
|
+
|
11
|
+
headers = t.properties.headers
|
12
|
+
|
12
13
|
xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, 'UTF-8')
|
13
14
|
(row_css? ? xml.css(t.properties.row_css) : xml.xpath(t.properties.row_xpath)).each do |row|
|
14
15
|
values = if column_css?
|
@@ -18,22 +19,21 @@ class RemoteTable
|
|
18
19
|
else
|
19
20
|
[row]
|
20
21
|
end.map { |cell| assume_utf8 cell.content.gsub(/\s+/, ' ').strip }
|
21
|
-
if
|
22
|
-
|
23
|
-
first_row = false
|
22
|
+
if headers == :first_row
|
23
|
+
headers = values.select(&:present?)
|
24
24
|
next
|
25
25
|
end
|
26
26
|
output = if t.properties.output_class == ::Array
|
27
27
|
values
|
28
28
|
else
|
29
|
-
zip
|
29
|
+
zip headers, values
|
30
30
|
end
|
31
31
|
if t.properties.keep_blank_rows or values.any?
|
32
32
|
yield output
|
33
33
|
end
|
34
34
|
end
|
35
35
|
ensure
|
36
|
-
t.local_file.
|
36
|
+
t.local_file.cleanup
|
37
37
|
end
|
38
38
|
|
39
39
|
private
|
@@ -5,44 +5,54 @@ class RemoteTable
|
|
5
5
|
def each(&blk)
|
6
6
|
spreadsheet = roo_class.new t.local_file.path, nil, :ignore
|
7
7
|
spreadsheet.default_sheet = t.properties.sheet.is_a?(::Numeric) ? spreadsheet.sheets[t.properties.sheet] : t.properties.sheet
|
8
|
+
|
9
|
+
first_row = if t.properties.crop
|
10
|
+
t.properties.crop.first + 1
|
11
|
+
else
|
12
|
+
t.properties.skip + 1
|
13
|
+
end
|
14
|
+
|
15
|
+
last_row = if t.properties.crop
|
16
|
+
t.properties.crop.last
|
17
|
+
else
|
18
|
+
spreadsheet.last_row
|
19
|
+
end
|
20
|
+
|
8
21
|
if t.properties.output_class == ::Array
|
9
|
-
(first_row..
|
22
|
+
(first_row..last_row).each do |y|
|
10
23
|
output = (1..spreadsheet.last_column).map do |x|
|
11
24
|
assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
12
25
|
end
|
13
26
|
yield output if t.properties.keep_blank_rows or output.any? { |v| v.present? }
|
14
27
|
end
|
15
28
|
else
|
16
|
-
|
29
|
+
headers = {}
|
17
30
|
if t.properties.use_first_row_as_header?
|
18
31
|
(1..spreadsheet.last_column).each do |x|
|
19
|
-
|
20
|
-
|
21
|
-
|
32
|
+
v = spreadsheet.cell(first_row, x)
|
33
|
+
v = spreadsheet.cell(first_row - 1, x) if v.blank? # look up
|
34
|
+
if v.present?
|
35
|
+
v = assume_utf8 v
|
36
|
+
headers[v] = x # 'foobar' is found at column 6
|
37
|
+
end
|
22
38
|
end
|
39
|
+
# "advance the cursor"
|
40
|
+
first_row += 1
|
23
41
|
else
|
24
|
-
|
25
|
-
|
42
|
+
t.properties.headers.each_with_index do |k, i|
|
43
|
+
headers[k] = i + 1
|
26
44
|
end
|
27
45
|
end
|
28
|
-
(first_row
|
29
|
-
output =
|
30
|
-
|
31
|
-
|
32
|
-
end
|
33
|
-
memo
|
46
|
+
(first_row..last_row).each do |y|
|
47
|
+
output = ::ActiveSupport::OrderedHash.new
|
48
|
+
headers.each do |k, x|
|
49
|
+
output[k] = assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
34
50
|
end
|
35
51
|
yield output if t.properties.keep_blank_rows or output.any? { |k, v| v.present? }
|
36
52
|
end
|
37
53
|
end
|
38
54
|
ensure
|
39
|
-
t.local_file.
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def first_row
|
45
|
-
1 + t.properties.skip
|
55
|
+
t.local_file.cleanup
|
46
56
|
end
|
47
57
|
end
|
48
58
|
end
|
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'fileutils'
|
2
|
-
require 'escape'
|
3
2
|
class RemoteTable
|
4
3
|
class Format
|
5
4
|
module Textual
|
@@ -8,35 +7,36 @@ class RemoteTable
|
|
8
7
|
'\xc2\xad', # soft hyphen, often inserted by MS Office (html: ­)
|
9
8
|
]
|
10
9
|
def remove_useless_characters!
|
11
|
-
|
10
|
+
Utils.in_place t.local_file.path, 'perl', '-pe', "s/#{USELESS_CHARACTERS.join '//g; s/'}//g"
|
12
11
|
if t.properties.internal_encoding =~ /windows.?1252/i
|
13
12
|
# soft hyphen again, as I have seen it appear in windows 1252
|
14
|
-
|
13
|
+
Utils.in_place t.local_file.path, 'perl', '-pe', 's/\xad//g'
|
15
14
|
end
|
16
15
|
end
|
17
16
|
|
18
17
|
def transliterate_whole_file_to_utf8!
|
19
|
-
|
20
|
-
t.properties.update
|
18
|
+
Utils.in_place t.local_file.path, 'iconv', '-c', '-f', t.properties.internal_encoding, '-t', t.properties.external_encoding_iconv, :ignore_error => true
|
19
|
+
t.properties.update :encoding => t.properties.external_encoding
|
21
20
|
end
|
22
21
|
|
23
22
|
def fix_newlines!
|
24
|
-
|
23
|
+
Utils.in_place t.local_file.path, 'perl', '-pe', 's/\r\n|\n|\r/\n/g'
|
25
24
|
end
|
26
25
|
|
27
26
|
def skip_rows!
|
28
27
|
return unless t.properties.skip > 0
|
29
|
-
|
28
|
+
Utils.in_place t.local_file.path, 'tail', '-n', "+#{t.properties.skip + 1}"
|
30
29
|
end
|
31
30
|
|
32
31
|
def crop_rows!
|
33
32
|
return unless t.properties.crop
|
34
|
-
|
33
|
+
Utils.in_place t.local_file.path, 'tail', '-n', "+#{t.properties.crop.first}"
|
34
|
+
Utils.in_place t.local_file.path, 'head', '-n', (t.properties.crop.last - t.properties.crop.first + 1).to_s
|
35
35
|
end
|
36
36
|
|
37
37
|
def cut_columns!
|
38
38
|
return unless t.properties.cut
|
39
|
-
|
39
|
+
Utils.in_place t.local_file.path, 'cut', '-c', t.properties.cut.to_s
|
40
40
|
end
|
41
41
|
end
|
42
42
|
end
|
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'fileutils'
|
2
|
-
|
3
|
-
require 'tmpdir'
|
2
|
+
|
4
3
|
class RemoteTable
|
5
4
|
class LocalFile #:nodoc:all
|
6
5
|
|
@@ -11,7 +10,7 @@ class RemoteTable
|
|
11
10
|
end
|
12
11
|
|
13
12
|
def path
|
14
|
-
|
13
|
+
generate unless generated?
|
15
14
|
@path
|
16
15
|
end
|
17
16
|
|
@@ -23,95 +22,34 @@ class RemoteTable
|
|
23
22
|
end
|
24
23
|
end
|
25
24
|
|
26
|
-
def
|
25
|
+
def cleanup
|
27
26
|
if @encoded_io.respond_to?(:closed?) and !@encoded_io.closed?
|
28
27
|
@encoded_io.close
|
29
28
|
end
|
30
|
-
::FileUtils.rm_rf staging_dir_path
|
31
29
|
@encoded_io = nil
|
30
|
+
if @path and ::File.exist?(@path)
|
31
|
+
::FileUtils.rm_f @path
|
32
|
+
end
|
32
33
|
@path = nil
|
33
|
-
@
|
34
|
+
@generated = nil
|
34
35
|
end
|
35
36
|
|
36
37
|
private
|
37
38
|
|
38
|
-
def
|
39
|
-
|
40
|
-
srand # in case this was forked by resque
|
41
|
-
@staging_dir_path = ::File.join ::Dir.tmpdir, 'remote_table_gem', rand.to_s
|
42
|
-
::FileUtils.mkdir_p @staging_dir_path
|
43
|
-
@staging_dir_path
|
44
|
-
end
|
45
|
-
|
46
|
-
def save_locally
|
47
|
-
return if @path.is_a?(::String)
|
48
|
-
@path = ::File.join(staging_dir_path, ::File.basename(t.properties.uri.path))
|
49
|
-
download
|
50
|
-
decompress
|
51
|
-
unpack
|
52
|
-
pick
|
53
|
-
@path
|
54
|
-
end
|
55
|
-
|
56
|
-
def download
|
57
|
-
if t.properties.uri.scheme == 'file'
|
58
|
-
::FileUtils.cp t.properties.uri.path, @path
|
59
|
-
else
|
60
|
-
# sabshere 1/20/11 FIXME: ::RemoteTable.config.curl_bin_path or smth
|
61
|
-
# sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
|
62
|
-
sleep t.properties.delay_between_requests if t.properties.delay_between_requests
|
63
|
-
$stderr.puts "[remote_table] Downloading #{t.properties.uri.to_s}"
|
64
|
-
::RemoteTable.executor.backtick_with_reporting %{
|
65
|
-
curl
|
66
|
-
--silent
|
67
|
-
--show-error
|
68
|
-
--location
|
69
|
-
--header "Expect: "
|
70
|
-
#{"--data #{::Escape.shell_single_word t.properties.form_data}" if t.properties.form_data.present?}
|
71
|
-
--output #{::Escape.shell_single_word @path}
|
72
|
-
#{::Escape.shell_single_word t.properties.uri.to_s}
|
73
|
-
2>&1
|
74
|
-
}
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def decompress
|
79
|
-
return unless t.properties.compression
|
80
|
-
new_path = @path.chomp ".#{t.properties.compression}"
|
81
|
-
raise_on_error = true
|
82
|
-
cmd = case t.properties.compression
|
83
|
-
when 'zip', 'exe'
|
84
|
-
# can't set path yet because there may be multiple files
|
85
|
-
raise_on_error = false
|
86
|
-
"unzip -qq -n #{::Escape.shell_single_word @path} -d #{::File.dirname(@path)}"
|
87
|
-
when 'bz2'
|
88
|
-
@path = new_path
|
89
|
-
"bunzip2 --stdout #{::Escape.shell_single_word @path} > #{::Escape.shell_single_word new_path}"
|
90
|
-
when 'gz'
|
91
|
-
@path = new_path
|
92
|
-
"gunzip --stdout #{::Escape.shell_single_word @path} > #{::Escape.shell_single_word new_path}"
|
93
|
-
end
|
94
|
-
::RemoteTable.executor.backtick_with_reporting cmd, raise_on_error
|
39
|
+
def generated?
|
40
|
+
@generated == true
|
95
41
|
end
|
96
|
-
|
97
|
-
def
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
"tar -xf #{::Escape.shell_single_word @path} -C #{::File.dirname(@path)}"
|
42
|
+
|
43
|
+
def generate
|
44
|
+
tmp_path = Utils.download t.properties.uri, t.properties.form_data
|
45
|
+
if compression = t.properties.compression
|
46
|
+
tmp_path = Utils.decompress tmp_path, compression
|
102
47
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
# ex. A: 2007-01.csv.gz (compression not capable of storing multiple files)
|
107
|
-
# ex. B: 2007-01.tar.gz (packing)
|
108
|
-
# ex. C: 2007-01.zip (compression capable of storing multiple files)
|
109
|
-
def pick
|
110
|
-
if t.properties.filename.present?
|
111
|
-
@path = ::File.join ::File.dirname(@path), t.properties.filename
|
112
|
-
elsif t.properties.glob.present?
|
113
|
-
@path = ::Dir[::File.dirname(@path)+t.properties.glob].first
|
48
|
+
if packing = t.properties.packing
|
49
|
+
tmp_path = Utils.unpack tmp_path, packing
|
114
50
|
end
|
51
|
+
@path = Utils.pick tmp_path, :filename => t.properties.filename, :glob => t.properties.glob
|
52
|
+
@generated = true
|
115
53
|
end
|
116
54
|
end
|
117
55
|
end
|
@@ -7,22 +7,18 @@ class RemoteTable
|
|
7
7
|
|
8
8
|
def initialize(t)
|
9
9
|
@t = t
|
10
|
-
@current_options = t.options.
|
10
|
+
@current_options = t.options.symbolize_keys
|
11
11
|
end
|
12
12
|
|
13
13
|
def update(options)
|
14
14
|
current_options.update options
|
15
15
|
end
|
16
|
-
|
17
|
-
def delay_between_requests
|
18
|
-
current_options['delay_between_requests'] || (::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS') ? ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i : nil)
|
19
|
-
end
|
20
|
-
|
16
|
+
|
21
17
|
# The parsed URI of the file to get.
|
22
18
|
def uri
|
23
19
|
return @uri if @uri.is_a?(::URI)
|
24
20
|
@uri = ::URI.parse t.url
|
25
|
-
if @uri.host == 'spreadsheets.google.com'
|
21
|
+
if @uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
26
22
|
@uri.query = 'output=csv&' + @uri.query.sub(/\&?output=.*?(\&|\z)/, '\1')
|
27
23
|
end
|
28
24
|
@uri
|
@@ -33,19 +29,19 @@ class RemoteTable
|
|
33
29
|
# * call each
|
34
30
|
# Defaults to false.
|
35
31
|
def streaming
|
36
|
-
current_options[
|
32
|
+
current_options[:streaming] || false
|
37
33
|
end
|
38
34
|
|
39
35
|
# Defaults to true.
|
40
36
|
def warn_on_multiple_downloads
|
41
|
-
current_options[
|
37
|
+
current_options[:warn_on_multiple_downloads] != false
|
42
38
|
end
|
43
39
|
|
44
40
|
# The headers specified by the user
|
45
41
|
#
|
46
42
|
# Default: :first_row
|
47
43
|
def headers
|
48
|
-
current_options[
|
44
|
+
current_options[:headers].nil? ? :first_row : current_options[:headers]
|
49
45
|
end
|
50
46
|
|
51
47
|
def use_first_row_as_header?
|
@@ -60,30 +56,30 @@ class RemoteTable
|
|
60
56
|
#
|
61
57
|
# Default: 0
|
62
58
|
def sheet
|
63
|
-
current_options[
|
59
|
+
current_options[:sheet] || 0
|
64
60
|
end
|
65
61
|
|
66
62
|
# Whether to keep blank rows
|
67
63
|
#
|
68
64
|
# Default: false
|
69
65
|
def keep_blank_rows
|
70
|
-
current_options[
|
66
|
+
current_options[:keep_blank_rows] || false
|
71
67
|
end
|
72
68
|
|
73
69
|
# Form data to send in with the download request
|
74
70
|
def form_data
|
75
|
-
current_options[
|
71
|
+
current_options[:form_data]
|
76
72
|
end
|
77
73
|
|
78
74
|
# How many rows to skip
|
79
75
|
#
|
80
76
|
# Default: 0
|
81
77
|
def skip
|
82
|
-
current_options[
|
78
|
+
current_options[:skip] || 0
|
83
79
|
end
|
84
80
|
|
85
81
|
def internal_encoding
|
86
|
-
(current_options[
|
82
|
+
(current_options[:encoding] || 'UTF-8').upcase
|
87
83
|
end
|
88
84
|
|
89
85
|
def external_encoding
|
@@ -98,49 +94,47 @@ class RemoteTable
|
|
98
94
|
#
|
99
95
|
# Default: ","
|
100
96
|
def delimiter
|
101
|
-
current_options[
|
97
|
+
current_options[:delimiter] || ','
|
102
98
|
end
|
103
99
|
|
104
100
|
# The XPath used to find rows
|
105
101
|
def row_xpath
|
106
|
-
current_options[
|
102
|
+
current_options[:row_xpath]
|
107
103
|
end
|
108
104
|
|
109
105
|
# The XPath used to find columns
|
110
106
|
def column_xpath
|
111
|
-
current_options[
|
107
|
+
current_options[:column_xpath]
|
112
108
|
end
|
113
109
|
|
114
110
|
# The CSS selector used to find rows
|
115
111
|
def row_css
|
116
|
-
current_options[
|
112
|
+
current_options[:row_css]
|
117
113
|
end
|
118
114
|
|
119
115
|
# The CSS selector used to find columns
|
120
116
|
def column_css
|
121
|
-
current_options[
|
117
|
+
current_options[:column_css]
|
122
118
|
end
|
123
119
|
|
124
120
|
# The compression type.
|
125
121
|
#
|
126
122
|
# Default: guessed from URI.
|
127
123
|
#
|
128
|
-
# Can be specified as:
|
124
|
+
# Can be specified as: :gz, :zip, :bz2, :exe (treated as :zip)
|
129
125
|
def compression
|
130
|
-
|
131
|
-
current_options[
|
132
|
-
else
|
133
|
-
::File.extname uri.path
|
126
|
+
if current_options.has_key?(:compression)
|
127
|
+
return current_options[:compression]
|
134
128
|
end
|
135
|
-
case
|
129
|
+
case ::File.extname(uri.path).downcase
|
136
130
|
when /gz/, /gunzip/
|
137
|
-
|
131
|
+
:gz
|
138
132
|
when /zip/
|
139
|
-
|
133
|
+
:zip
|
140
134
|
when /bz2/, /bunzip2/
|
141
|
-
|
135
|
+
:bz2
|
142
136
|
when /exe/
|
143
|
-
|
137
|
+
:exe
|
144
138
|
end
|
145
139
|
end
|
146
140
|
|
@@ -148,82 +142,79 @@ class RemoteTable
|
|
148
142
|
#
|
149
143
|
# Default: guessed from URI.
|
150
144
|
#
|
151
|
-
# Can be specified as:
|
145
|
+
# Can be specified as: :tar
|
152
146
|
def packing
|
153
|
-
|
154
|
-
current_options[
|
155
|
-
else
|
156
|
-
::File.extname(uri.path.sub(/\.#{compression}\z/, ''))
|
147
|
+
if current_options.has_key?(:packing)
|
148
|
+
return current_options[:packing]
|
157
149
|
end
|
158
|
-
|
159
|
-
|
160
|
-
'tar'
|
150
|
+
if uri.path =~ %r{\.tar(?:\.|$)}i
|
151
|
+
:tar
|
161
152
|
end
|
162
153
|
end
|
163
154
|
|
164
155
|
# The glob used to pick a file out of an archive.
|
165
156
|
#
|
166
157
|
# Example:
|
167
|
-
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip',
|
158
|
+
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
|
168
159
|
def glob
|
169
|
-
current_options[
|
160
|
+
current_options[:glob]
|
170
161
|
end
|
171
162
|
|
172
163
|
# The filename, which can be used to pick a file out of an archive.
|
173
164
|
#
|
174
165
|
# Example:
|
175
|
-
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip',
|
166
|
+
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
|
176
167
|
def filename
|
177
|
-
current_options[
|
168
|
+
current_options[:filename]
|
178
169
|
end
|
179
170
|
|
180
171
|
# Cut columns up to this character
|
181
172
|
def cut
|
182
|
-
current_options[
|
173
|
+
current_options[:cut]
|
183
174
|
end
|
184
175
|
|
185
176
|
# Crop rows after this line
|
186
177
|
def crop
|
187
|
-
current_options[
|
178
|
+
current_options[:crop]
|
188
179
|
end
|
189
180
|
|
190
181
|
# The fixed-width schema, given as an array
|
191
182
|
#
|
192
183
|
# Example:
|
193
184
|
# RemoteTable.new('http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
194
|
-
#
|
195
|
-
#
|
196
|
-
#
|
185
|
+
# :format => :fixed_width,
|
186
|
+
# :skip => 1,
|
187
|
+
# :schema => [[ 'header4', 10, { :type => :string } ],
|
197
188
|
# [ 'spacer', 1 ],
|
198
189
|
# [ 'header5', 10, { :type => :string } ],
|
199
190
|
# [ 'spacer', 12 ],
|
200
191
|
# [ 'header6', 10, { :type => :string } ]])
|
201
192
|
def schema
|
202
|
-
current_options[
|
193
|
+
current_options[:schema]
|
203
194
|
end
|
204
195
|
|
205
196
|
# The name of the fixed-width schema according to FixedWidth
|
206
197
|
def schema_name
|
207
|
-
current_options[
|
198
|
+
current_options[:schema_name]
|
208
199
|
end
|
209
200
|
|
210
201
|
# A proc to call to decide whether to return a row.
|
211
202
|
def select
|
212
|
-
current_options[
|
203
|
+
current_options[:select]
|
213
204
|
end
|
214
205
|
|
215
206
|
# A proc to call to decide whether to return a row.
|
216
207
|
def reject
|
217
|
-
current_options[
|
208
|
+
current_options[:reject]
|
218
209
|
end
|
219
210
|
|
220
211
|
# A hash of options to create a new Errata instance (see the Errata gem at http://github.com/seamusabshere/errata) to be used on every row.
|
221
212
|
def errata
|
222
|
-
return unless current_options.has_key?
|
223
|
-
@errata ||= if current_options[
|
224
|
-
::Errata.new current_options[
|
213
|
+
return unless current_options.has_key? :errata
|
214
|
+
@errata ||= if current_options[:errata].is_a? ::Hash
|
215
|
+
::Errata.new current_options[:errata]
|
225
216
|
else
|
226
|
-
current_options[
|
217
|
+
current_options[:errata]
|
227
218
|
end
|
228
219
|
end
|
229
220
|
|
@@ -233,15 +224,15 @@ class RemoteTable
|
|
233
224
|
#
|
234
225
|
# Default: guessed from file extension (which is usually the same as the URI, but sometimes not if you pick out a specific file from an archive)
|
235
226
|
#
|
236
|
-
# Can be specified as:
|
227
|
+
# Can be specified as: :xlsx, :xls, :delimited (aka :csv and :tsv), :ods, :fixed_width, :html
|
237
228
|
def format
|
238
|
-
return Format::Delimited if uri.host == 'spreadsheets.google.com'
|
239
|
-
clue = if current_options
|
240
|
-
current_options[
|
229
|
+
return Format::Delimited if uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
230
|
+
clue = if current_options.has_key?(:format)
|
231
|
+
current_options[:format]
|
241
232
|
else
|
242
|
-
|
233
|
+
t.local_file.path
|
243
234
|
end
|
244
|
-
case clue.downcase
|
235
|
+
case clue.to_s.downcase
|
245
236
|
when /xlsx/, /excelx/
|
246
237
|
Format::Excelx
|
247
238
|
when /xls/, /excel/
|