remote_table 1.2.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -1
- data/lib/remote_table.rb +4 -13
- data/lib/remote_table/format/delimited.rb +20 -21
- data/lib/remote_table/format/fixed_width.rb +2 -1
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +7 -7
- data/lib/remote_table/format/mixins/processed_by_roo.rb +30 -20
- data/lib/remote_table/format/mixins/textual.rb +9 -9
- data/lib/remote_table/local_file.rb +18 -80
- data/lib/remote_table/properties.rb +52 -61
- data/lib/remote_table/transformer.rb +3 -4
- data/lib/remote_table/utils.rb +157 -0
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +1 -7
- data/test/test_big.rb +2 -2
- data/test/test_old_syntax.rb +20 -20
- data/test/test_remote_table.rb +11 -0
- metadata +28 -94
- data/lib/remote_table/executor.rb +0 -37
data/README.rdoc
CHANGED
@@ -15,7 +15,7 @@ As this library matures, those should go away.
|
|
15
15
|
|
16
16
|
==Example
|
17
17
|
|
18
|
-
?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip',
|
18
|
+
?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv'
|
19
19
|
=> #<RemoteTable:0x359da50 [...]>
|
20
20
|
?> t[0]
|
21
21
|
=> {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
|
data/lib/remote_table.rb
CHANGED
@@ -28,9 +28,7 @@ class RemoteTable
|
|
28
28
|
autoload :Properties, 'remote_table/properties'
|
29
29
|
autoload :LocalFile, 'remote_table/local_file'
|
30
30
|
autoload :Transformer, 'remote_table/transformer'
|
31
|
-
|
32
|
-
# singletons
|
33
|
-
autoload :Executor, 'remote_table/executor'
|
31
|
+
autoload :Utils, 'remote_table/utils'
|
34
32
|
|
35
33
|
# Legacy
|
36
34
|
class Transform
|
@@ -49,18 +47,17 @@ class RemoteTable
|
|
49
47
|
# RemoteTable.new(url, options = {})
|
50
48
|
#
|
51
49
|
# New syntax:
|
52
|
-
# RemoteTable.new('www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx',
|
50
|
+
# RemoteTable.new('www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :foo => 'bar')
|
53
51
|
# Old syntax:
|
54
52
|
# RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :foo => 'bar')
|
55
53
|
#
|
56
54
|
# See the <tt>Properties</tt> object for the sorts of options you can pass.
|
57
55
|
def initialize(*args)
|
58
|
-
@options = args.last.is_a?(::Hash) ? args.last.
|
59
|
-
@options.stringify_keys!
|
56
|
+
@options = args.last.is_a?(::Hash) ? args.last.symbolize_keys : {}
|
60
57
|
@url = if args.first.is_a? ::String
|
61
58
|
args.first.dup
|
62
59
|
else
|
63
|
-
@options[
|
60
|
+
@options[:url].dup
|
64
61
|
end
|
65
62
|
@url.freeze
|
66
63
|
@options.freeze
|
@@ -112,15 +109,9 @@ class RemoteTable
|
|
112
109
|
# clear the row cache to save memory
|
113
110
|
def free
|
114
111
|
cache.clear
|
115
|
-
::GC.start
|
116
112
|
nil
|
117
113
|
end
|
118
114
|
|
119
|
-
# Used internally to execute stuff in shells.
|
120
|
-
def self.executor
|
121
|
-
Executor.instance
|
122
|
-
end
|
123
|
-
|
124
115
|
# Used internally to access to a downloaded copy of the file
|
125
116
|
def local_file
|
126
117
|
@local_file ||= LocalFile.new self
|
@@ -1,10 +1,10 @@
|
|
1
1
|
if RUBY_VERSION >= '1.9'
|
2
2
|
require 'csv'
|
3
|
-
::RemoteTable::
|
3
|
+
::RemoteTable::MyCSV = ::CSV
|
4
4
|
else
|
5
5
|
begin
|
6
6
|
require 'fastercsv'
|
7
|
-
::RemoteTable::
|
7
|
+
::RemoteTable::MyCSV = ::FasterCSV
|
8
8
|
rescue ::LoadError
|
9
9
|
$stderr.puts "[remote_table] You probably need to manually install the fastercsv gem and/or require it in your Gemfile."
|
10
10
|
raise $!
|
@@ -20,8 +20,8 @@ class RemoteTable
|
|
20
20
|
fix_newlines!
|
21
21
|
transliterate_whole_file_to_utf8!
|
22
22
|
skip_rows!
|
23
|
-
|
24
|
-
if row.is_a?(
|
23
|
+
MyCSV.new(t.local_file.encoded_io, fastercsv_options).each do |row|
|
24
|
+
if row.is_a?(MyCSV::Row)
|
25
25
|
hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (k, v)|
|
26
26
|
if k.present?
|
27
27
|
memo[k] = v.to_s
|
@@ -35,30 +35,29 @@ class RemoteTable
|
|
35
35
|
end
|
36
36
|
end
|
37
37
|
ensure
|
38
|
-
t.local_file.
|
38
|
+
t.local_file.cleanup
|
39
39
|
end
|
40
40
|
|
41
41
|
private
|
42
42
|
|
43
|
-
FASTERCSV_OPTIONS =
|
44
|
-
unconverted_fields
|
45
|
-
col_sep
|
46
|
-
headers
|
47
|
-
row_sep
|
48
|
-
return_headers
|
49
|
-
header_converters
|
50
|
-
quote_char
|
51
|
-
skip_blanks
|
52
|
-
converters
|
53
|
-
force_quotes
|
54
|
-
|
43
|
+
FASTERCSV_OPTIONS = [
|
44
|
+
:unconverted_fields,
|
45
|
+
:col_sep,
|
46
|
+
:headers,
|
47
|
+
:row_sep,
|
48
|
+
:return_headers,
|
49
|
+
:header_converters,
|
50
|
+
:quote_char,
|
51
|
+
:skip_blanks,
|
52
|
+
:converters,
|
53
|
+
:force_quotes,
|
54
|
+
]
|
55
55
|
|
56
56
|
def fastercsv_options
|
57
57
|
hsh = t.options.slice *FASTERCSV_OPTIONS
|
58
|
-
hsh.merge!
|
59
|
-
hsh.reverse_merge!
|
60
|
-
hsh.reverse_merge!
|
61
|
-
hsh.symbolize_keys
|
58
|
+
hsh.merge! :skip_blanks => !t.properties.keep_blank_rows
|
59
|
+
hsh.reverse_merge! :headers => t.properties.headers
|
60
|
+
hsh.reverse_merge! :col_sep => t.properties.delimiter
|
62
61
|
end
|
63
62
|
end
|
64
63
|
end
|
@@ -19,7 +19,7 @@ class RemoteTable
|
|
19
19
|
yield row if t.properties.keep_blank_rows or row.any? { |k, v| v.present? }
|
20
20
|
end
|
21
21
|
ensure
|
22
|
-
t.local_file.
|
22
|
+
t.local_file.cleanup
|
23
23
|
end
|
24
24
|
|
25
25
|
private
|
@@ -42,6 +42,7 @@ class RemoteTable
|
|
42
42
|
d.rows do |row|
|
43
43
|
row.trap(&everything)
|
44
44
|
t.properties.schema.each do |name, width, options|
|
45
|
+
name = name.to_s
|
45
46
|
if name == 'spacer'
|
46
47
|
row.spacer width
|
47
48
|
else
|
@@ -7,8 +7,9 @@ class RemoteTable
|
|
7
7
|
raise "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML" unless t.properties.row_css or t.properties.row_xpath
|
8
8
|
remove_useless_characters!
|
9
9
|
transliterate_whole_file_to_utf8!
|
10
|
-
|
11
|
-
|
10
|
+
|
11
|
+
headers = t.properties.headers
|
12
|
+
|
12
13
|
xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, 'UTF-8')
|
13
14
|
(row_css? ? xml.css(t.properties.row_css) : xml.xpath(t.properties.row_xpath)).each do |row|
|
14
15
|
values = if column_css?
|
@@ -18,22 +19,21 @@ class RemoteTable
|
|
18
19
|
else
|
19
20
|
[row]
|
20
21
|
end.map { |cell| assume_utf8 cell.content.gsub(/\s+/, ' ').strip }
|
21
|
-
if
|
22
|
-
|
23
|
-
first_row = false
|
22
|
+
if headers == :first_row
|
23
|
+
headers = values.select(&:present?)
|
24
24
|
next
|
25
25
|
end
|
26
26
|
output = if t.properties.output_class == ::Array
|
27
27
|
values
|
28
28
|
else
|
29
|
-
zip
|
29
|
+
zip headers, values
|
30
30
|
end
|
31
31
|
if t.properties.keep_blank_rows or values.any?
|
32
32
|
yield output
|
33
33
|
end
|
34
34
|
end
|
35
35
|
ensure
|
36
|
-
t.local_file.
|
36
|
+
t.local_file.cleanup
|
37
37
|
end
|
38
38
|
|
39
39
|
private
|
@@ -5,44 +5,54 @@ class RemoteTable
|
|
5
5
|
def each(&blk)
|
6
6
|
spreadsheet = roo_class.new t.local_file.path, nil, :ignore
|
7
7
|
spreadsheet.default_sheet = t.properties.sheet.is_a?(::Numeric) ? spreadsheet.sheets[t.properties.sheet] : t.properties.sheet
|
8
|
+
|
9
|
+
first_row = if t.properties.crop
|
10
|
+
t.properties.crop.first + 1
|
11
|
+
else
|
12
|
+
t.properties.skip + 1
|
13
|
+
end
|
14
|
+
|
15
|
+
last_row = if t.properties.crop
|
16
|
+
t.properties.crop.last
|
17
|
+
else
|
18
|
+
spreadsheet.last_row
|
19
|
+
end
|
20
|
+
|
8
21
|
if t.properties.output_class == ::Array
|
9
|
-
(first_row..
|
22
|
+
(first_row..last_row).each do |y|
|
10
23
|
output = (1..spreadsheet.last_column).map do |x|
|
11
24
|
assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
12
25
|
end
|
13
26
|
yield output if t.properties.keep_blank_rows or output.any? { |v| v.present? }
|
14
27
|
end
|
15
28
|
else
|
16
|
-
|
29
|
+
headers = {}
|
17
30
|
if t.properties.use_first_row_as_header?
|
18
31
|
(1..spreadsheet.last_column).each do |x|
|
19
|
-
|
20
|
-
|
21
|
-
|
32
|
+
v = spreadsheet.cell(first_row, x)
|
33
|
+
v = spreadsheet.cell(first_row - 1, x) if v.blank? # look up
|
34
|
+
if v.present?
|
35
|
+
v = assume_utf8 v
|
36
|
+
headers[v] = x # 'foobar' is found at column 6
|
37
|
+
end
|
22
38
|
end
|
39
|
+
# "advance the cursor"
|
40
|
+
first_row += 1
|
23
41
|
else
|
24
|
-
|
25
|
-
|
42
|
+
t.properties.headers.each_with_index do |k, i|
|
43
|
+
headers[k] = i + 1
|
26
44
|
end
|
27
45
|
end
|
28
|
-
(first_row
|
29
|
-
output =
|
30
|
-
|
31
|
-
|
32
|
-
end
|
33
|
-
memo
|
46
|
+
(first_row..last_row).each do |y|
|
47
|
+
output = ::ActiveSupport::OrderedHash.new
|
48
|
+
headers.each do |k, x|
|
49
|
+
output[k] = assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
34
50
|
end
|
35
51
|
yield output if t.properties.keep_blank_rows or output.any? { |k, v| v.present? }
|
36
52
|
end
|
37
53
|
end
|
38
54
|
ensure
|
39
|
-
t.local_file.
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def first_row
|
45
|
-
1 + t.properties.skip
|
55
|
+
t.local_file.cleanup
|
46
56
|
end
|
47
57
|
end
|
48
58
|
end
|
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'fileutils'
|
2
|
-
require 'escape'
|
3
2
|
class RemoteTable
|
4
3
|
class Format
|
5
4
|
module Textual
|
@@ -8,35 +7,36 @@ class RemoteTable
|
|
8
7
|
'\xc2\xad', # soft hyphen, often inserted by MS Office (html: ­)
|
9
8
|
]
|
10
9
|
def remove_useless_characters!
|
11
|
-
|
10
|
+
Utils.in_place t.local_file.path, 'perl', '-pe', "s/#{USELESS_CHARACTERS.join '//g; s/'}//g"
|
12
11
|
if t.properties.internal_encoding =~ /windows.?1252/i
|
13
12
|
# soft hyphen again, as I have seen it appear in windows 1252
|
14
|
-
|
13
|
+
Utils.in_place t.local_file.path, 'perl', '-pe', 's/\xad//g'
|
15
14
|
end
|
16
15
|
end
|
17
16
|
|
18
17
|
def transliterate_whole_file_to_utf8!
|
19
|
-
|
20
|
-
t.properties.update
|
18
|
+
Utils.in_place t.local_file.path, 'iconv', '-c', '-f', t.properties.internal_encoding, '-t', t.properties.external_encoding_iconv, :ignore_error => true
|
19
|
+
t.properties.update :encoding => t.properties.external_encoding
|
21
20
|
end
|
22
21
|
|
23
22
|
def fix_newlines!
|
24
|
-
|
23
|
+
Utils.in_place t.local_file.path, 'perl', '-pe', 's/\r\n|\n|\r/\n/g'
|
25
24
|
end
|
26
25
|
|
27
26
|
def skip_rows!
|
28
27
|
return unless t.properties.skip > 0
|
29
|
-
|
28
|
+
Utils.in_place t.local_file.path, 'tail', '-n', "+#{t.properties.skip + 1}"
|
30
29
|
end
|
31
30
|
|
32
31
|
def crop_rows!
|
33
32
|
return unless t.properties.crop
|
34
|
-
|
33
|
+
Utils.in_place t.local_file.path, 'tail', '-n', "+#{t.properties.crop.first}"
|
34
|
+
Utils.in_place t.local_file.path, 'head', '-n', (t.properties.crop.last - t.properties.crop.first + 1).to_s
|
35
35
|
end
|
36
36
|
|
37
37
|
def cut_columns!
|
38
38
|
return unless t.properties.cut
|
39
|
-
|
39
|
+
Utils.in_place t.local_file.path, 'cut', '-c', t.properties.cut.to_s
|
40
40
|
end
|
41
41
|
end
|
42
42
|
end
|
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'fileutils'
|
2
|
-
|
3
|
-
require 'tmpdir'
|
2
|
+
|
4
3
|
class RemoteTable
|
5
4
|
class LocalFile #:nodoc:all
|
6
5
|
|
@@ -11,7 +10,7 @@ class RemoteTable
|
|
11
10
|
end
|
12
11
|
|
13
12
|
def path
|
14
|
-
|
13
|
+
generate unless generated?
|
15
14
|
@path
|
16
15
|
end
|
17
16
|
|
@@ -23,95 +22,34 @@ class RemoteTable
|
|
23
22
|
end
|
24
23
|
end
|
25
24
|
|
26
|
-
def
|
25
|
+
def cleanup
|
27
26
|
if @encoded_io.respond_to?(:closed?) and !@encoded_io.closed?
|
28
27
|
@encoded_io.close
|
29
28
|
end
|
30
|
-
::FileUtils.rm_rf staging_dir_path
|
31
29
|
@encoded_io = nil
|
30
|
+
if @path and ::File.exist?(@path)
|
31
|
+
::FileUtils.rm_f @path
|
32
|
+
end
|
32
33
|
@path = nil
|
33
|
-
@
|
34
|
+
@generated = nil
|
34
35
|
end
|
35
36
|
|
36
37
|
private
|
37
38
|
|
38
|
-
def
|
39
|
-
|
40
|
-
srand # in case this was forked by resque
|
41
|
-
@staging_dir_path = ::File.join ::Dir.tmpdir, 'remote_table_gem', rand.to_s
|
42
|
-
::FileUtils.mkdir_p @staging_dir_path
|
43
|
-
@staging_dir_path
|
44
|
-
end
|
45
|
-
|
46
|
-
def save_locally
|
47
|
-
return if @path.is_a?(::String)
|
48
|
-
@path = ::File.join(staging_dir_path, ::File.basename(t.properties.uri.path))
|
49
|
-
download
|
50
|
-
decompress
|
51
|
-
unpack
|
52
|
-
pick
|
53
|
-
@path
|
54
|
-
end
|
55
|
-
|
56
|
-
def download
|
57
|
-
if t.properties.uri.scheme == 'file'
|
58
|
-
::FileUtils.cp t.properties.uri.path, @path
|
59
|
-
else
|
60
|
-
# sabshere 1/20/11 FIXME: ::RemoteTable.config.curl_bin_path or smth
|
61
|
-
# sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
|
62
|
-
sleep t.properties.delay_between_requests if t.properties.delay_between_requests
|
63
|
-
$stderr.puts "[remote_table] Downloading #{t.properties.uri.to_s}"
|
64
|
-
::RemoteTable.executor.backtick_with_reporting %{
|
65
|
-
curl
|
66
|
-
--silent
|
67
|
-
--show-error
|
68
|
-
--location
|
69
|
-
--header "Expect: "
|
70
|
-
#{"--data #{::Escape.shell_single_word t.properties.form_data}" if t.properties.form_data.present?}
|
71
|
-
--output #{::Escape.shell_single_word @path}
|
72
|
-
#{::Escape.shell_single_word t.properties.uri.to_s}
|
73
|
-
2>&1
|
74
|
-
}
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def decompress
|
79
|
-
return unless t.properties.compression
|
80
|
-
new_path = @path.chomp ".#{t.properties.compression}"
|
81
|
-
raise_on_error = true
|
82
|
-
cmd = case t.properties.compression
|
83
|
-
when 'zip', 'exe'
|
84
|
-
# can't set path yet because there may be multiple files
|
85
|
-
raise_on_error = false
|
86
|
-
"unzip -qq -n #{::Escape.shell_single_word @path} -d #{::File.dirname(@path)}"
|
87
|
-
when 'bz2'
|
88
|
-
@path = new_path
|
89
|
-
"bunzip2 --stdout #{::Escape.shell_single_word @path} > #{::Escape.shell_single_word new_path}"
|
90
|
-
when 'gz'
|
91
|
-
@path = new_path
|
92
|
-
"gunzip --stdout #{::Escape.shell_single_word @path} > #{::Escape.shell_single_word new_path}"
|
93
|
-
end
|
94
|
-
::RemoteTable.executor.backtick_with_reporting cmd, raise_on_error
|
39
|
+
def generated?
|
40
|
+
@generated == true
|
95
41
|
end
|
96
|
-
|
97
|
-
def
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
"tar -xf #{::Escape.shell_single_word @path} -C #{::File.dirname(@path)}"
|
42
|
+
|
43
|
+
def generate
|
44
|
+
tmp_path = Utils.download t.properties.uri, t.properties.form_data
|
45
|
+
if compression = t.properties.compression
|
46
|
+
tmp_path = Utils.decompress tmp_path, compression
|
102
47
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
# ex. A: 2007-01.csv.gz (compression not capable of storing multiple files)
|
107
|
-
# ex. B: 2007-01.tar.gz (packing)
|
108
|
-
# ex. C: 2007-01.zip (compression capable of storing multiple files)
|
109
|
-
def pick
|
110
|
-
if t.properties.filename.present?
|
111
|
-
@path = ::File.join ::File.dirname(@path), t.properties.filename
|
112
|
-
elsif t.properties.glob.present?
|
113
|
-
@path = ::Dir[::File.dirname(@path)+t.properties.glob].first
|
48
|
+
if packing = t.properties.packing
|
49
|
+
tmp_path = Utils.unpack tmp_path, packing
|
114
50
|
end
|
51
|
+
@path = Utils.pick tmp_path, :filename => t.properties.filename, :glob => t.properties.glob
|
52
|
+
@generated = true
|
115
53
|
end
|
116
54
|
end
|
117
55
|
end
|
@@ -7,22 +7,18 @@ class RemoteTable
|
|
7
7
|
|
8
8
|
def initialize(t)
|
9
9
|
@t = t
|
10
|
-
@current_options = t.options.
|
10
|
+
@current_options = t.options.symbolize_keys
|
11
11
|
end
|
12
12
|
|
13
13
|
def update(options)
|
14
14
|
current_options.update options
|
15
15
|
end
|
16
|
-
|
17
|
-
def delay_between_requests
|
18
|
-
current_options['delay_between_requests'] || (::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS') ? ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i : nil)
|
19
|
-
end
|
20
|
-
|
16
|
+
|
21
17
|
# The parsed URI of the file to get.
|
22
18
|
def uri
|
23
19
|
return @uri if @uri.is_a?(::URI)
|
24
20
|
@uri = ::URI.parse t.url
|
25
|
-
if @uri.host == 'spreadsheets.google.com'
|
21
|
+
if @uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
26
22
|
@uri.query = 'output=csv&' + @uri.query.sub(/\&?output=.*?(\&|\z)/, '\1')
|
27
23
|
end
|
28
24
|
@uri
|
@@ -33,19 +29,19 @@ class RemoteTable
|
|
33
29
|
# * call each
|
34
30
|
# Defaults to false.
|
35
31
|
def streaming
|
36
|
-
current_options[
|
32
|
+
current_options[:streaming] || false
|
37
33
|
end
|
38
34
|
|
39
35
|
# Defaults to true.
|
40
36
|
def warn_on_multiple_downloads
|
41
|
-
current_options[
|
37
|
+
current_options[:warn_on_multiple_downloads] != false
|
42
38
|
end
|
43
39
|
|
44
40
|
# The headers specified by the user
|
45
41
|
#
|
46
42
|
# Default: :first_row
|
47
43
|
def headers
|
48
|
-
current_options[
|
44
|
+
current_options[:headers].nil? ? :first_row : current_options[:headers]
|
49
45
|
end
|
50
46
|
|
51
47
|
def use_first_row_as_header?
|
@@ -60,30 +56,30 @@ class RemoteTable
|
|
60
56
|
#
|
61
57
|
# Default: 0
|
62
58
|
def sheet
|
63
|
-
current_options[
|
59
|
+
current_options[:sheet] || 0
|
64
60
|
end
|
65
61
|
|
66
62
|
# Whether to keep blank rows
|
67
63
|
#
|
68
64
|
# Default: false
|
69
65
|
def keep_blank_rows
|
70
|
-
current_options[
|
66
|
+
current_options[:keep_blank_rows] || false
|
71
67
|
end
|
72
68
|
|
73
69
|
# Form data to send in with the download request
|
74
70
|
def form_data
|
75
|
-
current_options[
|
71
|
+
current_options[:form_data]
|
76
72
|
end
|
77
73
|
|
78
74
|
# How many rows to skip
|
79
75
|
#
|
80
76
|
# Default: 0
|
81
77
|
def skip
|
82
|
-
current_options[
|
78
|
+
current_options[:skip] || 0
|
83
79
|
end
|
84
80
|
|
85
81
|
def internal_encoding
|
86
|
-
(current_options[
|
82
|
+
(current_options[:encoding] || 'UTF-8').upcase
|
87
83
|
end
|
88
84
|
|
89
85
|
def external_encoding
|
@@ -98,49 +94,47 @@ class RemoteTable
|
|
98
94
|
#
|
99
95
|
# Default: ","
|
100
96
|
def delimiter
|
101
|
-
current_options[
|
97
|
+
current_options[:delimiter] || ','
|
102
98
|
end
|
103
99
|
|
104
100
|
# The XPath used to find rows
|
105
101
|
def row_xpath
|
106
|
-
current_options[
|
102
|
+
current_options[:row_xpath]
|
107
103
|
end
|
108
104
|
|
109
105
|
# The XPath used to find columns
|
110
106
|
def column_xpath
|
111
|
-
current_options[
|
107
|
+
current_options[:column_xpath]
|
112
108
|
end
|
113
109
|
|
114
110
|
# The CSS selector used to find rows
|
115
111
|
def row_css
|
116
|
-
current_options[
|
112
|
+
current_options[:row_css]
|
117
113
|
end
|
118
114
|
|
119
115
|
# The CSS selector used to find columns
|
120
116
|
def column_css
|
121
|
-
current_options[
|
117
|
+
current_options[:column_css]
|
122
118
|
end
|
123
119
|
|
124
120
|
# The compression type.
|
125
121
|
#
|
126
122
|
# Default: guessed from URI.
|
127
123
|
#
|
128
|
-
# Can be specified as:
|
124
|
+
# Can be specified as: :gz, :zip, :bz2, :exe (treated as :zip)
|
129
125
|
def compression
|
130
|
-
|
131
|
-
current_options[
|
132
|
-
else
|
133
|
-
::File.extname uri.path
|
126
|
+
if current_options.has_key?(:compression)
|
127
|
+
return current_options[:compression]
|
134
128
|
end
|
135
|
-
case
|
129
|
+
case ::File.extname(uri.path).downcase
|
136
130
|
when /gz/, /gunzip/
|
137
|
-
|
131
|
+
:gz
|
138
132
|
when /zip/
|
139
|
-
|
133
|
+
:zip
|
140
134
|
when /bz2/, /bunzip2/
|
141
|
-
|
135
|
+
:bz2
|
142
136
|
when /exe/
|
143
|
-
|
137
|
+
:exe
|
144
138
|
end
|
145
139
|
end
|
146
140
|
|
@@ -148,82 +142,79 @@ class RemoteTable
|
|
148
142
|
#
|
149
143
|
# Default: guessed from URI.
|
150
144
|
#
|
151
|
-
# Can be specified as:
|
145
|
+
# Can be specified as: :tar
|
152
146
|
def packing
|
153
|
-
|
154
|
-
current_options[
|
155
|
-
else
|
156
|
-
::File.extname(uri.path.sub(/\.#{compression}\z/, ''))
|
147
|
+
if current_options.has_key?(:packing)
|
148
|
+
return current_options[:packing]
|
157
149
|
end
|
158
|
-
|
159
|
-
|
160
|
-
'tar'
|
150
|
+
if uri.path =~ %r{\.tar(?:\.|$)}i
|
151
|
+
:tar
|
161
152
|
end
|
162
153
|
end
|
163
154
|
|
164
155
|
# The glob used to pick a file out of an archive.
|
165
156
|
#
|
166
157
|
# Example:
|
167
|
-
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip',
|
158
|
+
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
|
168
159
|
def glob
|
169
|
-
current_options[
|
160
|
+
current_options[:glob]
|
170
161
|
end
|
171
162
|
|
172
163
|
# The filename, which can be used to pick a file out of an archive.
|
173
164
|
#
|
174
165
|
# Example:
|
175
|
-
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip',
|
166
|
+
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
|
176
167
|
def filename
|
177
|
-
current_options[
|
168
|
+
current_options[:filename]
|
178
169
|
end
|
179
170
|
|
180
171
|
# Cut columns up to this character
|
181
172
|
def cut
|
182
|
-
current_options[
|
173
|
+
current_options[:cut]
|
183
174
|
end
|
184
175
|
|
185
176
|
# Crop rows after this line
|
186
177
|
def crop
|
187
|
-
current_options[
|
178
|
+
current_options[:crop]
|
188
179
|
end
|
189
180
|
|
190
181
|
# The fixed-width schema, given as an array
|
191
182
|
#
|
192
183
|
# Example:
|
193
184
|
# RemoteTable.new('http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
194
|
-
#
|
195
|
-
#
|
196
|
-
#
|
185
|
+
# :format => :fixed_width,
|
186
|
+
# :skip => 1,
|
187
|
+
# :schema => [[ 'header4', 10, { :type => :string } ],
|
197
188
|
# [ 'spacer', 1 ],
|
198
189
|
# [ 'header5', 10, { :type => :string } ],
|
199
190
|
# [ 'spacer', 12 ],
|
200
191
|
# [ 'header6', 10, { :type => :string } ]])
|
201
192
|
def schema
|
202
|
-
current_options[
|
193
|
+
current_options[:schema]
|
203
194
|
end
|
204
195
|
|
205
196
|
# The name of the fixed-width schema according to FixedWidth
|
206
197
|
def schema_name
|
207
|
-
current_options[
|
198
|
+
current_options[:schema_name]
|
208
199
|
end
|
209
200
|
|
210
201
|
# A proc to call to decide whether to return a row.
|
211
202
|
def select
|
212
|
-
current_options[
|
203
|
+
current_options[:select]
|
213
204
|
end
|
214
205
|
|
215
206
|
# A proc to call to decide whether to return a row.
|
216
207
|
def reject
|
217
|
-
current_options[
|
208
|
+
current_options[:reject]
|
218
209
|
end
|
219
210
|
|
220
211
|
# A hash of options to create a new Errata instance (see the Errata gem at http://github.com/seamusabshere/errata) to be used on every row.
|
221
212
|
def errata
|
222
|
-
return unless current_options.has_key?
|
223
|
-
@errata ||= if current_options[
|
224
|
-
::Errata.new current_options[
|
213
|
+
return unless current_options.has_key? :errata
|
214
|
+
@errata ||= if current_options[:errata].is_a? ::Hash
|
215
|
+
::Errata.new current_options[:errata]
|
225
216
|
else
|
226
|
-
current_options[
|
217
|
+
current_options[:errata]
|
227
218
|
end
|
228
219
|
end
|
229
220
|
|
@@ -233,15 +224,15 @@ class RemoteTable
|
|
233
224
|
#
|
234
225
|
# Default: guessed from file extension (which is usually the same as the URI, but sometimes not if you pick out a specific file from an archive)
|
235
226
|
#
|
236
|
-
# Can be specified as:
|
227
|
+
# Can be specified as: :xlsx, :xls, :delimited (aka :csv and :tsv), :ods, :fixed_width, :html
|
237
228
|
def format
|
238
|
-
return Format::Delimited if uri.host == 'spreadsheets.google.com'
|
239
|
-
clue = if current_options
|
240
|
-
current_options[
|
229
|
+
return Format::Delimited if uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
230
|
+
clue = if current_options.has_key?(:format)
|
231
|
+
current_options[:format]
|
241
232
|
else
|
242
|
-
|
233
|
+
t.local_file.path
|
243
234
|
end
|
244
|
-
case clue.downcase
|
235
|
+
case clue.to_s.downcase
|
245
236
|
when /xlsx/, /excelx/
|
246
237
|
Format::Excelx
|
247
238
|
when /xls/, /excel/
|