remote_table 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/README.markdown +59 -37
- data/lib/remote_table.rb +478 -56
- data/lib/remote_table/delimited.rb +91 -0
- data/lib/remote_table/fixed_width.rb +81 -0
- data/lib/remote_table/html.rb +13 -0
- data/lib/remote_table/{local_file.rb → local_copy.rb} +26 -22
- data/lib/remote_table/ods.rb +17 -0
- data/lib/remote_table/plaintext.rb +67 -0
- data/lib/remote_table/processed_by_nokogiri.rb +76 -0
- data/lib/remote_table/processed_by_roo.rb +97 -0
- data/lib/remote_table/transformer.rb +9 -5
- data/lib/remote_table/version.rb +1 -1
- data/lib/remote_table/xls.rb +11 -0
- data/lib/remote_table/xlsx.rb +11 -0
- data/lib/remote_table/xml.rb +13 -0
- data/lib/remote_table/yaml.rb +14 -0
- data/remote_table.gemspec +2 -2
- data/test/test_big.rb +1 -1
- data/test/test_remote_table.rb +26 -21
- metadata +19 -20
- data/lib/remote_table/config.rb +0 -251
- data/lib/remote_table/format.rb +0 -49
- data/lib/remote_table/format/delimited.rb +0 -60
- data/lib/remote_table/format/excel.rb +0 -10
- data/lib/remote_table/format/excelx.rb +0 -10
- data/lib/remote_table/format/fixed_width.rb +0 -60
- data/lib/remote_table/format/html.rb +0 -12
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +0 -70
- data/lib/remote_table/format/mixins/processed_by_roo.rb +0 -63
- data/lib/remote_table/format/mixins/textual.rb +0 -43
- data/lib/remote_table/format/open_office.rb +0 -13
- data/lib/remote_table/format/xml.rb +0 -12
- data/lib/remote_table/format/yaml.rb +0 -14
@@ -3,6 +3,7 @@ class RemoteTable
|
|
3
3
|
attr_reader :t
|
4
4
|
def initialize(t)
|
5
5
|
@t = t
|
6
|
+
@legacy_transformer_mutex = ::Mutex.new
|
6
7
|
end
|
7
8
|
# eventually this will support a different way of specifying a transformer
|
8
9
|
def transform(row)
|
@@ -14,12 +15,15 @@ class RemoteTable
|
|
14
15
|
end
|
15
16
|
def legacy_transformer
|
16
17
|
return @legacy_transformer[0] if @legacy_transformer.is_a?(::Array)
|
17
|
-
|
18
|
-
|
19
|
-
|
18
|
+
@legacy_transformer_mutex.synchronize do
|
19
|
+
return @legacy_transformer[0] if @legacy_transformer.is_a?(::Array)
|
20
|
+
memo = if (transform_settings = t.transform_settings)
|
21
|
+
transform_settings = transform_settings.symbolize_keys
|
22
|
+
transform_settings[:class].new transform_settings.except(:class)
|
23
|
+
end
|
24
|
+
@legacy_transformer = [memo]
|
25
|
+
memo
|
20
26
|
end
|
21
|
-
@legacy_transformer = [memo]
|
22
|
-
memo
|
23
27
|
end
|
24
28
|
end
|
25
29
|
end
|
data/lib/remote_table/version.rb
CHANGED
@@ -0,0 +1,13 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
# Parses XML files using Nokogiri's Nokogiri::XML::Document class.
|
3
|
+
module Xml
|
4
|
+
def self.extended(base)
|
5
|
+
base.extend Plaintext
|
6
|
+
base.extend ProcessedByNokogiri
|
7
|
+
end
|
8
|
+
|
9
|
+
def nokogiri_class
|
10
|
+
::Nokogiri::XML::Document
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/remote_table.gemspec
CHANGED
@@ -7,8 +7,8 @@ Gem::Specification.new do |s|
|
|
7
7
|
s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
8
8
|
s.email = ["seamus@abshere.net"]
|
9
9
|
s.homepage = "https://github.com/seamusabshere/remote_table"
|
10
|
-
s.summary =
|
11
|
-
s.description = %
|
10
|
+
s.summary = %{Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), other delimited, fixed-width files.}
|
11
|
+
s.description = %{Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), other delimited, fixed-width files. Returns an Array of Arrays or Hashes, depending on whether there are headers.}
|
12
12
|
|
13
13
|
s.rubyforge_project = "remotetable"
|
14
14
|
|
data/test/test_big.rb
CHANGED
@@ -31,7 +31,7 @@ describe RemoteTable do
|
|
31
31
|
:format => :fixed_width,
|
32
32
|
:crop => 21..26, # inclusive
|
33
33
|
:cut => '2-',
|
34
|
-
:select =>
|
34
|
+
:select => proc { |row| /\A[A-Z]/.match row['code'] },
|
35
35
|
:schema => [[ 'code', 2, { :type => :string } ],
|
36
36
|
[ 'spacer', 2 ],
|
37
37
|
[ 'name', 52, { :type => :string } ]])
|
data/test/test_remote_table.rb
CHANGED
@@ -4,20 +4,25 @@ require 'tempfile'
|
|
4
4
|
|
5
5
|
describe RemoteTable do
|
6
6
|
it "open an XLSX" do
|
7
|
+
t = RemoteTable.new 'http://www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx'
|
8
|
+
t[5]["Requirements"].must_equal "Secure encryption of all data"
|
9
|
+
end
|
10
|
+
|
11
|
+
it "does its best to download urls without http://" do
|
7
12
|
t = RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx'
|
8
13
|
t[5]["Requirements"].must_equal "Secure encryption of all data"
|
9
14
|
end
|
10
|
-
|
15
|
+
|
11
16
|
it "add a row hash to every row" do
|
12
|
-
t = RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx')
|
17
|
+
t = RemoteTable.new(:url => 'http://www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx')
|
13
18
|
t[5].row_hash.must_equal "06d8a738551c17735e2731e25c8d0461"
|
14
19
|
end
|
15
|
-
|
20
|
+
|
16
21
|
it "open a google doc" do
|
17
22
|
t = RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
|
18
23
|
t[0]['name'].must_equal 'Seamus Abshere'
|
19
24
|
end
|
20
|
-
|
25
|
+
|
21
26
|
it "open a csv with custom headers" do
|
22
27
|
t = RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :headers => %w{ col1 col2 col3 }
|
23
28
|
t[0]['col2'].must_equal 'name'
|
@@ -31,29 +36,29 @@ describe RemoteTable do
|
|
31
36
|
t[1]['name'].must_equal 'Derek Kastner'
|
32
37
|
t[1]['city'].must_equal 'Lansing'
|
33
38
|
end
|
34
|
-
|
39
|
+
|
35
40
|
it "return an ordered hash" do
|
36
41
|
t = RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
|
37
42
|
t[0].class.must_equal ::ActiveSupport::OrderedHash
|
38
43
|
end
|
39
|
-
|
44
|
+
|
40
45
|
it "pass through fastercsv options" do
|
41
46
|
f = Tempfile.new 'pass-through-fastercsv-options'
|
42
47
|
f.write %{3,Title example,Body example with a <a href="">link</a>,test category}
|
43
48
|
f.flush
|
44
|
-
t = RemoteTable.new "file://#{f.path}", :quote_char => %{'}, :headers => nil
|
49
|
+
t = RemoteTable.new "file://#{f.path}", :quote_char => %{'}, :headers => nil # this should really be "headers => false"
|
45
50
|
t[0][2].must_equal %{Body example with a <a href="">link</a>}
|
46
51
|
f.close
|
47
52
|
end
|
48
|
-
|
53
|
+
|
49
54
|
it "open a csv inside a zip file" do
|
50
55
|
t = RemoteTable.new 'http://www.epa.gov/climatechange/emissions/downloads10/2010-Inventory-Annex-Tables.zip',
|
51
56
|
:filename => 'Annex Tables/Annex 3/Table A-93.csv',
|
52
57
|
:skip => 1,
|
53
|
-
:select =>
|
58
|
+
:select => proc { |row| row['Vehicle Age'].strip =~ /^\d+$/ }
|
54
59
|
t[0]['LDGV'].must_equal '9.09%'
|
55
60
|
end
|
56
|
-
|
61
|
+
|
57
62
|
it 'not blow up if each is called twice' do
|
58
63
|
t = RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
|
59
64
|
count = 0
|
@@ -62,7 +67,7 @@ describe RemoteTable do
|
|
62
67
|
t.each { |row| count += 1}
|
63
68
|
count.must_equal first_run*2
|
64
69
|
end
|
65
|
-
|
70
|
+
|
66
71
|
it 'allow itself to be cleared for save memory' do
|
67
72
|
t = RemoteTable.new 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw'
|
68
73
|
t.to_a
|
@@ -70,23 +75,23 @@ describe RemoteTable do
|
|
70
75
|
t.free
|
71
76
|
t.send(:cache).length.must_equal 0
|
72
77
|
end
|
73
|
-
|
78
|
+
|
74
79
|
# fixes ArgumentError: invalid byte sequence in UTF-8
|
75
80
|
it %{safely strip soft hyphens and read windows-1252 html} do
|
76
81
|
t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table[2]//table[1]//tr[3]//tr', :column_xpath => 'td', :encoding => 'windows-1252'
|
77
82
|
t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
|
78
83
|
end
|
79
|
-
|
84
|
+
|
80
85
|
it %{transliterate characters from ISO-8859-1} do
|
81
86
|
t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv', :encoding => 'ISO-8859-1'
|
82
87
|
t.rows.detect { |row| row['name'] == 'Briquet Griffon Vendéen' }.wont_equal nil
|
83
88
|
end
|
84
|
-
|
89
|
+
|
85
90
|
it %{read xml with css selectors} do
|
86
91
|
t = RemoteTable.new 'http://www.nanonull.com/TimeService/TimeService.asmx/getCityTime?city=Chicago', :format => :xml, :row_css => 'string', :headers => false
|
87
92
|
/(AM|PM)/.match(t[0][0]).wont_equal nil
|
88
93
|
end
|
89
|
-
|
94
|
+
|
90
95
|
it %{optionally stream rows instead of caching them} do
|
91
96
|
t = RemoteTable.new 'http://www.earthtools.org/timezone/40.71417/-74.00639', :format => :xml, :row_xpath => '//timezone/isotime', :headers => false, :streaming => true
|
92
97
|
time1 = t[0][0]
|
@@ -95,7 +100,7 @@ describe RemoteTable do
|
|
95
100
|
time2 = t[0][0]
|
96
101
|
time1.wont_equal time2
|
97
102
|
end
|
98
|
-
|
103
|
+
|
99
104
|
{
|
100
105
|
# IMPOSSIBLE "../support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls" => {:format=>"xls", :encoding=>"binary"},
|
101
106
|
"../support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx" => {:format=>"xlsx"},
|
@@ -106,7 +111,7 @@ describe RemoteTable do
|
|
106
111
|
# TODO "../support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html" => {:format=>"html" },
|
107
112
|
# TODO "../support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html" => {:format=>"html", :encoding=>"iso-8859-1"},
|
108
113
|
# TODO "../support/list-en1-semic-3.neooffice.utf-8.html" => {:format=>"html" },
|
109
|
-
"../support/list-en1-semic-3.neooffice.utf-8.xml" => {:format=>"xml", :row_css=>'Row', :column_css => 'Data', :select =>
|
114
|
+
"../support/list-en1-semic-3.neooffice.utf-8.xml" => {:format=>"xml", :row_css=>'Row', :column_css => 'Data', :select => proc { |row| row[1].to_s =~ /[A-Z]{2}/ }},
|
110
115
|
"../support/list-en1-semic-3.neooffice.iso-8859-1.csv" => {:format=>"csv", :encoding=>"iso-8859-1", :delimiter => ';'},
|
111
116
|
"../support/list-en1-semic-3.original.iso-8859-1.csv" => {:format=>"csv", :encoding=>"iso-8859-1", :delimiter => ';'},
|
112
117
|
"../support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma" => {:format=>"csv", :encoding=>"MACROMAN"}, # comma because no option in excel
|
@@ -124,22 +129,22 @@ describe RemoteTable do
|
|
124
129
|
b.must_equal a
|
125
130
|
end
|
126
131
|
end
|
127
|
-
|
132
|
+
|
128
133
|
it %{recode as UTF-8 even ISO-8859-1 (or any other encoding)} do
|
129
134
|
t = RemoteTable.new 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';', :encoding => 'ISO-8859-1'
|
130
135
|
t[1][0].must_equal %{ÅLAND ISLANDS}
|
131
136
|
end
|
132
|
-
|
137
|
+
|
133
138
|
it %{parse a big CSV that is not UTF-8} do
|
134
139
|
t = RemoteTable.new 'https://openflights.svn.sourceforge.net/svnroot/openflights/openflights/data/airports.dat', :headers => false#, :encoding => 'UTF-8'
|
135
140
|
t[0][1].must_equal 'Goroka'
|
136
141
|
end
|
137
|
-
|
142
|
+
|
138
143
|
it "read only certain rows of an XLSX" do
|
139
144
|
t = RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :crop => 11..16, :headers => false
|
140
145
|
t[0][0].must_equal "Permissioning and access groups for all content"
|
141
146
|
t[4][0].must_equal "Manage Multiple Incentive Programs for Participants"
|
142
|
-
|
147
|
+
|
143
148
|
t = RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :crop => 11..16, :headers => %w{ col1 }
|
144
149
|
t[0]['col1'].must_equal "Permissioning and access groups for all content"
|
145
150
|
t[4]['col1'].must_equal "Manage Multiple Incentive Programs for Participants"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remote_table
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-05-08 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: activesupport
|
@@ -124,8 +124,9 @@ dependencies:
|
|
124
124
|
- - ! '>='
|
125
125
|
- !ruby/object:Gem::Version
|
126
126
|
version: '0'
|
127
|
-
description:
|
128
|
-
|
127
|
+
description: Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma
|
128
|
+
separated), TSV (tab separated), other delimited, fixed-width files. Returns an
|
129
|
+
Array of Arrays or Hashes, depending on whether there are headers.
|
129
130
|
email:
|
130
131
|
- seamus@abshere.net
|
131
132
|
executables: []
|
@@ -140,22 +141,20 @@ files:
|
|
140
141
|
- README.markdown
|
141
142
|
- Rakefile
|
142
143
|
- lib/remote_table.rb
|
143
|
-
- lib/remote_table/
|
144
|
-
- lib/remote_table/
|
145
|
-
- lib/remote_table/
|
146
|
-
- lib/remote_table/
|
147
|
-
- lib/remote_table/
|
148
|
-
- lib/remote_table/
|
149
|
-
- lib/remote_table/
|
150
|
-
- lib/remote_table/
|
151
|
-
- lib/remote_table/format/mixins/processed_by_roo.rb
|
152
|
-
- lib/remote_table/format/mixins/textual.rb
|
153
|
-
- lib/remote_table/format/open_office.rb
|
154
|
-
- lib/remote_table/format/xml.rb
|
155
|
-
- lib/remote_table/format/yaml.rb
|
156
|
-
- lib/remote_table/local_file.rb
|
144
|
+
- lib/remote_table/delimited.rb
|
145
|
+
- lib/remote_table/fixed_width.rb
|
146
|
+
- lib/remote_table/html.rb
|
147
|
+
- lib/remote_table/local_copy.rb
|
148
|
+
- lib/remote_table/ods.rb
|
149
|
+
- lib/remote_table/plaintext.rb
|
150
|
+
- lib/remote_table/processed_by_nokogiri.rb
|
151
|
+
- lib/remote_table/processed_by_roo.rb
|
157
152
|
- lib/remote_table/transformer.rb
|
158
153
|
- lib/remote_table/version.rb
|
154
|
+
- lib/remote_table/xls.rb
|
155
|
+
- lib/remote_table/xlsx.rb
|
156
|
+
- lib/remote_table/xml.rb
|
157
|
+
- lib/remote_table/yaml.rb
|
159
158
|
- remote_table.gemspec
|
160
159
|
- test/fixtures/data.yml
|
161
160
|
- test/helper.rb
|
@@ -201,8 +200,8 @@ rubyforge_project: remotetable
|
|
201
200
|
rubygems_version: 1.8.21
|
202
201
|
signing_key:
|
203
202
|
specification_version: 3
|
204
|
-
summary: Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV
|
205
|
-
separated), other delimited, fixed-width files.
|
203
|
+
summary: Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma
|
204
|
+
separated), TSV (tab separated), other delimited, fixed-width files.
|
206
205
|
test_files:
|
207
206
|
- test/fixtures/data.yml
|
208
207
|
- test/helper.rb
|
data/lib/remote_table/config.rb
DELETED
@@ -1,251 +0,0 @@
|
|
1
|
-
require 'uri'
|
2
|
-
class RemoteTable
|
3
|
-
# Represents the config of a RemoteTable, whether they are explicitly set by the user or inferred automatically.
|
4
|
-
class Config
|
5
|
-
attr_reader :t
|
6
|
-
attr_reader :user_specified_options
|
7
|
-
|
8
|
-
def initialize(t, user_specified_options)
|
9
|
-
@t = t
|
10
|
-
@user_specified_options = user_specified_options
|
11
|
-
end
|
12
|
-
|
13
|
-
# The parsed URI of the file to get.
|
14
|
-
def uri
|
15
|
-
return @uri if @uri.is_a?(::URI)
|
16
|
-
@uri = ::URI.parse t.url
|
17
|
-
if @uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
18
|
-
@uri.query = 'output=csv&' + @uri.query.sub(/\&?output=.*?(\&|\z)/, '\1')
|
19
|
-
end
|
20
|
-
@uri
|
21
|
-
end
|
22
|
-
|
23
|
-
# Whether to stream the rows without caching them. Saves memory, but you have to re-download the file every time you...
|
24
|
-
# * call []
|
25
|
-
# * call each
|
26
|
-
# Defaults to false.
|
27
|
-
def streaming
|
28
|
-
user_specified_options.fetch :streaming, false
|
29
|
-
end
|
30
|
-
|
31
|
-
# Defaults to true.
|
32
|
-
def warn_on_multiple_downloads
|
33
|
-
user_specified_options[:warn_on_multiple_downloads] != false
|
34
|
-
end
|
35
|
-
|
36
|
-
# The headers specified by the user
|
37
|
-
#
|
38
|
-
# Default: :first_row
|
39
|
-
def headers
|
40
|
-
user_specified_options[:headers].nil? ? :first_row : user_specified_options[:headers]
|
41
|
-
end
|
42
|
-
|
43
|
-
def use_first_row_as_header?
|
44
|
-
headers == :first_row
|
45
|
-
end
|
46
|
-
|
47
|
-
def output_class
|
48
|
-
headers == false ? ::Array : ::ActiveSupport::OrderedHash
|
49
|
-
end
|
50
|
-
|
51
|
-
# The sheet specified by the user as a number or a string
|
52
|
-
def sheet
|
53
|
-
user_specified_options[:sheet]
|
54
|
-
end
|
55
|
-
|
56
|
-
# Whether to keep blank rows
|
57
|
-
#
|
58
|
-
# Default: false
|
59
|
-
def keep_blank_rows
|
60
|
-
user_specified_options.fetch :keep_blank_rows, false
|
61
|
-
end
|
62
|
-
|
63
|
-
# Form data to send in with the download request
|
64
|
-
def form_data
|
65
|
-
user_specified_options[:form_data]
|
66
|
-
end
|
67
|
-
|
68
|
-
# How many rows to skip
|
69
|
-
#
|
70
|
-
# Default: 0
|
71
|
-
def skip
|
72
|
-
user_specified_options.fetch :skip, 0
|
73
|
-
end
|
74
|
-
|
75
|
-
def internal_encoding
|
76
|
-
user_specified_options.fetch(:encoding, 'UTF-8').upcase
|
77
|
-
end
|
78
|
-
|
79
|
-
def external_encoding
|
80
|
-
'UTF-8'
|
81
|
-
end
|
82
|
-
|
83
|
-
def external_encoding_iconv
|
84
|
-
'UTF-8//TRANSLIT'
|
85
|
-
end
|
86
|
-
|
87
|
-
# The delimiter
|
88
|
-
#
|
89
|
-
# Default: ","
|
90
|
-
def delimiter
|
91
|
-
user_specified_options.fetch :delimiter, ','
|
92
|
-
end
|
93
|
-
|
94
|
-
# The XPath used to find rows
|
95
|
-
def row_xpath
|
96
|
-
user_specified_options[:row_xpath]
|
97
|
-
end
|
98
|
-
|
99
|
-
# The XPath used to find columns
|
100
|
-
def column_xpath
|
101
|
-
user_specified_options[:column_xpath]
|
102
|
-
end
|
103
|
-
|
104
|
-
# The CSS selector used to find rows
|
105
|
-
def row_css
|
106
|
-
user_specified_options[:row_css]
|
107
|
-
end
|
108
|
-
|
109
|
-
# The CSS selector used to find columns
|
110
|
-
def column_css
|
111
|
-
user_specified_options[:column_css]
|
112
|
-
end
|
113
|
-
|
114
|
-
# The compression type.
|
115
|
-
#
|
116
|
-
# Default: guessed from URI.
|
117
|
-
#
|
118
|
-
# Can be specified as: :gz, :zip, :bz2, :exe (treated as :zip)
|
119
|
-
def compression
|
120
|
-
if user_specified_options.has_key?(:compression)
|
121
|
-
return user_specified_options[:compression]
|
122
|
-
end
|
123
|
-
case ::File.extname(uri.path).downcase
|
124
|
-
when /gz/, /gunzip/
|
125
|
-
:gz
|
126
|
-
when /zip/
|
127
|
-
:zip
|
128
|
-
when /bz2/, /bunzip2/
|
129
|
-
:bz2
|
130
|
-
when /exe/
|
131
|
-
:exe
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
# The packing type.
|
136
|
-
#
|
137
|
-
# Default: guessed from URI.
|
138
|
-
#
|
139
|
-
# Can be specified as: :tar
|
140
|
-
def packing
|
141
|
-
if user_specified_options.has_key?(:packing)
|
142
|
-
return user_specified_options[:packing]
|
143
|
-
end
|
144
|
-
if uri.path =~ %r{\.tar(?:\.|$)}i
|
145
|
-
:tar
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
# The glob used to pick a file out of an archive.
|
150
|
-
#
|
151
|
-
# Example:
|
152
|
-
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
|
153
|
-
def glob
|
154
|
-
user_specified_options[:glob]
|
155
|
-
end
|
156
|
-
|
157
|
-
# The filename, which can be used to pick a file out of an archive.
|
158
|
-
#
|
159
|
-
# Example:
|
160
|
-
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
|
161
|
-
def filename
|
162
|
-
user_specified_options[:filename]
|
163
|
-
end
|
164
|
-
|
165
|
-
# Cut columns up to this character
|
166
|
-
def cut
|
167
|
-
user_specified_options[:cut]
|
168
|
-
end
|
169
|
-
|
170
|
-
# Crop rows after this line
|
171
|
-
def crop
|
172
|
-
user_specified_options[:crop]
|
173
|
-
end
|
174
|
-
|
175
|
-
# The fixed-width schema, given as an array
|
176
|
-
#
|
177
|
-
# Example:
|
178
|
-
# RemoteTable.new('http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
|
179
|
-
# :format => :fixed_width,
|
180
|
-
# :skip => 1,
|
181
|
-
# :schema => [[ 'header4', 10, { :type => :string } ],
|
182
|
-
# [ 'spacer', 1 ],
|
183
|
-
# [ 'header5', 10, { :type => :string } ],
|
184
|
-
# [ 'spacer', 12 ],
|
185
|
-
# [ 'header6', 10, { :type => :string } ]])
|
186
|
-
def schema
|
187
|
-
user_specified_options[:schema]
|
188
|
-
end
|
189
|
-
|
190
|
-
# The name of the fixed-width schema according to FixedWidth
|
191
|
-
def schema_name
|
192
|
-
user_specified_options[:schema_name]
|
193
|
-
end
|
194
|
-
|
195
|
-
# A proc to call to decide whether to return a row.
|
196
|
-
def select
|
197
|
-
user_specified_options[:select]
|
198
|
-
end
|
199
|
-
|
200
|
-
# A proc to call to decide whether to return a row.
|
201
|
-
def reject
|
202
|
-
user_specified_options[:reject]
|
203
|
-
end
|
204
|
-
|
205
|
-
# A hash of options to create a new Errata instance (see the Errata gem at http://github.com/seamusabshere/errata) to be used on every row.
|
206
|
-
def errata
|
207
|
-
return unless user_specified_options.has_key? :errata
|
208
|
-
@errata ||= if user_specified_options[:errata].is_a? ::Hash
|
209
|
-
::Errata.new user_specified_options[:errata]
|
210
|
-
else
|
211
|
-
user_specified_options[:errata]
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
# Get the format in the form of RemoteTable::Format::Excel, etc.
|
216
|
-
#
|
217
|
-
# Note: treats all spreadsheets.google.com URLs as Format::Delimited (i.e., CSV)
|
218
|
-
#
|
219
|
-
# Default: guessed from file extension (which is usually the same as the URI, but sometimes not if you pick out a specific file from an archive)
|
220
|
-
#
|
221
|
-
# Can be specified as: :xlsx, :xls, :delimited (aka :csv and :tsv), :ods, :fixed_width, :html
|
222
|
-
def format
|
223
|
-
return Format::Delimited if uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
224
|
-
clue = if user_specified_options.has_key?(:format)
|
225
|
-
user_specified_options[:format]
|
226
|
-
else
|
227
|
-
t.local_file.path
|
228
|
-
end
|
229
|
-
case clue.to_s.downcase
|
230
|
-
when /xlsx/, /excelx/
|
231
|
-
Format::Excelx
|
232
|
-
when /xls/, /excel/
|
233
|
-
Format::Excel
|
234
|
-
when /csv/, /tsv/, /delimited/
|
235
|
-
Format::Delimited
|
236
|
-
when /ods/, /open_?office/
|
237
|
-
Format::OpenOffice
|
238
|
-
when /fixed_?width/
|
239
|
-
Format::FixedWidth
|
240
|
-
when /htm/
|
241
|
-
Format::HTML
|
242
|
-
when /xml/
|
243
|
-
Format::XML
|
244
|
-
when /yaml/, /yml/
|
245
|
-
Format::Yaml
|
246
|
-
else
|
247
|
-
Format::Delimited
|
248
|
-
end
|
249
|
-
end
|
250
|
-
end
|
251
|
-
end
|