remote_table 3.0.0.alpha → 3.0.0.beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -0
- data/lib/remote_table.rb +37 -4
- data/lib/remote_table/delimited.rb +29 -19
- data/lib/remote_table/fixed_width.rb +5 -3
- data/lib/remote_table/processed_by_nokogiri.rb +5 -3
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +2 -0
- data/test/data/lots of spaces.csv +2 -0
- data/test/data/table.html +17 -0
- data/test/helper.rb +3 -1
- data/test/test_remote.rb +8 -13
- data/test/test_remote_table.rb +21 -0
- metadata +38 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
3.0.0.beta / 2013-07-30
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Strip whitespace from headers unless you provide them as an array
|
6
|
+
* Not passing anything options on to CSV... use :delimiter instead of :col_sep
|
7
|
+
* Include columns with blank headers as "empty_N"
|
8
|
+
|
9
|
+
* Enhancements
|
10
|
+
|
11
|
+
* Support relative paths with spaces
|
12
|
+
|
1
13
|
3.0.0.alpha / 2013-07-25
|
2
14
|
|
3
15
|
* Breaking changes
|
data/lib/remote_table.rb
CHANGED
@@ -53,7 +53,7 @@ class RemoteTable
|
|
53
53
|
# Guess compression based on URL. Used internally.
|
54
54
|
# @return [Symbol,nil]
|
55
55
|
def guess_compression(url)
|
56
|
-
extname =
|
56
|
+
extname = extname(url).downcase
|
57
57
|
case extname
|
58
58
|
when /gz/, /gunzip/
|
59
59
|
:gz
|
@@ -69,7 +69,7 @@ class RemoteTable
|
|
69
69
|
# Guess packing from URL. Used internally.
|
70
70
|
# @return [Symbol,nil]
|
71
71
|
def guess_packing(url)
|
72
|
-
basename =
|
72
|
+
basename = basename(url).downcase
|
73
73
|
if basename.include?('.tar') or basename.include?('.tgz')
|
74
74
|
:tar
|
75
75
|
end
|
@@ -109,6 +109,24 @@ class RemoteTable
|
|
109
109
|
uri.query = params.join('&')
|
110
110
|
uri.to_s
|
111
111
|
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def basename(url)
|
116
|
+
::File.basename path(url)
|
117
|
+
end
|
118
|
+
|
119
|
+
def extname(url)
|
120
|
+
::File.extname path(url)
|
121
|
+
end
|
122
|
+
|
123
|
+
def path(url)
|
124
|
+
if url.include?('://')
|
125
|
+
::URI.parse(url).path
|
126
|
+
else
|
127
|
+
File.expand_path url
|
128
|
+
end
|
129
|
+
end
|
112
130
|
end
|
113
131
|
|
114
132
|
EXTERNAL_ENCODING = 'UTF-8'
|
@@ -129,11 +147,13 @@ class RemoteTable
|
|
129
147
|
:keep_blank_rows => false,
|
130
148
|
:skip => 0,
|
131
149
|
:encoding => 'UTF-8',
|
132
|
-
:delimiter => ','
|
150
|
+
:delimiter => ',',
|
151
|
+
:quote_char => '"',
|
133
152
|
}
|
134
153
|
OLD_SETTING_NAMES = {
|
135
154
|
:pre_select => [:select],
|
136
155
|
:pre_reject => [:reject],
|
156
|
+
:delimiter => [:col_sep],
|
137
157
|
}
|
138
158
|
|
139
159
|
include ::Enumerable
|
@@ -178,6 +198,13 @@ class RemoteTable
|
|
178
198
|
# Headers specified by the user: +:first_row+ (the default), +false+, or a list of headers.
|
179
199
|
# @return [:first_row,false,Array<String>]
|
180
200
|
attr_reader :headers
|
201
|
+
|
202
|
+
# Quote character for delimited files.
|
203
|
+
#
|
204
|
+
# Defaults to double quotes.
|
205
|
+
#
|
206
|
+
# @return [String]
|
207
|
+
attr_reader :quote_char
|
181
208
|
|
182
209
|
# The sheet specified by the user as a number or a string.
|
183
210
|
# @return[String,Integer]
|
@@ -199,7 +226,7 @@ class RemoteTable
|
|
199
226
|
# @return [String]
|
200
227
|
attr_reader :encoding
|
201
228
|
|
202
|
-
# The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is
|
229
|
+
# The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is ','.
|
203
230
|
# @return [String]
|
204
231
|
attr_reader :delimiter
|
205
232
|
|
@@ -377,6 +404,7 @@ class RemoteTable
|
|
377
404
|
if headers.is_a?(::Array) and headers.any?(&:blank?)
|
378
405
|
raise ::ArgumentError, "[remote_table] If you specify headers, none of them can be blank"
|
379
406
|
end
|
407
|
+
@quote_char = grab settings, :quote_char
|
380
408
|
|
381
409
|
@compression = grab(settings, :compression) || RemoteTable.guess_compression(url)
|
382
410
|
@packing = grab(settings, :packing) || RemoteTable.guess_packing(url)
|
@@ -422,6 +450,7 @@ class RemoteTable
|
|
422
450
|
end
|
423
451
|
else
|
424
452
|
mark_download!
|
453
|
+
preprocess!
|
425
454
|
memo = _each do |row|
|
426
455
|
parser.parse(row).each do |virtual_row|
|
427
456
|
virtual_row.row_hash = ::HashDigest.hexdigest row
|
@@ -481,6 +510,10 @@ class RemoteTable
|
|
481
510
|
end
|
482
511
|
|
483
512
|
private
|
513
|
+
|
514
|
+
def preprocess!
|
515
|
+
# noop, overridden sometimes
|
516
|
+
end
|
484
517
|
|
485
518
|
def mark_download!
|
486
519
|
@download_count_mutex.synchronize do
|
@@ -14,24 +14,15 @@ class RemoteTable
|
|
14
14
|
Engine = ::FasterCSV
|
15
15
|
end
|
16
16
|
|
17
|
-
|
18
|
-
:unconverted_fields,
|
19
|
-
:col_sep,
|
20
|
-
:row_sep,
|
21
|
-
:return_headers,
|
22
|
-
:header_converters,
|
23
|
-
:quote_char,
|
24
|
-
:converters,
|
25
|
-
:force_quotes,
|
26
|
-
]
|
27
|
-
|
28
|
-
# Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
|
29
|
-
def _each
|
17
|
+
def preprocess!
|
30
18
|
delete_harmful!
|
31
19
|
convert_eol_to_unix!
|
32
20
|
transliterate_whole_file_to_utf8!
|
33
21
|
skip_rows!
|
22
|
+
end
|
34
23
|
|
24
|
+
# Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
|
25
|
+
def _each
|
35
26
|
Engine.new(local_copy.encoded_io, csv_options).each do |row|
|
36
27
|
|
37
28
|
some_value_present = false
|
@@ -55,7 +46,6 @@ class RemoteTable
|
|
55
46
|
# represent the row as a hash
|
56
47
|
hash = ::ActiveSupport::OrderedHash.new
|
57
48
|
row.each do |k, v|
|
58
|
-
next unless k.present?
|
59
49
|
v = v.to_s
|
60
50
|
if not some_value_present and not keep_blank_rows and v.present?
|
61
51
|
some_value_present = true
|
@@ -81,11 +71,31 @@ class RemoteTable
|
|
81
71
|
#
|
82
72
|
# @return [Hash]
|
83
73
|
def csv_options
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
74
|
+
{
|
75
|
+
skip_blanks: !keep_blank_rows,
|
76
|
+
headers: headers,
|
77
|
+
col_sep: delimiter,
|
78
|
+
quote_char: quote_char,
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
def headers
|
83
|
+
return @_headers if defined?(@_headers)
|
84
|
+
@_headers = case @headers
|
85
|
+
when FalseClass, NilClass
|
86
|
+
false
|
87
|
+
when :first_row, TrueClass
|
88
|
+
i = 0
|
89
|
+
line = local_copy.encoded_io.gets
|
90
|
+
Engine.parse_line(line).map do |v|
|
91
|
+
header = v.to_s.gsub(/\s+/, ' ').strip
|
92
|
+
header.present? ? header : "empty_#{i+=1}"
|
93
|
+
end
|
94
|
+
when Array
|
95
|
+
@headers
|
96
|
+
else
|
97
|
+
raise "Invalid headers: #{headers.inspect}"
|
98
|
+
end
|
89
99
|
end
|
90
100
|
end
|
91
101
|
end
|
@@ -13,15 +13,17 @@ class RemoteTable
|
|
13
13
|
@definition_mutex = ::Mutex.new
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
17
|
-
require 'fixed_width-multibyte'
|
18
|
-
|
16
|
+
def preprocess!
|
19
17
|
delete_harmful!
|
20
18
|
convert_eol_to_unix!
|
21
19
|
transliterate_whole_file_to_utf8!
|
22
20
|
crop_rows!
|
23
21
|
skip_rows!
|
24
22
|
cut_columns!
|
23
|
+
end
|
24
|
+
|
25
|
+
def _each
|
26
|
+
require 'fixed_width-multibyte'
|
25
27
|
|
26
28
|
fixed_width_parser.parse[:rows].each do |row|
|
27
29
|
some_value_present = false
|
@@ -5,6 +5,11 @@ class RemoteTable
|
|
5
5
|
SINGLE_SPACE = ' '
|
6
6
|
SOFT_HYPHEN = '­'
|
7
7
|
|
8
|
+
def preprocess!
|
9
|
+
delete_harmful!
|
10
|
+
transliterate_whole_file_to_utf8!
|
11
|
+
end
|
12
|
+
|
8
13
|
# Yield each row using Nokogiri.
|
9
14
|
def _each
|
10
15
|
require 'nokogiri'
|
@@ -16,9 +21,6 @@ class RemoteTable
|
|
16
21
|
unless row_css or row_xpath
|
17
22
|
raise ::ArgumentError, "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML"
|
18
23
|
end
|
19
|
-
|
20
|
-
delete_harmful!
|
21
|
-
transliterate_whole_file_to_utf8!
|
22
24
|
|
23
25
|
xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, RemoteTable::EXTERNAL_ENCODING)
|
24
26
|
(row_css ? xml.css(row_css) : xml.xpath(row_xpath)).each do |row|
|
data/lib/remote_table/version.rb
CHANGED
data/remote_table.gemspec
CHANGED
data/test/helper.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
require 'bundler/setup'
|
2
|
+
|
2
3
|
require 'minitest/spec'
|
3
|
-
require 'minitest/autorun'
|
4
4
|
require 'minitest/reporters'
|
5
|
+
require 'minitest/autorun'
|
6
|
+
# require 'pry-rescue/minitest'
|
5
7
|
#MiniTest::Unit.runner = MiniTest::SuiteRunner.new
|
6
8
|
#MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
|
7
9
|
require 'remote_table'
|
data/test/test_remote.rb
CHANGED
@@ -36,11 +36,12 @@ describe RemoteTable do
|
|
36
36
|
end
|
37
37
|
|
38
38
|
it "open a csv inside a zip file" do
|
39
|
-
t = RemoteTable.new
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
t = RemoteTable.new('http://www.epa.gov/climatechange/Downloads/ghgemissions/2011-Annex-Tables.zip',
|
40
|
+
:filename => 'Annex Tables/Table A-93.csv',
|
41
|
+
:skip => 1,
|
42
|
+
:headers => %w{ age LDGV LDGT HDGV LDDV LDDT HDDT MC },
|
43
|
+
:select => proc { |row| row['age'].to_i.to_s == row['age'] })
|
44
|
+
t[0]['LDGV'].must_equal '5.20%'
|
44
45
|
end
|
45
46
|
|
46
47
|
it 'not blow up if each is called twice' do
|
@@ -60,12 +61,6 @@ describe RemoteTable do
|
|
60
61
|
t.send(:cache).length.must_equal 0
|
61
62
|
end
|
62
63
|
|
63
|
-
# fixes ArgumentError: invalid byte sequence in UTF-8
|
64
|
-
it %{safely strip soft hyphens and read windows-1252 html} do
|
65
|
-
t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table[2]//table[1]//tr[3]//tr', :column_xpath => 'td', :encoding => 'windows-1252'
|
66
|
-
t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
|
67
|
-
end
|
68
|
-
|
69
64
|
it %{transliterate characters from ISO-8859-1} do
|
70
65
|
t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv', :encoding => 'ISO-8859-1'
|
71
66
|
t.rows.detect { |row| row['name'] == 'Briquet Griffon Vendéen' }.wont_equal nil
|
@@ -85,8 +80,8 @@ describe RemoteTable do
|
|
85
80
|
time1.wont_equal time2
|
86
81
|
end
|
87
82
|
|
88
|
-
it %{
|
89
|
-
t = RemoteTable.new 'http://www.iso.org/iso/
|
83
|
+
it %{reads country names} do
|
84
|
+
t = RemoteTable.new 'http://www.iso.org/iso/country_names_and_code_elements_txt', :skip => 1, :headers => false, :delimiter => ';'
|
90
85
|
t[1][0].must_equal %{ÅLAND ISLANDS}
|
91
86
|
end
|
92
87
|
|
data/test/test_remote_table.rb
CHANGED
@@ -14,6 +14,13 @@ describe RemoteTable do
|
|
14
14
|
by_path.rows.must_equal by_url.rows
|
15
15
|
end
|
16
16
|
|
17
|
+
it "strips whitespace from headers" do
|
18
|
+
t = RemoteTable.new 'test/data/lots of spaces.csv'
|
19
|
+
t[0]['a one'].must_equal 'a1'
|
20
|
+
t[0]['b two'].must_equal 'b2'
|
21
|
+
t[0]['c three'].must_equal 'c3'
|
22
|
+
end
|
23
|
+
|
17
24
|
{
|
18
25
|
# IMPOSSIBLE "../data/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls" => {:format=>"xls", :encoding=>"binary"},
|
19
26
|
"../data/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx" => {:format=>"xlsx"},
|
@@ -59,4 +66,18 @@ describe RemoteTable do
|
|
59
66
|
t[1]['name'].must_equal 'Derek Kastner'
|
60
67
|
t[1]['city'].must_equal 'Lansing'
|
61
68
|
end
|
69
|
+
|
70
|
+
it "reads html with xpath" do
|
71
|
+
t = RemoteTable.new 'test/data/table.html', row_xpath: '//tr', column_xpath: 'td'
|
72
|
+
t[0]['h1'].must_equal 'a'
|
73
|
+
t[1]['h3'].must_equal 'f'
|
74
|
+
end
|
75
|
+
|
76
|
+
# fixes ArgumentError: invalid byte sequence in UTF-8
|
77
|
+
# disabled because xpath not be somehow broken - works in chrome
|
78
|
+
it %{safely strip soft hyphens and read windows-1252 html} do
|
79
|
+
row_xpath = '/html/body/table[2]/tbody/tr/td/center/table/tbody/tr[3]/td/table/tbody/tr[2]/td[1]'
|
80
|
+
t = RemoteTable.new 'test/data/faa-aircraft.html', :row_xpath => row_xpath, :column_xpath => 'td', :encoding => 'windows-1252'
|
81
|
+
t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
|
82
|
+
end
|
62
83
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remote_table
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.0.
|
4
|
+
version: 3.0.0.beta
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-07-
|
13
|
+
date: 2013-07-31 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: activesupport
|
@@ -220,6 +220,38 @@ dependencies:
|
|
220
220
|
- - ! '>='
|
221
221
|
- !ruby/object:Gem::Version
|
222
222
|
version: '0'
|
223
|
+
- !ruby/object:Gem::Dependency
|
224
|
+
name: pry-rescue
|
225
|
+
requirement: !ruby/object:Gem::Requirement
|
226
|
+
none: false
|
227
|
+
requirements:
|
228
|
+
- - ! '>='
|
229
|
+
- !ruby/object:Gem::Version
|
230
|
+
version: '0'
|
231
|
+
type: :development
|
232
|
+
prerelease: false
|
233
|
+
version_requirements: !ruby/object:Gem::Requirement
|
234
|
+
none: false
|
235
|
+
requirements:
|
236
|
+
- - ! '>='
|
237
|
+
- !ruby/object:Gem::Version
|
238
|
+
version: '0'
|
239
|
+
- !ruby/object:Gem::Dependency
|
240
|
+
name: pry-stack_explorer
|
241
|
+
requirement: !ruby/object:Gem::Requirement
|
242
|
+
none: false
|
243
|
+
requirements:
|
244
|
+
- - ! '>='
|
245
|
+
- !ruby/object:Gem::Version
|
246
|
+
version: '0'
|
247
|
+
type: :development
|
248
|
+
prerelease: false
|
249
|
+
version_requirements: !ruby/object:Gem::Requirement
|
250
|
+
none: false
|
251
|
+
requirements:
|
252
|
+
- - ! '>='
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: '0'
|
223
255
|
description: Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma
|
224
256
|
separated), TSV (tab separated), other delimited, fixed-width files, and shapefiles.
|
225
257
|
Returns an Array of Arrays or Hashes, depending on whether there are headers.
|
@@ -269,7 +301,9 @@ files:
|
|
269
301
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
|
270
302
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
|
271
303
|
- test/data/list-en1-semic-3.original.iso-8859-1.csv
|
304
|
+
- test/data/lots of spaces.csv
|
272
305
|
- test/data/ranges.csv
|
306
|
+
- test/data/table.html
|
273
307
|
- test/helper.rb
|
274
308
|
- test/test_big.rb
|
275
309
|
- test/test_errata.rb
|
@@ -322,7 +356,9 @@ test_files:
|
|
322
356
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
|
323
357
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
|
324
358
|
- test/data/list-en1-semic-3.original.iso-8859-1.csv
|
359
|
+
- test/data/lots of spaces.csv
|
325
360
|
- test/data/ranges.csv
|
361
|
+
- test/data/table.html
|
326
362
|
- test/helper.rb
|
327
363
|
- test/test_big.rb
|
328
364
|
- test/test_errata.rb
|