remote_table 3.0.0.alpha → 3.0.0.beta
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -0
- data/lib/remote_table.rb +37 -4
- data/lib/remote_table/delimited.rb +29 -19
- data/lib/remote_table/fixed_width.rb +5 -3
- data/lib/remote_table/processed_by_nokogiri.rb +5 -3
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +2 -0
- data/test/data/lots of spaces.csv +2 -0
- data/test/data/table.html +17 -0
- data/test/helper.rb +3 -1
- data/test/test_remote.rb +8 -13
- data/test/test_remote_table.rb +21 -0
- metadata +38 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
3.0.0.beta / 2013-07-30
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Strip whitespace from headers unless you provide them as an array
|
6
|
+
* Not passing anything options on to CSV... use :delimiter instead of :col_sep
|
7
|
+
* Include columns with blank headers as "empty_N"
|
8
|
+
|
9
|
+
* Enhancements
|
10
|
+
|
11
|
+
* Support relative paths with spaces
|
12
|
+
|
1
13
|
3.0.0.alpha / 2013-07-25
|
2
14
|
|
3
15
|
* Breaking changes
|
data/lib/remote_table.rb
CHANGED
@@ -53,7 +53,7 @@ class RemoteTable
|
|
53
53
|
# Guess compression based on URL. Used internally.
|
54
54
|
# @return [Symbol,nil]
|
55
55
|
def guess_compression(url)
|
56
|
-
extname =
|
56
|
+
extname = extname(url).downcase
|
57
57
|
case extname
|
58
58
|
when /gz/, /gunzip/
|
59
59
|
:gz
|
@@ -69,7 +69,7 @@ class RemoteTable
|
|
69
69
|
# Guess packing from URL. Used internally.
|
70
70
|
# @return [Symbol,nil]
|
71
71
|
def guess_packing(url)
|
72
|
-
basename =
|
72
|
+
basename = basename(url).downcase
|
73
73
|
if basename.include?('.tar') or basename.include?('.tgz')
|
74
74
|
:tar
|
75
75
|
end
|
@@ -109,6 +109,24 @@ class RemoteTable
|
|
109
109
|
uri.query = params.join('&')
|
110
110
|
uri.to_s
|
111
111
|
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def basename(url)
|
116
|
+
::File.basename path(url)
|
117
|
+
end
|
118
|
+
|
119
|
+
def extname(url)
|
120
|
+
::File.extname path(url)
|
121
|
+
end
|
122
|
+
|
123
|
+
def path(url)
|
124
|
+
if url.include?('://')
|
125
|
+
::URI.parse(url).path
|
126
|
+
else
|
127
|
+
File.expand_path url
|
128
|
+
end
|
129
|
+
end
|
112
130
|
end
|
113
131
|
|
114
132
|
EXTERNAL_ENCODING = 'UTF-8'
|
@@ -129,11 +147,13 @@ class RemoteTable
|
|
129
147
|
:keep_blank_rows => false,
|
130
148
|
:skip => 0,
|
131
149
|
:encoding => 'UTF-8',
|
132
|
-
:delimiter => ','
|
150
|
+
:delimiter => ',',
|
151
|
+
:quote_char => '"',
|
133
152
|
}
|
134
153
|
OLD_SETTING_NAMES = {
|
135
154
|
:pre_select => [:select],
|
136
155
|
:pre_reject => [:reject],
|
156
|
+
:delimiter => [:col_sep],
|
137
157
|
}
|
138
158
|
|
139
159
|
include ::Enumerable
|
@@ -178,6 +198,13 @@ class RemoteTable
|
|
178
198
|
# Headers specified by the user: +:first_row+ (the default), +false+, or a list of headers.
|
179
199
|
# @return [:first_row,false,Array<String>]
|
180
200
|
attr_reader :headers
|
201
|
+
|
202
|
+
# Quote character for delimited files.
|
203
|
+
#
|
204
|
+
# Defaults to double quotes.
|
205
|
+
#
|
206
|
+
# @return [String]
|
207
|
+
attr_reader :quote_char
|
181
208
|
|
182
209
|
# The sheet specified by the user as a number or a string.
|
183
210
|
# @return[String,Integer]
|
@@ -199,7 +226,7 @@ class RemoteTable
|
|
199
226
|
# @return [String]
|
200
227
|
attr_reader :encoding
|
201
228
|
|
202
|
-
# The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is
|
229
|
+
# The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is ','.
|
203
230
|
# @return [String]
|
204
231
|
attr_reader :delimiter
|
205
232
|
|
@@ -377,6 +404,7 @@ class RemoteTable
|
|
377
404
|
if headers.is_a?(::Array) and headers.any?(&:blank?)
|
378
405
|
raise ::ArgumentError, "[remote_table] If you specify headers, none of them can be blank"
|
379
406
|
end
|
407
|
+
@quote_char = grab settings, :quote_char
|
380
408
|
|
381
409
|
@compression = grab(settings, :compression) || RemoteTable.guess_compression(url)
|
382
410
|
@packing = grab(settings, :packing) || RemoteTable.guess_packing(url)
|
@@ -422,6 +450,7 @@ class RemoteTable
|
|
422
450
|
end
|
423
451
|
else
|
424
452
|
mark_download!
|
453
|
+
preprocess!
|
425
454
|
memo = _each do |row|
|
426
455
|
parser.parse(row).each do |virtual_row|
|
427
456
|
virtual_row.row_hash = ::HashDigest.hexdigest row
|
@@ -481,6 +510,10 @@ class RemoteTable
|
|
481
510
|
end
|
482
511
|
|
483
512
|
private
|
513
|
+
|
514
|
+
def preprocess!
|
515
|
+
# noop, overridden sometimes
|
516
|
+
end
|
484
517
|
|
485
518
|
def mark_download!
|
486
519
|
@download_count_mutex.synchronize do
|
@@ -14,24 +14,15 @@ class RemoteTable
|
|
14
14
|
Engine = ::FasterCSV
|
15
15
|
end
|
16
16
|
|
17
|
-
|
18
|
-
:unconverted_fields,
|
19
|
-
:col_sep,
|
20
|
-
:row_sep,
|
21
|
-
:return_headers,
|
22
|
-
:header_converters,
|
23
|
-
:quote_char,
|
24
|
-
:converters,
|
25
|
-
:force_quotes,
|
26
|
-
]
|
27
|
-
|
28
|
-
# Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
|
29
|
-
def _each
|
17
|
+
def preprocess!
|
30
18
|
delete_harmful!
|
31
19
|
convert_eol_to_unix!
|
32
20
|
transliterate_whole_file_to_utf8!
|
33
21
|
skip_rows!
|
22
|
+
end
|
34
23
|
|
24
|
+
# Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
|
25
|
+
def _each
|
35
26
|
Engine.new(local_copy.encoded_io, csv_options).each do |row|
|
36
27
|
|
37
28
|
some_value_present = false
|
@@ -55,7 +46,6 @@ class RemoteTable
|
|
55
46
|
# represent the row as a hash
|
56
47
|
hash = ::ActiveSupport::OrderedHash.new
|
57
48
|
row.each do |k, v|
|
58
|
-
next unless k.present?
|
59
49
|
v = v.to_s
|
60
50
|
if not some_value_present and not keep_blank_rows and v.present?
|
61
51
|
some_value_present = true
|
@@ -81,11 +71,31 @@ class RemoteTable
|
|
81
71
|
#
|
82
72
|
# @return [Hash]
|
83
73
|
def csv_options
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
74
|
+
{
|
75
|
+
skip_blanks: !keep_blank_rows,
|
76
|
+
headers: headers,
|
77
|
+
col_sep: delimiter,
|
78
|
+
quote_char: quote_char,
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
def headers
|
83
|
+
return @_headers if defined?(@_headers)
|
84
|
+
@_headers = case @headers
|
85
|
+
when FalseClass, NilClass
|
86
|
+
false
|
87
|
+
when :first_row, TrueClass
|
88
|
+
i = 0
|
89
|
+
line = local_copy.encoded_io.gets
|
90
|
+
Engine.parse_line(line).map do |v|
|
91
|
+
header = v.to_s.gsub(/\s+/, ' ').strip
|
92
|
+
header.present? ? header : "empty_#{i+=1}"
|
93
|
+
end
|
94
|
+
when Array
|
95
|
+
@headers
|
96
|
+
else
|
97
|
+
raise "Invalid headers: #{headers.inspect}"
|
98
|
+
end
|
89
99
|
end
|
90
100
|
end
|
91
101
|
end
|
@@ -13,15 +13,17 @@ class RemoteTable
|
|
13
13
|
@definition_mutex = ::Mutex.new
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
17
|
-
require 'fixed_width-multibyte'
|
18
|
-
|
16
|
+
def preprocess!
|
19
17
|
delete_harmful!
|
20
18
|
convert_eol_to_unix!
|
21
19
|
transliterate_whole_file_to_utf8!
|
22
20
|
crop_rows!
|
23
21
|
skip_rows!
|
24
22
|
cut_columns!
|
23
|
+
end
|
24
|
+
|
25
|
+
def _each
|
26
|
+
require 'fixed_width-multibyte'
|
25
27
|
|
26
28
|
fixed_width_parser.parse[:rows].each do |row|
|
27
29
|
some_value_present = false
|
@@ -5,6 +5,11 @@ class RemoteTable
|
|
5
5
|
SINGLE_SPACE = ' '
|
6
6
|
SOFT_HYPHEN = '­'
|
7
7
|
|
8
|
+
def preprocess!
|
9
|
+
delete_harmful!
|
10
|
+
transliterate_whole_file_to_utf8!
|
11
|
+
end
|
12
|
+
|
8
13
|
# Yield each row using Nokogiri.
|
9
14
|
def _each
|
10
15
|
require 'nokogiri'
|
@@ -16,9 +21,6 @@ class RemoteTable
|
|
16
21
|
unless row_css or row_xpath
|
17
22
|
raise ::ArgumentError, "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML"
|
18
23
|
end
|
19
|
-
|
20
|
-
delete_harmful!
|
21
|
-
transliterate_whole_file_to_utf8!
|
22
24
|
|
23
25
|
xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, RemoteTable::EXTERNAL_ENCODING)
|
24
26
|
(row_css ? xml.css(row_css) : xml.xpath(row_xpath)).each do |row|
|
data/lib/remote_table/version.rb
CHANGED
data/remote_table.gemspec
CHANGED
data/test/helper.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
require 'bundler/setup'
|
2
|
+
|
2
3
|
require 'minitest/spec'
|
3
|
-
require 'minitest/autorun'
|
4
4
|
require 'minitest/reporters'
|
5
|
+
require 'minitest/autorun'
|
6
|
+
# require 'pry-rescue/minitest'
|
5
7
|
#MiniTest::Unit.runner = MiniTest::SuiteRunner.new
|
6
8
|
#MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
|
7
9
|
require 'remote_table'
|
data/test/test_remote.rb
CHANGED
@@ -36,11 +36,12 @@ describe RemoteTable do
|
|
36
36
|
end
|
37
37
|
|
38
38
|
it "open a csv inside a zip file" do
|
39
|
-
t = RemoteTable.new
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
t = RemoteTable.new('http://www.epa.gov/climatechange/Downloads/ghgemissions/2011-Annex-Tables.zip',
|
40
|
+
:filename => 'Annex Tables/Table A-93.csv',
|
41
|
+
:skip => 1,
|
42
|
+
:headers => %w{ age LDGV LDGT HDGV LDDV LDDT HDDT MC },
|
43
|
+
:select => proc { |row| row['age'].to_i.to_s == row['age'] })
|
44
|
+
t[0]['LDGV'].must_equal '5.20%'
|
44
45
|
end
|
45
46
|
|
46
47
|
it 'not blow up if each is called twice' do
|
@@ -60,12 +61,6 @@ describe RemoteTable do
|
|
60
61
|
t.send(:cache).length.must_equal 0
|
61
62
|
end
|
62
63
|
|
63
|
-
# fixes ArgumentError: invalid byte sequence in UTF-8
|
64
|
-
it %{safely strip soft hyphens and read windows-1252 html} do
|
65
|
-
t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table[2]//table[1]//tr[3]//tr', :column_xpath => 'td', :encoding => 'windows-1252'
|
66
|
-
t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
|
67
|
-
end
|
68
|
-
|
69
64
|
it %{transliterate characters from ISO-8859-1} do
|
70
65
|
t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv', :encoding => 'ISO-8859-1'
|
71
66
|
t.rows.detect { |row| row['name'] == 'Briquet Griffon Vendéen' }.wont_equal nil
|
@@ -85,8 +80,8 @@ describe RemoteTable do
|
|
85
80
|
time1.wont_equal time2
|
86
81
|
end
|
87
82
|
|
88
|
-
it %{
|
89
|
-
t = RemoteTable.new 'http://www.iso.org/iso/
|
83
|
+
it %{reads country names} do
|
84
|
+
t = RemoteTable.new 'http://www.iso.org/iso/country_names_and_code_elements_txt', :skip => 1, :headers => false, :delimiter => ';'
|
90
85
|
t[1][0].must_equal %{ÅLAND ISLANDS}
|
91
86
|
end
|
92
87
|
|
data/test/test_remote_table.rb
CHANGED
@@ -14,6 +14,13 @@ describe RemoteTable do
|
|
14
14
|
by_path.rows.must_equal by_url.rows
|
15
15
|
end
|
16
16
|
|
17
|
+
it "strips whitespace from headers" do
|
18
|
+
t = RemoteTable.new 'test/data/lots of spaces.csv'
|
19
|
+
t[0]['a one'].must_equal 'a1'
|
20
|
+
t[0]['b two'].must_equal 'b2'
|
21
|
+
t[0]['c three'].must_equal 'c3'
|
22
|
+
end
|
23
|
+
|
17
24
|
{
|
18
25
|
# IMPOSSIBLE "../data/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls" => {:format=>"xls", :encoding=>"binary"},
|
19
26
|
"../data/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx" => {:format=>"xlsx"},
|
@@ -59,4 +66,18 @@ describe RemoteTable do
|
|
59
66
|
t[1]['name'].must_equal 'Derek Kastner'
|
60
67
|
t[1]['city'].must_equal 'Lansing'
|
61
68
|
end
|
69
|
+
|
70
|
+
it "reads html with xpath" do
|
71
|
+
t = RemoteTable.new 'test/data/table.html', row_xpath: '//tr', column_xpath: 'td'
|
72
|
+
t[0]['h1'].must_equal 'a'
|
73
|
+
t[1]['h3'].must_equal 'f'
|
74
|
+
end
|
75
|
+
|
76
|
+
# fixes ArgumentError: invalid byte sequence in UTF-8
|
77
|
+
# disabled because xpath not be somehow broken - works in chrome
|
78
|
+
it %{safely strip soft hyphens and read windows-1252 html} do
|
79
|
+
row_xpath = '/html/body/table[2]/tbody/tr/td/center/table/tbody/tr[3]/td/table/tbody/tr[2]/td[1]'
|
80
|
+
t = RemoteTable.new 'test/data/faa-aircraft.html', :row_xpath => row_xpath, :column_xpath => 'td', :encoding => 'windows-1252'
|
81
|
+
t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
|
82
|
+
end
|
62
83
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remote_table
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.0.
|
4
|
+
version: 3.0.0.beta
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-07-
|
13
|
+
date: 2013-07-31 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: activesupport
|
@@ -220,6 +220,38 @@ dependencies:
|
|
220
220
|
- - ! '>='
|
221
221
|
- !ruby/object:Gem::Version
|
222
222
|
version: '0'
|
223
|
+
- !ruby/object:Gem::Dependency
|
224
|
+
name: pry-rescue
|
225
|
+
requirement: !ruby/object:Gem::Requirement
|
226
|
+
none: false
|
227
|
+
requirements:
|
228
|
+
- - ! '>='
|
229
|
+
- !ruby/object:Gem::Version
|
230
|
+
version: '0'
|
231
|
+
type: :development
|
232
|
+
prerelease: false
|
233
|
+
version_requirements: !ruby/object:Gem::Requirement
|
234
|
+
none: false
|
235
|
+
requirements:
|
236
|
+
- - ! '>='
|
237
|
+
- !ruby/object:Gem::Version
|
238
|
+
version: '0'
|
239
|
+
- !ruby/object:Gem::Dependency
|
240
|
+
name: pry-stack_explorer
|
241
|
+
requirement: !ruby/object:Gem::Requirement
|
242
|
+
none: false
|
243
|
+
requirements:
|
244
|
+
- - ! '>='
|
245
|
+
- !ruby/object:Gem::Version
|
246
|
+
version: '0'
|
247
|
+
type: :development
|
248
|
+
prerelease: false
|
249
|
+
version_requirements: !ruby/object:Gem::Requirement
|
250
|
+
none: false
|
251
|
+
requirements:
|
252
|
+
- - ! '>='
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: '0'
|
223
255
|
description: Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma
|
224
256
|
separated), TSV (tab separated), other delimited, fixed-width files, and shapefiles.
|
225
257
|
Returns an Array of Arrays or Hashes, depending on whether there are headers.
|
@@ -269,7 +301,9 @@ files:
|
|
269
301
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
|
270
302
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
|
271
303
|
- test/data/list-en1-semic-3.original.iso-8859-1.csv
|
304
|
+
- test/data/lots of spaces.csv
|
272
305
|
- test/data/ranges.csv
|
306
|
+
- test/data/table.html
|
273
307
|
- test/helper.rb
|
274
308
|
- test/test_big.rb
|
275
309
|
- test/test_errata.rb
|
@@ -322,7 +356,9 @@ test_files:
|
|
322
356
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
|
323
357
|
- test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
|
324
358
|
- test/data/list-en1-semic-3.original.iso-8859-1.csv
|
359
|
+
- test/data/lots of spaces.csv
|
325
360
|
- test/data/ranges.csv
|
361
|
+
- test/data/table.html
|
326
362
|
- test/helper.rb
|
327
363
|
- test/test_big.rb
|
328
364
|
- test/test_errata.rb
|