remote_table 3.0.0.alpha → 3.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,15 @@
1
+ 3.0.0.beta / 2013-07-30
2
+
3
+ * Breaking changes
4
+
5
+ * Strip whitespace from headers unless you provide them as an array
6
+ * Not passing anything options on to CSV... use :delimiter instead of :col_sep
7
+ * Include columns with blank headers as "empty_N"
8
+
9
+ * Enhancements
10
+
11
+ * Support relative paths with spaces
12
+
1
13
  3.0.0.alpha / 2013-07-25
2
14
 
3
15
  * Breaking changes
data/lib/remote_table.rb CHANGED
@@ -53,7 +53,7 @@ class RemoteTable
53
53
  # Guess compression based on URL. Used internally.
54
54
  # @return [Symbol,nil]
55
55
  def guess_compression(url)
56
- extname = ::File.extname(::URI.parse(url).path).downcase
56
+ extname = extname(url).downcase
57
57
  case extname
58
58
  when /gz/, /gunzip/
59
59
  :gz
@@ -69,7 +69,7 @@ class RemoteTable
69
69
  # Guess packing from URL. Used internally.
70
70
  # @return [Symbol,nil]
71
71
  def guess_packing(url)
72
- basename = ::File.basename(::URI.parse(url).path).downcase
72
+ basename = basename(url).downcase
73
73
  if basename.include?('.tar') or basename.include?('.tgz')
74
74
  :tar
75
75
  end
@@ -109,6 +109,24 @@ class RemoteTable
109
109
  uri.query = params.join('&')
110
110
  uri.to_s
111
111
  end
112
+
113
+ private
114
+
115
+ def basename(url)
116
+ ::File.basename path(url)
117
+ end
118
+
119
+ def extname(url)
120
+ ::File.extname path(url)
121
+ end
122
+
123
+ def path(url)
124
+ if url.include?('://')
125
+ ::URI.parse(url).path
126
+ else
127
+ File.expand_path url
128
+ end
129
+ end
112
130
  end
113
131
 
114
132
  EXTERNAL_ENCODING = 'UTF-8'
@@ -129,11 +147,13 @@ class RemoteTable
129
147
  :keep_blank_rows => false,
130
148
  :skip => 0,
131
149
  :encoding => 'UTF-8',
132
- :delimiter => ','
150
+ :delimiter => ',',
151
+ :quote_char => '"',
133
152
  }
134
153
  OLD_SETTING_NAMES = {
135
154
  :pre_select => [:select],
136
155
  :pre_reject => [:reject],
156
+ :delimiter => [:col_sep],
137
157
  }
138
158
 
139
159
  include ::Enumerable
@@ -178,6 +198,13 @@ class RemoteTable
178
198
  # Headers specified by the user: +:first_row+ (the default), +false+, or a list of headers.
179
199
  # @return [:first_row,false,Array<String>]
180
200
  attr_reader :headers
201
+
202
+ # Quote character for delimited files.
203
+ #
204
+ # Defaults to double quotes.
205
+ #
206
+ # @return [String]
207
+ attr_reader :quote_char
181
208
 
182
209
  # The sheet specified by the user as a number or a string.
183
210
  # @return[String,Integer]
@@ -199,7 +226,7 @@ class RemoteTable
199
226
  # @return [String]
200
227
  attr_reader :encoding
201
228
 
202
- # The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is :delimited.
229
+ # The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is ','.
203
230
  # @return [String]
204
231
  attr_reader :delimiter
205
232
 
@@ -377,6 +404,7 @@ class RemoteTable
377
404
  if headers.is_a?(::Array) and headers.any?(&:blank?)
378
405
  raise ::ArgumentError, "[remote_table] If you specify headers, none of them can be blank"
379
406
  end
407
+ @quote_char = grab settings, :quote_char
380
408
 
381
409
  @compression = grab(settings, :compression) || RemoteTable.guess_compression(url)
382
410
  @packing = grab(settings, :packing) || RemoteTable.guess_packing(url)
@@ -422,6 +450,7 @@ class RemoteTable
422
450
  end
423
451
  else
424
452
  mark_download!
453
+ preprocess!
425
454
  memo = _each do |row|
426
455
  parser.parse(row).each do |virtual_row|
427
456
  virtual_row.row_hash = ::HashDigest.hexdigest row
@@ -481,6 +510,10 @@ class RemoteTable
481
510
  end
482
511
 
483
512
  private
513
+
514
+ def preprocess!
515
+ # noop, overridden sometimes
516
+ end
484
517
 
485
518
  def mark_download!
486
519
  @download_count_mutex.synchronize do
@@ -14,24 +14,15 @@ class RemoteTable
14
14
  Engine = ::FasterCSV
15
15
  end
16
16
 
17
- PASSTHROUGH_CSV_SETTINGS = [
18
- :unconverted_fields,
19
- :col_sep,
20
- :row_sep,
21
- :return_headers,
22
- :header_converters,
23
- :quote_char,
24
- :converters,
25
- :force_quotes,
26
- ]
27
-
28
- # Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
29
- def _each
17
+ def preprocess!
30
18
  delete_harmful!
31
19
  convert_eol_to_unix!
32
20
  transliterate_whole_file_to_utf8!
33
21
  skip_rows!
22
+ end
34
23
 
24
+ # Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
25
+ def _each
35
26
  Engine.new(local_copy.encoded_io, csv_options).each do |row|
36
27
 
37
28
  some_value_present = false
@@ -55,7 +46,6 @@ class RemoteTable
55
46
  # represent the row as a hash
56
47
  hash = ::ActiveSupport::OrderedHash.new
57
48
  row.each do |k, v|
58
- next unless k.present?
59
49
  v = v.to_s
60
50
  if not some_value_present and not keep_blank_rows and v.present?
61
51
  some_value_present = true
@@ -81,11 +71,31 @@ class RemoteTable
81
71
  #
82
72
  # @return [Hash]
83
73
  def csv_options
84
- memo = other_options.slice(*PASSTHROUGH_CSV_SETTINGS)
85
- memo[:skip_blanks] = !keep_blank_rows
86
- memo[:headers] ||= headers
87
- memo[:col_sep] ||= delimiter
88
- memo
74
+ {
75
+ skip_blanks: !keep_blank_rows,
76
+ headers: headers,
77
+ col_sep: delimiter,
78
+ quote_char: quote_char,
79
+ }
80
+ end
81
+
82
+ def headers
83
+ return @_headers if defined?(@_headers)
84
+ @_headers = case @headers
85
+ when FalseClass, NilClass
86
+ false
87
+ when :first_row, TrueClass
88
+ i = 0
89
+ line = local_copy.encoded_io.gets
90
+ Engine.parse_line(line).map do |v|
91
+ header = v.to_s.gsub(/\s+/, ' ').strip
92
+ header.present? ? header : "empty_#{i+=1}"
93
+ end
94
+ when Array
95
+ @headers
96
+ else
97
+ raise "Invalid headers: #{headers.inspect}"
98
+ end
89
99
  end
90
100
  end
91
101
  end
@@ -13,15 +13,17 @@ class RemoteTable
13
13
  @definition_mutex = ::Mutex.new
14
14
  end
15
15
 
16
- def _each
17
- require 'fixed_width-multibyte'
18
-
16
+ def preprocess!
19
17
  delete_harmful!
20
18
  convert_eol_to_unix!
21
19
  transliterate_whole_file_to_utf8!
22
20
  crop_rows!
23
21
  skip_rows!
24
22
  cut_columns!
23
+ end
24
+
25
+ def _each
26
+ require 'fixed_width-multibyte'
25
27
 
26
28
  fixed_width_parser.parse[:rows].each do |row|
27
29
  some_value_present = false
@@ -5,6 +5,11 @@ class RemoteTable
5
5
  SINGLE_SPACE = ' '
6
6
  SOFT_HYPHEN = '&shy;'
7
7
 
8
+ def preprocess!
9
+ delete_harmful!
10
+ transliterate_whole_file_to_utf8!
11
+ end
12
+
8
13
  # Yield each row using Nokogiri.
9
14
  def _each
10
15
  require 'nokogiri'
@@ -16,9 +21,6 @@ class RemoteTable
16
21
  unless row_css or row_xpath
17
22
  raise ::ArgumentError, "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML"
18
23
  end
19
-
20
- delete_harmful!
21
- transliterate_whole_file_to_utf8!
22
24
 
23
25
  xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, RemoteTable::EXTERNAL_ENCODING)
24
26
  (row_css ? xml.css(row_css) : xml.xpath(row_xpath)).each do |row|
@@ -1,3 +1,3 @@
1
1
  class RemoteTable
2
- VERSION = '3.0.0.alpha'
2
+ VERSION = '3.0.0.beta'
3
3
  end
data/remote_table.gemspec CHANGED
@@ -31,4 +31,6 @@ Gem::Specification.new do |s|
31
31
  s.add_development_dependency 'rake'
32
32
  s.add_development_dependency 'yard'
33
33
  s.add_development_dependency 'pry'
34
+ s.add_development_dependency 'pry-rescue'
35
+ s.add_development_dependency 'pry-stack_explorer'
34
36
  end
@@ -0,0 +1,2 @@
1
+ a one, b two, c three ,
2
+ a1,b2,c3
@@ -0,0 +1,17 @@
1
+ <table>
2
+ <tr>
3
+ <td>h1</td>
4
+ <td>h2</td>
5
+ <td>h3</td>
6
+ </tr>
7
+ <tr>
8
+ <td>a</td>
9
+ <td>b</td>
10
+ <td>c</td>
11
+ </tr>
12
+ <tr>
13
+ <td>d</td>
14
+ <td>e</td>
15
+ <td>f</td>
16
+ </tr>
17
+ </table>
data/test/helper.rb CHANGED
@@ -1,7 +1,9 @@
1
1
  require 'bundler/setup'
2
+
2
3
  require 'minitest/spec'
3
- require 'minitest/autorun'
4
4
  require 'minitest/reporters'
5
+ require 'minitest/autorun'
6
+ # require 'pry-rescue/minitest'
5
7
  #MiniTest::Unit.runner = MiniTest::SuiteRunner.new
6
8
  #MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
7
9
  require 'remote_table'
data/test/test_remote.rb CHANGED
@@ -36,11 +36,12 @@ describe RemoteTable do
36
36
  end
37
37
 
38
38
  it "open a csv inside a zip file" do
39
- t = RemoteTable.new 'http://www.epa.gov/climatechange/emissions/downloads10/2010-Inventory-Annex-Tables.zip',
40
- :filename => 'Annex Tables/Annex 3/Table A-93.csv',
41
- :skip => 1,
42
- :select => proc { |row| row['Vehicle Age'].strip =~ /^\d+$/ }
43
- t[0]['LDGV'].must_equal '9.09%'
39
+ t = RemoteTable.new('http://www.epa.gov/climatechange/Downloads/ghgemissions/2011-Annex-Tables.zip',
40
+ :filename => 'Annex Tables/Table A-93.csv',
41
+ :skip => 1,
42
+ :headers => %w{ age LDGV LDGT HDGV LDDV LDDT HDDT MC },
43
+ :select => proc { |row| row['age'].to_i.to_s == row['age'] })
44
+ t[0]['LDGV'].must_equal '5.20%'
44
45
  end
45
46
 
46
47
  it 'not blow up if each is called twice' do
@@ -60,12 +61,6 @@ describe RemoteTable do
60
61
  t.send(:cache).length.must_equal 0
61
62
  end
62
63
 
63
- # fixes ArgumentError: invalid byte sequence in UTF-8
64
- it %{safely strip soft hyphens and read windows-1252 html} do
65
- t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table[2]//table[1]//tr[3]//tr', :column_xpath => 'td', :encoding => 'windows-1252'
66
- t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
67
- end
68
-
69
64
  it %{transliterate characters from ISO-8859-1} do
70
65
  t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv', :encoding => 'ISO-8859-1'
71
66
  t.rows.detect { |row| row['name'] == 'Briquet Griffon Vendéen' }.wont_equal nil
@@ -85,8 +80,8 @@ describe RemoteTable do
85
80
  time1.wont_equal time2
86
81
  end
87
82
 
88
- it %{recode as UTF-8 even ISO-8859-1 (or any other encoding)} do
89
- t = RemoteTable.new 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';', :encoding => 'ISO-8859-1'
83
+ it %{reads country names} do
84
+ t = RemoteTable.new 'http://www.iso.org/iso/country_names_and_code_elements_txt', :skip => 1, :headers => false, :delimiter => ';'
90
85
  t[1][0].must_equal %{ÅLAND ISLANDS}
91
86
  end
92
87
 
@@ -14,6 +14,13 @@ describe RemoteTable do
14
14
  by_path.rows.must_equal by_url.rows
15
15
  end
16
16
 
17
+ it "strips whitespace from headers" do
18
+ t = RemoteTable.new 'test/data/lots of spaces.csv'
19
+ t[0]['a one'].must_equal 'a1'
20
+ t[0]['b two'].must_equal 'b2'
21
+ t[0]['c three'].must_equal 'c3'
22
+ end
23
+
17
24
  {
18
25
  # IMPOSSIBLE "../data/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls" => {:format=>"xls", :encoding=>"binary"},
19
26
  "../data/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx" => {:format=>"xlsx"},
@@ -59,4 +66,18 @@ describe RemoteTable do
59
66
  t[1]['name'].must_equal 'Derek Kastner'
60
67
  t[1]['city'].must_equal 'Lansing'
61
68
  end
69
+
70
+ it "reads html with xpath" do
71
+ t = RemoteTable.new 'test/data/table.html', row_xpath: '//tr', column_xpath: 'td'
72
+ t[0]['h1'].must_equal 'a'
73
+ t[1]['h3'].must_equal 'f'
74
+ end
75
+
76
+ # fixes ArgumentError: invalid byte sequence in UTF-8
77
+ # disabled because xpath not be somehow broken - works in chrome
78
+ it %{safely strip soft hyphens and read windows-1252 html} do
79
+ row_xpath = '/html/body/table[2]/tbody/tr/td/center/table/tbody/tr[3]/td/table/tbody/tr[2]/td[1]'
80
+ t = RemoteTable.new 'test/data/faa-aircraft.html', :row_xpath => row_xpath, :column_xpath => 'td', :encoding => 'windows-1252'
81
+ t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
82
+ end
62
83
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remote_table
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.0.alpha
4
+ version: 3.0.0.beta
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2013-07-26 00:00:00.000000000 Z
13
+ date: 2013-07-31 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: activesupport
@@ -220,6 +220,38 @@ dependencies:
220
220
  - - ! '>='
221
221
  - !ruby/object:Gem::Version
222
222
  version: '0'
223
+ - !ruby/object:Gem::Dependency
224
+ name: pry-rescue
225
+ requirement: !ruby/object:Gem::Requirement
226
+ none: false
227
+ requirements:
228
+ - - ! '>='
229
+ - !ruby/object:Gem::Version
230
+ version: '0'
231
+ type: :development
232
+ prerelease: false
233
+ version_requirements: !ruby/object:Gem::Requirement
234
+ none: false
235
+ requirements:
236
+ - - ! '>='
237
+ - !ruby/object:Gem::Version
238
+ version: '0'
239
+ - !ruby/object:Gem::Dependency
240
+ name: pry-stack_explorer
241
+ requirement: !ruby/object:Gem::Requirement
242
+ none: false
243
+ requirements:
244
+ - - ! '>='
245
+ - !ruby/object:Gem::Version
246
+ version: '0'
247
+ type: :development
248
+ prerelease: false
249
+ version_requirements: !ruby/object:Gem::Requirement
250
+ none: false
251
+ requirements:
252
+ - - ! '>='
253
+ - !ruby/object:Gem::Version
254
+ version: '0'
223
255
  description: Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma
224
256
  separated), TSV (tab separated), other delimited, fixed-width files, and shapefiles.
225
257
  Returns an Array of Arrays or Hashes, depending on whether there are headers.
@@ -269,7 +301,9 @@ files:
269
301
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
270
302
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
271
303
  - test/data/list-en1-semic-3.original.iso-8859-1.csv
304
+ - test/data/lots of spaces.csv
272
305
  - test/data/ranges.csv
306
+ - test/data/table.html
273
307
  - test/helper.rb
274
308
  - test/test_big.rb
275
309
  - test/test_errata.rb
@@ -322,7 +356,9 @@ test_files:
322
356
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
323
357
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
324
358
  - test/data/list-en1-semic-3.original.iso-8859-1.csv
359
+ - test/data/lots of spaces.csv
325
360
  - test/data/ranges.csv
361
+ - test/data/table.html
326
362
  - test/helper.rb
327
363
  - test/test_big.rb
328
364
  - test/test_errata.rb