remote_table 3.0.0.alpha → 3.0.0.beta

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,15 @@
1
+ 3.0.0.beta / 2013-07-30
2
+
3
+ * Breaking changes
4
+
5
+ * Strip whitespace from headers unless you provide them as an array
6
+ * Not passing anything options on to CSV... use :delimiter instead of :col_sep
7
+ * Include columns with blank headers as "empty_N"
8
+
9
+ * Enhancements
10
+
11
+ * Support relative paths with spaces
12
+
1
13
  3.0.0.alpha / 2013-07-25
2
14
 
3
15
  * Breaking changes
data/lib/remote_table.rb CHANGED
@@ -53,7 +53,7 @@ class RemoteTable
53
53
  # Guess compression based on URL. Used internally.
54
54
  # @return [Symbol,nil]
55
55
  def guess_compression(url)
56
- extname = ::File.extname(::URI.parse(url).path).downcase
56
+ extname = extname(url).downcase
57
57
  case extname
58
58
  when /gz/, /gunzip/
59
59
  :gz
@@ -69,7 +69,7 @@ class RemoteTable
69
69
  # Guess packing from URL. Used internally.
70
70
  # @return [Symbol,nil]
71
71
  def guess_packing(url)
72
- basename = ::File.basename(::URI.parse(url).path).downcase
72
+ basename = basename(url).downcase
73
73
  if basename.include?('.tar') or basename.include?('.tgz')
74
74
  :tar
75
75
  end
@@ -109,6 +109,24 @@ class RemoteTable
109
109
  uri.query = params.join('&')
110
110
  uri.to_s
111
111
  end
112
+
113
+ private
114
+
115
+ def basename(url)
116
+ ::File.basename path(url)
117
+ end
118
+
119
+ def extname(url)
120
+ ::File.extname path(url)
121
+ end
122
+
123
+ def path(url)
124
+ if url.include?('://')
125
+ ::URI.parse(url).path
126
+ else
127
+ File.expand_path url
128
+ end
129
+ end
112
130
  end
113
131
 
114
132
  EXTERNAL_ENCODING = 'UTF-8'
@@ -129,11 +147,13 @@ class RemoteTable
129
147
  :keep_blank_rows => false,
130
148
  :skip => 0,
131
149
  :encoding => 'UTF-8',
132
- :delimiter => ','
150
+ :delimiter => ',',
151
+ :quote_char => '"',
133
152
  }
134
153
  OLD_SETTING_NAMES = {
135
154
  :pre_select => [:select],
136
155
  :pre_reject => [:reject],
156
+ :delimiter => [:col_sep],
137
157
  }
138
158
 
139
159
  include ::Enumerable
@@ -178,6 +198,13 @@ class RemoteTable
178
198
  # Headers specified by the user: +:first_row+ (the default), +false+, or a list of headers.
179
199
  # @return [:first_row,false,Array<String>]
180
200
  attr_reader :headers
201
+
202
+ # Quote character for delimited files.
203
+ #
204
+ # Defaults to double quotes.
205
+ #
206
+ # @return [String]
207
+ attr_reader :quote_char
181
208
 
182
209
  # The sheet specified by the user as a number or a string.
183
210
  # @return[String,Integer]
@@ -199,7 +226,7 @@ class RemoteTable
199
226
  # @return [String]
200
227
  attr_reader :encoding
201
228
 
202
- # The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is :delimited.
229
+ # The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is ','.
203
230
  # @return [String]
204
231
  attr_reader :delimiter
205
232
 
@@ -377,6 +404,7 @@ class RemoteTable
377
404
  if headers.is_a?(::Array) and headers.any?(&:blank?)
378
405
  raise ::ArgumentError, "[remote_table] If you specify headers, none of them can be blank"
379
406
  end
407
+ @quote_char = grab settings, :quote_char
380
408
 
381
409
  @compression = grab(settings, :compression) || RemoteTable.guess_compression(url)
382
410
  @packing = grab(settings, :packing) || RemoteTable.guess_packing(url)
@@ -422,6 +450,7 @@ class RemoteTable
422
450
  end
423
451
  else
424
452
  mark_download!
453
+ preprocess!
425
454
  memo = _each do |row|
426
455
  parser.parse(row).each do |virtual_row|
427
456
  virtual_row.row_hash = ::HashDigest.hexdigest row
@@ -481,6 +510,10 @@ class RemoteTable
481
510
  end
482
511
 
483
512
  private
513
+
514
+ def preprocess!
515
+ # noop, overridden sometimes
516
+ end
484
517
 
485
518
  def mark_download!
486
519
  @download_count_mutex.synchronize do
@@ -14,24 +14,15 @@ class RemoteTable
14
14
  Engine = ::FasterCSV
15
15
  end
16
16
 
17
- PASSTHROUGH_CSV_SETTINGS = [
18
- :unconverted_fields,
19
- :col_sep,
20
- :row_sep,
21
- :return_headers,
22
- :header_converters,
23
- :quote_char,
24
- :converters,
25
- :force_quotes,
26
- ]
27
-
28
- # Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
29
- def _each
17
+ def preprocess!
30
18
  delete_harmful!
31
19
  convert_eol_to_unix!
32
20
  transliterate_whole_file_to_utf8!
33
21
  skip_rows!
22
+ end
34
23
 
24
+ # Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
25
+ def _each
35
26
  Engine.new(local_copy.encoded_io, csv_options).each do |row|
36
27
 
37
28
  some_value_present = false
@@ -55,7 +46,6 @@ class RemoteTable
55
46
  # represent the row as a hash
56
47
  hash = ::ActiveSupport::OrderedHash.new
57
48
  row.each do |k, v|
58
- next unless k.present?
59
49
  v = v.to_s
60
50
  if not some_value_present and not keep_blank_rows and v.present?
61
51
  some_value_present = true
@@ -81,11 +71,31 @@ class RemoteTable
81
71
  #
82
72
  # @return [Hash]
83
73
  def csv_options
84
- memo = other_options.slice(*PASSTHROUGH_CSV_SETTINGS)
85
- memo[:skip_blanks] = !keep_blank_rows
86
- memo[:headers] ||= headers
87
- memo[:col_sep] ||= delimiter
88
- memo
74
+ {
75
+ skip_blanks: !keep_blank_rows,
76
+ headers: headers,
77
+ col_sep: delimiter,
78
+ quote_char: quote_char,
79
+ }
80
+ end
81
+
82
+ def headers
83
+ return @_headers if defined?(@_headers)
84
+ @_headers = case @headers
85
+ when FalseClass, NilClass
86
+ false
87
+ when :first_row, TrueClass
88
+ i = 0
89
+ line = local_copy.encoded_io.gets
90
+ Engine.parse_line(line).map do |v|
91
+ header = v.to_s.gsub(/\s+/, ' ').strip
92
+ header.present? ? header : "empty_#{i+=1}"
93
+ end
94
+ when Array
95
+ @headers
96
+ else
97
+ raise "Invalid headers: #{headers.inspect}"
98
+ end
89
99
  end
90
100
  end
91
101
  end
@@ -13,15 +13,17 @@ class RemoteTable
13
13
  @definition_mutex = ::Mutex.new
14
14
  end
15
15
 
16
- def _each
17
- require 'fixed_width-multibyte'
18
-
16
+ def preprocess!
19
17
  delete_harmful!
20
18
  convert_eol_to_unix!
21
19
  transliterate_whole_file_to_utf8!
22
20
  crop_rows!
23
21
  skip_rows!
24
22
  cut_columns!
23
+ end
24
+
25
+ def _each
26
+ require 'fixed_width-multibyte'
25
27
 
26
28
  fixed_width_parser.parse[:rows].each do |row|
27
29
  some_value_present = false
@@ -5,6 +5,11 @@ class RemoteTable
5
5
  SINGLE_SPACE = ' '
6
6
  SOFT_HYPHEN = '&shy;'
7
7
 
8
+ def preprocess!
9
+ delete_harmful!
10
+ transliterate_whole_file_to_utf8!
11
+ end
12
+
8
13
  # Yield each row using Nokogiri.
9
14
  def _each
10
15
  require 'nokogiri'
@@ -16,9 +21,6 @@ class RemoteTable
16
21
  unless row_css or row_xpath
17
22
  raise ::ArgumentError, "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML"
18
23
  end
19
-
20
- delete_harmful!
21
- transliterate_whole_file_to_utf8!
22
24
 
23
25
  xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, RemoteTable::EXTERNAL_ENCODING)
24
26
  (row_css ? xml.css(row_css) : xml.xpath(row_xpath)).each do |row|
@@ -1,3 +1,3 @@
1
1
  class RemoteTable
2
- VERSION = '3.0.0.alpha'
2
+ VERSION = '3.0.0.beta'
3
3
  end
data/remote_table.gemspec CHANGED
@@ -31,4 +31,6 @@ Gem::Specification.new do |s|
31
31
  s.add_development_dependency 'rake'
32
32
  s.add_development_dependency 'yard'
33
33
  s.add_development_dependency 'pry'
34
+ s.add_development_dependency 'pry-rescue'
35
+ s.add_development_dependency 'pry-stack_explorer'
34
36
  end
@@ -0,0 +1,2 @@
1
+ a one, b two, c three ,
2
+ a1,b2,c3
@@ -0,0 +1,17 @@
1
+ <table>
2
+ <tr>
3
+ <td>h1</td>
4
+ <td>h2</td>
5
+ <td>h3</td>
6
+ </tr>
7
+ <tr>
8
+ <td>a</td>
9
+ <td>b</td>
10
+ <td>c</td>
11
+ </tr>
12
+ <tr>
13
+ <td>d</td>
14
+ <td>e</td>
15
+ <td>f</td>
16
+ </tr>
17
+ </table>
data/test/helper.rb CHANGED
@@ -1,7 +1,9 @@
1
1
  require 'bundler/setup'
2
+
2
3
  require 'minitest/spec'
3
- require 'minitest/autorun'
4
4
  require 'minitest/reporters'
5
+ require 'minitest/autorun'
6
+ # require 'pry-rescue/minitest'
5
7
  #MiniTest::Unit.runner = MiniTest::SuiteRunner.new
6
8
  #MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
7
9
  require 'remote_table'
data/test/test_remote.rb CHANGED
@@ -36,11 +36,12 @@ describe RemoteTable do
36
36
  end
37
37
 
38
38
  it "open a csv inside a zip file" do
39
- t = RemoteTable.new 'http://www.epa.gov/climatechange/emissions/downloads10/2010-Inventory-Annex-Tables.zip',
40
- :filename => 'Annex Tables/Annex 3/Table A-93.csv',
41
- :skip => 1,
42
- :select => proc { |row| row['Vehicle Age'].strip =~ /^\d+$/ }
43
- t[0]['LDGV'].must_equal '9.09%'
39
+ t = RemoteTable.new('http://www.epa.gov/climatechange/Downloads/ghgemissions/2011-Annex-Tables.zip',
40
+ :filename => 'Annex Tables/Table A-93.csv',
41
+ :skip => 1,
42
+ :headers => %w{ age LDGV LDGT HDGV LDDV LDDT HDDT MC },
43
+ :select => proc { |row| row['age'].to_i.to_s == row['age'] })
44
+ t[0]['LDGV'].must_equal '5.20%'
44
45
  end
45
46
 
46
47
  it 'not blow up if each is called twice' do
@@ -60,12 +61,6 @@ describe RemoteTable do
60
61
  t.send(:cache).length.must_equal 0
61
62
  end
62
63
 
63
- # fixes ArgumentError: invalid byte sequence in UTF-8
64
- it %{safely strip soft hyphens and read windows-1252 html} do
65
- t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table[2]//table[1]//tr[3]//tr', :column_xpath => 'td', :encoding => 'windows-1252'
66
- t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
67
- end
68
-
69
64
  it %{transliterate characters from ISO-8859-1} do
70
65
  t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv', :encoding => 'ISO-8859-1'
71
66
  t.rows.detect { |row| row['name'] == 'Briquet Griffon Vendéen' }.wont_equal nil
@@ -85,8 +80,8 @@ describe RemoteTable do
85
80
  time1.wont_equal time2
86
81
  end
87
82
 
88
- it %{recode as UTF-8 even ISO-8859-1 (or any other encoding)} do
89
- t = RemoteTable.new 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';', :encoding => 'ISO-8859-1'
83
+ it %{reads country names} do
84
+ t = RemoteTable.new 'http://www.iso.org/iso/country_names_and_code_elements_txt', :skip => 1, :headers => false, :delimiter => ';'
90
85
  t[1][0].must_equal %{ÅLAND ISLANDS}
91
86
  end
92
87
 
@@ -14,6 +14,13 @@ describe RemoteTable do
14
14
  by_path.rows.must_equal by_url.rows
15
15
  end
16
16
 
17
+ it "strips whitespace from headers" do
18
+ t = RemoteTable.new 'test/data/lots of spaces.csv'
19
+ t[0]['a one'].must_equal 'a1'
20
+ t[0]['b two'].must_equal 'b2'
21
+ t[0]['c three'].must_equal 'c3'
22
+ end
23
+
17
24
  {
18
25
  # IMPOSSIBLE "../data/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls" => {:format=>"xls", :encoding=>"binary"},
19
26
  "../data/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx" => {:format=>"xlsx"},
@@ -59,4 +66,18 @@ describe RemoteTable do
59
66
  t[1]['name'].must_equal 'Derek Kastner'
60
67
  t[1]['city'].must_equal 'Lansing'
61
68
  end
69
+
70
+ it "reads html with xpath" do
71
+ t = RemoteTable.new 'test/data/table.html', row_xpath: '//tr', column_xpath: 'td'
72
+ t[0]['h1'].must_equal 'a'
73
+ t[1]['h3'].must_equal 'f'
74
+ end
75
+
76
+ # fixes ArgumentError: invalid byte sequence in UTF-8
77
+ # disabled because xpath not be somehow broken - works in chrome
78
+ it %{safely strip soft hyphens and read windows-1252 html} do
79
+ row_xpath = '/html/body/table[2]/tbody/tr/td/center/table/tbody/tr[3]/td/table/tbody/tr[2]/td[1]'
80
+ t = RemoteTable.new 'test/data/faa-aircraft.html', :row_xpath => row_xpath, :column_xpath => 'td', :encoding => 'windows-1252'
81
+ t.rows.detect { |row| row['Model'] == 'A300B4600' }.wont_equal nil
82
+ end
62
83
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remote_table
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.0.alpha
4
+ version: 3.0.0.beta
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2013-07-26 00:00:00.000000000 Z
13
+ date: 2013-07-31 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: activesupport
@@ -220,6 +220,38 @@ dependencies:
220
220
  - - ! '>='
221
221
  - !ruby/object:Gem::Version
222
222
  version: '0'
223
+ - !ruby/object:Gem::Dependency
224
+ name: pry-rescue
225
+ requirement: !ruby/object:Gem::Requirement
226
+ none: false
227
+ requirements:
228
+ - - ! '>='
229
+ - !ruby/object:Gem::Version
230
+ version: '0'
231
+ type: :development
232
+ prerelease: false
233
+ version_requirements: !ruby/object:Gem::Requirement
234
+ none: false
235
+ requirements:
236
+ - - ! '>='
237
+ - !ruby/object:Gem::Version
238
+ version: '0'
239
+ - !ruby/object:Gem::Dependency
240
+ name: pry-stack_explorer
241
+ requirement: !ruby/object:Gem::Requirement
242
+ none: false
243
+ requirements:
244
+ - - ! '>='
245
+ - !ruby/object:Gem::Version
246
+ version: '0'
247
+ type: :development
248
+ prerelease: false
249
+ version_requirements: !ruby/object:Gem::Requirement
250
+ none: false
251
+ requirements:
252
+ - - ! '>='
253
+ - !ruby/object:Gem::Version
254
+ version: '0'
223
255
  description: Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma
224
256
  separated), TSV (tab separated), other delimited, fixed-width files, and shapefiles.
225
257
  Returns an Array of Arrays or Hashes, depending on whether there are headers.
@@ -269,7 +301,9 @@ files:
269
301
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
270
302
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
271
303
  - test/data/list-en1-semic-3.original.iso-8859-1.csv
304
+ - test/data/lots of spaces.csv
272
305
  - test/data/ranges.csv
306
+ - test/data/table.html
273
307
  - test/helper.rb
274
308
  - test/test_big.rb
275
309
  - test/test_errata.rb
@@ -322,7 +356,9 @@ test_files:
322
356
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
323
357
  - test/data/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
324
358
  - test/data/list-en1-semic-3.original.iso-8859-1.csv
359
+ - test/data/lots of spaces.csv
325
360
  - test/data/ranges.csv
361
+ - test/data/table.html
326
362
  - test/helper.rb
327
363
  - test/test_big.rb
328
364
  - test/test_errata.rb