remote_table 2.1.2 → 3.0.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/CHANGELOG +12 -3
  2. data/README.markdown +1 -8
  3. data/lib/remote_table.rb +72 -87
  4. data/lib/remote_table/fixed_width.rb +5 -5
  5. data/lib/remote_table/local_copy.rb +1 -1
  6. data/lib/remote_table/plaintext.rb +3 -3
  7. data/lib/remote_table/processed_by_roo.rb +6 -4
  8. data/lib/remote_table/version.rb +1 -1
  9. data/remote_table.gemspec +3 -4
  10. data/test/{support → data}/airports.utf8.csv +0 -0
  11. data/test/data/color.csv +3 -0
  12. data/test/{fixtures → data}/data.yml +0 -0
  13. data/test/{support → data}/list-en1-semic-3.neooffice.binary.ods +0 -0
  14. data/test/{support → data}/list-en1-semic-3.neooffice.iso-8859-1.csv +0 -0
  15. data/test/{support → data}/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64 +0 -0
  16. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.csv +0 -0
  17. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.fixed_width-62 +0 -0
  18. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.html +0 -0
  19. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.xml +0 -0
  20. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls +0 -0
  21. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls +0 -0
  22. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx +0 -0
  23. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html +0 -0
  24. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma +0 -0
  25. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html +0 -0
  26. data/test/{support → data}/list-en1-semic-3.original.iso-8859-1.csv +0 -0
  27. data/test/data/ranges.csv +4 -0
  28. data/test/test_errata.rb +2 -2
  29. data/test/test_local.rb +10 -0
  30. data/test/test_old_syntax.rb +0 -13
  31. data/test/test_parser.rb +24 -0
  32. data/test/test_remote.rb +113 -0
  33. data/test/test_remote_table.rb +30 -165
  34. data/test/test_transpose.rb +11 -0
  35. metadata +86 -66
  36. checksums.yaml +0 -15
  37. data/lib/remote_table/shp.rb +0 -30
  38. data/lib/remote_table/transformer.rb +0 -29
  39. data/test/test_old_transform.rb +0 -47
  40. data/test/test_shapefile.rb +0 -13
data/CHANGELOG CHANGED
@@ -1,8 +1,17 @@
1
- 2.1.2 / 2013-10-08
1
+ 3.0.0.alpha / 2013-07-25
2
2
 
3
- * Bug fixes
3
+ * Breaking changes
4
+
5
+ * Just use :encoding to specify a file's internal/initial/original encoding. No more :internal_encoding, this was jargony.
6
+ * No more shapefile support - hard to imagine a shapefile as a table
7
+ * No more :transform, legacy or not
8
+ * For :errata, now you pass an object that responds to #rejects?(row) and #correct!(row)
9
+ * No more :errata_settings
10
+
11
+ * Enhancements
4
12
 
5
- * Correctly detect format from filename - don't get confused if a filename has "xls" in it (thanks @activefx #10)
13
+ * :parser option takes an object that responds to #parse(row) and returns an array of one or more rows.
14
+ * RemoteTable.transpose(url, key_key, value_key) helper
6
15
 
7
16
  2.1.1 / 2013-03-25
8
17
 
data/README.markdown CHANGED
@@ -1,6 +1,6 @@
1
1
  # remote_table
2
2
 
3
- Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), SHP (ESRI shapefiles), other delimited, fixed-width files.
3
+ Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), other delimited, fixed-width files.
4
4
 
5
5
  Tested on MRI 1.8, MRI 1.9, and JRuby 1.6.7+. Thread-safe.
6
6
 
@@ -441,12 +441,6 @@ Everything is forced into UTF-8. You can improve the quality of the conversion b
441
441
  [ 'spacer', 1 ],
442
442
  [ 'header1', 10, { :type => :string } ]]
443
443
 
444
- # ESRI Shapefile from NREL
445
- require 'geo_ruby'
446
- require 'dbf'
447
- RemoteTable.new 'http://www.nrel.gov/gis/cfm/data/GIS_Data_Technology_Specific/United_States/Solar/High_Resolution/Lower_48_DNI_High_Resolution.zip',
448
- :format => :shp
449
-
450
444
  ## Requirements
451
445
 
452
446
  * Unix tools like curl, iconv, perl, cat, cut, tail, etc. accessible from your `$PATH`
@@ -455,7 +449,6 @@ Everything is forced into UTF-8. You can improve the quality of the conversion b
455
449
  ## Wishlist
456
450
 
457
451
  * Win32 compat
458
- * The new "custom parser" syntax (aka transformer) hasn't been defined yet... only the old-style syntax is available
459
452
 
460
453
  ## Authors
461
454
 
data/lib/remote_table.rb CHANGED
@@ -14,7 +14,6 @@ end
14
14
  require 'hash_digest'
15
15
 
16
16
  require 'remote_table/local_copy'
17
- require 'remote_table/transformer'
18
17
 
19
18
  require 'remote_table/plaintext'
20
19
  require 'remote_table/processed_by_roo'
@@ -27,7 +26,6 @@ require 'remote_table/fixed_width'
27
26
  require 'remote_table/html'
28
27
  require 'remote_table/xml'
29
28
  require 'remote_table/yaml'
30
- require 'remote_table/shp'
31
29
 
32
30
  class Hash
33
31
  # Added by remote_table to store a hash (think checksum) of the data with which a particular Hash is initialized.
@@ -44,6 +42,14 @@ end
44
42
  # Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), other delimited, fixed-width files.
45
43
  class RemoteTable
46
44
  class << self
45
+ # Transpose two columns into a mapping from one to the other.
46
+ def transpose(url, key_key, value_key, options = {})
47
+ new(url, options).inject({}) do |memo, row|
48
+ memo[row[key_key]] = row[value_key]
49
+ memo
50
+ end
51
+ end
52
+
47
53
  # Guess compression based on URL. Used internally.
48
54
  # @return [Symbol,nil]
49
55
  def guess_compression(url)
@@ -72,26 +78,24 @@ class RemoteTable
72
78
  # Guess file format from the basename. Since a file might be decompressed and/or pulled out of an archive with a glob, this usually can't be called until a file is downloaded.
73
79
  # @return [Symbol,nil]
74
80
  def guess_format(basename)
75
- case basename.to_s.downcase.strip
76
- when /ods\z/, /open_?office\z/
81
+ case basename.to_s.downcase
82
+ when /ods/, /open_?office/
77
83
  :ods
78
- when /xlsx\z/, /excelx\z/
84
+ when /xlsx/, /excelx/
79
85
  :xlsx
80
- when /xls\z/, /excel\z/
86
+ when /xls/, /excel/
81
87
  :xls
82
- when /csv\z/, /tsv\z/, /delimited\z/
88
+ when /csv/, /tsv/, /delimited/
83
89
  # note that there is no RemoteTable::Csv class - it's normalized to :delimited
84
90
  :delimited
85
- when /fixed_?width\z/
91
+ when /fixed_?width/
86
92
  :fixed_width
87
- when /html?\z/
93
+ when /htm/
88
94
  :html
89
- when /xml\z/
95
+ when /xml/
90
96
  :xml
91
- when /yaml\z/, /yml\z/
97
+ when /yaml/, /yml/
92
98
  :yaml
93
- when /shp\z/
94
- :shp
95
99
  end
96
100
  end
97
101
 
@@ -107,14 +111,6 @@ class RemoteTable
107
111
  end
108
112
  end
109
113
 
110
- # @private
111
- # Here to support legacy code.
112
- class Transform
113
- def self.row_hash(row)
114
- ::HashDigest.hexdigest row
115
- end
116
- end
117
-
118
114
  EXTERNAL_ENCODING = 'UTF-8'
119
115
  EXTERNAL_ENCODING_ICONV = 'UTF-8//TRANSLIT'
120
116
  GOOGLE_DOCS_SPREADSHEET = [
@@ -124,7 +120,7 @@ class RemoteTable
124
120
  VALID = {
125
121
  :compression => [:gz, :zip, :bz2, :exe],
126
122
  :packing => [:tar],
127
- :format => [:xlsx, :xls, :delimited, :ods, :fixed_width, :html, :xml, :yaml, :csv, :shp],
123
+ :format => [:xlsx, :xls, :delimited, :ods, :fixed_width, :html, :xml, :yaml, :csv],
128
124
  }
129
125
  DEFAULT = {
130
126
  :streaming => false,
@@ -132,19 +128,16 @@ class RemoteTable
132
128
  :headers => :first_row,
133
129
  :keep_blank_rows => false,
134
130
  :skip => 0,
135
- :internal_encoding => 'UTF-8',
131
+ :encoding => 'UTF-8',
136
132
  :delimiter => ','
137
133
  }
138
134
  OLD_SETTING_NAMES = {
139
- :internal_encoding => [:encoding],
140
- :transform_settings => [:transform],
141
135
  :pre_select => [:select],
142
136
  :pre_reject => [:reject],
143
- :errata_settings => [:errata],
144
137
  }
145
138
 
146
139
  include ::Enumerable
147
-
140
+
148
141
  # The URL of the local or remote file.
149
142
  #
150
143
  # @example Local
@@ -169,10 +162,6 @@ class RemoteTable
169
162
  # @return [Integer]
170
163
  attr_reader :download_count
171
164
 
172
- # @private
173
- # Used internally to access the transformer (aka parser).
174
- attr_reader :transformer
175
-
176
165
  # @private
177
166
  # Used internally to access to a downloaded copy of the file.
178
167
  # @return [RemoteTable::LocalCopy]
@@ -185,39 +174,39 @@ class RemoteTable
185
174
  # Whether to warn the user on multiple downloads. Defaults to true.
186
175
  # @return [true,false]
187
176
  attr_reader :warn_on_multiple_downloads
188
-
177
+
189
178
  # Headers specified by the user: +:first_row+ (the default), +false+, or a list of headers.
190
179
  # @return [:first_row,false,Array<String>]
191
180
  attr_reader :headers
192
-
181
+
193
182
  # The sheet specified by the user as a number or a string.
194
183
  # @return[String,Integer]
195
184
  attr_reader :sheet
196
-
185
+
197
186
  # Whether to keep blank rows. Default is false.
198
187
  # @return [true,false]
199
188
  attr_reader :keep_blank_rows
200
-
189
+
201
190
  # Form data to POST in the download request. It should probably be in +application/x-www-form-urlencoded+.
202
191
  # @return [String]
203
192
  attr_reader :form_data
204
-
193
+
205
194
  # How many rows to skip at the beginning of the file or table. Default is 0.
206
195
  # @return [Integer]
207
196
  attr_reader :skip
208
197
 
209
- # The original encoding of the source file. Default is UTF-8. Previously passed as +:encoding+.
198
+ # The original encoding of the source file. Default is UTF-8.
210
199
  # @return [String]
211
- attr_reader :internal_encoding
212
-
200
+ attr_reader :encoding
201
+
213
202
  # The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is :delimited.
214
203
  # @return [String]
215
204
  attr_reader :delimiter
216
-
205
+
217
206
  # The XPath used to find rows in HTML or XML.
218
207
  # @return [String]
219
208
  attr_reader :row_xpath
220
-
209
+
221
210
  # The XPath used to find columns in HTML or XML.
222
211
  # @return [String]
223
212
  attr_reader :column_xpath
@@ -225,11 +214,11 @@ class RemoteTable
225
214
  # The CSS selector used to find rows in HTML or XML.
226
215
  # @return [String]
227
216
  attr_reader :row_css
228
-
217
+
229
218
  # The CSS selector used to find columns in HTML or XML.
230
219
  # @return [String]
231
220
  attr_reader :column_css
232
-
221
+
233
222
  # The format of the source file. Can be +:xlsx+, +:xls+, +:delimited+, +:ods+, +:fixed_width+, +:html+, +:xml+, +:yaml+.
234
223
  # @return [Symbol]
235
224
  attr_reader :format
@@ -241,7 +230,7 @@ class RemoteTable
241
230
  # The packing type. Guessed from URL if not provided. Only +:tar+ is supported.
242
231
  # @return [Symbol]
243
232
  attr_reader :packing
244
-
233
+
245
234
  # The glob used to pick a file out of an archive.
246
235
  #
247
236
  # @return [String]
@@ -249,7 +238,7 @@ class RemoteTable
249
238
  # @example Pick out the only CSV in a ZIP file
250
239
  # RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
251
240
  attr_reader :glob
252
-
241
+
253
242
  # The filename, which can be used to pick a file out of an archive.
254
243
  #
255
244
  # @return [String]
@@ -267,7 +256,7 @@ class RemoteTable
267
256
  # # ALMOST
268
257
  # RemoteTable.new 'file:///atoz.txt', :cut => '1,12,13,15,19,20'
269
258
  attr_reader :cut
270
-
259
+
271
260
  # Use a range of rows in a plaintext file.
272
261
  #
273
262
  # @return [Range]
@@ -278,7 +267,7 @@ class RemoteTable
278
267
  # :select => proc { |row| CbecsEnergyIntensity::NAICS_CODE_SYNTHESIZER.call(row) },
279
268
  # :crop => (21..37))
280
269
  attr_reader :crop
281
-
270
+
282
271
  # The fixed-width schema, given as a multi-dimensional array.
283
272
  #
284
273
  # @return [Array<Array{String,Integer,Hash}>]
@@ -293,30 +282,29 @@ class RemoteTable
293
282
  # [ 'spacer', 12 ],
294
283
  # [ 'header6', 10, { :type => :string } ]])
295
284
  attr_reader :schema
296
-
285
+
297
286
  # If you somehow already defined a fixed-width schema (so you can re-use it?), specify it here.
298
287
  # @return [String,Symbol]
299
288
  attr_reader :schema_name
300
-
289
+
301
290
  # A proc that decides whether to include a row. Previously passed as +:select+.
302
291
  # @return [Proc]
303
292
  attr_reader :pre_select
304
-
293
+
305
294
  # A proc that decides whether to include a row. Previously passed as +:reject+.
306
295
  # @return [Proc]
307
296
  attr_reader :pre_reject
308
297
 
309
- # Settings to create a transformer.
310
- # @return [Hash]
311
- attr_reader :transform_settings
312
-
313
- # A hash to initialize an Errata instance to be used on every row. Applied after creating +row_hash+ and before passing to +:synthesize+ procs, etc. Previously passed as +:errata+.
298
+ # An object that responds to #rejects?(row) and #correct!(row). Applied after creating +row_hash+.
299
+ #
300
+ # * #rejects?(row) - if row should be treated like it doesn't exist
301
+ # * #correct!(row) - destructively update a row to fix something
314
302
  #
315
- # See the Errata library at https://github.com/seamusabshere/errata
303
+ # See the Errata library at https://github.com/seamusabshere/errata for an example implementation.
316
304
  #
317
305
  # @return [Hash]
318
- attr_reader :errata_settings
319
-
306
+ attr_reader :errata
307
+
320
308
  # The format of the source file. Can be specified as: :xlsx, :xls, :delimited (aka :csv), :ods, :fixed_width, :html, :xml, :yaml
321
309
  #
322
310
  # Note: treats all +docs.google.com+ and +spreadsheets.google.com+ URLs as +:delimited+.
@@ -326,6 +314,20 @@ class RemoteTable
326
314
  # @return [Hash]
327
315
  attr_reader :format
328
316
 
317
+ # @private
318
+ class NullParser
319
+ def parse(row)
320
+ [row]
321
+ end
322
+ end
323
+
324
+ # An object that responds to #parse(row) and returns an array of one or more rows.
325
+ #
326
+ # @return [#parse]
327
+ def parser
328
+ @final_parser ||= (@parser || NullParser.new)
329
+ end
330
+
329
331
  # Options passed by the user that may be passed through to the underlying parsing library.
330
332
  # @return [Hash]
331
333
  attr_reader :other_options
@@ -354,7 +356,6 @@ class RemoteTable
354
356
  def initialize(*args)
355
357
  @download_count_mutex = ::Mutex.new
356
358
  @extend_bang_mutex = ::Mutex.new
357
- @errata_mutex = ::Mutex.new
358
359
 
359
360
  @cache = []
360
361
  @download_count = 0
@@ -387,25 +388,24 @@ class RemoteTable
387
388
  @keep_blank_rows = grab settings, :keep_blank_rows
388
389
  @form_data = grab settings, :form_data
389
390
  @skip = grab settings, :skip
390
- @internal_encoding = grab settings, :internal_encoding
391
+ @encoding = grab settings, :encoding
391
392
  @row_xpath = grab settings, :row_xpath
392
393
  @column_xpath = grab settings, :column_xpath
393
394
  @row_css = grab settings, :row_css
394
395
  @column_css = grab settings, :column_css
395
396
  @glob = grab settings, :glob
396
397
  @filename = grab settings, :filename
397
- @transform_settings = grab settings, :transform_settings
398
398
  @cut = grab settings, :cut
399
399
  @crop = grab settings, :crop
400
400
  @schema = grab settings, :schema
401
401
  @schema_name = grab settings, :schema_name
402
402
  @pre_select = grab settings, :pre_select
403
403
  @pre_reject = grab settings, :pre_reject
404
- @errata_settings = grab settings, :errata_settings
404
+ @errata = grab settings, :errata
405
+ @parser = grab settings, :parser
405
406
 
406
407
  @other_options = settings
407
-
408
- @transformer = Transformer.new self
408
+
409
409
  @local_copy = LocalCopy.new self
410
410
  end
411
411
 
@@ -423,7 +423,7 @@ class RemoteTable
423
423
  else
424
424
  mark_download!
425
425
  memo = _each do |row|
426
- transformer.transform(row).each do |virtual_row|
426
+ parser.parse(row).each do |virtual_row|
427
427
  virtual_row.row_hash = ::HashDigest.hexdigest row
428
428
  if errata
429
429
  next if errata.rejects? virtual_row
@@ -447,7 +447,7 @@ class RemoteTable
447
447
 
448
448
  # @deprecated
449
449
  alias :each_row :each
450
-
450
+
451
451
  # @return [Array<Hash,Array>] All rows.
452
452
  def to_a
453
453
  if fully_cached?
@@ -459,7 +459,7 @@ class RemoteTable
459
459
 
460
460
  # @deprecated
461
461
  alias :rows :to_a
462
-
462
+
463
463
  # Get a row by row number. Zero-based.
464
464
  #
465
465
  # @return [Hash,Array]
@@ -470,33 +470,18 @@ class RemoteTable
470
470
  to_a[row_number]
471
471
  end
472
472
  end
473
-
473
+
474
474
  # Clear the row cache in case it helps your GC.
475
475
  #
476
476
  # @return [nil]
477
477
  def free
478
478
  @fully_cached = false
479
- @errata = nil
480
479
  cache.clear
481
480
  nil
482
481
  end
483
482
 
484
- # @private
485
- def errata
486
- @errata || @errata_mutex.synchronize do
487
- @errata ||= begin
488
- if defined?(::Errata) and errata_settings.is_a?(::Errata)
489
- ::Kernel.warn %{[remote_table] Passing :errata_settings as an Errata object is deprecated. Please pass a Hash of settings instead.}
490
- errata_settings
491
- elsif errata_settings.is_a?(::Hash)
492
- ::Errata.new errata_settings
493
- end
494
- end
495
- end
496
- end
497
-
498
483
  private
499
-
484
+
500
485
  def mark_download!
501
486
  @download_count_mutex.synchronize do
502
487
  @download_count += 1
@@ -504,12 +489,12 @@ class RemoteTable
504
489
  if warn_on_multiple_downloads and download_count > 1
505
490
  ::Kernel.warn "[remote_table] #{url} has been downloaded #{download_count} times."
506
491
  end
507
- end
508
-
492
+ end
493
+
509
494
  def fully_cached!
510
495
  @fully_cached = true
511
496
  end
512
-
497
+
513
498
  def fully_cached?
514
499
  !!@fully_cached
515
500
  end
@@ -9,7 +9,7 @@ class RemoteTable
9
9
 
10
10
  # @private
11
11
  def after_extend
12
- @parser_mutex = ::Mutex.new
12
+ @fixed_width_parser_mutex = ::Mutex.new
13
13
  @definition_mutex = ::Mutex.new
14
14
  end
15
15
 
@@ -23,7 +23,7 @@ class RemoteTable
23
23
  skip_rows!
24
24
  cut_columns!
25
25
 
26
- parser.parse[:rows].each do |row|
26
+ fixed_width_parser.parse[:rows].each do |row|
27
27
  some_value_present = false
28
28
  hash = ::ActiveSupport::OrderedHash.new
29
29
  row.each do |k, v|
@@ -43,9 +43,9 @@ class RemoteTable
43
43
 
44
44
  private
45
45
 
46
- def parser
47
- @parser || @parser_mutex.synchronize do
48
- @parser ||= begin
46
+ def fixed_width_parser
47
+ @fixed_width_parser || @fixed_width_parser_mutex.synchronize do
48
+ @fixed_width_parser ||= begin
49
49
  if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
50
50
  raise ::RuntimeError, "[remote_table] You need to use exclusively the fixed_width-multibyte library https://github.com/seamusabshere/fixed_width"
51
51
  end