remote_table 2.1.2 → 3.0.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/CHANGELOG +12 -3
  2. data/README.markdown +1 -8
  3. data/lib/remote_table.rb +72 -87
  4. data/lib/remote_table/fixed_width.rb +5 -5
  5. data/lib/remote_table/local_copy.rb +1 -1
  6. data/lib/remote_table/plaintext.rb +3 -3
  7. data/lib/remote_table/processed_by_roo.rb +6 -4
  8. data/lib/remote_table/version.rb +1 -1
  9. data/remote_table.gemspec +3 -4
  10. data/test/{support → data}/airports.utf8.csv +0 -0
  11. data/test/data/color.csv +3 -0
  12. data/test/{fixtures → data}/data.yml +0 -0
  13. data/test/{support → data}/list-en1-semic-3.neooffice.binary.ods +0 -0
  14. data/test/{support → data}/list-en1-semic-3.neooffice.iso-8859-1.csv +0 -0
  15. data/test/{support → data}/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64 +0 -0
  16. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.csv +0 -0
  17. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.fixed_width-62 +0 -0
  18. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.html +0 -0
  19. data/test/{support → data}/list-en1-semic-3.neooffice.utf-8.xml +0 -0
  20. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls +0 -0
  21. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls +0 -0
  22. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx +0 -0
  23. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html +0 -0
  24. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma +0 -0
  25. data/test/{support → data}/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html +0 -0
  26. data/test/{support → data}/list-en1-semic-3.original.iso-8859-1.csv +0 -0
  27. data/test/data/ranges.csv +4 -0
  28. data/test/test_errata.rb +2 -2
  29. data/test/test_local.rb +10 -0
  30. data/test/test_old_syntax.rb +0 -13
  31. data/test/test_parser.rb +24 -0
  32. data/test/test_remote.rb +113 -0
  33. data/test/test_remote_table.rb +30 -165
  34. data/test/test_transpose.rb +11 -0
  35. metadata +86 -66
  36. checksums.yaml +0 -15
  37. data/lib/remote_table/shp.rb +0 -30
  38. data/lib/remote_table/transformer.rb +0 -29
  39. data/test/test_old_transform.rb +0 -47
  40. data/test/test_shapefile.rb +0 -13
data/CHANGELOG CHANGED
@@ -1,8 +1,17 @@
1
- 2.1.2 / 2013-10-08
1
+ 3.0.0.alpha / 2013-07-25
2
2
 
3
- * Bug fixes
3
+ * Breaking changes
4
+
5
+ * Just use :encoding to specify a file's internal/initial/original encoding. No more :internal_encoding, this was jargony.
6
+ * No more shapefile support - hard to imagine a shapefile as a table
7
+ * No more :transform, legacy or not
8
+ * For :errata, now you pass an object that responds to #rejects?(row) and #correct!(row)
9
+ * No more :errata_settings
10
+
11
+ * Enhancements
4
12
 
5
- * Correctly detect format from filename - don't get confused if a filename has "xls" in it (thanks @activefx #10)
13
+ * :parser option takes an object that responds to #parse(row) and returns an array of one or more rows.
14
+ * RemoteTable.transpose(url, key_key, value_key) helper
6
15
 
7
16
  2.1.1 / 2013-03-25
8
17
 
data/README.markdown CHANGED
@@ -1,6 +1,6 @@
1
1
  # remote_table
2
2
 
3
- Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), SHP (ESRI shapefiles), other delimited, fixed-width files.
3
+ Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), other delimited, fixed-width files.
4
4
 
5
5
  Tested on MRI 1.8, MRI 1.9, and JRuby 1.6.7+. Thread-safe.
6
6
 
@@ -441,12 +441,6 @@ Everything is forced into UTF-8. You can improve the quality of the conversion b
441
441
  [ 'spacer', 1 ],
442
442
  [ 'header1', 10, { :type => :string } ]]
443
443
 
444
- # ESRI Shapefile from NREL
445
- require 'geo_ruby'
446
- require 'dbf'
447
- RemoteTable.new 'http://www.nrel.gov/gis/cfm/data/GIS_Data_Technology_Specific/United_States/Solar/High_Resolution/Lower_48_DNI_High_Resolution.zip',
448
- :format => :shp
449
-
450
444
  ## Requirements
451
445
 
452
446
  * Unix tools like curl, iconv, perl, cat, cut, tail, etc. accessible from your `$PATH`
@@ -455,7 +449,6 @@ Everything is forced into UTF-8. You can improve the quality of the conversion b
455
449
  ## Wishlist
456
450
 
457
451
  * Win32 compat
458
- * The new "custom parser" syntax (aka transformer) hasn't been defined yet... only the old-style syntax is available
459
452
 
460
453
  ## Authors
461
454
 
data/lib/remote_table.rb CHANGED
@@ -14,7 +14,6 @@ end
14
14
  require 'hash_digest'
15
15
 
16
16
  require 'remote_table/local_copy'
17
- require 'remote_table/transformer'
18
17
 
19
18
  require 'remote_table/plaintext'
20
19
  require 'remote_table/processed_by_roo'
@@ -27,7 +26,6 @@ require 'remote_table/fixed_width'
27
26
  require 'remote_table/html'
28
27
  require 'remote_table/xml'
29
28
  require 'remote_table/yaml'
30
- require 'remote_table/shp'
31
29
 
32
30
  class Hash
33
31
  # Added by remote_table to store a hash (think checksum) of the data with which a particular Hash is initialized.
@@ -44,6 +42,14 @@ end
44
42
  # Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), other delimited, fixed-width files.
45
43
  class RemoteTable
46
44
  class << self
45
+ # Transpose two columns into a mapping from one to the other.
46
+ def transpose(url, key_key, value_key, options = {})
47
+ new(url, options).inject({}) do |memo, row|
48
+ memo[row[key_key]] = row[value_key]
49
+ memo
50
+ end
51
+ end
52
+
47
53
  # Guess compression based on URL. Used internally.
48
54
  # @return [Symbol,nil]
49
55
  def guess_compression(url)
@@ -72,26 +78,24 @@ class RemoteTable
72
78
  # Guess file format from the basename. Since a file might be decompressed and/or pulled out of an archive with a glob, this usually can't be called until a file is downloaded.
73
79
  # @return [Symbol,nil]
74
80
  def guess_format(basename)
75
- case basename.to_s.downcase.strip
76
- when /ods\z/, /open_?office\z/
81
+ case basename.to_s.downcase
82
+ when /ods/, /open_?office/
77
83
  :ods
78
- when /xlsx\z/, /excelx\z/
84
+ when /xlsx/, /excelx/
79
85
  :xlsx
80
- when /xls\z/, /excel\z/
86
+ when /xls/, /excel/
81
87
  :xls
82
- when /csv\z/, /tsv\z/, /delimited\z/
88
+ when /csv/, /tsv/, /delimited/
83
89
  # note that there is no RemoteTable::Csv class - it's normalized to :delimited
84
90
  :delimited
85
- when /fixed_?width\z/
91
+ when /fixed_?width/
86
92
  :fixed_width
87
- when /html?\z/
93
+ when /htm/
88
94
  :html
89
- when /xml\z/
95
+ when /xml/
90
96
  :xml
91
- when /yaml\z/, /yml\z/
97
+ when /yaml/, /yml/
92
98
  :yaml
93
- when /shp\z/
94
- :shp
95
99
  end
96
100
  end
97
101
 
@@ -107,14 +111,6 @@ class RemoteTable
107
111
  end
108
112
  end
109
113
 
110
- # @private
111
- # Here to support legacy code.
112
- class Transform
113
- def self.row_hash(row)
114
- ::HashDigest.hexdigest row
115
- end
116
- end
117
-
118
114
  EXTERNAL_ENCODING = 'UTF-8'
119
115
  EXTERNAL_ENCODING_ICONV = 'UTF-8//TRANSLIT'
120
116
  GOOGLE_DOCS_SPREADSHEET = [
@@ -124,7 +120,7 @@ class RemoteTable
124
120
  VALID = {
125
121
  :compression => [:gz, :zip, :bz2, :exe],
126
122
  :packing => [:tar],
127
- :format => [:xlsx, :xls, :delimited, :ods, :fixed_width, :html, :xml, :yaml, :csv, :shp],
123
+ :format => [:xlsx, :xls, :delimited, :ods, :fixed_width, :html, :xml, :yaml, :csv],
128
124
  }
129
125
  DEFAULT = {
130
126
  :streaming => false,
@@ -132,19 +128,16 @@ class RemoteTable
132
128
  :headers => :first_row,
133
129
  :keep_blank_rows => false,
134
130
  :skip => 0,
135
- :internal_encoding => 'UTF-8',
131
+ :encoding => 'UTF-8',
136
132
  :delimiter => ','
137
133
  }
138
134
  OLD_SETTING_NAMES = {
139
- :internal_encoding => [:encoding],
140
- :transform_settings => [:transform],
141
135
  :pre_select => [:select],
142
136
  :pre_reject => [:reject],
143
- :errata_settings => [:errata],
144
137
  }
145
138
 
146
139
  include ::Enumerable
147
-
140
+
148
141
  # The URL of the local or remote file.
149
142
  #
150
143
  # @example Local
@@ -169,10 +162,6 @@ class RemoteTable
169
162
  # @return [Integer]
170
163
  attr_reader :download_count
171
164
 
172
- # @private
173
- # Used internally to access the transformer (aka parser).
174
- attr_reader :transformer
175
-
176
165
  # @private
177
166
  # Used internally to access to a downloaded copy of the file.
178
167
  # @return [RemoteTable::LocalCopy]
@@ -185,39 +174,39 @@ class RemoteTable
185
174
  # Whether to warn the user on multiple downloads. Defaults to true.
186
175
  # @return [true,false]
187
176
  attr_reader :warn_on_multiple_downloads
188
-
177
+
189
178
  # Headers specified by the user: +:first_row+ (the default), +false+, or a list of headers.
190
179
  # @return [:first_row,false,Array<String>]
191
180
  attr_reader :headers
192
-
181
+
193
182
  # The sheet specified by the user as a number or a string.
194
183
  # @return[String,Integer]
195
184
  attr_reader :sheet
196
-
185
+
197
186
  # Whether to keep blank rows. Default is false.
198
187
  # @return [true,false]
199
188
  attr_reader :keep_blank_rows
200
-
189
+
201
190
  # Form data to POST in the download request. It should probably be in +application/x-www-form-urlencoded+.
202
191
  # @return [String]
203
192
  attr_reader :form_data
204
-
193
+
205
194
  # How many rows to skip at the beginning of the file or table. Default is 0.
206
195
  # @return [Integer]
207
196
  attr_reader :skip
208
197
 
209
- # The original encoding of the source file. Default is UTF-8. Previously passed as +:encoding+.
198
+ # The original encoding of the source file. Default is UTF-8.
210
199
  # @return [String]
211
- attr_reader :internal_encoding
212
-
200
+ attr_reader :encoding
201
+
213
202
  # The delimiter, a.k.a. column separator. Passed to Ruby CSV as +:col_sep+. Default is :delimited.
214
203
  # @return [String]
215
204
  attr_reader :delimiter
216
-
205
+
217
206
  # The XPath used to find rows in HTML or XML.
218
207
  # @return [String]
219
208
  attr_reader :row_xpath
220
-
209
+
221
210
  # The XPath used to find columns in HTML or XML.
222
211
  # @return [String]
223
212
  attr_reader :column_xpath
@@ -225,11 +214,11 @@ class RemoteTable
225
214
  # The CSS selector used to find rows in HTML or XML.
226
215
  # @return [String]
227
216
  attr_reader :row_css
228
-
217
+
229
218
  # The CSS selector used to find columns in HTML or XML.
230
219
  # @return [String]
231
220
  attr_reader :column_css
232
-
221
+
233
222
  # The format of the source file. Can be +:xlsx+, +:xls+, +:delimited+, +:ods+, +:fixed_width+, +:html+, +:xml+, +:yaml+.
234
223
  # @return [Symbol]
235
224
  attr_reader :format
@@ -241,7 +230,7 @@ class RemoteTable
241
230
  # The packing type. Guessed from URL if not provided. Only +:tar+ is supported.
242
231
  # @return [Symbol]
243
232
  attr_reader :packing
244
-
233
+
245
234
  # The glob used to pick a file out of an archive.
246
235
  #
247
236
  # @return [String]
@@ -249,7 +238,7 @@ class RemoteTable
249
238
  # @example Pick out the only CSV in a ZIP file
250
239
  # RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
251
240
  attr_reader :glob
252
-
241
+
253
242
  # The filename, which can be used to pick a file out of an archive.
254
243
  #
255
244
  # @return [String]
@@ -267,7 +256,7 @@ class RemoteTable
267
256
  # # ALMOST
268
257
  # RemoteTable.new 'file:///atoz.txt', :cut => '1,12,13,15,19,20'
269
258
  attr_reader :cut
270
-
259
+
271
260
  # Use a range of rows in a plaintext file.
272
261
  #
273
262
  # @return [Range]
@@ -278,7 +267,7 @@ class RemoteTable
278
267
  # :select => proc { |row| CbecsEnergyIntensity::NAICS_CODE_SYNTHESIZER.call(row) },
279
268
  # :crop => (21..37))
280
269
  attr_reader :crop
281
-
270
+
282
271
  # The fixed-width schema, given as a multi-dimensional array.
283
272
  #
284
273
  # @return [Array<Array{String,Integer,Hash}>]
@@ -293,30 +282,29 @@ class RemoteTable
293
282
  # [ 'spacer', 12 ],
294
283
  # [ 'header6', 10, { :type => :string } ]])
295
284
  attr_reader :schema
296
-
285
+
297
286
  # If you somehow already defined a fixed-width schema (so you can re-use it?), specify it here.
298
287
  # @return [String,Symbol]
299
288
  attr_reader :schema_name
300
-
289
+
301
290
  # A proc that decides whether to include a row. Previously passed as +:select+.
302
291
  # @return [Proc]
303
292
  attr_reader :pre_select
304
-
293
+
305
294
  # A proc that decides whether to include a row. Previously passed as +:reject+.
306
295
  # @return [Proc]
307
296
  attr_reader :pre_reject
308
297
 
309
- # Settings to create a transformer.
310
- # @return [Hash]
311
- attr_reader :transform_settings
312
-
313
- # A hash to initialize an Errata instance to be used on every row. Applied after creating +row_hash+ and before passing to +:synthesize+ procs, etc. Previously passed as +:errata+.
298
+ # An object that responds to #rejects?(row) and #correct!(row). Applied after creating +row_hash+.
299
+ #
300
+ # * #rejects?(row) - if row should be treated like it doesn't exist
301
+ # * #correct!(row) - destructively update a row to fix something
314
302
  #
315
- # See the Errata library at https://github.com/seamusabshere/errata
303
+ # See the Errata library at https://github.com/seamusabshere/errata for an example implementation.
316
304
  #
317
305
  # @return [Hash]
318
- attr_reader :errata_settings
319
-
306
+ attr_reader :errata
307
+
320
308
  # The format of the source file. Can be specified as: :xlsx, :xls, :delimited (aka :csv), :ods, :fixed_width, :html, :xml, :yaml
321
309
  #
322
310
  # Note: treats all +docs.google.com+ and +spreadsheets.google.com+ URLs as +:delimited+.
@@ -326,6 +314,20 @@ class RemoteTable
326
314
  # @return [Hash]
327
315
  attr_reader :format
328
316
 
317
+ # @private
318
+ class NullParser
319
+ def parse(row)
320
+ [row]
321
+ end
322
+ end
323
+
324
+ # An object that responds to #parse(row) and returns an array of one or more rows.
325
+ #
326
+ # @return [#parse]
327
+ def parser
328
+ @final_parser ||= (@parser || NullParser.new)
329
+ end
330
+
329
331
  # Options passed by the user that may be passed through to the underlying parsing library.
330
332
  # @return [Hash]
331
333
  attr_reader :other_options
@@ -354,7 +356,6 @@ class RemoteTable
354
356
  def initialize(*args)
355
357
  @download_count_mutex = ::Mutex.new
356
358
  @extend_bang_mutex = ::Mutex.new
357
- @errata_mutex = ::Mutex.new
358
359
 
359
360
  @cache = []
360
361
  @download_count = 0
@@ -387,25 +388,24 @@ class RemoteTable
387
388
  @keep_blank_rows = grab settings, :keep_blank_rows
388
389
  @form_data = grab settings, :form_data
389
390
  @skip = grab settings, :skip
390
- @internal_encoding = grab settings, :internal_encoding
391
+ @encoding = grab settings, :encoding
391
392
  @row_xpath = grab settings, :row_xpath
392
393
  @column_xpath = grab settings, :column_xpath
393
394
  @row_css = grab settings, :row_css
394
395
  @column_css = grab settings, :column_css
395
396
  @glob = grab settings, :glob
396
397
  @filename = grab settings, :filename
397
- @transform_settings = grab settings, :transform_settings
398
398
  @cut = grab settings, :cut
399
399
  @crop = grab settings, :crop
400
400
  @schema = grab settings, :schema
401
401
  @schema_name = grab settings, :schema_name
402
402
  @pre_select = grab settings, :pre_select
403
403
  @pre_reject = grab settings, :pre_reject
404
- @errata_settings = grab settings, :errata_settings
404
+ @errata = grab settings, :errata
405
+ @parser = grab settings, :parser
405
406
 
406
407
  @other_options = settings
407
-
408
- @transformer = Transformer.new self
408
+
409
409
  @local_copy = LocalCopy.new self
410
410
  end
411
411
 
@@ -423,7 +423,7 @@ class RemoteTable
423
423
  else
424
424
  mark_download!
425
425
  memo = _each do |row|
426
- transformer.transform(row).each do |virtual_row|
426
+ parser.parse(row).each do |virtual_row|
427
427
  virtual_row.row_hash = ::HashDigest.hexdigest row
428
428
  if errata
429
429
  next if errata.rejects? virtual_row
@@ -447,7 +447,7 @@ class RemoteTable
447
447
 
448
448
  # @deprecated
449
449
  alias :each_row :each
450
-
450
+
451
451
  # @return [Array<Hash,Array>] All rows.
452
452
  def to_a
453
453
  if fully_cached?
@@ -459,7 +459,7 @@ class RemoteTable
459
459
 
460
460
  # @deprecated
461
461
  alias :rows :to_a
462
-
462
+
463
463
  # Get a row by row number. Zero-based.
464
464
  #
465
465
  # @return [Hash,Array]
@@ -470,33 +470,18 @@ class RemoteTable
470
470
  to_a[row_number]
471
471
  end
472
472
  end
473
-
473
+
474
474
  # Clear the row cache in case it helps your GC.
475
475
  #
476
476
  # @return [nil]
477
477
  def free
478
478
  @fully_cached = false
479
- @errata = nil
480
479
  cache.clear
481
480
  nil
482
481
  end
483
482
 
484
- # @private
485
- def errata
486
- @errata || @errata_mutex.synchronize do
487
- @errata ||= begin
488
- if defined?(::Errata) and errata_settings.is_a?(::Errata)
489
- ::Kernel.warn %{[remote_table] Passing :errata_settings as an Errata object is deprecated. Please pass a Hash of settings instead.}
490
- errata_settings
491
- elsif errata_settings.is_a?(::Hash)
492
- ::Errata.new errata_settings
493
- end
494
- end
495
- end
496
- end
497
-
498
483
  private
499
-
484
+
500
485
  def mark_download!
501
486
  @download_count_mutex.synchronize do
502
487
  @download_count += 1
@@ -504,12 +489,12 @@ class RemoteTable
504
489
  if warn_on_multiple_downloads and download_count > 1
505
490
  ::Kernel.warn "[remote_table] #{url} has been downloaded #{download_count} times."
506
491
  end
507
- end
508
-
492
+ end
493
+
509
494
  def fully_cached!
510
495
  @fully_cached = true
511
496
  end
512
-
497
+
513
498
  def fully_cached?
514
499
  !!@fully_cached
515
500
  end
@@ -9,7 +9,7 @@ class RemoteTable
9
9
 
10
10
  # @private
11
11
  def after_extend
12
- @parser_mutex = ::Mutex.new
12
+ @fixed_width_parser_mutex = ::Mutex.new
13
13
  @definition_mutex = ::Mutex.new
14
14
  end
15
15
 
@@ -23,7 +23,7 @@ class RemoteTable
23
23
  skip_rows!
24
24
  cut_columns!
25
25
 
26
- parser.parse[:rows].each do |row|
26
+ fixed_width_parser.parse[:rows].each do |row|
27
27
  some_value_present = false
28
28
  hash = ::ActiveSupport::OrderedHash.new
29
29
  row.each do |k, v|
@@ -43,9 +43,9 @@ class RemoteTable
43
43
 
44
44
  private
45
45
 
46
- def parser
47
- @parser || @parser_mutex.synchronize do
48
- @parser ||= begin
46
+ def fixed_width_parser
47
+ @fixed_width_parser || @fixed_width_parser_mutex.synchronize do
48
+ @fixed_width_parser ||= begin
49
49
  if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
50
50
  raise ::RuntimeError, "[remote_table] You need to use exclusively the fixed_width-multibyte library https://github.com/seamusabshere/fixed_width"
51
51
  end