iron-import 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,119 +18,242 @@
18
18
  # end
19
19
  # end
20
20
  #
21
- # The row.all? call will verify that each row passed contains a value for all defined columns.
22
- #
23
21
  # A more realistic and complex example follows:
24
22
  #
25
- # importer = Importer.build do
23
+ # Importer.build do
24
+ # # Define our columns and settings
26
25
  # column :order_number do
27
- # match /order (num.*|id)/i
26
+ # header /order (num.*|id)/i
27
+ # type :int
28
28
  # end
29
- # column :date
30
- # column :amount
29
+ # column :date do
30
+ # type :date
31
+ # end
32
+ # column :amount do
33
+ # type :cents
34
+ # end
35
+ #
36
+ # # Filter out any rows missing an order number
37
+ # filter do |row|
38
+ # !row[:order_number].nil?
39
+ # end
40
+ #
41
+ # end.import('/path/to/file.csv', format: :csv) do |row|
42
+ # # Process each row as basically a hash of :column_key => value,
43
+ # # only called on import success
44
+ # Order.create(row.to_hash)
45
+ #
46
+ # end.on_error do
47
+ # # If we have any errors, do something
48
+ # raise error_summary
31
49
  # end
32
50
  #
33
51
  class Importer
34
52
 
35
- # Array of error message or nil for each non-header row
36
- attr_accessor :errors, :warnings
37
- attr_accessor :sheets
38
- attr_reader :data, :custom_reader
53
+ # Inner class for holding load-time data that gets reset on each load call
54
+ class Data
55
+ attr_accessor :start_row, :rows
56
+ def initialize
57
+ @start_row = nil
58
+ @rows = []
59
+ end
60
+ end
61
+
62
+ # Array of defined columns
63
+ attr_reader :columns
64
+ # Array of error messages collected during an import/process run
65
+ attr_accessor :errors
66
+ # Custom reader, if one has been defined using #on_file or #on_stream
67
+ attr_reader :custom_reader
68
+ # Set to the format selected during past import
69
+ attr_reader :format
70
+ # Import data
71
+ attr_reader :data
72
+ # Missing headers post-import
73
+ attr_reader :missing_headers
74
+
75
+ # When true, skips header detection
76
+ dsl_flag :headerless
77
+ # Explicitly sets the row number (1-indexed) where data rows begin,
78
+ # usually left defaulted to nil to automatically start after the header
79
+ # row.
80
+ dsl_accessor :start_row
81
+ # Set to a block/lambda taking a parsed but unvalidated row as a hash,
82
+ # return true to keep, false to skip.
83
+ dsl_accessor :filter
39
84
  # Source file/stream encoding, assumes UTF-8 if none specified
40
85
  dsl_accessor :encoding
41
86
 
87
+ # Create a new importer! See #build for details on what to do
88
+ # in the block.
42
89
  def self.build(options = {}, &block)
43
90
  importer = Importer.new(options)
44
91
  importer.build(&block)
45
92
  importer
46
93
  end
47
94
 
95
+ # Ye standard constructor!
48
96
  def initialize(options = {})
97
+ @scopes = {}
49
98
  @encoding = 'UTF-8'
50
- @sheets = {}
99
+ @headerless = false
100
+
101
+ @filter = nil
102
+ @columns = []
51
103
 
52
104
  reset
53
105
  end
54
106
 
55
- # Takes a block, and sets self to be importer instance, so you can
56
- # just call #column, #sheet, etc. directly.
107
+ # Call to define the importer's column configuration and other setup options.
108
+ #
109
+ # The following builder options are available:
110
+ #
111
+ # importer = Importer.build do
112
+ # # Don't try to look for a header using column definitions, there is no header
113
+ # headerless!
114
+ #
115
+ # # Manually set the start row for data, defaults to nil
116
+ # # indicating that the data rows start immediatly following the header.
117
+ # start_row 4
118
+ #
119
+ # # Define a filter that will skip unneeded rows. The filter command takes
120
+ # # a block that receives the parsed (but not validated!) row data as an
121
+ # # associative hash of :col_key => <parsed value>, and returns
122
+ # # true to keep the row or false to exclude it.
123
+ # filter do |row|
124
+ # row[:id].to_i > 5000
125
+ # end
126
+ #
127
+ # # If you need to process a type of input that isn't built in, define
128
+ # # a custom reader with #on_file or #on_stream
129
+ # on_file do |path|
130
+ # ... read file at path, return array of each row's raw column values ...
131
+ # end
132
+ #
133
+ # # Got a multi-block format like Excel or HTML? You can optionally limit
134
+ # # searching by setting a scope or scopes to search:
135
+ # scope :xls, 'Sheet 2'
136
+ # # Or set a bunch of scopes in one go:
137
+ # scopes :html => ['div > table.data', 'table.aux-data'],
138
+ # :xls => [2, 'Orders']
139
+ #
140
+ # # Of course, the main thing you're going to do is to define columns. See the
141
+ # # Column class' notes for options when defining a column. Note that
142
+ # # you can define columns using either hash-style:
143
+ # column :id, :type => :integer
144
+ # # or builder-style:
145
+ # column :name do
146
+ # header /company\s*name/i
147
+ # type :string
148
+ # end
149
+ # end
57
150
  def build(&block)
58
151
  DslProxy.exec(self, &block) if block
59
152
  self
60
153
  end
61
154
 
62
- # For the common case where there is only one "sheet", e.g. CSV files.
63
- def default_sheet(&block)
64
- sheet(1, true, &block)
65
- end
66
-
67
- # Access a Sheet definition by id (either number (1-N) or sheet name).
68
- # Used during #build calls to define a sheet with a passed block, like so:
155
+ # Add a new column definition to our list, allows customizing the new
156
+ # column with a builder block. See Importer::Column docs for
157
+ # options. In lieu of a builder mode, you can pass the same values
158
+ # as key => value pairs in the options hash to this method, so:
69
159
  #
70
- # Importer.build do
71
- # sheet(1) do
72
- # column :store_name
73
- # column :store_address
74
- # end
75
- # sheet('Orders') do
76
- # column :id
77
- # column :price
78
- # filter do |row|
79
- # row[:price].prensent?
80
- # end
160
+ # column(:foo) do
161
+ # type :string
162
+ # parse do |val|
163
+ # val.to_s.upcase
81
164
  # end
82
165
  # end
83
- def sheet(id, create=true, &block)
84
- # Find the sheet, creating it if needed (and requested!)
85
- if @sheets[id].nil?
86
- if create
87
- @sheets[id] = Sheet.new(self, id)
88
- else
89
- return nil
90
- end
166
+ #
167
+ # Is equivalent to:
168
+ #
169
+ # column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
170
+ #
171
+ # Use whichever you prefer!
172
+ def column(key, options_hash = {}, &block)
173
+ # Find existing column with key to allow re-opening an existing definition
174
+ col = @columns.detect {|c| c.key == key }
175
+ unless col
176
+ # if none found, add a new one
177
+ col = Column.new(self, key, options_hash)
178
+ @columns << col
91
179
  end
92
- sheet = @sheets[id]
93
180
 
94
- # Allow customization by DSL block if requested
95
- sheet.build(&block) if block
181
+ # Customize if needed
182
+ DslProxy::exec(col, &block) if block
96
183
 
97
- # Return the sheet
98
- sheet
184
+ col
185
+ end
186
+
187
+ # Limit the search scope for a single format (:xls, :xlsx, :html, :custom)
188
+ # to the given value or values - the meaning and format of scopes is determined
189
+ # by that format's data reader.
190
+ def scope(format, *scopes)
191
+ @scopes[format] = scopes.flatten
192
+ end
193
+
194
+ # Limit the search scope for more than one format at a time. For example, if
195
+ # you support both XLS and XLSX formats (and why wouldn't you?) then you
196
+ # could tell the importer to look only at the sheets named "Orders" and
197
+ # "Legacy Orders" like so:
198
+ #
199
+ # scopes :xls => ['Orders', 'Legacy Orders'],
200
+ # :xlsx => ['Orders', 'Legacy Orders']
201
+ #
202
+ def scopes(map = :__read__)
203
+ if map == :__read__
204
+ return @scopes
205
+ else
206
+ map.each_pair do |format, scope|
207
+ scope(format, scope)
208
+ end
209
+ end
99
210
  end
100
211
 
101
- # Define a custom file reader to implement your own sheet parsing.
212
+ # Define a custom file reader to implement your own parsing. Pass
213
+ # a block accepting a file path, and returning an array of arrays (rows of
214
+ # raw column values). Use #add_error(msg) to add a reading error.
215
+ #
216
+ # Adding a custom stream parser will change the importer's default
217
+ # format to :custom, though you can override it when calling #import as
218
+ # usual.
219
+ #
220
+ # Only one of #on_file or #on_stream needs to be implemented - the importer
221
+ # will cross convert as needed!
222
+ #
223
+ # Example:
224
+ #
225
+ # on_file do |path|
226
+ # # Read a file line by line
227
+ # File.readlines(path).collect do |line|
228
+ # # Each line has colon-separated values, so split 'em up
229
+ # line.split(/\s*:\s*/)
230
+ # end
231
+ # end
232
+ #
102
233
  def on_file(&block)
103
234
  @custom_reader = CustomReader.new(self) unless @custom_reader
104
235
  @custom_reader.set_reader(:file, block)
105
236
  end
106
237
 
238
+ # Just like #on_file, but for streams. Pass
239
+ # a block accepting a stream, and returning an array of arrays (rows of
240
+ # raw column values). Use #add_error(msg) to add a reading error.
241
+ #
242
+ # Example:
243
+ #
244
+ # on_stream do |stream|
245
+ # # Stream contains rows separated by a | char
246
+ # stream.readlines('|').collect do |line|
247
+ # # Each line has 3 fields of 10 characters each
248
+ # [line[0...10], line[10...20], line[20...30]]
249
+ # end
250
+ # end
251
+ #
107
252
  def on_stream(&block)
108
253
  @custom_reader = CustomReader.new(self) unless @custom_reader
109
254
  @custom_reader.set_reader(:stream, block)
110
255
  end
111
256
 
112
- # Very, very commonly we only want to deal with the default sheet. In this case,
113
- # let folks skip the sheet(n) do ... end block wrapper and just define columns
114
- # against the main importer. Internally, proxy those calls to the first sheet.
115
- def column(*args, &block)
116
- default_sheet.column(*args, &block)
117
- end
118
-
119
- # Ditto for filters
120
- def filter(*args, &block)
121
- default_sheet.filter(*args, &block)
122
- end
123
-
124
- # Ditto for start row too
125
- def start_row(row_num)
126
- default_sheet.start_row(row_num)
127
- end
128
-
129
- # More facading
130
- def headerless!
131
- default_sheet.headerless!
132
- end
133
-
134
257
  # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
135
258
  # validate the required values, run custom validations... basically pre-parse and
136
259
  # massage the supplied data. It will return true on success, or false if one
@@ -139,87 +262,293 @@ class Importer
139
262
  # You may supply various options for the import using the options hash. Supported
140
263
  # options include:
141
264
  #
142
- # format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied
143
- # source as the specified format, or auto-detects if set to :auto
265
+ # format: one of :auto, :csv, :html, :xls, :xlsx, defaults to :auto, forces treating the supplied
266
+ # source as the specified format, or attempts to auto-detect if set to :auto
267
+ # scope: specify the search scope for the data/format, overriding any scope set with #scope
144
268
  # encoding: source encoding override, defaults to guessing based on input
145
269
  #
146
- # Generally, you should be able to throw a source at it and it should work. The
270
+ # Generally, you should be able to throw a path or stream at it and it should work. The
147
271
  # options exist to allow overriding in cases where the automation heuristics
148
272
  # have failed and the input type is known by the caller.
149
273
  #
274
+ # If you're trying to import from a raw string, use Importer#import_string instead.
275
+ #
150
276
  # After #import has completed successfully, you can process the resulting data
151
- # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a
152
- def import(path_or_stream, options = {})
277
+ # using #process or extract the raw data by calling #to_a to get an array of row hashes
278
+ #
279
+ # Note that as of version 0.7.0, there is a more compact operation mode enabled by passing
280
+ # a block to this call:
281
+ #
282
+ # importer.import(...) do |row|
283
+ # # Process each row here
284
+ # end
285
+ #
286
+ # In this mode, the block is called with each row as in #process, conditionally on no
287
+ # errors. In addition, when a block is passed, true/false is not returned (as the
288
+ # block is already conditionally called). Instead, it will return the importer to allow
289
+ # chaining to #on_error or other calls.
290
+ def import(path_or_stream, options = {}, &block)
153
291
  # Clear all our load-time state, including all rows, header locations... you name it
154
292
  reset
155
293
 
156
294
  # Get the reader for this format
157
295
  default = @custom_reader ? :custom : :auto
158
- format = options.delete(:format) { default }
159
- if format == :custom
296
+ @format = options.delete(:format) { default }
297
+ if @format == :custom
160
298
  # Custom format selected, use our internal custom reader
161
- @data = @custom_reader
299
+ @reader = @custom_reader
162
300
 
163
- elsif format && format != :auto
301
+ elsif @format && @format != :auto
164
302
  # Explicit format requested
165
- @data = DataReader::for_format(self, format)
166
- unless @data
167
- add_error("Unable to find format handler for format #{format} - aborting")
168
- return
169
- end
303
+ @reader = DataReader::for_format(self, @format)
170
304
 
171
305
  else
172
306
  # Auto select
173
- @data = DataReader::for_source(self, path_or_stream)
307
+ @reader = DataReader::for_source(self, path_or_stream)
308
+ @format = @reader.format
309
+ end
310
+
311
+ # Verify we got one
312
+ unless @reader
313
+ add_error("Unable to find format handler for format :#{format} on import of #{path_or_stream.class.name} source - aborting")
314
+ return
315
+ end
316
+
317
+ # What scopes (if any) should we limit our searching to?
318
+ scopes = options.delete(:scope) { @scopes[@format] }
319
+ if scopes && !scopes.is_a?(Array)
320
+ scopes = [scopes]
174
321
  end
175
322
 
176
323
  # Read in the data!
177
- @data.load(path_or_stream)
324
+ @reader.load(path_or_stream, scopes) do |raw_rows|
325
+ # Find our column layout, start of data, etc
326
+ if find_header(raw_rows)
327
+ # Now, run all the data and add it as a Row instance
328
+ raw_rows.each_with_index do |raw, index|
329
+ row_num = index + 1
330
+ if row_num >= @data.start_row
331
+ add_row(row_num, raw)
332
+ end
333
+ end
334
+ # We've found a workable sheet/table/whatever, stop looking
335
+ true
336
+
337
+ else
338
+ # This sheet/table/whatever didn't have the needed header, try
339
+ # the next one (if any)
340
+ false
341
+ end
342
+ end
343
+
344
+ # If we have any missing headers, note that fact
345
+ if @missing_headers && @missing_headers.count > 0
346
+ add_error("Unable to locate required column header for column(s): " + @missing_headers.collect{|c| ":#{c}"}.list_join(', '))
347
+ end
348
+
349
+ # If we're here with no errors, we rule!
350
+ success = !has_errors?
351
+
352
+ if block
353
+ # New way, if block is passed, process it on success
354
+ process(&block) if success
355
+ self
356
+ else
357
+ # Old way, return result
358
+ success
359
+ end
178
360
  end
179
361
 
180
- # Process a specific sheet, or the default sheet if none is provided. Your
181
- # passed block will be handed one Row at a time.
182
- def process(sheet_id = nil, &block)
183
- s = sheet(sheet_id, false) || default_sheet
184
- s.process(&block)
362
+ # Use this form of import for the common case of having a raw CSV or HTML string.
363
+ def import_string(string, options = {}, &block)
364
+ # Get a format here if needed
365
+ if options[:format].nil?
366
+ if @custom_reader
367
+ format = :custom
368
+ else
369
+ format = string.include?('<table') && string.include?('</tr>') ? :html : :csv
370
+ end
371
+ options[:format] = format
372
+ end
373
+
374
+ # Do the import, converting the string to a stream
375
+ import(StringIO.new(string), options, &block)
376
+ end
377
+
378
+ # Call with a block accepting a single Importer::Row with contents that
379
+ # look like :column_key => <parsed value>. Any filtered rows
380
+ # will not be present. If you want to register an error, simply
381
+ # raise "some text" and it will be added to the importer's error
382
+ # list for display to the user, logging, or whatever.
383
+ def process
384
+ @data.rows.each do |row|
385
+ begin
386
+ yield row
387
+ rescue Exception => e
388
+ add_error(row, e.to_s)
389
+ end
390
+ end
185
391
  end
186
392
 
187
- def add_error(context, msg = nil)
188
- if context.is_a?(String) && msg.nil?
189
- msg = context
190
- context = nil
393
+ def on_error(&block)
394
+ raise 'Invalid block passed to Importer#on_error: block may accept 0, 1 or 2 arguments' if block.arity > 2
395
+
396
+ if has_errors?
397
+ case block.arity
398
+ when 0 then DslProxy.exec(self, &block)
399
+ when 1 then DslProxy.exec(self, @errors, &block)
400
+ when 2 then DslProxy.exec(self, @errors, error_summary, &block)
401
+ end
191
402
  end
192
- @errors << Error.new(context, msg)
403
+
404
+ self
193
405
  end
194
406
 
407
+ # Process the raw values for the first rows in a sheet,
408
+ # and attempt to build a map of the column layout, and
409
+ # detect the first row of real data
410
+ def find_header(raw_rows)
411
+ if headerless?
412
+ # Use implicit or explicit column position when told to not look for a header
413
+ next_index = 0
414
+ @columns.each do |col|
415
+ unless col.position.nil?
416
+ next_index = col.fixed_index
417
+ end
418
+ col.data.index = next_index
419
+ next_index += 1
420
+ end
421
+ @data.start_row = @start_row || 1
422
+ @missing_headers = nil
423
+ return true
424
+
425
+ else
426
+ # Match by testing
427
+ missing = nil
428
+ raw_rows.each_with_index do |row, i|
429
+ # Um, have data?
430
+ next unless row
431
+
432
+ # Set up for this iteration
433
+ remaining = @columns.dup
434
+
435
+ # Step through this row's raw values, and look for a matching column for all columns
436
+ row.each_with_index do |val, i|
437
+ col = remaining.detect {|c| c.match_header?(val.to_s, i) }
438
+ if col
439
+ remaining -= [col]
440
+ col.data.index = i
441
+ end
442
+ end
443
+
444
+ if remaining.empty?
445
+ # Found all columns, have a map, update our start row to be the next line and return!
446
+ @data.start_row = @start_row || i+2
447
+ @missing_headers = nil
448
+ return true
449
+ else
450
+ missing = remaining if (missing.nil? || missing.count > remaining.count)
451
+ end
452
+ end
453
+
454
+ # If we get here, we're hosed
455
+ @missing_headers = missing.collect(&:key) if @missing_headers.nil? || @missing_headers.count > missing.count
456
+ false
457
+ end
458
+ end
459
+
460
+ # Add a new row to our stash, parsing/filtering/validating as we go!
461
+ def add_row(line, raw_data)
462
+ # Gracefully handle custom parsers that return nil for a row's data
463
+ raw_data ||= []
464
+ # Add the row
465
+ row = Row.new(self, line)
466
+
467
+ # Parse out the values
468
+ values = {}
469
+ @columns.each do |col|
470
+ index = col.data.index
471
+ raw_val = raw_data[index]
472
+ if col.parse
473
+ # Use custom parser if this row has one
474
+ val = col.parse_value(row, raw_val)
475
+ else
476
+ # Otherwise use our standard parser
477
+ val = @reader.parse_value(raw_val, col.type)
478
+ end
479
+ values[col.key] = val
480
+ end
481
+
482
+ # Set the values and filter if needed
483
+ row.set_values(values)
484
+ return nil if @filter && !@filter.call(row)
485
+
486
+ # Row is desired, now validate values
487
+ @columns.each do |col|
488
+ val = values[col.key]
489
+ col.validate_value(row, val)
490
+ end
491
+
492
+ # We is good
493
+ @data.rows << row
494
+ row
495
+ end
496
+
497
+ # When true, one or more errors have been recorded during this import/process
498
+ # cycle.
195
499
  def has_errors?
196
500
  @errors.any?
197
501
  end
198
502
 
199
- def add_warning(context, msg)
503
+ # Add an error to our error list. Will result in a failed import.
504
+ def add_error(context, msg = nil)
200
505
  if context.is_a?(String) && msg.nil?
201
506
  msg = context
202
507
  context = nil
203
508
  end
204
- @warnings << Error.new(context, msg)
205
- end
206
-
207
- def has_warnings?
208
- @warnings.any?
509
+ @errors << Error.new(context, msg)
209
510
  end
210
511
 
211
- # Returns a human-readable summary of the errors present on the importer
512
+ # Returns a human-readable summary of the errors present on the importer, or
513
+ # nil if no errors are present
212
514
  def error_summary
515
+ # Simple case
213
516
  return nil unless has_errors?
214
- @errors.collect(&:summary).list_join(', ')
517
+
518
+ # Group by error text - we often get the same error dozens of times
519
+ list = {}
520
+ @errors.each do |err|
521
+ errs = list[err.text] || []
522
+ errs << err
523
+ list[err.text] = errs
524
+ end
525
+
526
+ # Build summary & return
527
+ list.values.collect do |errs|
528
+ summary = errs.first.summary
529
+ if errs.count == 1
530
+ summary
531
+ else
532
+ errs.count.to_s + ' x ' + summary
533
+ end
534
+ end.list_join(', ')
535
+ end
536
+
537
+ # After calling #import, you can dump the final values for each row
538
+ # as an array of hashes. Useful in debugging! For general processing,
539
+ # use #process or the block form of #import instead.
540
+ def to_a
541
+ @data.rows.collect(&:values)
215
542
  end
216
543
 
217
544
  protected
218
545
 
219
546
  def reset
220
547
  @errors = []
221
- @warnings = []
222
- @sheets.values.each(&:reset)
548
+ @missing_headers = nil
549
+ @format = nil
550
+ @reader = nil
551
+ @data = Data.new
223
552
  end
224
553
 
225
554
  end
@@ -2,12 +2,12 @@ class Importer
2
2
 
3
3
  class Row
4
4
 
5
- attr_reader :sheet, :line, :values
5
+ attr_reader :line, :values
6
6
 
7
- def initialize(sheet, line, value_hash = nil)
8
- @sheet = sheet
7
+ def initialize(importer, line, value_hash = nil)
8
+ @importer = importer
9
9
  @line = line
10
- @values = value_hash
10
+ set_values(value_hash)
11
11
  end
12
12
 
13
13
  def set_values(value_hash)
@@ -15,8 +15,9 @@ class Importer
15
15
  end
16
16
 
17
17
  # True when all columns have a non-nil value, useful in filtering out junk
18
- # rows
18
+ # rows. Pass in one or more keys to check only those keys for presence.
19
19
  def all?(*keys)
20
+ keys.flatten!
20
21
  if keys.any?
21
22
  # Check only the specified keys
22
23
  valid = true
@@ -33,25 +34,28 @@ class Importer
33
34
  end
34
35
  end
35
36
 
37
+ # True when all row columns have nil values.
36
38
  def empty?
37
39
  @values.values.all?(&:nil?)
38
40
  end
39
41
 
40
- # Returns the value of a column
42
+ # Returns the value of a column.
41
43
  def [](column_key)
42
44
  @values[column_key]
43
45
  end
44
-
46
+
47
+ # The row's name, e.g. 'Row 4'
45
48
  def to_s
46
49
  "Row #{@line}"
47
50
  end
48
51
 
49
- def add_error(msg)
50
- @sheet.importer.add_error(self, msg)
52
+ # This row's values as a hash of :column_key => <parsed + validated value>
53
+ def to_hash
54
+ @values.dup
51
55
  end
52
56
 
53
- def add_warning(msg)
54
- @sheet.importer.add_warning(self, msg)
57
+ def add_error(msg)
58
+ @importer.add_error(self, msg)
55
59
  end
56
60
 
57
61
  end