iron-import 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -18,119 +18,242 @@
18
18
  # end
19
19
  # end
20
20
  #
21
- # The row.all? call will verify that each row passed contains a value for all defined columns.
22
- #
23
21
  # A more realistic and complex example follows:
24
22
  #
25
- # importer = Importer.build do
23
+ # Importer.build do
24
+ # # Define our columns and settings
26
25
  # column :order_number do
27
- # match /order (num.*|id)/i
26
+ # header /order (num.*|id)/i
27
+ # type :int
28
28
  # end
29
- # column :date
30
- # column :amount
29
+ # column :date do
30
+ # type :date
31
+ # end
32
+ # column :amount do
33
+ # type :cents
34
+ # end
35
+ #
36
+ # # Filter out any rows missing an order number
37
+ # filter do |row|
38
+ # !row[:order_number].nil?
39
+ # end
40
+ #
41
+ # end.import('/path/to/file.csv', format: :csv) do |row|
42
+ # # Process each row as basically a hash of :column_key => value,
43
+ # # only called on import success
44
+ # Order.create(row.to_hash)
45
+ #
46
+ # end.on_error do
47
+ # # If we have any errors, do something
48
+ # raise error_summary
31
49
  # end
32
50
  #
33
51
  class Importer
34
52
 
35
- # Array of error message or nil for each non-header row
36
- attr_accessor :errors, :warnings
37
- attr_accessor :sheets
38
- attr_reader :data, :custom_reader
53
+ # Inner class for holding load-time data that gets reset on each load call
54
+ class Data
55
+ attr_accessor :start_row, :rows
56
+ def initialize
57
+ @start_row = nil
58
+ @rows = []
59
+ end
60
+ end
61
+
62
+ # Array of defined columns
63
+ attr_reader :columns
64
+ # Array of error messages collected during an import/process run
65
+ attr_accessor :errors
66
+ # Custom reader, if one has been defined using #on_file or #on_stream
67
+ attr_reader :custom_reader
68
+ # Set to the format selected during past import
69
+ attr_reader :format
70
+ # Import data
71
+ attr_reader :data
72
+ # Missing headers post-import
73
+ attr_reader :missing_headers
74
+
75
+ # When true, skips header detection
76
+ dsl_flag :headerless
77
+ # Explicitly sets the row number (1-indexed) where data rows begin,
78
+ # usually left defaulted to nil to automatically start after the header
79
+ # row.
80
+ dsl_accessor :start_row
81
+ # Set to a block/lambda taking a parsed but unvalidated row as a hash,
82
+ # return true to keep, false to skip.
83
+ dsl_accessor :filter
39
84
  # Source file/stream encoding, assumes UTF-8 if none specified
40
85
  dsl_accessor :encoding
41
86
 
87
+ # Create a new importer! See #build for details on what to do
88
+ # in the block.
42
89
  def self.build(options = {}, &block)
43
90
  importer = Importer.new(options)
44
91
  importer.build(&block)
45
92
  importer
46
93
  end
47
94
 
95
+ # Ye standard constructor!
48
96
  def initialize(options = {})
97
+ @scopes = {}
49
98
  @encoding = 'UTF-8'
50
- @sheets = {}
99
+ @headerless = false
100
+
101
+ @filter = nil
102
+ @columns = []
51
103
 
52
104
  reset
53
105
  end
54
106
 
55
- # Takes a block, and sets self to be importer instance, so you can
56
- # just call #column, #sheet, etc. directly.
107
+ # Call to define the importer's column configuration and other setup options.
108
+ #
109
+ # The following builder options are available:
110
+ #
111
+ # importer = Importer.build do
112
+ # # Don't try to look for a header using column definitions, there is no header
113
+ # headerless!
114
+ #
115
+ # # Manually set the start row for data, defaults to nil
116
+ # # indicating that the data rows start immediatly following the header.
117
+ # start_row 4
118
+ #
119
+ # # Define a filter that will skip unneeded rows. The filter command takes
120
+ # # a block that receives the parsed (but not validated!) row data as an
121
+ # # associative hash of :col_key => <parsed value>, and returns
122
+ # # true to keep the row or false to exclude it.
123
+ # filter do |row|
124
+ # row[:id].to_i > 5000
125
+ # end
126
+ #
127
+ # # If you need to process a type of input that isn't built in, define
128
+ # # a custom reader with #on_file or #on_stream
129
+ # on_file do |path|
130
+ # ... read file at path, return array of each row's raw column values ...
131
+ # end
132
+ #
133
+ # # Got a multi-block format like Excel or HTML? You can optionally limit
134
+ # # searching by setting a scope or scopes to search:
135
+ # scope :xls, 'Sheet 2'
136
+ # # Or set a bunch of scopes in one go:
137
+ # scopes :html => ['div > table.data', 'table.aux-data'],
138
+ # :xls => [2, 'Orders']
139
+ #
140
+ # # Of course, the main thing you're going to do is to define columns. See the
141
+ # # Column class' notes for options when defining a column. Note that
142
+ # # you can define columns using either hash-style:
143
+ # column :id, :type => :integer
144
+ # # or builder-style:
145
+ # column :name do
146
+ # header /company\s*name/i
147
+ # type :string
148
+ # end
149
+ # end
57
150
  def build(&block)
58
151
  DslProxy.exec(self, &block) if block
59
152
  self
60
153
  end
61
154
 
62
- # For the common case where there is only one "sheet", e.g. CSV files.
63
- def default_sheet(&block)
64
- sheet(1, true, &block)
65
- end
66
-
67
- # Access a Sheet definition by id (either number (1-N) or sheet name).
68
- # Used during #build calls to define a sheet with a passed block, like so:
155
+ # Add a new column definition to our list, allows customizing the new
156
+ # column with a builder block. See Importer::Column docs for
157
+ # options. In lieu of a builder mode, you can pass the same values
158
+ # as key => value pairs in the options hash to this method, so:
69
159
  #
70
- # Importer.build do
71
- # sheet(1) do
72
- # column :store_name
73
- # column :store_address
74
- # end
75
- # sheet('Orders') do
76
- # column :id
77
- # column :price
78
- # filter do |row|
79
- # row[:price].prensent?
80
- # end
160
+ # column(:foo) do
161
+ # type :string
162
+ # parse do |val|
163
+ # val.to_s.upcase
81
164
  # end
82
165
  # end
83
- def sheet(id, create=true, &block)
84
- # Find the sheet, creating it if needed (and requested!)
85
- if @sheets[id].nil?
86
- if create
87
- @sheets[id] = Sheet.new(self, id)
88
- else
89
- return nil
90
- end
166
+ #
167
+ # Is equivalent to:
168
+ #
169
+ # column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
170
+ #
171
+ # Use whichever you prefer!
172
+ def column(key, options_hash = {}, &block)
173
+ # Find existing column with key to allow re-opening an existing definition
174
+ col = @columns.detect {|c| c.key == key }
175
+ unless col
176
+ # if none found, add a new one
177
+ col = Column.new(self, key, options_hash)
178
+ @columns << col
91
179
  end
92
- sheet = @sheets[id]
93
180
 
94
- # Allow customization by DSL block if requested
95
- sheet.build(&block) if block
181
+ # Customize if needed
182
+ DslProxy::exec(col, &block) if block
96
183
 
97
- # Return the sheet
98
- sheet
184
+ col
185
+ end
186
+
187
+ # Limit the search scope for a single format (:xls, :xlsx, :html, :custom)
188
+ # to the given value or values - the meaning and format of scopes is determined
189
+ # by that format's data reader.
190
+ def scope(format, *scopes)
191
+ @scopes[format] = scopes.flatten
192
+ end
193
+
194
+ # Limit the search scope for more than one format at a time. For example, if
195
+ # you support both XLS and XLSX formats (and why wouldn't you?) then you
196
+ # could tell the importer to look only at the sheets named "Orders" and
197
+ # "Legacy Orders" like so:
198
+ #
199
+ # scopes :xls => ['Orders', 'Legacy Orders'],
200
+ # :xlsx => ['Orders', 'Legacy Orders']
201
+ #
202
+ def scopes(map = :__read__)
203
+ if map == :__read__
204
+ return @scopes
205
+ else
206
+ map.each_pair do |format, scope|
207
+ scope(format, scope)
208
+ end
209
+ end
99
210
  end
100
211
 
101
- # Define a custom file reader to implement your own sheet parsing.
212
+ # Define a custom file reader to implement your own parsing. Pass
213
+ # a block accepting a file path, and returning an array of arrays (rows of
214
+ # raw column values). Use #add_error(msg) to add a reading error.
215
+ #
216
+ # Adding a custom stream parser will change the importer's default
217
+ # format to :custom, though you can override it when calling #import as
218
+ # usual.
219
+ #
220
+ # Only one of #on_file or #on_stream needs to be implemented - the importer
221
+ # will cross convert as needed!
222
+ #
223
+ # Example:
224
+ #
225
+ # on_file do |path|
226
+ # # Read a file line by line
227
+ # File.readlines(path).collect do |line|
228
+ # # Each line has colon-separated values, so split 'em up
229
+ # line.split(/\s*:\s*/)
230
+ # end
231
+ # end
232
+ #
102
233
  def on_file(&block)
103
234
  @custom_reader = CustomReader.new(self) unless @custom_reader
104
235
  @custom_reader.set_reader(:file, block)
105
236
  end
106
237
 
238
+ # Just like #on_file, but for streams. Pass
239
+ # a block accepting a stream, and returning an array of arrays (rows of
240
+ # raw column values). Use #add_error(msg) to add a reading error.
241
+ #
242
+ # Example:
243
+ #
244
+ # on_stream do |stream|
245
+ # # Stream contains rows separated by a | char
246
+ # stream.readlines('|').collect do |line|
247
+ # # Each line has 3 fields of 10 characters each
248
+ # [line[0...10], line[10...20], line[20...30]]
249
+ # end
250
+ # end
251
+ #
107
252
  def on_stream(&block)
108
253
  @custom_reader = CustomReader.new(self) unless @custom_reader
109
254
  @custom_reader.set_reader(:stream, block)
110
255
  end
111
256
 
112
- # Very, very commonly we only want to deal with the default sheet. In this case,
113
- # let folks skip the sheet(n) do ... end block wrapper and just define columns
114
- # against the main importer. Internally, proxy those calls to the first sheet.
115
- def column(*args, &block)
116
- default_sheet.column(*args, &block)
117
- end
118
-
119
- # Ditto for filters
120
- def filter(*args, &block)
121
- default_sheet.filter(*args, &block)
122
- end
123
-
124
- # Ditto for start row too
125
- def start_row(row_num)
126
- default_sheet.start_row(row_num)
127
- end
128
-
129
- # More facading
130
- def headerless!
131
- default_sheet.headerless!
132
- end
133
-
134
257
  # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
135
258
  # validate the required values, run custom validations... basically pre-parse and
136
259
  # massage the supplied data. It will return true on success, or false if one
@@ -139,87 +262,293 @@ class Importer
139
262
  # You may supply various options for the import using the options hash. Supported
140
263
  # options include:
141
264
  #
142
- # format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied
143
- # source as the specified format, or auto-detects if set to :auto
265
+ # format: one of :auto, :csv, :html, :xls, :xlsx, defaults to :auto, forces treating the supplied
266
+ # source as the specified format, or attempts to auto-detect if set to :auto
267
+ # scope: specify the search scope for the data/format, overriding any scope set with #scope
144
268
  # encoding: source encoding override, defaults to guessing based on input
145
269
  #
146
- # Generally, you should be able to throw a source at it and it should work. The
270
+ # Generally, you should be able to throw a path or stream at it and it should work. The
147
271
  # options exist to allow overriding in cases where the automation heuristics
148
272
  # have failed and the input type is known by the caller.
149
273
  #
274
+ # If you're trying to import from a raw string, use Importer#import_string instead.
275
+ #
150
276
  # After #import has completed successfully, you can process the resulting data
151
- # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a
152
- def import(path_or_stream, options = {})
277
+ # using #process or extract the raw data by calling #to_a to get an array of row hashes
278
+ #
279
+ # Note that as of version 0.7.0, there is a more compact operation mode enabled by passing
280
+ # a block to this call:
281
+ #
282
+ # importer.import(...) do |row|
283
+ # # Process each row here
284
+ # end
285
+ #
286
+ # In this mode, the block is called with each row as in #process, conditionally on no
287
+ # errors. In addition, when a block is passed, true/false is not returned (as the
288
+ # block is already conditionally called). Instead, it will return the importer to allow
289
+ # chaining to #on_error or other calls.
290
+ def import(path_or_stream, options = {}, &block)
153
291
  # Clear all our load-time state, including all rows, header locations... you name it
154
292
  reset
155
293
 
156
294
  # Get the reader for this format
157
295
  default = @custom_reader ? :custom : :auto
158
- format = options.delete(:format) { default }
159
- if format == :custom
296
+ @format = options.delete(:format) { default }
297
+ if @format == :custom
160
298
  # Custom format selected, use our internal custom reader
161
- @data = @custom_reader
299
+ @reader = @custom_reader
162
300
 
163
- elsif format && format != :auto
301
+ elsif @format && @format != :auto
164
302
  # Explicit format requested
165
- @data = DataReader::for_format(self, format)
166
- unless @data
167
- add_error("Unable to find format handler for format #{format} - aborting")
168
- return
169
- end
303
+ @reader = DataReader::for_format(self, @format)
170
304
 
171
305
  else
172
306
  # Auto select
173
- @data = DataReader::for_source(self, path_or_stream)
307
+ @reader = DataReader::for_source(self, path_or_stream)
308
+ @format = @reader.format
309
+ end
310
+
311
+ # Verify we got one
312
+ unless @reader
313
+ add_error("Unable to find format handler for format :#{format} on import of #{path_or_stream.class.name} source - aborting")
314
+ return
315
+ end
316
+
317
+ # What scopes (if any) should we limit our searching to?
318
+ scopes = options.delete(:scope) { @scopes[@format] }
319
+ if scopes && !scopes.is_a?(Array)
320
+ scopes = [scopes]
174
321
  end
175
322
 
176
323
  # Read in the data!
177
- @data.load(path_or_stream)
324
+ @reader.load(path_or_stream, scopes) do |raw_rows|
325
+ # Find our column layout, start of data, etc
326
+ if find_header(raw_rows)
327
+ # Now, run all the data and add it as a Row instance
328
+ raw_rows.each_with_index do |raw, index|
329
+ row_num = index + 1
330
+ if row_num >= @data.start_row
331
+ add_row(row_num, raw)
332
+ end
333
+ end
334
+ # We've found a workable sheet/table/whatever, stop looking
335
+ true
336
+
337
+ else
338
+ # This sheet/table/whatever didn't have the needed header, try
339
+ # the next one (if any)
340
+ false
341
+ end
342
+ end
343
+
344
+ # If we have any missing headers, note that fact
345
+ if @missing_headers && @missing_headers.count > 0
346
+ add_error("Unable to locate required column header for column(s): " + @missing_headers.collect{|c| ":#{c}"}.list_join(', '))
347
+ end
348
+
349
+ # If we're here with no errors, we rule!
350
+ success = !has_errors?
351
+
352
+ if block
353
+ # New way, if block is passed, process it on success
354
+ process(&block) if success
355
+ self
356
+ else
357
+ # Old way, return result
358
+ success
359
+ end
178
360
  end
179
361
 
180
- # Process a specific sheet, or the default sheet if none is provided. Your
181
- # passed block will be handed one Row at a time.
182
- def process(sheet_id = nil, &block)
183
- s = sheet(sheet_id, false) || default_sheet
184
- s.process(&block)
362
+ # Use this form of import for the common case of having a raw CSV or HTML string.
363
+ def import_string(string, options = {}, &block)
364
+ # Get a format here if needed
365
+ if options[:format].nil?
366
+ if @custom_reader
367
+ format = :custom
368
+ else
369
+ format = string.include?('<table') && string.include?('</tr>') ? :html : :csv
370
+ end
371
+ options[:format] = format
372
+ end
373
+
374
+ # Do the import, converting the string to a stream
375
+ import(StringIO.new(string), options, &block)
376
+ end
377
+
378
+ # Call with a block accepting a single Importer::Row with contents that
379
+ # look like :column_key => <parsed value>. Any filtered rows
380
+ # will not be present. If you want to register an error, simply
381
+ # raise "some text" and it will be added to the importer's error
382
+ # list for display to the user, logging, or whatever.
383
+ def process
384
+ @data.rows.each do |row|
385
+ begin
386
+ yield row
387
+ rescue Exception => e
388
+ add_error(row, e.to_s)
389
+ end
390
+ end
185
391
  end
186
392
 
187
- def add_error(context, msg = nil)
188
- if context.is_a?(String) && msg.nil?
189
- msg = context
190
- context = nil
393
+ def on_error(&block)
394
+ raise 'Invalid block passed to Importer#on_error: block may accept 0, 1 or 2 arguments' if block.arity > 2
395
+
396
+ if has_errors?
397
+ case block.arity
398
+ when 0 then DslProxy.exec(self, &block)
399
+ when 1 then DslProxy.exec(self, @errors, &block)
400
+ when 2 then DslProxy.exec(self, @errors, error_summary, &block)
401
+ end
191
402
  end
192
- @errors << Error.new(context, msg)
403
+
404
+ self
193
405
  end
194
406
 
407
+ # Process the raw values for the first rows in a sheet,
408
+ # and attempt to build a map of the column layout, and
409
+ # detect the first row of real data
410
+ def find_header(raw_rows)
411
+ if headerless?
412
+ # Use implicit or explicit column position when told to not look for a header
413
+ next_index = 0
414
+ @columns.each do |col|
415
+ unless col.position.nil?
416
+ next_index = col.fixed_index
417
+ end
418
+ col.data.index = next_index
419
+ next_index += 1
420
+ end
421
+ @data.start_row = @start_row || 1
422
+ @missing_headers = nil
423
+ return true
424
+
425
+ else
426
+ # Match by testing
427
+ missing = nil
428
+ raw_rows.each_with_index do |row, i|
429
+ # Um, have data?
430
+ next unless row
431
+
432
+ # Set up for this iteration
433
+ remaining = @columns.dup
434
+
435
+ # Step through this row's raw values, and look for a matching column for all columns
436
+ row.each_with_index do |val, i|
437
+ col = remaining.detect {|c| c.match_header?(val.to_s, i) }
438
+ if col
439
+ remaining -= [col]
440
+ col.data.index = i
441
+ end
442
+ end
443
+
444
+ if remaining.empty?
445
+ # Found all columns, have a map, update our start row to be the next line and return!
446
+ @data.start_row = @start_row || i+2
447
+ @missing_headers = nil
448
+ return true
449
+ else
450
+ missing = remaining if (missing.nil? || missing.count > remaining.count)
451
+ end
452
+ end
453
+
454
+ # If we get here, we're hosed
455
+ @missing_headers = missing.collect(&:key) if @missing_headers.nil? || @missing_headers.count > missing.count
456
+ false
457
+ end
458
+ end
459
+
460
+ # Add a new row to our stash, parsing/filtering/validating as we go!
461
+ def add_row(line, raw_data)
462
+ # Gracefully handle custom parsers that return nil for a row's data
463
+ raw_data ||= []
464
+ # Add the row
465
+ row = Row.new(self, line)
466
+
467
+ # Parse out the values
468
+ values = {}
469
+ @columns.each do |col|
470
+ index = col.data.index
471
+ raw_val = raw_data[index]
472
+ if col.parse
473
+ # Use custom parser if this row has one
474
+ val = col.parse_value(row, raw_val)
475
+ else
476
+ # Otherwise use our standard parser
477
+ val = @reader.parse_value(raw_val, col.type)
478
+ end
479
+ values[col.key] = val
480
+ end
481
+
482
+ # Set the values and filter if needed
483
+ row.set_values(values)
484
+ return nil if @filter && !@filter.call(row)
485
+
486
+ # Row is desired, now validate values
487
+ @columns.each do |col|
488
+ val = values[col.key]
489
+ col.validate_value(row, val)
490
+ end
491
+
492
+ # We is good
493
+ @data.rows << row
494
+ row
495
+ end
496
+
497
+ # When true, one or more errors have been recorded during this import/process
498
+ # cycle.
195
499
  def has_errors?
196
500
  @errors.any?
197
501
  end
198
502
 
199
- def add_warning(context, msg)
503
+ # Add an error to our error list. Will result in a failed import.
504
+ def add_error(context, msg = nil)
200
505
  if context.is_a?(String) && msg.nil?
201
506
  msg = context
202
507
  context = nil
203
508
  end
204
- @warnings << Error.new(context, msg)
205
- end
206
-
207
- def has_warnings?
208
- @warnings.any?
509
+ @errors << Error.new(context, msg)
209
510
  end
210
511
 
211
- # Returns a human-readable summary of the errors present on the importer
512
+ # Returns a human-readable summary of the errors present on the importer, or
513
+ # nil if no errors are present
212
514
  def error_summary
515
+ # Simple case
213
516
  return nil unless has_errors?
214
- @errors.collect(&:summary).list_join(', ')
517
+
518
+ # Group by error text - we often get the same error dozens of times
519
+ list = {}
520
+ @errors.each do |err|
521
+ errs = list[err.text] || []
522
+ errs << err
523
+ list[err.text] = errs
524
+ end
525
+
526
+ # Build summary & return
527
+ list.values.collect do |errs|
528
+ summary = errs.first.summary
529
+ if errs.count == 1
530
+ summary
531
+ else
532
+ errs.count.to_s + ' x ' + summary
533
+ end
534
+ end.list_join(', ')
535
+ end
536
+
537
+ # After calling #import, you can dump the final values for each row
538
+ # as an array of hashes. Useful in debugging! For general processing,
539
+ # use #process or the block form of #import instead.
540
+ def to_a
541
+ @data.rows.collect(&:values)
215
542
  end
216
543
 
217
544
  protected
218
545
 
219
546
  def reset
220
547
  @errors = []
221
- @warnings = []
222
- @sheets.values.each(&:reset)
548
+ @missing_headers = nil
549
+ @format = nil
550
+ @reader = nil
551
+ @data = Data.new
223
552
  end
224
553
 
225
554
  end
@@ -2,12 +2,12 @@ class Importer
2
2
 
3
3
  class Row
4
4
 
5
- attr_reader :sheet, :line, :values
5
+ attr_reader :line, :values
6
6
 
7
- def initialize(sheet, line, value_hash = nil)
8
- @sheet = sheet
7
+ def initialize(importer, line, value_hash = nil)
8
+ @importer = importer
9
9
  @line = line
10
- @values = value_hash
10
+ set_values(value_hash)
11
11
  end
12
12
 
13
13
  def set_values(value_hash)
@@ -15,8 +15,9 @@ class Importer
15
15
  end
16
16
 
17
17
  # True when all columns have a non-nil value, useful in filtering out junk
18
- # rows
18
+ # rows. Pass in one or more keys to check only those keys for presence.
19
19
  def all?(*keys)
20
+ keys.flatten!
20
21
  if keys.any?
21
22
  # Check only the specified keys
22
23
  valid = true
@@ -33,25 +34,28 @@ class Importer
33
34
  end
34
35
  end
35
36
 
37
+ # True when all row columns have nil values.
36
38
  def empty?
37
39
  @values.values.all?(&:nil?)
38
40
  end
39
41
 
40
- # Returns the value of a column
42
+ # Returns the value of a column.
41
43
  def [](column_key)
42
44
  @values[column_key]
43
45
  end
44
-
46
+
47
+ # The row's name, e.g. 'Row 4'
45
48
  def to_s
46
49
  "Row #{@line}"
47
50
  end
48
51
 
49
- def add_error(msg)
50
- @sheet.importer.add_error(self, msg)
52
+ # This row's values as a hash of :column_key => <parsed + validated value>
53
+ def to_hash
54
+ @values.dup
51
55
  end
52
56
 
53
- def add_warning(msg)
54
- @sheet.importer.add_warning(self, msg)
57
+ def add_error(msg)
58
+ @importer.add_error(self, msg)
55
59
  end
56
60
 
57
61
  end