dreader 0.5.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,495 @@
1
+ require "roo"
2
+ require "logger"
3
+ require "fast_excel"
4
+
5
+ require "dreader/column"
6
+ require "dreader/options"
7
+ require "dreader/util"
8
+
9
+ module Dreader
10
+ #
11
+ # This is where the real stuff begins
12
+ #
13
+ module Engine
14
+ # the options we passed
15
+ attr_accessor :declared_options
16
+ # the specification of the columns to process
17
+ attr_accessor :declared_columns
18
+ # some example lines
19
+ attr_accessor :declared_examples
20
+ # the specification of the virtual columns
21
+ attr_accessor :declared_virtual_columns
22
+ # the mapping rules
23
+ attr_accessor :declared_mapping
24
+
25
+ # the data we read
26
+ attr_reader :table
27
+
28
+ # define a DSL for options
29
+ # any string is processed as an option and it ends up in the
30
+ # @options hash
31
+ def options(&block)
32
+ options = Options.new
33
+ options.instance_eval(&block)
34
+
35
+ @declared_options = options.to_hash
36
+ end
37
+
38
+ # define a DSL for column specification
39
+ # - `name` is the name of the column
40
+ # - `block` contains two declarations, `process` and `check`, which are
41
+ # used, respectively, to make a cell into the desired data and to check
42
+ # whether the desired data is ok
43
+ def column(name, &block)
44
+ column = Column.new
45
+ column.instance_eval(&block)
46
+
47
+ @declared_columns ||= []
48
+
49
+ if name.instance_of?(Hash)
50
+ @declared_columns << column.to_hash.merge(
51
+ { name: name.keys.first, colref: name.values.first }
52
+ )
53
+ else
54
+ @declared_columns << column.to_hash.merge({ name: name })
55
+ end
56
+ end
57
+
58
+ # define a DSL for multiple column specification (bulk_declare)
59
+ #
60
+ # - hash is a hash in the form { symbolic_name: colref }
61
+ #
62
+ # i.bulk_declare {name: "B", age: "C"} is equivalent to:
63
+ #
64
+ # i.column :name do
65
+ # colref "B"
66
+ # end
67
+ # i.column :age do
68
+ # colref "C"
69
+ # end
70
+ #
71
+ # i.bulk_declare {name: "B", age: "C"} do
72
+ # process do |cell|
73
+ # cell.strip
74
+ # end
75
+ # end
76
+ #
77
+ # is equivalent to:
78
+ #
79
+ # i.column :name do
80
+ # colref "B"
81
+ # process do |cell|
82
+ # cell.strip
83
+ # end
84
+ # end
85
+ # i.column :age do
86
+ # colref "C"
87
+ # process do |cell|
88
+ # cell.strip
89
+ # end
90
+ # end
91
+ def columns(hash, &block)
92
+ hash.each_key do |key|
93
+ column = Column.new
94
+ column.colref hash[key]
95
+ column.instance_eval(&block) if block
96
+
97
+ @declared_columns ||= []
98
+ @declared_columns << column.to_hash.merge({ name: key })
99
+ end
100
+ end
101
+
102
+ def example(hash)
103
+ @declared_examples ||= []
104
+ @declared_examples << hash
105
+ end
106
+
107
+ # virtual columns define derived attributes
108
+ # the code specified in the virtual column is executed after reading
109
+ # a row and before applying the mapping function
110
+ #
111
+ # virtual colum declarations are executed in the order in which
112
+ # they are defined
113
+ def virtual_column(name, &block)
114
+ column = Column.new
115
+ column.instance_eval &block
116
+
117
+ @declared_virtual_columns ||= []
118
+ @declared_virtual_columns << column.to_hash.merge({ name: name })
119
+ end
120
+
121
+ # define what we do with each line we read
122
+ # - `block` is the code which takes as input a `row` and processes
123
+ # `row` is a hash in which each spreadsheet cell is accessible under
124
+ # the column names. Each cell has the following values:
125
+ # :value, :error, :row_number, :col_number
126
+ def mapping(&block)
127
+ @declared_mapping = block
128
+ end
129
+
130
+ # read a file and store it internally
131
+ #
132
+ # @param hash, a hash, possibly overriding any of the parameters
133
+ # set in the initial options. This allows you, for
134
+ # instance, to apply the same column specification to
135
+ # different files and different sheets
136
+ #
137
+ # @return the data read from filename, in the form of an array of
138
+ # hashes
139
+ def read(args = {})
140
+ # args override values in options (if defined)
141
+ # the initializer guarantees @options is at least {}
142
+ options = (@declared_options || {}).merge(args)
143
+
144
+ @logger = options[:logger] || Logger.new($stdout)
145
+ @logger.level = options[:logger_level] || Logger::WARN
146
+ @debug = options[:debug] == true
147
+
148
+ if !args.instance_of?(Hash)
149
+ @logger.error "#{__callee__}: this function takes a Hash as input"
150
+ raise Exception
151
+ end
152
+
153
+ spreadsheet = open_spreadsheet (options[:filename])
154
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
155
+ first_row = options[:first_row] || 1
156
+ last_row = options[:last_row] || sheet.last_row
157
+
158
+ if @debug
159
+ # override some defaults
160
+ @logger.level = Logger::DEBUG
161
+
162
+ # override the number of lines read
163
+ n_lines = options[:n] ? options[:n].to_i : 10
164
+ last_row = first_row + n_lines - 1
165
+
166
+ # apply some defaults for debugging, if not defined in the options
167
+ [:check_raw, :process, :check].map do |key|
168
+ options[key] = true unless options.key?(key)
169
+ end
170
+ end
171
+
172
+ { current: @declared_options, debug: options }.each do |k, v|
173
+ @logger.debug "[dreader] #{k.capitalize} configuration:"
174
+ v.each do |key, value|
175
+ @logger.debug " #{key}: #{value}"
176
+ end
177
+ end
178
+
179
+ @errors = []
180
+
181
+ @table = (first_row..last_row).map do |row_number|
182
+ r = { row_number: row_number, row_errors: [] }
183
+
184
+ # this has side-effects on r
185
+ columns_on(r, row_number, sheet)
186
+
187
+ # this has side-effects on r
188
+ virtual_columns_on(r) if options[:virtual] || options[:mapping]
189
+
190
+ options[:mapping] ? mappings_on(r) : r
191
+ end
192
+ end
193
+
194
+ # TODO: PASS A ROW (and not row_number and sheet)
195
+ def columns_on(r, row_number, sheet)
196
+ @declared_columns.each_with_index do |colspec, index|
197
+ colname = colspec[:name]
198
+ colref = colspec[:colref]
199
+ cell = sheet.cell(row_number, colref)
200
+
201
+ r[colname] = {
202
+ row: row_number,
203
+ col: colspec[:colref],
204
+ value: cell,
205
+ error: false
206
+ }
207
+
208
+ # Repeated below
209
+ # @logger.debug "[dreader] Processing #{coord(row_number, colref)}"
210
+
211
+ # check raw data
212
+ check_data(colspec[:checks_raw], r, colname)
213
+
214
+ # process data
215
+ coord = coord(row_number, colspec[:colref], cell)
216
+ begin
217
+ processed = colspec[:process] ? colspec[:process].call(cell) : cell
218
+ @logger.debug "[dreader] #{colname} process #{coord} yields '#{processed}' (#{processed.class})"
219
+ r[colname][:value] = processed
220
+ rescue => e
221
+ @logger.error "[dreader] #{colname} process #{coord} raises an exception"
222
+ raise e
223
+ end
224
+
225
+ # check data after process - notice that now r contains the value
226
+ # processed by process
227
+ check_data(colspec[:checks], r, colname)
228
+ end
229
+
230
+ r
231
+ end
232
+
233
+ alias load read
234
+
235
+ # show to stdout the first `n` records we read from the file given the
236
+ # current configuration
237
+ def debug(args = {})
238
+ read(args.merge({ debug: true }))
239
+ end
240
+
241
+ # get (processed) row number
242
+ #
243
+ # - row_number is the row to get: index starts at 1.
244
+ #
245
+ # get_row(1) get the first line read, that is, the row specified
246
+ # by `first_row` in `options` (or in read)
247
+ #
248
+ # You need to invoke read first
249
+ def get_row(row_number)
250
+ if row_number > @table.size
251
+ @logger.error "[dreader] 'row_number' is out of range (did you invoke read?)"
252
+ exit
253
+ elsif row_number <= 0
254
+ @logger.error "[dreader] 'row_number' is zero or negative (first row is 1)."
255
+ else
256
+ @table[row_number - 1]
257
+ end
258
+ end
259
+
260
+ # return an array of hashes with all the errors we have encountered
261
+ # an empty array is a good news
262
+ attr_reader :errors
263
+
264
+ def virtual_columns
265
+ # execute the virtual column specification
266
+ @table.each { |row| virtual_columns_on(row) }
267
+ end
268
+
269
+ # Compute virtual columns for, with side effect on row
270
+ def virtual_columns_on(row)
271
+ @declared_virtual_columns.each do |virtualcol|
272
+ colname = virtualcol[:name]
273
+ row[colname] = { virtual: true }
274
+
275
+ check_data(virtualcol[:checks_raw], row, colname, full_row: true)
276
+
277
+ begin
278
+ # add the cell to the table
279
+ if virtualcol[:process]
280
+ row[colname][:value] = virtualcol[:process].call(row)
281
+ end
282
+ rescue => e
283
+ r = row[:row_number]
284
+ @logger.error "[dreader] #{colname} process raises an exception at row #{r}"
285
+ raise e
286
+ end
287
+
288
+ # check data after process -- we also have the processed value of
289
+ # the virtual column
290
+ check_data(virtualcol[:checks], row, colname, full_row: true)
291
+ end
292
+ end
293
+
294
+ # apply the mapping code to the array it makes sense to invoke it only
295
+ # once.
296
+ #
297
+ # the mapping is applied only if it defined and it uses map, so that
298
+ # it can be used functionally
299
+ def mappings
300
+ @table.map { |row| mappings_on(row) }
301
+ end
302
+
303
+ def mappings_on(row)
304
+ @declared_mapping&.call(row)
305
+ end
306
+
307
+ # an alias
308
+ def data
309
+ @table
310
+ end
311
+
312
+ def to_s
313
+ @table.to_s
314
+ end
315
+
316
+ #
317
+ # headers validation functions
318
+ #
319
+ def correct_headers?(hash = {})
320
+ output = compare_headers
321
+ output.values.map { |x| x[:correct] }.all?(true)
322
+ end
323
+
324
+ def compare_headers(hash = {})
325
+ options = @declared_options.merge(hash)
326
+
327
+ spreadsheet = open_spreadsheet(options[:filename])
328
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
329
+ header_row_number = options[:first_row] - 1 || 1
330
+
331
+ output_hash = {}
332
+ @declared_columns.map do |colspec|
333
+ cell = sheet.cell(row_number, colspec[:colref])
334
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
335
+
336
+ output_hash[colspec[:colref]] = {
337
+ header_in_spec: colspec[:name],
338
+ human_readable: human_readable,
339
+ header_in_file: cell,
340
+ correct: cell == human_readable
341
+ }
342
+ end
343
+
344
+ output_hash
345
+ end
346
+
347
+ # generate a template from the column specification
348
+ # first row is a header, determined by colspec
349
+ # second row includes the documentation string, to document values in
350
+ # the columns
351
+ def template(hash = {})
352
+ options = @declared_options.merge(hash)
353
+ filename = options[:template_filename]
354
+
355
+ workbook = FastExcel.open(filename, constant_memory: true)
356
+ worksheet = workbook.add_worksheet("Template")
357
+ worksheet.auto_width = true
358
+
359
+ # first_row is indexed from 1 by roo and from 0 by FastExcel
360
+ first_row = [(options[:first_row] || 0) - 2, 0].max
361
+ bold = workbook.bold_format
362
+
363
+ #
364
+ # somehow fast excel seems to allow to set cells row by row
365
+ #
366
+
367
+ # here we write the first row
368
+ @declared_columns.each do |colspec|
369
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
370
+ colref = colref_to_i(colspec[:colref])
371
+ worksheet.write_string(first_row, colref, human_readable, bold)
372
+ end
373
+
374
+ # here we create a note with the legenda
375
+ @declared_columns.each do |colspec|
376
+ colref = colref_to_i(colspec[:colref])
377
+ worksheet.write_string(first_row + 1, colref, colspec[:doc], nil)
378
+ end
379
+
380
+ # here we write some example records
381
+ @declared_examples.each_with_index do |example_hash, index|
382
+ example_hash.each do |colname, value|
383
+ colspec = @declared_columns.select { |x| x[:name] == colname }.first
384
+ if colspec
385
+ colref = colref_to_i(colspec[:colref])
386
+ worksheet.write_string(index + 3, colref, value, nil)
387
+ else
388
+ @logger.err "generate_template: #{colname} used in example is not defined"
389
+ end
390
+ end
391
+ end
392
+
393
+ workbook.close
394
+ end
395
+
396
+ private
397
+
398
+ # list of keys we support in options. We remove them when reading
399
+ # the CSV file
400
+ OPTION_KEYS = %i[
401
+ filename sheet first_row last_row logger logger_level
402
+ ]
403
+
404
+ def open_spreadsheet(filename)
405
+ ext = File.extname(filename)
406
+
407
+ case ext
408
+ when ".csv"
409
+ csv_options = @declared_options.except(*OPTION_KEYS)
410
+ Roo::CSV.new(filename, csv_options:)
411
+ when ".tsv"
412
+ csv_options = @declared_options.except(*OPTION_KEYS).merge({ col_sep: "\t" })
413
+ Roo::CSV.new(filename, csv_options:)
414
+ when ".ods"
415
+ Roo::OpenOffice.new(filename)
416
+ when ".xls"
417
+ Roo::Excel.new(filename)
418
+ when ".xlsx"
419
+ Roo::Excelx.new(filename)
420
+ else
421
+ raise "Unknown extension: #{ext}"
422
+ end
423
+ end
424
+
425
+ def colref_to_i(colref)
426
+ return colref if colref.instance_of?(Integer)
427
+ value = 0
428
+ power = 1
429
+ colref.to_s.reverse.split("").map do |char|
430
+ value = value + power * (1 + char.ord - 'A'.ord)
431
+ power = power * 26
432
+ end
433
+ value - 1
434
+ end
435
+
436
+ # performs check on data (this is the same code for check_raw and
437
+ # check)
438
+ #
439
+ # @params
440
+ # - check_spec :: the set of checks to perform (an array of hashes
441
+ # in the form error_message: lambda
442
+ #
443
+ # - hash :: either the hash of a column or the hash of a row
444
+ #
445
+ # - colname :: the name of the column we are checking (if we check
446
+ # a column) or the column we are computing (virtual
447
+ # columns)
448
+ #
449
+ # - full_row :: if true, pass the full row to check rather than
450
+ # the value of a column
451
+ #
452
+ # - debug :: a boolean
453
+ def check_data(check_spec, hash, colname, full_row: false)
454
+ check_spec.each do |error_message, check_function|
455
+ # here we extract values by distinguishing whether the hash is that of
456
+ # column or that of a row
457
+ if full_row
458
+ value = hash
459
+ row = hash[:row_number]
460
+ col = :multiple_columns
461
+ else
462
+ value = hash[colname][:value]
463
+ row = hash[colname][:row]
464
+ col = hash[colname][:col]
465
+ end
466
+ coord = coord(row, col, value)
467
+
468
+ begin
469
+ pass = check_function.call(value)
470
+ @logger.debug "[dreader] check #{colname}/#{error_message} at #{coord} yields: '#{pass}'"
471
+
472
+ if pass != true
473
+ hash[colname][:error] = true
474
+ error = {
475
+ callee: __callee__,
476
+ cell_value: value, colname: colname,
477
+ row: row, col: col,
478
+ message: error_message,
479
+ content: pass
480
+ }
481
+ @errors << error
482
+ hash[:row_errors] << error
483
+ end
484
+ rescue => e
485
+ @logger.error "[dreader] check #{colname}/#{error_message} raises an exception at #{coord}"
486
+ raise e
487
+ end
488
+ end
489
+ end
490
+
491
+ def coord(row, col, value = nil)
492
+ ["#{row}#{col}", (value ? "(#{value})" : nil)].join(" ")
493
+ end
494
+ end
495
+ end
@@ -0,0 +1,16 @@
1
+ module Dreader
2
+ # service class to implement the options DSL language
3
+ class Options
4
+ def initialize
5
+ @attributes = {}
6
+ end
7
+
8
+ def method_missing(name, *args, &block)
9
+ @attributes[name] = args[0]
10
+ end
11
+
12
+ def to_hash
13
+ @attributes
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,86 @@
1
+ module Dreader
2
+ # Utilities function to simplify importing data into
3
+ # ActiveRecords
4
+ class Util
5
+ # given a hash returned by Engine, return the same hash with
6
+ # keys directly bound to the content of the :value sub-key
7
+ #
8
+ # Example
9
+ #
10
+ # hash = {name: {value: "A", ...}, surname: {value: "B", ...}}
11
+ # simplify hash
12
+ # {name: "A", surname: "B"}
13
+ #
14
+ # remove all keys which are not part of the data read (row_number and
15
+ # row_errors)
16
+ def self.simplify(hash)
17
+ (hash.keys - %i[row_number row_errors]).map do |colname|
18
+ [colname, hash[colname][:value]]
19
+ end.to_h
20
+ end
21
+
22
+ # Given a "simplified" hash restructure it according to the second
23
+ # argument.
24
+ #
25
+ # Use it for generating hashes with nested attributes, which
26
+ # follows Rails conventions.
27
+ #
28
+ # @params:
29
+ # - hash the hash to restructure
30
+ # - args splat arguments which specify how to (re)structure the
31
+ # values in Hash. Each element is either a symbol or a Hash
32
+ #
33
+ # Example
34
+ #
35
+ # hash = { name: "A", surname: "B", address: "via Piave", city: "Genoa" }
36
+ #
37
+ # restructure(hash, :name, :surname)
38
+ # { name: "A", surname: "B" }
39
+ #
40
+ # restructure(hash, :name, address_attributes: [:address, :city])
41
+ # {name: "A", address_attributes: { address: "via Piave", city: "Genoa" }
42
+ #
43
+ def self.restructure(hash, *new_structure)
44
+ new_structure.to_h do |value|
45
+ if value.instance_of?(Hash)
46
+ [value.keys.first, self.restructure(hash, *value.values.first)]
47
+ else
48
+ [value, hash[value]]
49
+ end
50
+ end
51
+ end
52
+
53
+ # an alias for Hash.slice
54
+ # keys is an array of keys
55
+ def self.slice(hash, keys)
56
+ hash.slice(*keys)
57
+ end
58
+
59
+ # remove all `keys` from `hash`
60
+ def self.clean(hash, keys)
61
+ hash.reject { |key, _| keys.include?(key) }
62
+ end
63
+
64
+ # given a hash, return a new hash with key and whose value is
65
+ # the hash
66
+ #
67
+ # Example:
68
+ #
69
+ # hash = {name: "A", size: 10}
70
+ # prepend hash, :product_attributes
71
+ # {product_attributes: {name: "A", size: 10}}
72
+ #
73
+ def self.prepend(hash, key)
74
+ { key => hash }
75
+ end
76
+
77
+ #
78
+ # Retrieve all errors related to row/col from and Array of error messages
79
+ #
80
+ def self.errors(errors_array, row, col = nil)
81
+ errors_array.select do |error|
82
+ error[:row] == row && (col.nil? || error[:col] == col)
83
+ end
84
+ end
85
+ end
86
+ end
@@ -1,3 +1,3 @@
1
1
  module Dreader
2
- VERSION = "0.5.0"
2
+ VERSION = "1.1.0"
3
3
  end