dreader 0.5.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,495 @@
1
+ require "roo"
2
+ require "logger"
3
+ require "fast_excel"
4
+
5
+ require "dreader/column"
6
+ require "dreader/options"
7
+ require "dreader/util"
8
+
9
+ module Dreader
10
+ #
11
+ # This is where the real stuff begins
12
+ #
13
+ module Engine
14
+ # the options we passed
15
+ attr_accessor :declared_options
16
+ # the specification of the columns to process
17
+ attr_accessor :declared_columns
18
+ # some example lines
19
+ attr_accessor :declared_examples
20
+ # the specification of the virtual columns
21
+ attr_accessor :declared_virtual_columns
22
+ # the mapping rules
23
+ attr_accessor :declared_mapping
24
+
25
+ # the data we read
26
+ attr_reader :table
27
+
28
+ # define a DSL for options
29
+ # any string is processed as an option and it ends up in the
30
+ # @options hash
31
+ def options(&block)
32
+ options = Options.new
33
+ options.instance_eval(&block)
34
+
35
+ @declared_options = options.to_hash
36
+ end
37
+
38
+ # define a DSL for column specification
39
+ # - `name` is the name of the column
40
+ # - `block` contains two declarations, `process` and `check`, which are
41
+ # used, respectively, to make a cell into the desired data and to check
42
+ # whether the desired data is ok
43
+ def column(name, &block)
44
+ column = Column.new
45
+ column.instance_eval(&block)
46
+
47
+ @declared_columns ||= []
48
+
49
+ if name.instance_of?(Hash)
50
+ @declared_columns << column.to_hash.merge(
51
+ { name: name.keys.first, colref: name.values.first }
52
+ )
53
+ else
54
+ @declared_columns << column.to_hash.merge({ name: name })
55
+ end
56
+ end
57
+
58
+ # define a DSL for multiple column specification (bulk_declare)
59
+ #
60
+ # - hash is a hash in the form { symbolic_name: colref }
61
+ #
62
+ # i.bulk_declare {name: "B", age: "C"} is equivalent to:
63
+ #
64
+ # i.column :name do
65
+ # colref "B"
66
+ # end
67
+ # i.column :age do
68
+ # colref "C"
69
+ # end
70
+ #
71
+ # i.bulk_declare {name: "B", age: "C"} do
72
+ # process do |cell|
73
+ # cell.strip
74
+ # end
75
+ # end
76
+ #
77
+ # is equivalent to:
78
+ #
79
+ # i.column :name do
80
+ # colref "B"
81
+ # process do |cell|
82
+ # cell.strip
83
+ # end
84
+ # end
85
+ # i.column :age do
86
+ # colref "C"
87
+ # process do |cell|
88
+ # cell.strip
89
+ # end
90
+ # end
91
+ def columns(hash, &block)
92
+ hash.each_key do |key|
93
+ column = Column.new
94
+ column.colref hash[key]
95
+ column.instance_eval(&block) if block
96
+
97
+ @declared_columns ||= []
98
+ @declared_columns << column.to_hash.merge({ name: key })
99
+ end
100
+ end
101
+
102
+ def example(hash)
103
+ @declared_examples ||= []
104
+ @declared_examples << hash
105
+ end
106
+
107
+ # virtual columns define derived attributes
108
+ # the code specified in the virtual column is executed after reading
109
+ # a row and before applying the mapping function
110
+ #
111
+ # virtual colum declarations are executed in the order in which
112
+ # they are defined
113
+ def virtual_column(name, &block)
114
+ column = Column.new
115
+ column.instance_eval &block
116
+
117
+ @declared_virtual_columns ||= []
118
+ @declared_virtual_columns << column.to_hash.merge({ name: name })
119
+ end
120
+
121
+ # define what we do with each line we read
122
+ # - `block` is the code which takes as input a `row` and processes
123
+ # `row` is a hash in which each spreadsheet cell is accessible under
124
+ # the column names. Each cell has the following values:
125
+ # :value, :error, :row_number, :col_number
126
+ def mapping(&block)
127
+ @declared_mapping = block
128
+ end
129
+
130
+ # read a file and store it internally
131
+ #
132
+ # @param hash, a hash, possibly overriding any of the parameters
133
+ # set in the initial options. This allows you, for
134
+ # instance, to apply the same column specification to
135
+ # different files and different sheets
136
+ #
137
+ # @return the data read from filename, in the form of an array of
138
+ # hashes
139
+ def read(args = {})
140
+ # args override values in options (if defined)
141
+ # the initializer guarantees @options is at least {}
142
+ options = (@declared_options || {}).merge(args)
143
+
144
+ @logger = options[:logger] || Logger.new($stdout)
145
+ @logger.level = options[:logger_level] || Logger::WARN
146
+ @debug = options[:debug] == true
147
+
148
+ if !args.instance_of?(Hash)
149
+ @logger.error "#{__callee__}: this function takes a Hash as input"
150
+ raise Exception
151
+ end
152
+
153
+ spreadsheet = open_spreadsheet (options[:filename])
154
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
155
+ first_row = options[:first_row] || 1
156
+ last_row = options[:last_row] || sheet.last_row
157
+
158
+ if @debug
159
+ # override some defaults
160
+ @logger.level = Logger::DEBUG
161
+
162
+ # override the number of lines read
163
+ n_lines = options[:n] ? options[:n].to_i : 10
164
+ last_row = first_row + n_lines - 1
165
+
166
+ # apply some defaults for debugging, if not defined in the options
167
+ [:check_raw, :process, :check].map do |key|
168
+ options[key] = true unless options.key?(key)
169
+ end
170
+ end
171
+
172
+ { current: @declared_options, debug: options }.each do |k, v|
173
+ @logger.debug "[dreader] #{k.capitalize} configuration:"
174
+ v.each do |key, value|
175
+ @logger.debug " #{key}: #{value}"
176
+ end
177
+ end
178
+
179
+ @errors = []
180
+
181
+ @table = (first_row..last_row).map do |row_number|
182
+ r = { row_number: row_number, row_errors: [] }
183
+
184
+ # this has side-effects on r
185
+ columns_on(r, row_number, sheet)
186
+
187
+ # this has side-effects on r
188
+ virtual_columns_on(r) if options[:virtual] || options[:mapping]
189
+
190
+ options[:mapping] ? mappings_on(r) : r
191
+ end
192
+ end
193
+
194
+ # TODO: PASS A ROW (and not row_number and sheet)
195
+ def columns_on(r, row_number, sheet)
196
+ @declared_columns.each_with_index do |colspec, index|
197
+ colname = colspec[:name]
198
+ colref = colspec[:colref]
199
+ cell = sheet.cell(row_number, colref)
200
+
201
+ r[colname] = {
202
+ row: row_number,
203
+ col: colspec[:colref],
204
+ value: cell,
205
+ error: false
206
+ }
207
+
208
+ # Repeated below
209
+ # @logger.debug "[dreader] Processing #{coord(row_number, colref)}"
210
+
211
+ # check raw data
212
+ check_data(colspec[:checks_raw], r, colname)
213
+
214
+ # process data
215
+ coord = coord(row_number, colspec[:colref], cell)
216
+ begin
217
+ processed = colspec[:process] ? colspec[:process].call(cell) : cell
218
+ @logger.debug "[dreader] #{colname} process #{coord} yields '#{processed}' (#{processed.class})"
219
+ r[colname][:value] = processed
220
+ rescue => e
221
+ @logger.error "[dreader] #{colname} process #{coord} raises an exception"
222
+ raise e
223
+ end
224
+
225
+ # check data after process - notice that now r contains the value
226
+ # processed by process
227
+ check_data(colspec[:checks], r, colname)
228
+ end
229
+
230
+ r
231
+ end
232
+
233
+ alias load read
234
+
235
+ # show to stdout the first `n` records we read from the file given the
236
+ # current configuration
237
+ def debug(args = {})
238
+ read(args.merge({ debug: true }))
239
+ end
240
+
241
+ # get (processed) row number
242
+ #
243
+ # - row_number is the row to get: index starts at 1.
244
+ #
245
+ # get_row(1) get the first line read, that is, the row specified
246
+ # by `first_row` in `options` (or in read)
247
+ #
248
+ # You need to invoke read first
249
+ def get_row(row_number)
250
+ if row_number > @table.size
251
+ @logger.error "[dreader] 'row_number' is out of range (did you invoke read?)"
252
+ exit
253
+ elsif row_number <= 0
254
+ @logger.error "[dreader] 'row_number' is zero or negative (first row is 1)."
255
+ else
256
+ @table[row_number - 1]
257
+ end
258
+ end
259
+
260
+ # return an array of hashes with all the errors we have encountered
261
+ # an empty array is a good news
262
+ attr_reader :errors
263
+
264
+ def virtual_columns
265
+ # execute the virtual column specification
266
+ @table.each { |row| virtual_columns_on(row) }
267
+ end
268
+
269
+ # Compute virtual columns for, with side effect on row
270
+ def virtual_columns_on(row)
271
+ @declared_virtual_columns.each do |virtualcol|
272
+ colname = virtualcol[:name]
273
+ row[colname] = { virtual: true }
274
+
275
+ check_data(virtualcol[:checks_raw], row, colname, full_row: true)
276
+
277
+ begin
278
+ # add the cell to the table
279
+ if virtualcol[:process]
280
+ row[colname][:value] = virtualcol[:process].call(row)
281
+ end
282
+ rescue => e
283
+ r = row[:row_number]
284
+ @logger.error "[dreader] #{colname} process raises an exception at row #{r}"
285
+ raise e
286
+ end
287
+
288
+ # check data after process -- we also have the processed value of
289
+ # the virtual column
290
+ check_data(virtualcol[:checks], row, colname, full_row: true)
291
+ end
292
+ end
293
+
294
+ # apply the mapping code to the array it makes sense to invoke it only
295
+ # once.
296
+ #
297
+ # the mapping is applied only if it defined and it uses map, so that
298
+ # it can be used functionally
299
+ def mappings
300
+ @table.map { |row| mappings_on(row) }
301
+ end
302
+
303
+ def mappings_on(row)
304
+ @declared_mapping&.call(row)
305
+ end
306
+
307
+ # an alias
308
+ def data
309
+ @table
310
+ end
311
+
312
+ def to_s
313
+ @table.to_s
314
+ end
315
+
316
+ #
317
+ # headers validation functions
318
+ #
319
+ def correct_headers?(hash = {})
320
+ output = compare_headers
321
+ output.values.map { |x| x[:correct] }.all?(true)
322
+ end
323
+
324
+ def compare_headers(hash = {})
325
+ options = @declared_options.merge(hash)
326
+
327
+ spreadsheet = open_spreadsheet(options[:filename])
328
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
329
+ header_row_number = options[:first_row] - 1 || 1
330
+
331
+ output_hash = {}
332
+ @declared_columns.map do |colspec|
333
+ cell = sheet.cell(row_number, colspec[:colref])
334
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
335
+
336
+ output_hash[colspec[:colref]] = {
337
+ header_in_spec: colspec[:name],
338
+ human_readable: human_readable,
339
+ header_in_file: cell,
340
+ correct: cell == human_readable
341
+ }
342
+ end
343
+
344
+ output_hash
345
+ end
346
+
347
+ # generate a template from the column specification
348
+ # first row is a header, determined by colspec
349
+ # second row includes the documentation string, to document values in
350
+ # the columns
351
+ def template(hash = {})
352
+ options = @declared_options.merge(hash)
353
+ filename = options[:template_filename]
354
+
355
+ workbook = FastExcel.open(filename, constant_memory: true)
356
+ worksheet = workbook.add_worksheet("Template")
357
+ worksheet.auto_width = true
358
+
359
+ # first_row is indexed from 1 by roo and from 0 by FastExcel
360
+ first_row = [(options[:first_row] || 0) - 2, 0].max
361
+ bold = workbook.bold_format
362
+
363
+ #
364
+ # somehow fast excel seems to allow to set cells row by row
365
+ #
366
+
367
+ # here we write the first row
368
+ @declared_columns.each do |colspec|
369
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
370
+ colref = colref_to_i(colspec[:colref])
371
+ worksheet.write_string(first_row, colref, human_readable, bold)
372
+ end
373
+
374
+ # here we create a note with the legenda
375
+ @declared_columns.each do |colspec|
376
+ colref = colref_to_i(colspec[:colref])
377
+ worksheet.write_string(first_row + 1, colref, colspec[:doc], nil)
378
+ end
379
+
380
+ # here we write some example records
381
+ @declared_examples.each_with_index do |example_hash, index|
382
+ example_hash.each do |colname, value|
383
+ colspec = @declared_columns.select { |x| x[:name] == colname }.first
384
+ if colspec
385
+ colref = colref_to_i(colspec[:colref])
386
+ worksheet.write_string(index + 3, colref, value, nil)
387
+ else
388
+ @logger.err "generate_template: #{colname} used in example is not defined"
389
+ end
390
+ end
391
+ end
392
+
393
+ workbook.close
394
+ end
395
+
396
+ private
397
+
398
+ # list of keys we support in options. We remove them when reading
399
+ # the CSV file
400
+ OPTION_KEYS = %i[
401
+ filename sheet first_row last_row logger logger_level
402
+ ]
403
+
404
+ def open_spreadsheet(filename)
405
+ ext = File.extname(filename)
406
+
407
+ case ext
408
+ when ".csv"
409
+ csv_options = @declared_options.except(*OPTION_KEYS)
410
+ Roo::CSV.new(filename, csv_options:)
411
+ when ".tsv"
412
+ csv_options = @declared_options.except(*OPTION_KEYS).merge({ col_sep: "\t" })
413
+ Roo::CSV.new(filename, csv_options:)
414
+ when ".ods"
415
+ Roo::OpenOffice.new(filename)
416
+ when ".xls"
417
+ Roo::Excel.new(filename)
418
+ when ".xlsx"
419
+ Roo::Excelx.new(filename)
420
+ else
421
+ raise "Unknown extension: #{ext}"
422
+ end
423
+ end
424
+
425
+ def colref_to_i(colref)
426
+ return colref if colref.instance_of?(Integer)
427
+ value = 0
428
+ power = 1
429
+ colref.to_s.reverse.split("").map do |char|
430
+ value = value + power * (1 + char.ord - 'A'.ord)
431
+ power = power * 26
432
+ end
433
+ value - 1
434
+ end
435
+
436
+ # performs check on data (this is the same code for check_raw and
437
+ # check)
438
+ #
439
+ # @params
440
+ # - check_spec :: the set of checks to perform (an array of hashes
441
+ # in the form error_message: lambda
442
+ #
443
+ # - hash :: either the hash of a column or the hash of a row
444
+ #
445
+ # - colname :: the name of the column we are checking (if we check
446
+ # a column) or the column we are computing (virtual
447
+ # columns)
448
+ #
449
+ # - full_row :: if true, pass the full row to check rather than
450
+ # the value of a column
451
+ #
452
+ # - debug :: a boolean
453
+ def check_data(check_spec, hash, colname, full_row: false)
454
+ check_spec.each do |error_message, check_function|
455
+ # here we extract values by distinguishing whether the hash is that of
456
+ # column or that of a row
457
+ if full_row
458
+ value = hash
459
+ row = hash[:row_number]
460
+ col = :multiple_columns
461
+ else
462
+ value = hash[colname][:value]
463
+ row = hash[colname][:row]
464
+ col = hash[colname][:col]
465
+ end
466
+ coord = coord(row, col, value)
467
+
468
+ begin
469
+ pass = check_function.call(value)
470
+ @logger.debug "[dreader] check #{colname}/#{error_message} at #{coord} yields: '#{pass}'"
471
+
472
+ if pass != true
473
+ hash[colname][:error] = true
474
+ error = {
475
+ callee: __callee__,
476
+ cell_value: value, colname: colname,
477
+ row: row, col: col,
478
+ message: error_message,
479
+ content: pass
480
+ }
481
+ @errors << error
482
+ hash[:row_errors] << error
483
+ end
484
+ rescue => e
485
+ @logger.error "[dreader] check #{colname}/#{error_message} raises an exception at #{coord}"
486
+ raise e
487
+ end
488
+ end
489
+ end
490
+
491
+ def coord(row, col, value = nil)
492
+ ["#{row}#{col}", (value ? "(#{value})" : nil)].join(" ")
493
+ end
494
+ end
495
+ end
@@ -0,0 +1,16 @@
1
+ module Dreader
2
+ # service class to implement the options DSL language
3
+ class Options
4
+ def initialize
5
+ @attributes = {}
6
+ end
7
+
8
+ def method_missing(name, *args, &block)
9
+ @attributes[name] = args[0]
10
+ end
11
+
12
+ def to_hash
13
+ @attributes
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,86 @@
1
+ module Dreader
2
+ # Utilities function to simplify importing data into
3
+ # ActiveRecords
4
+ class Util
5
+ # given a hash returned by Engine, return the same hash with
6
+ # keys directly bound to the content of the :value sub-key
7
+ #
8
+ # Example
9
+ #
10
+ # hash = {name: {value: "A", ...}, surname: {value: "B", ...}}
11
+ # simplify hash
12
+ # {name: "A", surname: "B"}
13
+ #
14
+ # remove all keys which are not part of the data read (row_number and
15
+ # row_errors)
16
+ def self.simplify(hash)
17
+ (hash.keys - %i[row_number row_errors]).map do |colname|
18
+ [colname, hash[colname][:value]]
19
+ end.to_h
20
+ end
21
+
22
+ # Given a "simplified" hash restructure it according to the second
23
+ # argument.
24
+ #
25
+ # Use it for generating hashes with nested attributes, which
26
+ # follows Rails conventions.
27
+ #
28
+ # @params:
29
+ # - hash the hash to restructure
30
+ # - args splat arguments which specify how to (re)structure the
31
+ # values in Hash. Each element is either a symbol or a Hash
32
+ #
33
+ # Example
34
+ #
35
+ # hash = { name: "A", surname: "B", address: "via Piave", city: "Genoa" }
36
+ #
37
+ # restructure(hash, :name, :surname)
38
+ # { name: "A", surname: "B" }
39
+ #
40
+ # restructure(hash, :name, address_attributes: [:address, :city])
41
+ # {name: "A", address_attributes: { address: "via Piave", city: "Genoa" }
42
+ #
43
+ def self.restructure(hash, *new_structure)
44
+ new_structure.to_h do |value|
45
+ if value.instance_of?(Hash)
46
+ [value.keys.first, self.restructure(hash, *value.values.first)]
47
+ else
48
+ [value, hash[value]]
49
+ end
50
+ end
51
+ end
52
+
53
+ # an alias for Hash.slice
54
+ # keys is an array of keys
55
+ def self.slice(hash, keys)
56
+ hash.slice(*keys)
57
+ end
58
+
59
+ # remove all `keys` from `hash`
60
+ def self.clean(hash, keys)
61
+ hash.reject { |key, _| keys.include?(key) }
62
+ end
63
+
64
+ # given a hash, return a new hash with key and whose value is
65
+ # the hash
66
+ #
67
+ # Example:
68
+ #
69
+ # hash = {name: "A", size: 10}
70
+ # prepend hash, :product_attributes
71
+ # {product_attributes: {name: "A", size: 10}}
72
+ #
73
+ def self.prepend(hash, key)
74
+ { key => hash }
75
+ end
76
+
77
+ #
78
+ # Retrieve all errors related to row/col from and Array of error messages
79
+ #
80
+ def self.errors(errors_array, row, col = nil)
81
+ errors_array.select do |error|
82
+ error[:row] == row && (col.nil? || error[:col] == col)
83
+ end
84
+ end
85
+ end
86
+ end
@@ -1,3 +1,3 @@
1
1
  module Dreader
2
- VERSION = "0.5.0"
2
+ VERSION = "1.1.0"
3
3
  end