dreader 0.4.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,473 @@
1
+ require "roo"
2
+ require "logger"
3
+ require "fast_excel"
4
+
5
+ require "dreader/column"
6
+ require "dreader/options"
7
+ require "dreader/util"
8
+
9
+ module Dreader
10
+ #
11
+ # This is where the real stuff begins
12
+ #
13
+ class Engine
14
+ # TODO: make the writer into private methods (need to be accessed only
15
+ # in the initializer) and demote to attr_reader
16
+
17
+ # the options we passed
18
+ attr_accessor :options
19
+ # the specification of the columns to process
20
+ attr_accessor :colspec
21
+ # some example lines
22
+ attr_accessor :examples
23
+ # the specification of the virtual columns
24
+ attr_accessor :virtualcols
25
+ # the mapping rules
26
+ attr_accessor :mapping
27
+
28
+ # the data we read
29
+ attr_reader :table
30
+
31
+ # variables declared in the class which need to be propagated in
32
+ # the instance
33
+ INSTANTIATE = %i[options colspec examples virtualcols]
34
+
35
+ def initialize
36
+ @logger = Logger.new($stdout)
37
+ @logger.level = Logger::WARN
38
+
39
+ # populate the instance with the variables defined in the class
40
+ @options = defined?(@@options) ? @@options : {}
41
+ @colspec = defined?(@@colspec) ? @@colspec : []
42
+ @examples = defined?(@@examples) ? @@examples : []
43
+ @virtualcols = defined?(@@virtualcols) ? @@virtualcols : []
44
+ end
45
+
46
+ # define a DSL for options
47
+ # any string is processed as an option and it ends up in the
48
+ # @options hash
49
+ def self.options(&block)
50
+ options = Options.new
51
+ options.instance_eval(&block)
52
+
53
+ @@options = options.to_hash
54
+ end
55
+
56
+ # define a DSL for column specification
57
+ # - `name` is the name of the column
58
+ # - `block` contains two declarations, `process` and `check`, which are
59
+ # used, respectively, to make a cell into the desired data and to check
60
+ # whether the desired data is ok
61
+ def self.column(name, &block)
62
+ column = Column.new
63
+ column.instance_eval(&block)
64
+
65
+ @@colspec ||= []
66
+
67
+ if name.instance_of?(Hash)
68
+ @@colspec << column.to_hash.merge(
69
+ { name: name.keys.first, colref: name.values.first }
70
+ )
71
+ else
72
+ @@colspec << column.to_hash.merge({ name: name })
73
+ end
74
+ end
75
+
76
+ # define a DSL for multiple column specification (bulk_declare)
77
+ #
78
+ # - hash is a hash in the form { symbolic_name: colref }
79
+ #
80
+ # i.bulk_declare {name: "B", age: "C"} is equivalent to:
81
+ #
82
+ # i.column :name do
83
+ # colref "B"
84
+ # end
85
+ # i.column :age do
86
+ # colref "C"
87
+ # end
88
+ #
89
+ # i.bulk_declare {name: "B", age: "C"} do
90
+ # process do |cell|
91
+ # cell.strip
92
+ # end
93
+ # end
94
+ #
95
+ # is equivalent to:
96
+ #
97
+ # i.column :name do
98
+ # colref "B"
99
+ # process do |cell|
100
+ # cell.strip
101
+ # end
102
+ # end
103
+ # i.column :age do
104
+ # colref "C"
105
+ # process do |cell|
106
+ # cell.strip
107
+ # end
108
+ # end
109
+ def self.columns(hash, &block)
110
+ hash.each_key do |key|
111
+ column = Column.new
112
+ column.colref hash[key]
113
+ column.instance_eval(&block) if block
114
+
115
+ @@colspec ||= []
116
+ @@colspec << column.to_hash.merge({ name: key })
117
+ end
118
+ end
119
+
120
+ def self.example(hash)
121
+ @@examples ||= []
122
+ @@examples << hash
123
+ end
124
+
125
+ # virtual columns define derived attributes
126
+ # the code specified in the virtual column is executed after reading
127
+ # a row and before applying the mapping function
128
+ #
129
+ # virtual colum declarations are executed in the order in which
130
+ # they are defined
131
+ def self.virtual_column(name, &block)
132
+ column = Column.new
133
+ column.instance_eval &block
134
+
135
+ @@virtualcols ||= []
136
+ @@virtualcols << column.to_hash.merge({ name: name })
137
+ end
138
+
139
+ # define what we do with each line we read
140
+ # - `block` is the code which takes as input a `row` and processes
141
+ # `row` is a hash in which each spreadsheet cell is accessible under
142
+ # the column names. Each cell has the following values:
143
+ # :value, :error, :row_number, :col_number
144
+ def self.mapping(&block)
145
+ @@mapping = block
146
+ end
147
+
148
+ # read a file and store it internally
149
+ #
150
+ # @param hash, a hash, possibly overriding any of the parameters
151
+ # set in the initial options. This allows you, for
152
+ # instance, to apply the same column specification to
153
+ # different files and different sheets
154
+ #
155
+ # @return the data read from filename, in the form of an array of
156
+ # hashes
157
+ def read(args = {})
158
+ if !args.instance_of?(Hash)
159
+ @logger.error "#{__callee__}: this function takes a Hash as input"
160
+ raise Exception
161
+ end
162
+
163
+ options = (@options || {}).merge(args)
164
+
165
+ @logger = options[:logger] if options[:logger]
166
+ @logger.level = options[:logger_level] if options[:logger_level]
167
+ @debug = options[:debug] == true
168
+
169
+ spreadsheet = Dreader::Engine.open_spreadsheet (options[:filename])
170
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
171
+ first_row = options[:first_row] || 1
172
+ last_row = options[:last_row] || sheet.last_row
173
+
174
+ if @debug
175
+ # override some defaults
176
+ @logger.level = Logger::DEBUG
177
+
178
+ # override the number of lines read
179
+ options[:n] ||= 10 unless options[:n]
180
+ last_row = options[:first_row] + options[:n].to_i - 1
181
+
182
+ # apply some defaults for debugging, if not defined in the options
183
+ [:check_raw, :process, :check].map do |key|
184
+ options[key] = true unless options.key?(key)
185
+ end
186
+ end
187
+
188
+ { current: @options, debug: options}.each do |k, v|
189
+ @logger.debug "#{k.capitalize} configuration:"
190
+ v.each do |key, value|
191
+ @logger.debug " #{key}: #{value}"
192
+ end
193
+ end
194
+
195
+ @table = []
196
+ @errors = []
197
+
198
+ (first_row..last_row).each do |row_number|
199
+ r = { row_number: row_number, row_errors: [] }
200
+
201
+ @colspec.each_with_index do |colspec, index|
202
+ colref = colspec[:colref]
203
+ cell = sheet.cell(row_number, colref)
204
+ colname = colspec[:name]
205
+
206
+ r[colname] = {
207
+ row: row_number,
208
+ col: colspec[:colref],
209
+ value: cell,
210
+ error: false
211
+ }
212
+ @logger.debug "#{__callee__} [ROW DECL] Processing row: #{row_number}, col: #{colref}"
213
+
214
+ # check raw data
215
+ check_data(colspec[:checks_raw], r, colname)
216
+
217
+ # process data
218
+ coord = coord(row_number, colspec[:colref], cell)
219
+ begin
220
+ processed = colspec[:process] ? colspec[:process].call(cell) : cell
221
+ @logger.debug "process on #{colname} at #{coord} yields: '#{processed}' (#{processed.class})"
222
+ r[colname][:value] = processed
223
+ rescue => e
224
+ @logger.error "#{__callee__}: process on :#{colname} raised an exception at #{coord}"
225
+ raise e
226
+ end
227
+
228
+ # check data after process - notice that now r contains the value
229
+ # processed by process
230
+ check_data(colspec[:checks], r, colname)
231
+ end
232
+
233
+ @table << r
234
+ end
235
+
236
+ @table
237
+ end
238
+
239
+ alias load read
240
+
241
+ # show to stdout the first `n` records we read from the file given the
242
+ # current configuration
243
+ def debug(args = {})
244
+ read(args.merge({ debug: true }))
245
+ end
246
+
247
+ # get (processed) row number
248
+ #
249
+ # - row_number is the row to get: index starts at 1.
250
+ #
251
+ # get_row(1) get the first line read, that is, the row specified
252
+ # by `first_row` in `options` (or in read)
253
+ #
254
+ # You need to invoke read first
255
+ def get_row(row_number)
256
+ if row_number > @table.size
257
+ @logger.error "#{__callee__}: 'row_number' is out of range (did you invoke read?)"
258
+ exit
259
+ elsif row_number <= 0
260
+ @logger.error "#{__callee__}: 'row_number' is zero or negative (first row is 1)."
261
+ else
262
+ @table[row_number - 1]
263
+ end
264
+ end
265
+
266
+ # return an array of hashes with all the errors we have encounterd
267
+ # an empty array is a good news
268
+ attr_reader :errors
269
+
270
+ def virtual_columns(hash = {})
271
+ # execute the virtual column specification
272
+ @table.each do |r|
273
+ @virtualcols.each do |virtualcol|
274
+ colname = virtualcol[:name]
275
+ r[colname] = { virtual: true }
276
+
277
+ check_data(virtualcol[:checks_raw], r, colname, full_row: true)
278
+
279
+ begin
280
+ # add the cell to the table
281
+ if virtualcol[:process]
282
+ r[colname][:value] = virtualcol[:process].call(r)
283
+ end
284
+ rescue => e
285
+ row = r[:row_number]
286
+ @logger.error "#{__callee__}: process for virtual column :#{colname} raised an exception at row #{row}"
287
+ raise e
288
+ end
289
+
290
+ # check data after process -- we also have the processed value of the virtual column
291
+ check_data(virtualcol[:checks], r, colname, full_row: true)
292
+ end
293
+ end
294
+ end
295
+
296
+ # apply the mapping code to the array
297
+ # it makes sense to invoke it only once
298
+ #
299
+ # the mapping is applied only if it defined
300
+ def process
301
+ @table.each { |row| @mapping&.call(row) }
302
+ end
303
+
304
+ def to_s
305
+ @table.to_s
306
+ end
307
+
308
+ #
309
+ # headers validation functions
310
+ #
311
+ def correct_headers?(hash = {})
312
+ output = compare_headers
313
+ output.values.map { |x| x[:correct] }.all?(true)
314
+ end
315
+
316
+ def compare_headers(hash = {})
317
+ options = @options.merge(hash)
318
+
319
+ spreadsheet = Dreader::Engine.open_spreadsheet(options[:filename])
320
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
321
+ header_row_number = options[:first_row] - 1 || 1
322
+
323
+ output_hash = {}
324
+ @colspec.map do |colspec|
325
+ cell = sheet.cell(row_number, colspec[:colref])
326
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
327
+
328
+ output_hash[colspec[:colref]] = {
329
+ header_in_spec: colspec[:name],
330
+ human_readable: human_readable,
331
+ header_in_file: cell,
332
+ correct: cell == human_readable
333
+ }
334
+ end
335
+
336
+ output_hash
337
+ end
338
+
339
+ # generate a template from the column specification
340
+ # first row is a header, determined by colspec
341
+ # second row includes the documentation string, to document values in
342
+ # the columns
343
+ def template(hash = {})
344
+ options = @options.merge(hash)
345
+ filename = options[:template_filename]
346
+
347
+ workbook = FastExcel.open(filename, constant_memory: true)
348
+ worksheet = workbook.add_worksheet("Template")
349
+ worksheet.auto_width = true
350
+
351
+ # first_row is indexed from 1 by roo and from 0 by FastExcel
352
+ first_row = [(options[:first_row] || 0) - 2, 0].max
353
+ bold = workbook.bold_format
354
+
355
+ #
356
+ # somehow fast excel seems to allow to set cells row by row
357
+ #
358
+
359
+ # here we write the first row
360
+ @colspec.each do |colspec|
361
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
362
+ colref = colref_to_i(colspec[:colref])
363
+ worksheet.write_string(first_row, colref, human_readable, bold)
364
+ end
365
+
366
+ # here we create a note with the legenda
367
+ @colspec.each do |colspec|
368
+ colref = colref_to_i(colspec[:colref])
369
+ worksheet.write_string(first_row + 1, colref, colspec[:doc], nil)
370
+ end
371
+
372
+ # here we write some example records
373
+ @examples.each_with_index do |example_hash, index|
374
+ example_hash.each do |colname, value|
375
+ colspec = @colspec.select { |x| x[:name] == colname }.first
376
+ if colspec
377
+ colref = colref_to_i(colspec[:colref])
378
+ worksheet.write_string(index + 3, colref, value, nil)
379
+ else
380
+ @logger.err "generate_template: #{colname} used in example is not defined"
381
+ end
382
+ end
383
+ end
384
+
385
+ workbook.close
386
+ end
387
+
388
+ private
389
+
390
+ def self.open_spreadsheet(filename)
391
+ ext = File.extname(filename)
392
+
393
+ case ext
394
+ when ".csv" then Roo::CSV.new(filename)
395
+ when ".tsv" then Roo::CSV.new(filename, csv_options: { col_sep: "\t" })
396
+ when ".ods" then Roo::OpenOffice.new(filename)
397
+ when ".xls" then Roo::Excel.new(filename)
398
+ when ".xlsx" then Roo::Excelx.new(filename)
399
+ else raise "Unknown extension: #{ext}"
400
+ end
401
+ end
402
+
403
+ def colref_to_i(colref)
404
+ return colref if colref.instance_of?(Integer)
405
+ value = 0
406
+ power = 1
407
+ colref.to_s.reverse.split("").map do |char|
408
+ value = value + power * (1 + char.ord - 'A'.ord)
409
+ power = power * 26
410
+ end
411
+ value - 1
412
+ end
413
+
414
+ # performs check on data (this is the same code for check_raw and
415
+ # check)
416
+ #
417
+ # @params
418
+ # - check_spec :: the set of checks to perform (an array of hashes
419
+ # in the form error_message: lambda
420
+ #
421
+ # - hash :: either the hash of a column or the hash of a row
422
+ #
423
+ # - colname :: the name of the column we are checking (if we check
424
+ # a column) or the column we are computing (virtual
425
+ # columns)
426
+ #
427
+ # - full_row :: if true, pass the full row to check rather than
428
+ # the value of a column
429
+ #
430
+ # - debug :: a boolean
431
+ def check_data(check_spec, hash, colname, full_row: false)
432
+ check_spec.each do |error_message, check_function|
433
+ # here we extract values by distinguishing whether the hash is that of
434
+ # column or that of a row
435
+ if full_row
436
+ value = hash
437
+ row = hash[:row_number]
438
+ col = :multiple_columns
439
+ else
440
+ value = hash[colname][:value]
441
+ row = hash[colname][:row]
442
+ col = hash[colname][:col]
443
+ end
444
+ coord = coord(row, col, value)
445
+
446
+ begin
447
+ pass = check_function.call(value)
448
+ @logger.debug "check for #{colname}/#{error_message} at #{coord} yields: '#{pass}'"
449
+
450
+ if pass != true
451
+ hash[colname][:error] = true
452
+ error = {
453
+ callee: __callee__,
454
+ cell_value: value, colname: colname,
455
+ row: row, col: col,
456
+ message: error_message,
457
+ content: pass
458
+ }
459
+ @errors << error
460
+ hash[:row_errors] << error
461
+ end
462
+ rescue => e
463
+ @logger.error "#{__callee__} for #{colname}/#{error_message} raised an exception at #{coord}"
464
+ raise e
465
+ end
466
+ end
467
+ end
468
+
469
+ def coord(row, col, cell)
470
+ "row: #{row}, col: #{col}, value: #{cell}"
471
+ end
472
+ end
473
+ end
@@ -0,0 +1,16 @@
1
+ module Dreader
2
+ # service class to implement the options DSL language
3
+ class Options
4
+ def initialize
5
+ @attributes = {}
6
+ end
7
+
8
+ def method_missing(name, *args, &block)
9
+ @attributes[name] = args[0]
10
+ end
11
+
12
+ def to_hash
13
+ @attributes
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,71 @@
1
+ module Dreader
2
+ # Utilities function to simplify importing data into
3
+ # ActiveRecords
4
+ class Util
5
+ # given a hash returned by Engine, return the same hash with
6
+ # keys directly bound to the content of the :value sub-key
7
+ #
8
+ # Example
9
+ #
10
+ # hash = {name: {value: "A", ...}, surname: {value: "B", ...}}
11
+ # simplify hash
12
+ # {name: "A", surname: "B"}
13
+ #
14
+ # remove all keys which are not part of the data read (row_number and
15
+ # row_errors)
16
+ def self.simplify(hash)
17
+ (hash.keys - %i[row_number row_errors]).map do |colname|
18
+ [colname, hash[colname][:value]]
19
+ end.to_h
20
+ end
21
+
22
+ # given a hash returned by Engine, keep the "kept" keys in the top
23
+ # of the hierarchy and move the "moved_key" below the
24
+ # "subordinate_key"
25
+ #
26
+ # Example
27
+ #
28
+ # hash = { name: "A", surname: "B", address: "via XX Settembre", city: "Genoa" }
29
+ # restructure hash, [:name, :surname], :address_attributes, [:address, :city]
30
+ # {name: "A", surname: "B", address_attributes: {address: "via XX Settembre", city: "Genoa"}}
31
+ #
32
+ def self.restructure(hash, kept, subordinate_key, moved_keys)
33
+ head = hash.slice kept
34
+ subordinate = prepend subordinate_key, hash.slice(moved_keys)
35
+ head.merge subordinate
36
+ end
37
+
38
+ # an alias for Hash.slice
39
+ # keys is an array of keys
40
+ def self.slice(hash, keys)
41
+ hash.slice(*keys)
42
+ end
43
+
44
+ # remove all `keys` from `hash`
45
+ def self.clean(hash, keys)
46
+ hash.reject { |key, _| keys.include?(key) }
47
+ end
48
+
49
+ # given a hash, return a new hash with key and whose value is
50
+ # the hash
51
+ #
52
+ # Example:
53
+ #
54
+ # hash = {name: "A", size: 10}
55
+ # prepend hash, :product_attributes
56
+ # {product_attributes: {name: "A", size: 10}}
57
+ #
58
+ def self.prepend(hash, key)
59
+ { key => hash }
60
+ end
61
+
62
+ #
63
+ # Retrieve all errors related to row/col from and Array of error messages
64
+ #
65
+ def self.errors(errors_array, row, col = nil)
66
+ errors_array.select do |error|
67
+ error[:row] == row && (col.nil? || error[:col] == col)
68
+ end
69
+ end
70
+ end
71
+ end
@@ -1,3 +1,3 @@
1
1
  module Dreader
2
- VERSION = "0.4.2"
2
+ VERSION = "1.0.0"
3
3
  end