dreader 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,473 @@
1
+ require "roo"
2
+ require "logger"
3
+ require "fast_excel"
4
+
5
+ require "dreader/column"
6
+ require "dreader/options"
7
+ require "dreader/util"
8
+
9
+ module Dreader
10
+ #
11
+ # This is where the real stuff begins
12
+ #
13
+ class Engine
14
+ # TODO: make the writer into private methods (need to be accessed only
15
+ # in the initializer) and demote to attr_reader
16
+
17
+ # the options we passed
18
+ attr_accessor :options
19
+ # the specification of the columns to process
20
+ attr_accessor :colspec
21
+ # some example lines
22
+ attr_accessor :examples
23
+ # the specification of the virtual columns
24
+ attr_accessor :virtualcols
25
+ # the mapping rules
26
+ attr_accessor :mapping
27
+
28
+ # the data we read
29
+ attr_reader :table
30
+
31
+ # variables declared in the class which need to be propagated in
32
+ # the instance
33
+ INSTANTIATE = %i[options colspec examples virtualcols]
34
+
35
+ def initialize
36
+ @logger = Logger.new($stdout)
37
+ @logger.level = Logger::WARN
38
+
39
+ # populate the instance with the variables defined in the class
40
+ @options = defined?(@@options) ? @@options : {}
41
+ @colspec = defined?(@@colspec) ? @@colspec : []
42
+ @examples = defined?(@@examples) ? @@examples : []
43
+ @virtualcols = defined?(@@virtualcols) ? @@virtualcols : []
44
+ end
45
+
46
+ # define a DSL for options
47
+ # any string is processed as an option and it ends up in the
48
+ # @options hash
49
+ def self.options(&block)
50
+ options = Options.new
51
+ options.instance_eval(&block)
52
+
53
+ @@options = options.to_hash
54
+ end
55
+
56
+ # define a DSL for column specification
57
+ # - `name` is the name of the column
58
+ # - `block` contains two declarations, `process` and `check`, which are
59
+ # used, respectively, to make a cell into the desired data and to check
60
+ # whether the desired data is ok
61
+ def self.column(name, &block)
62
+ column = Column.new
63
+ column.instance_eval(&block)
64
+
65
+ @@colspec ||= []
66
+
67
+ if name.instance_of?(Hash)
68
+ @@colspec << column.to_hash.merge(
69
+ { name: name.keys.first, colref: name.values.first }
70
+ )
71
+ else
72
+ @@colspec << column.to_hash.merge({ name: name })
73
+ end
74
+ end
75
+
76
+ # define a DSL for multiple column specification (bulk_declare)
77
+ #
78
+ # - hash is a hash in the form { symbolic_name: colref }
79
+ #
80
+ # i.bulk_declare {name: "B", age: "C"} is equivalent to:
81
+ #
82
+ # i.column :name do
83
+ # colref "B"
84
+ # end
85
+ # i.column :age do
86
+ # colref "C"
87
+ # end
88
+ #
89
+ # i.bulk_declare {name: "B", age: "C"} do
90
+ # process do |cell|
91
+ # cell.strip
92
+ # end
93
+ # end
94
+ #
95
+ # is equivalent to:
96
+ #
97
+ # i.column :name do
98
+ # colref "B"
99
+ # process do |cell|
100
+ # cell.strip
101
+ # end
102
+ # end
103
+ # i.column :age do
104
+ # colref "C"
105
+ # process do |cell|
106
+ # cell.strip
107
+ # end
108
+ # end
109
+ def self.columns(hash, &block)
110
+ hash.each_key do |key|
111
+ column = Column.new
112
+ column.colref hash[key]
113
+ column.instance_eval(&block) if block
114
+
115
+ @@colspec ||= []
116
+ @@colspec << column.to_hash.merge({ name: key })
117
+ end
118
+ end
119
+
120
+ def self.example(hash)
121
+ @@examples ||= []
122
+ @@examples << hash
123
+ end
124
+
125
+ # virtual columns define derived attributes
126
+ # the code specified in the virtual column is executed after reading
127
+ # a row and before applying the mapping function
128
+ #
129
+ # virtual colum declarations are executed in the order in which
130
+ # they are defined
131
+ def self.virtual_column(name, &block)
132
+ column = Column.new
133
+ column.instance_eval &block
134
+
135
+ @@virtualcols ||= []
136
+ @@virtualcols << column.to_hash.merge({ name: name })
137
+ end
138
+
139
+ # define what we do with each line we read
140
+ # - `block` is the code which takes as input a `row` and processes
141
+ # `row` is a hash in which each spreadsheet cell is accessible under
142
+ # the column names. Each cell has the following values:
143
+ # :value, :error, :row_number, :col_number
144
+ def self.mapping(&block)
145
+ @@mapping = block
146
+ end
147
+
148
+ # read a file and store it internally
149
+ #
150
+ # @param hash, a hash, possibly overriding any of the parameters
151
+ # set in the initial options. This allows you, for
152
+ # instance, to apply the same column specification to
153
+ # different files and different sheets
154
+ #
155
+ # @return the data read from filename, in the form of an array of
156
+ # hashes
157
+ def read(args = {})
158
+ if !args.instance_of?(Hash)
159
+ @logger.error "#{__callee__}: this function takes a Hash as input"
160
+ raise Exception
161
+ end
162
+
163
+ options = (@options || {}).merge(args)
164
+
165
+ @logger = options[:logger] if options[:logger]
166
+ @logger.level = options[:logger_level] if options[:logger_level]
167
+ @debug = options[:debug] == true
168
+
169
+ spreadsheet = Dreader::Engine.open_spreadsheet (options[:filename])
170
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
171
+ first_row = options[:first_row] || 1
172
+ last_row = options[:last_row] || sheet.last_row
173
+
174
+ if @debug
175
+ # override some defaults
176
+ @logger.level = Logger::DEBUG
177
+
178
+ # override the number of lines read
179
+ options[:n] ||= 10 unless options[:n]
180
+ last_row = options[:first_row] + options[:n].to_i - 1
181
+
182
+ # apply some defaults for debugging, if not defined in the options
183
+ [:check_raw, :process, :check].map do |key|
184
+ options[key] = true unless options.key?(key)
185
+ end
186
+ end
187
+
188
+ { current: @options, debug: options}.each do |k, v|
189
+ @logger.debug "#{k.capitalize} configuration:"
190
+ v.each do |key, value|
191
+ @logger.debug " #{key}: #{value}"
192
+ end
193
+ end
194
+
195
+ @table = []
196
+ @errors = []
197
+
198
+ (first_row..last_row).each do |row_number|
199
+ r = { row_number: row_number, row_errors: [] }
200
+
201
+ @colspec.each_with_index do |colspec, index|
202
+ colref = colspec[:colref]
203
+ cell = sheet.cell(row_number, colref)
204
+ colname = colspec[:name]
205
+
206
+ r[colname] = {
207
+ row: row_number,
208
+ col: colspec[:colref],
209
+ value: cell,
210
+ error: false
211
+ }
212
+ @logger.debug "#{__callee__} [ROW DECL] Processing row: #{row_number}, col: #{colref}"
213
+
214
+ # check raw data
215
+ check_data(colspec[:checks_raw], r, colname)
216
+
217
+ # process data
218
+ coord = coord(row_number, colspec[:colref], cell)
219
+ begin
220
+ processed = colspec[:process] ? colspec[:process].call(cell) : cell
221
+ @logger.debug "process on #{colname} at #{coord} yields: '#{processed}' (#{processed.class})"
222
+ r[colname][:value] = processed
223
+ rescue => e
224
+ @logger.error "#{__callee__}: process on :#{colname} raised an exception at #{coord}"
225
+ raise e
226
+ end
227
+
228
+ # check data after process - notice that now r contains the value
229
+ # processed by process
230
+ check_data(colspec[:checks], r, colname)
231
+ end
232
+
233
+ @table << r
234
+ end
235
+
236
+ @table
237
+ end
238
+
239
+ alias load read
240
+
241
+ # show to stdout the first `n` records we read from the file given the
242
+ # current configuration
243
+ def debug(args = {})
244
+ read(args.merge({ debug: true }))
245
+ end
246
+
247
+ # get (processed) row number
248
+ #
249
+ # - row_number is the row to get: index starts at 1.
250
+ #
251
+ # get_row(1) get the first line read, that is, the row specified
252
+ # by `first_row` in `options` (or in read)
253
+ #
254
+ # You need to invoke read first
255
+ def get_row(row_number)
256
+ if row_number > @table.size
257
+ @logger.error "#{__callee__}: 'row_number' is out of range (did you invoke read?)"
258
+ exit
259
+ elsif row_number <= 0
260
+ @logger.error "#{__callee__}: 'row_number' is zero or negative (first row is 1)."
261
+ else
262
+ @table[row_number - 1]
263
+ end
264
+ end
265
+
266
+ # return an array of hashes with all the errors we have encounterd
267
+ # an empty array is a good news
268
+ attr_reader :errors
269
+
270
+ def virtual_columns(hash = {})
271
+ # execute the virtual column specification
272
+ @table.each do |r|
273
+ @virtualcols.each do |virtualcol|
274
+ colname = virtualcol[:name]
275
+ r[colname] = { virtual: true }
276
+
277
+ check_data(virtualcol[:checks_raw], r, colname, full_row: true)
278
+
279
+ begin
280
+ # add the cell to the table
281
+ if virtualcol[:process]
282
+ r[colname][:value] = virtualcol[:process].call(r)
283
+ end
284
+ rescue => e
285
+ row = r[:row_number]
286
+ @logger.error "#{__callee__}: process for virtual column :#{colname} raised an exception at row #{row}"
287
+ raise e
288
+ end
289
+
290
+ # check data after process -- we also have the processed value of the virtual column
291
+ check_data(virtualcol[:checks], r, colname, full_row: true)
292
+ end
293
+ end
294
+ end
295
+
296
+ # apply the mapping code to the array
297
+ # it makes sense to invoke it only once
298
+ #
299
+ # the mapping is applied only if it defined
300
+ def process
301
+ @table.each { |row| @mapping&.call(row) }
302
+ end
303
+
304
+ def to_s
305
+ @table.to_s
306
+ end
307
+
308
+ #
309
+ # headers validation functions
310
+ #
311
+ def correct_headers?(hash = {})
312
+ output = compare_headers
313
+ output.values.map { |x| x[:correct] }.all?(true)
314
+ end
315
+
316
+ def compare_headers(hash = {})
317
+ options = @options.merge(hash)
318
+
319
+ spreadsheet = Dreader::Engine.open_spreadsheet(options[:filename])
320
+ sheet = spreadsheet.sheet(options[:sheet] || 0)
321
+ header_row_number = options[:first_row] - 1 || 1
322
+
323
+ output_hash = {}
324
+ @colspec.map do |colspec|
325
+ cell = sheet.cell(row_number, colspec[:colref])
326
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
327
+
328
+ output_hash[colspec[:colref]] = {
329
+ header_in_spec: colspec[:name],
330
+ human_readable: human_readable,
331
+ header_in_file: cell,
332
+ correct: cell == human_readable
333
+ }
334
+ end
335
+
336
+ output_hash
337
+ end
338
+
339
+ # generate a template from the column specification
340
+ # first row is a header, determined by colspec
341
+ # second row includes the documentation string, to document values in
342
+ # the columns
343
+ def template(hash = {})
344
+ options = @options.merge(hash)
345
+ filename = options[:template_filename]
346
+
347
+ workbook = FastExcel.open(filename, constant_memory: true)
348
+ worksheet = workbook.add_worksheet("Template")
349
+ worksheet.auto_width = true
350
+
351
+ # first_row is indexed from 1 by roo and from 0 by FastExcel
352
+ first_row = [(options[:first_row] || 0) - 2, 0].max
353
+ bold = workbook.bold_format
354
+
355
+ #
356
+ # somehow fast excel seems to allow to set cells row by row
357
+ #
358
+
359
+ # here we write the first row
360
+ @colspec.each do |colspec|
361
+ human_readable = colspec[:name].to_s.split("_").map(&:capitalize).join(" ")
362
+ colref = colref_to_i(colspec[:colref])
363
+ worksheet.write_string(first_row, colref, human_readable, bold)
364
+ end
365
+
366
+ # here we create a note with the legenda
367
+ @colspec.each do |colspec|
368
+ colref = colref_to_i(colspec[:colref])
369
+ worksheet.write_string(first_row + 1, colref, colspec[:doc], nil)
370
+ end
371
+
372
+ # here we write some example records
373
+ @examples.each_with_index do |example_hash, index|
374
+ example_hash.each do |colname, value|
375
+ colspec = @colspec.select { |x| x[:name] == colname }.first
376
+ if colspec
377
+ colref = colref_to_i(colspec[:colref])
378
+ worksheet.write_string(index + 3, colref, value, nil)
379
+ else
380
+ @logger.err "generate_template: #{colname} used in example is not defined"
381
+ end
382
+ end
383
+ end
384
+
385
+ workbook.close
386
+ end
387
+
388
+ private
389
+
390
+ def self.open_spreadsheet(filename)
391
+ ext = File.extname(filename)
392
+
393
+ case ext
394
+ when ".csv" then Roo::CSV.new(filename)
395
+ when ".tsv" then Roo::CSV.new(filename, csv_options: { col_sep: "\t" })
396
+ when ".ods" then Roo::OpenOffice.new(filename)
397
+ when ".xls" then Roo::Excel.new(filename)
398
+ when ".xlsx" then Roo::Excelx.new(filename)
399
+ else raise "Unknown extension: #{ext}"
400
+ end
401
+ end
402
+
403
+ def colref_to_i(colref)
404
+ return colref if colref.instance_of?(Integer)
405
+ value = 0
406
+ power = 1
407
+ colref.to_s.reverse.split("").map do |char|
408
+ value = value + power * (1 + char.ord - 'A'.ord)
409
+ power = power * 26
410
+ end
411
+ value - 1
412
+ end
413
+
414
+ # performs check on data (this is the same code for check_raw and
415
+ # check)
416
+ #
417
+ # @params
418
+ # - check_spec :: the set of checks to perform (an array of hashes
419
+ # in the form error_message: lambda
420
+ #
421
+ # - hash :: either the hash of a column or the hash of a row
422
+ #
423
+ # - colname :: the name of the column we are checking (if we check
424
+ # a column) or the column we are computing (virtual
425
+ # columns)
426
+ #
427
+ # - full_row :: if true, pass the full row to check rather than
428
+ # the value of a column
429
+ #
430
+ # - debug :: a boolean
431
+ def check_data(check_spec, hash, colname, full_row: false)
432
+ check_spec.each do |error_message, check_function|
433
+ # here we extract values by distinguishing whether the hash is that of
434
+ # column or that of a row
435
+ if full_row
436
+ value = hash
437
+ row = hash[:row_number]
438
+ col = :multiple_columns
439
+ else
440
+ value = hash[colname][:value]
441
+ row = hash[colname][:row]
442
+ col = hash[colname][:col]
443
+ end
444
+ coord = coord(row, col, value)
445
+
446
+ begin
447
+ pass = check_function.call(value)
448
+ @logger.debug "check for #{colname}/#{error_message} at #{coord} yields: '#{pass}'"
449
+
450
+ if pass != true
451
+ hash[colname][:error] = true
452
+ error = {
453
+ callee: __callee__,
454
+ cell_value: value, colname: colname,
455
+ row: row, col: col,
456
+ message: error_message,
457
+ content: pass
458
+ }
459
+ @errors << error
460
+ hash[:row_errors] << error
461
+ end
462
+ rescue => e
463
+ @logger.error "#{__callee__} for #{colname}/#{error_message} raised an exception at #{coord}"
464
+ raise e
465
+ end
466
+ end
467
+ end
468
+
469
+ def coord(row, col, cell)
470
+ "row: #{row}, col: #{col}, value: #{cell}"
471
+ end
472
+ end
473
+ end
@@ -0,0 +1,16 @@
1
+ module Dreader
2
+ # service class to implement the options DSL language
3
+ class Options
4
+ def initialize
5
+ @attributes = {}
6
+ end
7
+
8
+ def method_missing(name, *args, &block)
9
+ @attributes[name] = args[0]
10
+ end
11
+
12
+ def to_hash
13
+ @attributes
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,71 @@
1
+ module Dreader
2
+ # Utilities function to simplify importing data into
3
+ # ActiveRecords
4
+ class Util
5
+ # given a hash returned by Engine, return the same hash with
6
+ # keys directly bound to the content of the :value sub-key
7
+ #
8
+ # Example
9
+ #
10
+ # hash = {name: {value: "A", ...}, surname: {value: "B", ...}}
11
+ # simplify hash
12
+ # {name: "A", surname: "B"}
13
+ #
14
+ # remove all keys which are not part of the data read (row_number and
15
+ # row_errors)
16
+ def self.simplify(hash)
17
+ (hash.keys - %i[row_number row_errors]).map do |colname|
18
+ [colname, hash[colname][:value]]
19
+ end.to_h
20
+ end
21
+
22
+ # given a hash returned by Engine, keep the "kept" keys in the top
23
+ # of the hierarchy and move the "moved_key" below the
24
+ # "subordinate_key"
25
+ #
26
+ # Example
27
+ #
28
+ # hash = { name: "A", surname: "B", address: "via XX Settembre", city: "Genoa" }
29
+ # restructure hash, [:name, :surname], :address_attributes, [:address, :city]
30
+ # {name: "A", surname: "B", address_attributes: {address: "via XX Settembre", city: "Genoa"}}
31
+ #
32
+ def self.restructure(hash, kept, subordinate_key, moved_keys)
33
+ head = hash.slice kept
34
+ subordinate = prepend subordinate_key, hash.slice(moved_keys)
35
+ head.merge subordinate
36
+ end
37
+
38
+ # an alias for Hash.slice
39
+ # keys is an array of keys
40
+ def self.slice(hash, keys)
41
+ hash.slice(*keys)
42
+ end
43
+
44
+ # remove all `keys` from `hash`
45
+ def self.clean(hash, keys)
46
+ hash.reject { |key, _| keys.include?(key) }
47
+ end
48
+
49
+ # given a hash, return a new hash with key and whose value is
50
+ # the hash
51
+ #
52
+ # Example:
53
+ #
54
+ # hash = {name: "A", size: 10}
55
+ # prepend hash, :product_attributes
56
+ # {product_attributes: {name: "A", size: 10}}
57
+ #
58
+ def self.prepend(hash, key)
59
+ { key => hash }
60
+ end
61
+
62
+ #
63
+ # Retrieve all errors related to row/col from and Array of error messages
64
+ #
65
+ def self.errors(errors_array, row, col = nil)
66
+ errors_array.select do |error|
67
+ error[:row] == row && (col.nil? || error[:col] == col)
68
+ end
69
+ end
70
+ end
71
+ end
@@ -1,3 +1,3 @@
1
1
  module Dreader
2
- VERSION = "0.5.0"
2
+ VERSION = "1.0.0"
3
3
  end