hlsv 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,979 @@
1
+ # Copyright (c) 2026 AdClin
2
+ # Licensed under the GNU Affero General Public License v3.0 or later.
3
+ # See the LICENSE file for details.
4
+
5
+ require 'yaml'
6
+ require 'nokogiri'
7
+ require 'csv'
8
+ require 'fileutils'
9
+ require 'pathname'
10
+ require 'erb'
11
+
12
+ require 'spreadsheet'
13
+ require 'rubyXL'
14
+ require 'rubyXL/convenience_methods/cell'
15
+ require 'rubyXL/convenience_methods/color'
16
+ require 'rubyXL/convenience_methods/font'
17
+ require 'rubyXL/convenience_methods/workbook'
18
+ require 'rubyXL/convenience_methods/worksheet'
19
+
20
+
21
+ require_relative 'xpt'
22
+ require_relative 'html2word'
23
+ module Hlsv
24
+ class FindKeys
25
+
26
+ attr_reader :verbose
27
+ attr_reader :config
28
+ attr_reader :ds_list
29
+ attr_reader :study_name
30
+ attr_reader :web_mode
31
+
32
+ attr_accessor :report
33
+ attr_accessor :ds_with_issue
34
+
35
+ def initialize(config_file, verbose: false, web_mode: false)
36
+
37
+ @verbose = verbose
38
+ @web_mode= web_mode
39
+ @ds_with_issue = {
40
+ ascii: [],
41
+ define: [],
42
+ data: [],
43
+ }
44
+
45
+ report_init = {
46
+ config_name: nil,
47
+ data_information: {},
48
+ define_information: {},
49
+ }
50
+
51
+ # load the config file
52
+ # read YAML, csv, xls, xlsx only
53
+ @config = load_config(config_file)
54
+ if @web_mode
55
+ report_init[:config_name] = 'config.yaml'
56
+ else
57
+ report_init[:config_name] = config_file
58
+ end
59
+
60
+ # save the study name
61
+ @study_name = config['study_name']
62
+ report_name = "#{@study_name}/#{File.basename(@study_name)}_high_level_check.html"
63
+ @report = Report.new(report_name, @study_name, outname_ext)
64
+
65
+ # get all xpt files in the directory present in the config file (data_directory parameter)
66
+ dir = config['data_directory'].gsub('\\', '/')
67
+ @ds_list = Dir["#{dir}/*"].select { |f| File.extname(f) == '.xpt' }
68
+
69
+ report_init[:data_information] = {
70
+ directory_name: dir,
71
+ file_number: @ds_list.size,
72
+ }
73
+
74
+ # load define with Nokogiri gem
75
+ # get define keys
76
+ # return {dataset => array of variables}
77
+ @define_keys, define_report = load_define()
78
+ report_init[:define_information] = define_report
79
+
80
+ @report.add_init(report_init)
81
+
82
+ # remove all previous csv, in case of rerun
83
+ prepare_output_dir
84
+ end
85
+
86
+ ###
87
+ # Check some high level information
88
+ # Restriction: the dataset export is load only one time to avoid some performance issue
89
+ # Process:
90
+ # - Load the dataset records
91
+ # - Check the presence of non-ASCII character in data
92
+ # - Check the validity of the key that is referenced in define.xml.
93
+ # - Search the minimal key with the variables list present in the config file
94
+ def high_level_check
95
+
96
+ @ds_list.each do |dsf|
97
+ # store the dataset name
98
+ ds = File.basename(dsf).split('.')[0].upcase
99
+
100
+ # shortcut if dataset in the excluded list
101
+ next if config['excluded_ds'].include? ds
102
+
103
+ # load all records
104
+ # only XPT
105
+ # TODO json
106
+ records = load_dsfile(dsf)
107
+
108
+ # Initialiser la structure du rapport pour ce dataset
109
+ ds_report = {
110
+ record_count: records&.size,
111
+ candidates_type: nil,
112
+ ascii_check: nil,
113
+ define_key_check: nil,
114
+ data_key_check: nil,
115
+ verbose_details: []
116
+ }
117
+
118
+ # shortcut if no records in the file
119
+ if records.empty?
120
+ ds_report[:record_count] = 0
121
+ @report.add_dataset_report(ds, ds_report)
122
+ next
123
+ end
124
+
125
+ # search non-ASCII
126
+ ascii_issues, valid_ascii = ascii_search(records)
127
+ ds_report[:ascii_check] = {
128
+ valid: valid_ascii,
129
+ issues: ascii_issues
130
+ }
131
+
132
+ @ds_with_issue[:ascii] << ds unless valid_ascii
133
+
134
+ # check define keys
135
+ if @define_keys && @define_keys[ds]
136
+ define_key, valid_define, duplicate_file = key_check_define(ds, @define_keys[ds], records)
137
+ ds_report[:define_key_check] = {
138
+ valid: valid_define,
139
+ key: define_key,
140
+ duplicate_file: duplicate_file
141
+ }
142
+ elsif @define_keys == '-'
143
+ ds_report[:define_key_check] = {
144
+ valid: true,
145
+ absent: true,
146
+ }
147
+ else
148
+ ds_report[:define_key_check] = {
149
+ valid: false,
150
+ absent: true,
151
+ }
152
+ end
153
+
154
+ @ds_with_issue[:define] << ds unless valid_define
155
+
156
+ # check data keys
157
+ data_key_info, valid_data, candidates_type = key_check_data(ds, records)
158
+ ds_report[:data_key_check] = data_key_info
159
+ ds_report[:candidates_type] = candidates_type
160
+ @ds_with_issue[:data] << ds unless valid_data
161
+
162
+ @report.add_dataset_report(ds, ds_report)
163
+ end
164
+ end
165
+
166
+ # search key according to the config
167
+ def key_check_data(ds, records)
168
+ # Cas spéciaux pour SE et SV
169
+ if ds == 'SE'
170
+ keys_candidates_list = [
171
+ [:USUBJID, :EPOCH, :SUBJID],
172
+ [:USUBJID, :TAETORD, :SUBJID]
173
+ ]
174
+
175
+ results = keys_candidates_list.map do |keys_candidates|
176
+ result = look_keys(ds, keys_candidates, records)
177
+ if result[:valid]
178
+ { valid: true, key: result[:key], candidates: keys_candidates }
179
+ else
180
+ { valid: false, last_valid_key: result[:last_valid_key],
181
+ duplicate_file: result[:duplicate_file], candidates: keys_candidates }
182
+ end
183
+ end
184
+
185
+ # Retourner un array de résultats et valid si au moins une clé est valide
186
+ [results, results.any? { |r| r[:valid] }]
187
+
188
+ elsif ds == 'SV'
189
+ keys_candidates_list = [
190
+ [:USUBJID, :SVSTDTC, :SUBJID],
191
+ [:USUBJID, :VISITNUM, :SUBJID]
192
+ ]
193
+
194
+ results = keys_candidates_list.map do |keys_candidates|
195
+ result = look_keys(ds, keys_candidates, records)
196
+ if result[:valid]
197
+ { valid: true, key: result[:key], candidates: keys_candidates }
198
+ else
199
+ { valid: false, last_valid_key: result[:last_valid_key],
200
+ duplicate_file: result[:duplicate_file], candidates: keys_candidates }
201
+ end
202
+ end
203
+
204
+ # Retourner un array de résultats et valid si au moins une clé est valide
205
+ [results, results.any? { |r| r[:valid] }]
206
+
207
+ else
208
+ # Cas standard
209
+ keys_candidates, candidates_type = candidates(ds)
210
+ result = look_keys(ds, keys_candidates, records)
211
+
212
+ if result[:valid]
213
+ [
214
+ { valid: true, key: result[:key], candidates: keys_candidates},
215
+ true,
216
+ candidates_type
217
+ ]
218
+ else
219
+ [
220
+ { valid: false, last_valid_key: result[:last_valid_key],
221
+ duplicate_file: result[:duplicate_file], candidates: keys_candidates },
222
+ false,
223
+ candidates_type
224
+ ]
225
+ end
226
+ end
227
+ end
228
+
229
+ # create a list of the candidate variables for the key search
230
+ # - always starts with USUBJID
231
+ # - a fix list of candidate is fixed by default
232
+ # - some additional variables can be added via the config file
233
+ private def candidates(ds)
234
+
235
+ # candidate keys creation by dataset
236
+ # according to the dataset type
237
+ # event, intervention, finding
238
+ if %w(AE BE CE DV HO MH).include? ds
239
+ expected_candidates = @config['event_key']&.split(' ')
240
+ if expected_candidates.nil?
241
+ rpt_cand = '- No candidates keys variables, please check the config file'
242
+ else
243
+ rpt_cand = 'General Observation, event dataset'
244
+ end
245
+ elsif %w(CM EC EX ML PR SU).include? ds
246
+ expected_candidates = @config['intervention_key']&.split(' ')
247
+ if expected_candidates.nil?
248
+ rpt_cand = '- No candidates keys variables, please check the config file'
249
+ else
250
+ rpt_cand = 'General Observation, intervention dataset'
251
+ end
252
+ elsif %w(BS DA DD EG IE IS LB MB MI MK MO PC PE PF PP RP RS SC SS TR TU VS ZI).include? ds
253
+ expected_candidates = @config['finding_key']&.split(' ')
254
+ if expected_candidates.nil?
255
+ rpt_cand = '- No candidates keys variables, please check the config file'
256
+ else
257
+ rpt_cand = 'General Observation, finding dataset'
258
+ end
259
+ elsif %w(QS).include? ds
260
+ expected_candidates = %w(CAT TESTCD VISITNUM)
261
+ if expected_candidates.nil?
262
+ rpt_cand = '- No candidates keys variables, please check the config file'
263
+ else
264
+ rpt_cand = 'General Observation, finding dataset'
265
+ end
266
+ elsif %w(FA ZA).include? ds
267
+ expected_candidates = @config['finding_about_key']&.split(' ')
268
+ if expected_candidates.nil?
269
+ rpt_cand = '- No candidates keys variables, please check the config file'
270
+ else
271
+ rpt_cand = 'General Observation, finding about dataset'
272
+ end
273
+ elsif %w(DC DM).include? ds
274
+ expected_candidates = ['SUBJID']
275
+ if expected_candidates.nil?
276
+ rpt_cand = '- No candidates keys variables, please check the config file'
277
+ else
278
+ rpt_cand = 'Demographic Dataset'
279
+ end
280
+ elsif %w(DS).include? ds
281
+ expected_candidates = @config['ds_key']&.split(' ')
282
+ if expected_candidates.nil?
283
+ rpt_cand = '- No candidates keys variables, please check the config file'
284
+ else
285
+ rpt_cand = 'Special Dataset, DS'
286
+ end
287
+ elsif %w(RELREC).include? ds
288
+ expected_candidates = @config['relrec_key']&.split(' ')
289
+ if expected_candidates.nil?
290
+ rpt_cand = '- No candidates keys variables, please check the config file'
291
+ else
292
+ rpt_cand = 'Special Dataset, RELREC'
293
+ end
294
+ elsif %w(CO).include? ds
295
+ expected_candidates = @config['CO_key']&.split(' ')
296
+ if expected_candidates.nil?
297
+ rpt_cand = '- No candidates keys variables, please check the config file'
298
+ else
299
+ rpt_cand = 'Special Dataset, CO'
300
+ end
301
+ elsif ds == 'TA'
302
+ expected_candidates = @config['TA_key']&.split(' ')
303
+ if expected_candidates.nil?
304
+ rpt_cand = '- No candidates keys variables, please check the config file'
305
+ else
306
+ rpt_cand = 'Trial Design Dataset, TA'
307
+ end
308
+ elsif ds == 'TE'
309
+ expected_candidates = @config['TE_key']&.split(' ')
310
+ if expected_candidates.nil?
311
+ rpt_cand = '- No candidates keys variables, please check the config file'
312
+ else
313
+ rpt_cand = 'Trial Design Dataset, TE'
314
+ end
315
+ elsif ds == 'TI'
316
+ expected_candidates = @config['TI_key']&.split(' ')
317
+ if expected_candidates.nil?
318
+ rpt_cand = '- No candidates keys variables, please check the config file'
319
+ else
320
+ rpt_cand = 'Trial Design Dataset, TI'
321
+ end
322
+ elsif ds == 'TS'
323
+ expected_candidates = @config['TS_key']&.split(' ')
324
+ if expected_candidates.nil?
325
+ rpt_cand = '- No candidates keys variables, please check the config file'
326
+ else
327
+ rpt_cand = 'Trial Design Dataset, TS'
328
+ end
329
+ elsif ds == 'TV'
330
+ expected_candidates = @config['TV_key']&.split(' ')
331
+ if expected_candidates.nil?
332
+ rpt_cand = '- No candidates keys variables, please check the config file'
333
+ else
334
+ rpt_cand = 'Trial Design Dataset, TV'
335
+ end
336
+ elsif ds.start_with? 'SUPP'
337
+ expected_candidates = ['USUBJID', 'IDVAR', 'IDVARVAL', 'QNAM']
338
+ rpt_cand = 'SUPP dataset'
339
+ else
340
+ puts "Unknown dataset: #{ds}"
341
+ rpt_cand = "no key candidate for #{ds}"
342
+ expected_candidates = []
343
+ end
344
+
345
+ # no expected_candidates
346
+ if expected_candidates.nil?
347
+ expected_candidates = []
348
+ puts "#{ds}: No variables are specified in the configuration file for the key search."
349
+ end
350
+
351
+ # transform list of variables into symbol
352
+ # add the dataset prefix to variables when expected
353
+ if %w(TA TE TI TS TV RELREC CO).include? ds
354
+ keys_candidates = expected_candidates.map(&:to_sym)
355
+ elsif ds.start_with? 'SUPP'
356
+ keys_candidates = expected_candidates.map(&:to_sym)
357
+ else
358
+ keys_candidates = [:USUBJID,] +
359
+ expected_candidates.map do |c|
360
+ if %w(VISITNUM SUBJID).include? c
361
+ c.to_sym
362
+ else
363
+ "#{ds}#{c}".to_sym
364
+ end
365
+ end
366
+ end
367
+
368
+ # remove variable unexpected for specific dataset
369
+ if ds == 'TU'
370
+ # no visit or timepoint are expected in TU key
371
+ keys_candidates.delete(:VISITNUM)
372
+ keys_candidates.delete(:TUTPTNUM)
373
+ end
374
+
375
+ [keys_candidates, rpt_cand]
376
+ end
377
+
378
+ # look variable by variable if the key are valid
379
+ private def look_keys(ds, keys_candidates, records)
380
+ # initialize the number of duplicate with the records of records present in the dataset
381
+ prev_records_max = records.size
382
+ prev_nb_sub = records.size
383
+ # initialize the useless variables as an empty array
384
+ useless_vars = []
385
+ # save report to add or according to the end
386
+ report_key = []
387
+
388
+ # display the variables check during the search
389
+ report_key << "- the key search is based on the following variables in the given order: #{keys_candidates.join(', ')}"
390
+
391
+ # loop on all candidate keys variables
392
+ keys_candidates.size.times do |i|
393
+
394
+ # tested key
395
+ keys_vars = keys_candidates[..i]
396
+ # remove useless variables
397
+ useless_vars.each do |v|
398
+ keys_vars.delete(v)
399
+ end
400
+ report_key << " - trying #{keys_vars.join(', ')}" if @verbose
401
+
402
+ # array of values of the variables
403
+ keys_records = records.group_by do |p|
404
+ keys_vars.map { |var| p[var] }
405
+ end
406
+
407
+ # get the maximum of records by key
408
+ records_max = 0
409
+ nb_sub = 0
410
+ keys_records.each do |values, list|
411
+ records_max = list.size if list.size > records_max
412
+ if list.size > 1
413
+ nb_sub += 1
414
+ end
415
+ end
416
+
417
+ # break if no duplicate
418
+ if records_max == 1
419
+ return {
420
+ valid: true,
421
+ key: keys_vars
422
+ }
423
+ end
424
+
425
+ # duplicate still present
426
+ report_key << " - nope: #{nb_sub} keys variables have a maximum #{records_max} records for #{keys_vars.join(', ')}" if @verbose
427
+ # if the number of current duplicate is superior or equal to the previous
428
+ # number of duplicate then the variable is not useful for the key search and
429
+ # the previous number of maximum duplicate stay as is
430
+ # the current variable will be remove for the next search
431
+ if prev_records_max <= records_max && prev_nb_sub <= nb_sub
432
+ report_key << " - #{keys_vars.last} not useful"
433
+ useless_vars << keys_vars.last
434
+ end
435
+
436
+ # current duplicate number become the current number of duplicate
437
+ prev_records_max = records_max
438
+ prev_nb_sub = nb_sub
439
+
440
+ # duplicate if no more variables
441
+ if i == (keys_candidates.size-1)
442
+ # duplicates records with the last valid key
443
+ last_valid_keys = keys_candidates.reject { |kc| useless_vars.include? kc }
444
+
445
+ # compile all duplicates with the last valid keys
446
+ duplicate_records = records.group_by do |p|
447
+ last_valid_keys.map { |var| p[var] }
448
+ end
449
+
450
+ # write the CSV file
451
+ file_name = display_duplicate_records(ds, duplicate_records, :data)
452
+
453
+ report_key << "- no more candidates variables"
454
+ report_key << " - the last interesting variables checked for the key are #{last_valid_keys.join(', ')}"
455
+ if outname_ext == 'csv'
456
+ report_key << " - all duplicates are present here: #{file_name}"
457
+ else
458
+ report_key << " - all duplicates are present here: #{output_file}, sheet: #{File.basename(file_name, '.csv')}"
459
+ end
460
+ return {
461
+ valid: false,
462
+ last_valid_key: last_valid_keys,
463
+ duplicate_file: file_name
464
+ }
465
+ end
466
+ end
467
+
468
+ end
469
+
470
+ # search the key
471
+ def key_check_define(ds, keys, records)
472
+ keys ||= [:USUBJID]
473
+
474
+ keys_records = records.group_by do |p|
475
+ keys.map { |var| p[var.to_sym] }
476
+ end
477
+
478
+ records_max = keys_records.map { |_, list| list.size }.max || 0
479
+
480
+ if records_max == 1
481
+ [keys, true, nil]
482
+ else
483
+ file_name = display_duplicate_records(ds, keys_records, :define)
484
+ [keys, false, file_name]
485
+ end
486
+ end
487
+
488
+ # search non-ASCII character
489
+ def ascii_search(records)
490
+ # initialize list of exception
491
+ non_ascii_list = []
492
+ # read all characters
493
+ records.each do |record|
494
+ record.each do |var, value|
495
+ next if value.nil? || value.is_a?(Integer) || value.is_a?(Float)
496
+ if value.match(/[^[:ascii:]]/)
497
+ non_ascii_list << "#{var}: #{value.inspect}"
498
+ end
499
+ end
500
+ end
501
+
502
+ [non_ascii_list, non_ascii_list.empty?]
503
+ end
504
+
505
+ ###
506
+ # load config file
507
+ def load_config(file)
508
+ YAML.load_file(file)
509
+ end
510
+
511
+ ###
512
+ # load xpt file
513
+ def load_dsfile(infile)
514
+ reader = SAS::XPT::Reader.new(infile)
515
+
516
+ # only the fist dataset is loaded
517
+ puts "several datasets in one xpt, only the first one is kept" if reader.library.datasets.size > 1
518
+ dataset = reader.library.datasets.first
519
+
520
+ vars = dataset.variables.map(&:name).map(&:to_sym)
521
+ obs = dataset.observations
522
+
523
+ records = obs.map do |values|
524
+ vars.zip(values).to_h
525
+ end
526
+
527
+ records
528
+ end
529
+
530
+ ####
531
+ # load define.xml with Nokogiri when present
532
+ # export the key variables by dataset (hash)
533
+ # nil otherwise
534
+ def load_define
535
+ # structure for report
536
+ define_report = {
537
+ define_path: nil,
538
+ load: false,
539
+ }
540
+
541
+ # initialize from config file
542
+ define_path = @config['define_path']
543
+ # define not mentionned at the end of the path
544
+ if define_path != '-' && File.basename(define_path) != 'define.xml'
545
+ define_path = "#{define_path}/define.xml"
546
+ end
547
+ # store
548
+ define_report[:define_path] = define_path
549
+
550
+ # use false to distinguish
551
+ if define_path == '-'
552
+ define_report[:load] = false
553
+ define_node = '-'
554
+ else
555
+ begin
556
+ define_node = File.open(define_path, "rb") { |io| Nokogiri::XML(io, &:noblanks) }
557
+ define_report[:load] = true
558
+ rescue => e
559
+ puts "⚠ Error loading define.xml : #{e.message}"
560
+ define_report[:load] = false
561
+ define_node = nil
562
+ end
563
+ end
564
+
565
+ [get_define_keys(define_node), define_report]
566
+ end
567
+
568
+ # get the define key variables by dataset from a nokogiri node,
569
+ # return an hash of dataset with list of variables name
570
+ # - exclude supp dataset
571
+ # - keys order is not supported
572
+ private def get_define_keys(define_node)
573
+ # shortcut if no define
574
+ return nil if define_node.nil?
575
+ # shortcut if define = -
576
+ return '-' if define_node == '-'
577
+ # store all ItemGroup
578
+ item_group = define_node.css("ItemGroupDef")
579
+ # store all Item
580
+ item_def = define_node.css("ItemDef").group_by { |id| id['OID'] }
581
+
582
+ # keep only the dataset name & the key variables
583
+ define_keys = {}
584
+ # loop on all item group (dataset)
585
+ item_group.each do |ig|
586
+ key_list = []
587
+ ds = ig["Name"].upcase
588
+ # loop on all item (variables)
589
+ ig.css("ItemRef").each do |ir|
590
+ # exclude all variables not in the key
591
+ index = ir["KeySequence"]&.to_i
592
+ next if index.nil?
593
+ # get the ItemDef of the variable by the ItemRef OID
594
+ item = item_def[ir["ItemOID"]]
595
+ # expect one item per variable
596
+ unless item.size == 1
597
+ puts "several item with the same OID: #{item.size}"
598
+ item.each do |i|
599
+ puts i
600
+ end
601
+ puts "only the first is kept"
602
+ end
603
+ # store the variable name
604
+ key_list << item.first["Name"]
605
+ end
606
+
607
+ # assign the key to the dataset
608
+ define_keys[ds] = key_list
609
+ end
610
+
611
+ define_keys
612
+ end
613
+
614
+ ###
615
+ # display
616
+ #
617
+ # create a CVS with all duplicates
618
+ def display_duplicate_records(ds, keys_records, type)
619
+
620
+ if outname_ext == 'csv'
621
+ display_out = "#{dir_out}/#{type.to_s}_#{ds}.csv"
622
+ else
623
+ display_out = "#{dir_out}/#{type.to_s}_#{ds}.csv"
624
+ end
625
+
626
+ # TODO: hum ça marche ouais mais non, faut pas laisser comme ça
627
+ header = keys_records.first.last.first.keys
628
+ header = header.insert(0, 'No')
629
+
630
+ duplicate = keys_records.reject { |key, list| list.size == 1 }
631
+
632
+ CSV.open(display_out, "w") do |csv|
633
+ # header
634
+ csv << header
635
+ # body
636
+ duplicate.each do |key, list|
637
+ idx = duplicate.keys.index(key) + 1
638
+ list.each do |l|
639
+ row = l.values.insert(0, idx)
640
+ csv << row
641
+ end
642
+ end
643
+ end
644
+
645
+ display_out
646
+ end
647
+
648
+ # write report
649
+ def write_html_report
650
+ @report.output_file = output_file
651
+ @report.save_html
652
+ end
653
+
654
+ def write_word_report
655
+
656
+ # write html report unless if already exist
657
+ write_html_report unless File.exist?(@report.report_name)
658
+
659
+ # compute name
660
+ report_html = @report.report_name
661
+ report_word = report_html[...-5] + '.docx'
662
+
663
+ # parse html
664
+ puts "Parsing #{report_html}..."
665
+ blocks = RiReportParser.new(report_html).parse
666
+ puts " -> #{blocks.size} blocks extracted"
667
+
668
+ # create word
669
+ puts "Building #{report_word}..."
670
+ DocxWriter.new(report_word).write(blocks)
671
+ puts " -> Done: #{report_word}"
672
+ end
673
+
674
+ private def write_xls(csv_files)
675
+ # create xls file
676
+ book = Spreadsheet::Workbook.new
677
+
678
+ # workbook default format
679
+ format_default = Spreadsheet::Format.new(
680
+ font: Spreadsheet::Font.new('Verdana', size: 8)
681
+ )
682
+
683
+ #
684
+ format_gray = Spreadsheet::Format.new(
685
+ font: Spreadsheet::Font.new('Verdana', size: 8),
686
+ pattern: 1,
687
+ pattern_fg_color: :silver
688
+ )
689
+
690
+ # header format
691
+ format_header = Spreadsheet::Format.new(
692
+ weight: :bold, # TODO, not working, dont known why
693
+ font: Spreadsheet::Font.new('Verdana', size: 8),
694
+ pattern: 1,
695
+ pattern_fg_color: :grey
696
+ )
697
+
698
+ # loop on csv files
699
+ csv_files.each do |csv_file|
700
+ # create sheet name
701
+ sheet_name = File.basename(csv_file, '.csv')[11...]
702
+
703
+ # create worksheet
704
+ sheet = book.create_worksheet(name: sheet_name)
705
+
706
+ # initialize variables for the color switch
707
+ previous_first_cell_value = nil
708
+ should_color = false
709
+
710
+ # read csv & write worksheet
711
+ row_index = 0
712
+ CSV.foreach(csv_file, headers: false, encoding: 'UTF-8') do |row|
713
+
714
+ # check the first cell on the current row, to known if the color must switch
715
+ if row_index >= 0 # switch start after the first duplicate
716
+ current_first_cell_value = row[0]
717
+ if current_first_cell_value != previous_first_cell_value
718
+ should_color = !should_color # color switch
719
+ previous_first_cell_value = current_first_cell_value
720
+ end
721
+ end
722
+
723
+ # set format
724
+ # on the header
725
+ if row_index == 0
726
+ sheet.row(row_index).default_format = format_header
727
+ # format alternate
728
+ elsif should_color
729
+ sheet.row(row_index).default_format = format_gray
730
+ else
731
+ sheet.row(row_index).default_format = format_default
732
+ end
733
+
734
+ # write line by line
735
+ sheet.row(row_index).concat(row)
736
+ row_index += 1
737
+ end
738
+
739
+ # adjust the column length
740
+ if row_index > 0
741
+ num_columns = sheet.row(0).size
742
+
743
+ (0...num_columns).each do |col_index|
744
+ max_length = 10 # min length
745
+
746
+ # loop on all column
747
+ (0...row_index).each do |r|
748
+ cell_value = sheet.row(r)[col_index].to_s
749
+ cell_length = cell_value.length + 2 # +2 for margin
750
+ max_length = cell_length if cell_length > max_length
751
+ end
752
+
753
+ # max length is 50
754
+ sheet.column(col_index).width = [max_length, 50].min
755
+ end
756
+ end
757
+ end
758
+
759
+ # save the workbook
760
+ book.write(output_file)
761
+ end
762
+
763
+ private def write_xlsx(csv_files)
764
+ # create workbook
765
+ workbook = RubyXL::Workbook.new
766
+
767
+ # delete default worksheet
768
+ workbook.worksheets.delete_at(0)
769
+
770
+ # loop on all csv files
771
+ csv_files.each do |csv_file|
772
+ # create sheet name
773
+ sheet_name = File.basename(csv_file, '.csv')
774
+
775
+ # create worksheet
776
+ worksheet = workbook.add_worksheet(sheet_name)
777
+
778
+ # initialize variables for background color switch
779
+ previous_first_cell_value = nil
780
+ should_color = false
781
+
782
+ # read csv & write worksheet
783
+ row_index = 0
784
+ CSV.foreach(csv_file, headers: false, encoding: 'UTF-8') do |row|
785
+ row.each_with_index do |cell_value, col_index|
786
+
787
+ # check the first cell on the current row, to known if the color must switch
788
+ if row_index >= 0 # start with a blank line after the header
789
+ current_first_cell_value = row[0]
790
+ if current_first_cell_value != previous_first_cell_value
791
+ should_color = !should_color # color switch
792
+ previous_first_cell_value = current_first_cell_value
793
+ end
794
+ end
795
+
796
+ # get the cell
797
+ cell = worksheet.add_cell(row_index, col_index, cell_value)
798
+
799
+ # set format
800
+ # on header row
801
+ if row_index == 0
802
+ cell.change_font_bold(true)
803
+ cell.change_font_name('Verdana')
804
+ cell.change_font_size(8)
805
+ cell.change_fill('C0C0C0')
806
+
807
+ # on other rows
808
+ else
809
+ cell.change_font_name('Verdana')
810
+ cell.change_font_size(8)
811
+ cell.change_fill('DCDCDC') if should_color
812
+ end
813
+ end
814
+
815
+ row_index += 1
816
+ end
817
+
818
+ # adjust the column length
819
+ if row_index > 0
820
+ num_columns = worksheet[0].cells.length
821
+
822
+ (0...num_columns).each do |col_index|
823
+ max_length = 10 # minimum length
824
+
825
+ # loop on all rows in a column
826
+ (0...row_index).each do |r|
827
+ cell = worksheet[r][col_index]
828
+ if cell && cell.value
829
+ cell_length = cell.value.to_s.length + 2 # +2 for margin
830
+ max_length = cell_length if cell_length > max_length
831
+ end
832
+ end
833
+
834
+ # column length is 50 or the length computed
835
+ col = worksheet.cols.get_range(col_index)
836
+ col.width = [max_length * 1.2, 50].min
837
+ col.custom_width = true
838
+ end
839
+ end
840
+ end
841
+
842
+ # save the workbook
843
+ workbook.write(output_file)
844
+ end
845
+
846
+ ###
847
+ # Helpers
848
+
849
+ # output directory
850
+ def prepare_output_dir
851
+ dir = "#{study_name}/#{config['output_directory']}"
852
+
853
+ # no value, current repository
854
+ if dir.nil? || dir.strip.empty?
855
+ dir = '.'
856
+
857
+ # repository present, delete all csv file
858
+ elsif Dir.exist?(dir)
859
+ Dir.glob("#{dir}/**/*.csv").each { |f| File.delete(f) }
860
+
861
+ # no repository, creation
862
+ else
863
+ FileUtils.mkdir_p(dir)
864
+ end
865
+
866
+ dir
867
+ end
868
+
869
+ def dir_out
870
+ dir = "#{study_name}/#{config['output_directory']}"
871
+ dir = '.' if dir.nil? || dir.strip.empty?
872
+ dir
873
+ end
874
+
875
+ # extension output file
876
+ def outname_ext
877
+ if %w(xls xlsx).include? config['output_type']
878
+ config['output_type']
879
+ else
880
+ 'csv'
881
+ end
882
+ end
883
+
884
+ # compute the file name for Excel file
885
+ def output_file
886
+ if outname_ext == 'csv'
887
+ nil
888
+ else
889
+ "duplicates.#{outname_ext}"
890
+ end
891
+ end
892
+
893
+ # list of invalid dataset, whatever the issue
894
+ def dslist_invalid
895
+ @ds_with_issue.values.flatten.uniq
896
+ end
897
+
898
+ ###
899
+ # Report class
900
+ class Report
901
+ attr_reader :report_name
902
+ attr_reader :study_name
903
+ attr_reader :externe_file_type
904
+
905
+ attr_accessor :init_reports
906
+ attr_accessor :dataset_reports
907
+ attr_accessor :output_file
908
+
909
+ def initialize(report_name, study_name, externe_file_type)
910
+ @report_name = report_name
911
+ @study_name = study_name
912
+ @externe_file_type = externe_file_type
913
+ @init_reports = {}
914
+ @dataset_reports = {}
915
+ end
916
+
917
+ # summary part from FindKeys.initialize
918
+ def add_init(text_hash)
919
+ @init_reports = text_hash
920
+ end
921
+
922
+ # report part from FindKeys.high_level_check
923
+ def add_dataset_report(ds_name, report_data)
924
+ @dataset_reports[ds_name] = report_data
925
+ end
926
+
927
+ # dynamic title for acces web or manual
928
+ def title
929
+ if @study_name.include?('/')
930
+ File.basename(@study_name)
931
+ else
932
+ @study_name
933
+ end
934
+ end
935
+
936
+ # html build from ERB file
937
+ def generate_html
938
+ # Charge ERB template
939
+ template_path = File.expand_path('../../../views/report_template.erb', __FILE__)
940
+
941
+ unless File.exist?(template_path)
942
+ raise "Template file not found: #{template_path}"
943
+ end
944
+
945
+ # Encode logo in base64
946
+ logo_path = File.expand_path('../../../public/Contact-LOGO.png', __FILE__)
947
+ @logo_base64 = if File.exist?(logo_path)
948
+ require 'base64'
949
+ "data:image/png;base64,#{Base64.strict_encode64(File.binread(logo_path))}"
950
+ else
951
+ nil
952
+ end
953
+
954
+ # Create HTML
955
+ template = File.read(template_path)
956
+ erb = ERB.new(template, trim_mode: '-')
957
+ erb.result(binding)
958
+ end
959
+
960
+ # write html file
961
+ def save_html(output_path = nil)
962
+ output_path ||= @report_name
963
+ html_content = generate_html
964
+
965
+ File.write(output_path, html_content)
966
+ puts "Rapport HTML généré : #{output_path}"
967
+ end
968
+ end
969
+
970
+ end
971
+ end
972
+
973
+ # config_file = 'config.yaml'
974
+ # fk = FindKeys.new(config_file, verbose: true)
975
+ # fk.high_level_check
976
+ # fk.write_html_report
977
+ # fk.write_word_report
978
+ # puts fk.ds_with_issue
979
+ # puts fk.dslist_invalid