hlsv 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE +676 -0
- data/README.md +356 -0
- data/bin/hlsv +4 -0
- data/config.default.yaml +19 -0
- data/lib/hlsv/cli.rb +85 -0
- data/lib/hlsv/find_keys.rb +979 -0
- data/lib/hlsv/html2word.rb +602 -0
- data/lib/hlsv/mon_script.rb +169 -0
- data/lib/hlsv/version.rb +5 -0
- data/lib/hlsv/web_app.rb +569 -0
- data/lib/hlsv/xpt/dataset.rb +38 -0
- data/lib/hlsv/xpt/library.rb +28 -0
- data/lib/hlsv/xpt/reader.rb +367 -0
- data/lib/hlsv/xpt/variable.rb +130 -0
- data/lib/hlsv/xpt.rb +11 -0
- data/lib/hlsv.rb +49 -0
- data/public/Contact-LOGO.png +0 -0
- data/public/app.js +569 -0
- data/public/styles.css +586 -0
- data/public/styles_csv.css +448 -0
- data/views/csv_view.erb +85 -0
- data/views/index.erb +233 -0
- data/views/report_template.erb +1144 -0
- metadata +176 -0
|
@@ -0,0 +1,979 @@
|
|
|
1
|
+
# Copyright (c) 2026 AdClin
|
|
2
|
+
# Licensed under the GNU Affero General Public License v3.0 or later.
|
|
3
|
+
# See the LICENSE file for details.
|
|
4
|
+
|
|
5
|
+
require 'yaml'
|
|
6
|
+
require 'nokogiri'
|
|
7
|
+
require 'csv'
|
|
8
|
+
require 'fileutils'
|
|
9
|
+
require 'pathname'
|
|
10
|
+
require 'erb'
|
|
11
|
+
|
|
12
|
+
require 'spreadsheet'
|
|
13
|
+
require 'rubyXL'
|
|
14
|
+
require 'rubyXL/convenience_methods/cell'
|
|
15
|
+
require 'rubyXL/convenience_methods/color'
|
|
16
|
+
require 'rubyXL/convenience_methods/font'
|
|
17
|
+
require 'rubyXL/convenience_methods/workbook'
|
|
18
|
+
require 'rubyXL/convenience_methods/worksheet'
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
require_relative 'xpt'
|
|
22
|
+
require_relative 'html2word'
|
|
23
|
+
module Hlsv
|
|
24
|
+
class FindKeys
|
|
25
|
+
|
|
26
|
+
attr_reader :verbose
|
|
27
|
+
attr_reader :config
|
|
28
|
+
attr_reader :ds_list
|
|
29
|
+
attr_reader :study_name
|
|
30
|
+
attr_reader :web_mode
|
|
31
|
+
|
|
32
|
+
attr_accessor :report
|
|
33
|
+
attr_accessor :ds_with_issue
|
|
34
|
+
|
|
35
|
+
def initialize(config_file, verbose: false, web_mode: false)
|
|
36
|
+
|
|
37
|
+
@verbose = verbose
|
|
38
|
+
@web_mode= web_mode
|
|
39
|
+
@ds_with_issue = {
|
|
40
|
+
ascii: [],
|
|
41
|
+
define: [],
|
|
42
|
+
data: [],
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
report_init = {
|
|
46
|
+
config_name: nil,
|
|
47
|
+
data_information: {},
|
|
48
|
+
define_information: {},
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# load the config file
|
|
52
|
+
# read YAML, csv, xls, xlsx only
|
|
53
|
+
@config = load_config(config_file)
|
|
54
|
+
if @web_mode
|
|
55
|
+
report_init[:config_name] = 'config.yaml'
|
|
56
|
+
else
|
|
57
|
+
report_init[:config_name] = config_file
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# save the study name
|
|
61
|
+
@study_name = config['study_name']
|
|
62
|
+
report_name = "#{@study_name}/#{File.basename(@study_name)}_high_level_check.html"
|
|
63
|
+
@report = Report.new(report_name, @study_name, outname_ext)
|
|
64
|
+
|
|
65
|
+
# get all xpt files in the directory present in the config file (data_directory parameter)
|
|
66
|
+
dir = config['data_directory'].gsub('\\', '/')
|
|
67
|
+
@ds_list = Dir["#{dir}/*"].select { |f| File.extname(f) == '.xpt' }
|
|
68
|
+
|
|
69
|
+
report_init[:data_information] = {
|
|
70
|
+
directory_name: dir,
|
|
71
|
+
file_number: @ds_list.size,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# load define with Nokogiri gem
|
|
75
|
+
# get define keys
|
|
76
|
+
# return {dataset => array of variables}
|
|
77
|
+
@define_keys, define_report = load_define()
|
|
78
|
+
report_init[:define_information] = define_report
|
|
79
|
+
|
|
80
|
+
@report.add_init(report_init)
|
|
81
|
+
|
|
82
|
+
# remove all previous csv, in case of rerun
|
|
83
|
+
prepare_output_dir
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
###
|
|
87
|
+
# Check some high level information
|
|
88
|
+
# Restriction: the dataset export is load only one time to avoid some performance issue
|
|
89
|
+
# Process:
|
|
90
|
+
# - Load the dataset records
|
|
91
|
+
# - Check the presence of non-ASCII character in data
|
|
92
|
+
# - Check the validity of the key that is referenced in define.xml.
|
|
93
|
+
# - Search the minimal key with the variables list present in the config file
|
|
94
|
+
def high_level_check
|
|
95
|
+
|
|
96
|
+
@ds_list.each do |dsf|
|
|
97
|
+
# store the dataset name
|
|
98
|
+
ds = File.basename(dsf).split('.')[0].upcase
|
|
99
|
+
|
|
100
|
+
# shortcut if dataset in the excluded list
|
|
101
|
+
next if config['excluded_ds'].include? ds
|
|
102
|
+
|
|
103
|
+
# load all records
|
|
104
|
+
# only XPT
|
|
105
|
+
# TODO json
|
|
106
|
+
records = load_dsfile(dsf)
|
|
107
|
+
|
|
108
|
+
# Initialiser la structure du rapport pour ce dataset
|
|
109
|
+
ds_report = {
|
|
110
|
+
record_count: records&.size,
|
|
111
|
+
candidates_type: nil,
|
|
112
|
+
ascii_check: nil,
|
|
113
|
+
define_key_check: nil,
|
|
114
|
+
data_key_check: nil,
|
|
115
|
+
verbose_details: []
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# shortcut if no records in the file
|
|
119
|
+
if records.empty?
|
|
120
|
+
ds_report[:record_count] = 0
|
|
121
|
+
@report.add_dataset_report(ds, ds_report)
|
|
122
|
+
next
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# search non-ASCII
|
|
126
|
+
ascii_issues, valid_ascii = ascii_search(records)
|
|
127
|
+
ds_report[:ascii_check] = {
|
|
128
|
+
valid: valid_ascii,
|
|
129
|
+
issues: ascii_issues
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
@ds_with_issue[:ascii] << ds unless valid_ascii
|
|
133
|
+
|
|
134
|
+
# check define keys
|
|
135
|
+
if @define_keys && @define_keys[ds]
|
|
136
|
+
define_key, valid_define, duplicate_file = key_check_define(ds, @define_keys[ds], records)
|
|
137
|
+
ds_report[:define_key_check] = {
|
|
138
|
+
valid: valid_define,
|
|
139
|
+
key: define_key,
|
|
140
|
+
duplicate_file: duplicate_file
|
|
141
|
+
}
|
|
142
|
+
elsif @define_keys == '-'
|
|
143
|
+
ds_report[:define_key_check] = {
|
|
144
|
+
valid: true,
|
|
145
|
+
absent: true,
|
|
146
|
+
}
|
|
147
|
+
else
|
|
148
|
+
ds_report[:define_key_check] = {
|
|
149
|
+
valid: false,
|
|
150
|
+
absent: true,
|
|
151
|
+
}
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
@ds_with_issue[:define] << ds unless valid_define
|
|
155
|
+
|
|
156
|
+
# check data keys
|
|
157
|
+
data_key_info, valid_data, candidates_type = key_check_data(ds, records)
|
|
158
|
+
ds_report[:data_key_check] = data_key_info
|
|
159
|
+
ds_report[:candidates_type] = candidates_type
|
|
160
|
+
@ds_with_issue[:data] << ds unless valid_data
|
|
161
|
+
|
|
162
|
+
@report.add_dataset_report(ds, ds_report)
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# search key according to the config
|
|
167
|
+
def key_check_data(ds, records)
|
|
168
|
+
# Cas spéciaux pour SE et SV
|
|
169
|
+
if ds == 'SE'
|
|
170
|
+
keys_candidates_list = [
|
|
171
|
+
[:USUBJID, :EPOCH, :SUBJID],
|
|
172
|
+
[:USUBJID, :TAETORD, :SUBJID]
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
results = keys_candidates_list.map do |keys_candidates|
|
|
176
|
+
result = look_keys(ds, keys_candidates, records)
|
|
177
|
+
if result[:valid]
|
|
178
|
+
{ valid: true, key: result[:key], candidates: keys_candidates }
|
|
179
|
+
else
|
|
180
|
+
{ valid: false, last_valid_key: result[:last_valid_key],
|
|
181
|
+
duplicate_file: result[:duplicate_file], candidates: keys_candidates }
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Retourner un array de résultats et valid si au moins une clé est valide
|
|
186
|
+
[results, results.any? { |r| r[:valid] }]
|
|
187
|
+
|
|
188
|
+
elsif ds == 'SV'
|
|
189
|
+
keys_candidates_list = [
|
|
190
|
+
[:USUBJID, :SVSTDTC, :SUBJID],
|
|
191
|
+
[:USUBJID, :VISITNUM, :SUBJID]
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
results = keys_candidates_list.map do |keys_candidates|
|
|
195
|
+
result = look_keys(ds, keys_candidates, records)
|
|
196
|
+
if result[:valid]
|
|
197
|
+
{ valid: true, key: result[:key], candidates: keys_candidates }
|
|
198
|
+
else
|
|
199
|
+
{ valid: false, last_valid_key: result[:last_valid_key],
|
|
200
|
+
duplicate_file: result[:duplicate_file], candidates: keys_candidates }
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Retourner un array de résultats et valid si au moins une clé est valide
|
|
205
|
+
[results, results.any? { |r| r[:valid] }]
|
|
206
|
+
|
|
207
|
+
else
|
|
208
|
+
# Cas standard
|
|
209
|
+
keys_candidates, candidates_type = candidates(ds)
|
|
210
|
+
result = look_keys(ds, keys_candidates, records)
|
|
211
|
+
|
|
212
|
+
if result[:valid]
|
|
213
|
+
[
|
|
214
|
+
{ valid: true, key: result[:key], candidates: keys_candidates},
|
|
215
|
+
true,
|
|
216
|
+
candidates_type
|
|
217
|
+
]
|
|
218
|
+
else
|
|
219
|
+
[
|
|
220
|
+
{ valid: false, last_valid_key: result[:last_valid_key],
|
|
221
|
+
duplicate_file: result[:duplicate_file], candidates: keys_candidates },
|
|
222
|
+
false,
|
|
223
|
+
candidates_type
|
|
224
|
+
]
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# create a list of the candidate variables for the key search
|
|
230
|
+
# - always starts with USUBJID
|
|
231
|
+
# - a fix list of candidate is fixed by default
|
|
232
|
+
# - some additional variables can be added via the config file
|
|
233
|
+
private def candidates(ds)
|
|
234
|
+
|
|
235
|
+
# candidate keys creation by dataset
|
|
236
|
+
# according to the dataset type
|
|
237
|
+
# event, intervention, finding
|
|
238
|
+
if %w(AE BE CE DV HO MH).include? ds
|
|
239
|
+
expected_candidates = @config['event_key']&.split(' ')
|
|
240
|
+
if expected_candidates.nil?
|
|
241
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
242
|
+
else
|
|
243
|
+
rpt_cand = 'General Observation, event dataset'
|
|
244
|
+
end
|
|
245
|
+
elsif %w(CM EC EX ML PR SU).include? ds
|
|
246
|
+
expected_candidates = @config['intervention_key']&.split(' ')
|
|
247
|
+
if expected_candidates.nil?
|
|
248
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
249
|
+
else
|
|
250
|
+
rpt_cand = 'General Observation, intervention dataset'
|
|
251
|
+
end
|
|
252
|
+
elsif %w(BS DA DD EG IE IS LB MB MI MK MO PC PE PF PP RP RS SC SS TR TU VS ZI).include? ds
|
|
253
|
+
expected_candidates = @config['finding_key']&.split(' ')
|
|
254
|
+
if expected_candidates.nil?
|
|
255
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
256
|
+
else
|
|
257
|
+
rpt_cand = 'General Observation, finding dataset'
|
|
258
|
+
end
|
|
259
|
+
elsif %w(QS).include? ds
|
|
260
|
+
expected_candidates = %w(CAT TESTCD VISITNUM)
|
|
261
|
+
if expected_candidates.nil?
|
|
262
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
263
|
+
else
|
|
264
|
+
rpt_cand = 'General Observation, finding dataset'
|
|
265
|
+
end
|
|
266
|
+
elsif %w(FA ZA).include? ds
|
|
267
|
+
expected_candidates = @config['finding_about_key']&.split(' ')
|
|
268
|
+
if expected_candidates.nil?
|
|
269
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
270
|
+
else
|
|
271
|
+
rpt_cand = 'General Observation, finding about dataset'
|
|
272
|
+
end
|
|
273
|
+
elsif %w(DC DM).include? ds
|
|
274
|
+
expected_candidates = ['SUBJID']
|
|
275
|
+
if expected_candidates.nil?
|
|
276
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
277
|
+
else
|
|
278
|
+
rpt_cand = 'Demographic Dataset'
|
|
279
|
+
end
|
|
280
|
+
elsif %w(DS).include? ds
|
|
281
|
+
expected_candidates = @config['ds_key']&.split(' ')
|
|
282
|
+
if expected_candidates.nil?
|
|
283
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
284
|
+
else
|
|
285
|
+
rpt_cand = 'Special Dataset, DS'
|
|
286
|
+
end
|
|
287
|
+
elsif %w(RELREC).include? ds
|
|
288
|
+
expected_candidates = @config['relrec_key']&.split(' ')
|
|
289
|
+
if expected_candidates.nil?
|
|
290
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
291
|
+
else
|
|
292
|
+
rpt_cand = 'Special Dataset, RELREC'
|
|
293
|
+
end
|
|
294
|
+
elsif %w(CO).include? ds
|
|
295
|
+
expected_candidates = @config['CO_key']&.split(' ')
|
|
296
|
+
if expected_candidates.nil?
|
|
297
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
298
|
+
else
|
|
299
|
+
rpt_cand = 'Special Dataset, CO'
|
|
300
|
+
end
|
|
301
|
+
elsif ds == 'TA'
|
|
302
|
+
expected_candidates = @config['TA_key']&.split(' ')
|
|
303
|
+
if expected_candidates.nil?
|
|
304
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
305
|
+
else
|
|
306
|
+
rpt_cand = 'Trial Design Dataset, TA'
|
|
307
|
+
end
|
|
308
|
+
elsif ds == 'TE'
|
|
309
|
+
expected_candidates = @config['TE_key']&.split(' ')
|
|
310
|
+
if expected_candidates.nil?
|
|
311
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
312
|
+
else
|
|
313
|
+
rpt_cand = 'Trial Design Dataset, TE'
|
|
314
|
+
end
|
|
315
|
+
elsif ds == 'TI'
|
|
316
|
+
expected_candidates = @config['TI_key']&.split(' ')
|
|
317
|
+
if expected_candidates.nil?
|
|
318
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
319
|
+
else
|
|
320
|
+
rpt_cand = 'Trial Design Dataset, TI'
|
|
321
|
+
end
|
|
322
|
+
elsif ds == 'TS'
|
|
323
|
+
expected_candidates = @config['TS_key']&.split(' ')
|
|
324
|
+
if expected_candidates.nil?
|
|
325
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
326
|
+
else
|
|
327
|
+
rpt_cand = 'Trial Design Dataset, TS'
|
|
328
|
+
end
|
|
329
|
+
elsif ds == 'TV'
|
|
330
|
+
expected_candidates = @config['TV_key']&.split(' ')
|
|
331
|
+
if expected_candidates.nil?
|
|
332
|
+
rpt_cand = '- No candidates keys variables, please check the config file'
|
|
333
|
+
else
|
|
334
|
+
rpt_cand = 'Trial Design Dataset, TV'
|
|
335
|
+
end
|
|
336
|
+
elsif ds.start_with? 'SUPP'
|
|
337
|
+
expected_candidates = ['USUBJID', 'IDVAR', 'IDVARVAL', 'QNAM']
|
|
338
|
+
rpt_cand = 'SUPP dataset'
|
|
339
|
+
else
|
|
340
|
+
puts "Unknown dataset: #{ds}"
|
|
341
|
+
rpt_cand = "no key candidate for #{ds}"
|
|
342
|
+
expected_candidates = []
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# no expected_candidates
|
|
346
|
+
if expected_candidates.nil?
|
|
347
|
+
expected_candidates = []
|
|
348
|
+
puts "#{ds}: No variables are specified in the configuration file for the key search."
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# transform list of variables into symbol
|
|
352
|
+
# add the dataset prefix to variables when expected
|
|
353
|
+
if %w(TA TE TI TS TV RELREC CO).include? ds
|
|
354
|
+
keys_candidates = expected_candidates.map(&:to_sym)
|
|
355
|
+
elsif ds.start_with? 'SUPP'
|
|
356
|
+
keys_candidates = expected_candidates.map(&:to_sym)
|
|
357
|
+
else
|
|
358
|
+
keys_candidates = [:USUBJID,] +
|
|
359
|
+
expected_candidates.map do |c|
|
|
360
|
+
if %w(VISITNUM SUBJID).include? c
|
|
361
|
+
c.to_sym
|
|
362
|
+
else
|
|
363
|
+
"#{ds}#{c}".to_sym
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
# remove variable unexpected for specific dataset
|
|
369
|
+
if ds == 'TU'
|
|
370
|
+
# no visit or timepoint are expected in TU key
|
|
371
|
+
keys_candidates.delete(:VISITNUM)
|
|
372
|
+
keys_candidates.delete(:TUTPTNUM)
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
[keys_candidates, rpt_cand]
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# look variable by variable if the key are valid
|
|
379
|
+
private def look_keys(ds, keys_candidates, records)
|
|
380
|
+
# initialize the number of duplicate with the records of records present in the dataset
|
|
381
|
+
prev_records_max = records.size
|
|
382
|
+
prev_nb_sub = records.size
|
|
383
|
+
# initialize the useless variables as an empty array
|
|
384
|
+
useless_vars = []
|
|
385
|
+
# save report to add or according to the end
|
|
386
|
+
report_key = []
|
|
387
|
+
|
|
388
|
+
# display the variables check during the search
|
|
389
|
+
report_key << "- the key search is based on the following variables in the given order: #{keys_candidates.join(', ')}"
|
|
390
|
+
|
|
391
|
+
# loop on all candidate keys variables
|
|
392
|
+
keys_candidates.size.times do |i|
|
|
393
|
+
|
|
394
|
+
# tested key
|
|
395
|
+
keys_vars = keys_candidates[..i]
|
|
396
|
+
# remove useless variables
|
|
397
|
+
useless_vars.each do |v|
|
|
398
|
+
keys_vars.delete(v)
|
|
399
|
+
end
|
|
400
|
+
report_key << " - trying #{keys_vars.join(', ')}" if @verbose
|
|
401
|
+
|
|
402
|
+
# array of values of the variables
|
|
403
|
+
keys_records = records.group_by do |p|
|
|
404
|
+
keys_vars.map { |var| p[var] }
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
# get the maximum of records by key
|
|
408
|
+
records_max = 0
|
|
409
|
+
nb_sub = 0
|
|
410
|
+
keys_records.each do |values, list|
|
|
411
|
+
records_max = list.size if list.size > records_max
|
|
412
|
+
if list.size > 1
|
|
413
|
+
nb_sub += 1
|
|
414
|
+
end
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
# break if no duplicate
|
|
418
|
+
if records_max == 1
|
|
419
|
+
return {
|
|
420
|
+
valid: true,
|
|
421
|
+
key: keys_vars
|
|
422
|
+
}
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
# duplicate still present
|
|
426
|
+
report_key << " - nope: #{nb_sub} keys variables have a maximum #{records_max} records for #{keys_vars.join(', ')}" if @verbose
|
|
427
|
+
# if the number of current duplicate is superior or equal to the previous
|
|
428
|
+
# number of duplicate then the variable is not useful for the key search and
|
|
429
|
+
# the previous number of maximum duplicate stay as is
|
|
430
|
+
# the current variable will be remove for the next search
|
|
431
|
+
if prev_records_max <= records_max && prev_nb_sub <= nb_sub
|
|
432
|
+
report_key << " - #{keys_vars.last} not useful"
|
|
433
|
+
useless_vars << keys_vars.last
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
# current duplicate number become the current number of duplicate
|
|
437
|
+
prev_records_max = records_max
|
|
438
|
+
prev_nb_sub = nb_sub
|
|
439
|
+
|
|
440
|
+
# duplicate if no more variables
|
|
441
|
+
if i == (keys_candidates.size-1)
|
|
442
|
+
# duplicates records with the last valid key
|
|
443
|
+
last_valid_keys = keys_candidates.reject { |kc| useless_vars.include? kc }
|
|
444
|
+
|
|
445
|
+
# compile all duplicates with the last valid keys
|
|
446
|
+
duplicate_records = records.group_by do |p|
|
|
447
|
+
last_valid_keys.map { |var| p[var] }
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
# write the CSV file
|
|
451
|
+
file_name = display_duplicate_records(ds, duplicate_records, :data)
|
|
452
|
+
|
|
453
|
+
report_key << "- no more candidates variables"
|
|
454
|
+
report_key << " - the last interesting variables checked for the key are #{last_valid_keys.join(', ')}"
|
|
455
|
+
if outname_ext == 'csv'
|
|
456
|
+
report_key << " - all duplicates are present here: #{file_name}"
|
|
457
|
+
else
|
|
458
|
+
report_key << " - all duplicates are present here: #{output_file}, sheet: #{File.basename(file_name, '.csv')}"
|
|
459
|
+
end
|
|
460
|
+
return {
|
|
461
|
+
valid: false,
|
|
462
|
+
last_valid_key: last_valid_keys,
|
|
463
|
+
duplicate_file: file_name
|
|
464
|
+
}
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
# search the key
|
|
471
|
+
def key_check_define(ds, keys, records)
|
|
472
|
+
keys ||= [:USUBJID]
|
|
473
|
+
|
|
474
|
+
keys_records = records.group_by do |p|
|
|
475
|
+
keys.map { |var| p[var.to_sym] }
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
records_max = keys_records.map { |_, list| list.size }.max || 0
|
|
479
|
+
|
|
480
|
+
if records_max == 1
|
|
481
|
+
[keys, true, nil]
|
|
482
|
+
else
|
|
483
|
+
file_name = display_duplicate_records(ds, keys_records, :define)
|
|
484
|
+
[keys, false, file_name]
|
|
485
|
+
end
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
# search non-ASCII character
|
|
489
|
+
def ascii_search(records)
|
|
490
|
+
# initialize list of exception
|
|
491
|
+
non_ascii_list = []
|
|
492
|
+
# read all characters
|
|
493
|
+
records.each do |record|
|
|
494
|
+
record.each do |var, value|
|
|
495
|
+
next if value.nil? || value.is_a?(Integer) || value.is_a?(Float)
|
|
496
|
+
if value.match(/[^[:ascii:]]/)
|
|
497
|
+
non_ascii_list << "#{var}: #{value.inspect}"
|
|
498
|
+
end
|
|
499
|
+
end
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
[non_ascii_list, non_ascii_list.empty?]
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
###
|
|
506
|
+
# load config file
|
|
507
|
+
def load_config(file)
|
|
508
|
+
YAML.load_file(file)
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
###
|
|
512
|
+
# load xpt file
|
|
513
|
+
def load_dsfile(infile)
|
|
514
|
+
reader = SAS::XPT::Reader.new(infile)
|
|
515
|
+
|
|
516
|
+
# only the fist dataset is loaded
|
|
517
|
+
puts "several datasets in one xpt, only the first one is kept" if reader.library.datasets.size > 1
|
|
518
|
+
dataset = reader.library.datasets.first
|
|
519
|
+
|
|
520
|
+
vars = dataset.variables.map(&:name).map(&:to_sym)
|
|
521
|
+
obs = dataset.observations
|
|
522
|
+
|
|
523
|
+
records = obs.map do |values|
|
|
524
|
+
vars.zip(values).to_h
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
records
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
####
|
|
531
|
+
# load define.xml with Nokogiri when present
|
|
532
|
+
# export the key variables by dataset (hash)
|
|
533
|
+
# nil otherwise
|
|
534
|
+
def load_define
|
|
535
|
+
# structure for report
|
|
536
|
+
define_report = {
|
|
537
|
+
define_path: nil,
|
|
538
|
+
load: false,
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
# initialize from config file
|
|
542
|
+
define_path = @config['define_path']
|
|
543
|
+
# define not mentionned at the end of the path
|
|
544
|
+
if define_path != '-' && File.basename(define_path) != 'define.xml'
|
|
545
|
+
define_path = "#{define_path}/define.xml"
|
|
546
|
+
end
|
|
547
|
+
# store
|
|
548
|
+
define_report[:define_path] = define_path
|
|
549
|
+
|
|
550
|
+
# use false to distinguish
|
|
551
|
+
if define_path == '-'
|
|
552
|
+
define_report[:load] = false
|
|
553
|
+
define_node = '-'
|
|
554
|
+
else
|
|
555
|
+
begin
|
|
556
|
+
define_node = File.open(define_path, "rb") { |io| Nokogiri::XML(io, &:noblanks) }
|
|
557
|
+
define_report[:load] = true
|
|
558
|
+
rescue => e
|
|
559
|
+
puts "⚠ Error loading define.xml : #{e.message}"
|
|
560
|
+
define_report[:load] = false
|
|
561
|
+
define_node = nil
|
|
562
|
+
end
|
|
563
|
+
end
|
|
564
|
+
|
|
565
|
+
[get_define_keys(define_node), define_report]
|
|
566
|
+
end
|
|
567
|
+
|
|
568
|
+
# get the define key variables by dataset from a nokogiri node,
|
|
569
|
+
# return an hash of dataset with list of variables name
|
|
570
|
+
# - exclude supp dataset
|
|
571
|
+
# - keys order is not supported
|
|
572
|
+
private def get_define_keys(define_node)
|
|
573
|
+
# shortcut if no define
|
|
574
|
+
return nil if define_node.nil?
|
|
575
|
+
# shortcut if define = -
|
|
576
|
+
return '-' if define_node == '-'
|
|
577
|
+
# store all ItemGroup
|
|
578
|
+
item_group = define_node.css("ItemGroupDef")
|
|
579
|
+
# store all Item
|
|
580
|
+
item_def = define_node.css("ItemDef").group_by { |id| id['OID'] }
|
|
581
|
+
|
|
582
|
+
# keep only the dataset name & the key variables
|
|
583
|
+
define_keys = {}
|
|
584
|
+
# loop on all item group (dataset)
|
|
585
|
+
item_group.each do |ig|
|
|
586
|
+
key_list = []
|
|
587
|
+
ds = ig["Name"].upcase
|
|
588
|
+
# loop on all item (variables)
|
|
589
|
+
ig.css("ItemRef").each do |ir|
|
|
590
|
+
# exclude all variables not in the key
|
|
591
|
+
index = ir["KeySequence"]&.to_i
|
|
592
|
+
next if index.nil?
|
|
593
|
+
# get the ItemDef of the variable by the ItemRef OID
|
|
594
|
+
item = item_def[ir["ItemOID"]]
|
|
595
|
+
# expect one item per variable
|
|
596
|
+
unless item.size == 1
|
|
597
|
+
puts "several item with the same OID: #{item.size}"
|
|
598
|
+
item.each do |i|
|
|
599
|
+
puts i
|
|
600
|
+
end
|
|
601
|
+
puts "only the first is kept"
|
|
602
|
+
end
|
|
603
|
+
# store the variable name
|
|
604
|
+
key_list << item.first["Name"]
|
|
605
|
+
end
|
|
606
|
+
|
|
607
|
+
# assign the key to the dataset
|
|
608
|
+
define_keys[ds] = key_list
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
define_keys
|
|
612
|
+
end
|
|
613
|
+
|
|
614
|
+
###
|
|
615
|
+
# display
|
|
616
|
+
#
|
|
617
|
+
# create a CVS with all duplicates
|
|
618
|
+
def display_duplicate_records(ds, keys_records, type)
|
|
619
|
+
|
|
620
|
+
if outname_ext == 'csv'
|
|
621
|
+
display_out = "#{dir_out}/#{type.to_s}_#{ds}.csv"
|
|
622
|
+
else
|
|
623
|
+
display_out = "#{dir_out}/#{type.to_s}_#{ds}.csv"
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
# TODO: hum ça marche ouais mais non, faut pas laisser comme ça
|
|
627
|
+
header = keys_records.first.last.first.keys
|
|
628
|
+
header = header.insert(0, 'No')
|
|
629
|
+
|
|
630
|
+
duplicate = keys_records.reject { |key, list| list.size == 1 }
|
|
631
|
+
|
|
632
|
+
CSV.open(display_out, "w") do |csv|
|
|
633
|
+
# header
|
|
634
|
+
csv << header
|
|
635
|
+
# body
|
|
636
|
+
duplicate.each do |key, list|
|
|
637
|
+
idx = duplicate.keys.index(key) + 1
|
|
638
|
+
list.each do |l|
|
|
639
|
+
row = l.values.insert(0, idx)
|
|
640
|
+
csv << row
|
|
641
|
+
end
|
|
642
|
+
end
|
|
643
|
+
end
|
|
644
|
+
|
|
645
|
+
display_out
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
# write report
|
|
649
|
+
def write_html_report
|
|
650
|
+
@report.output_file = output_file
|
|
651
|
+
@report.save_html
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
def write_word_report
|
|
655
|
+
|
|
656
|
+
# write html report unless if already exist
|
|
657
|
+
write_html_report unless File.exist?(@report.report_name)
|
|
658
|
+
|
|
659
|
+
# compute name
|
|
660
|
+
report_html = @report.report_name
|
|
661
|
+
report_word = report_html[...-5] + '.docx'
|
|
662
|
+
|
|
663
|
+
# parse html
|
|
664
|
+
puts "Parsing #{report_html}..."
|
|
665
|
+
blocks = RiReportParser.new(report_html).parse
|
|
666
|
+
puts " -> #{blocks.size} blocks extracted"
|
|
667
|
+
|
|
668
|
+
# create word
|
|
669
|
+
puts "Building #{report_word}..."
|
|
670
|
+
DocxWriter.new(report_word).write(blocks)
|
|
671
|
+
puts " -> Done: #{report_word}"
|
|
672
|
+
end
|
|
673
|
+
|
|
674
|
+
private def write_xls(csv_files)
|
|
675
|
+
# create xls file
|
|
676
|
+
book = Spreadsheet::Workbook.new
|
|
677
|
+
|
|
678
|
+
# workbook default format
|
|
679
|
+
format_default = Spreadsheet::Format.new(
|
|
680
|
+
font: Spreadsheet::Font.new('Verdana', size: 8)
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
#
|
|
684
|
+
format_gray = Spreadsheet::Format.new(
|
|
685
|
+
font: Spreadsheet::Font.new('Verdana', size: 8),
|
|
686
|
+
pattern: 1,
|
|
687
|
+
pattern_fg_color: :silver
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# header format
|
|
691
|
+
format_header = Spreadsheet::Format.new(
|
|
692
|
+
weight: :bold, # TODO, not working, dont known why
|
|
693
|
+
font: Spreadsheet::Font.new('Verdana', size: 8),
|
|
694
|
+
pattern: 1,
|
|
695
|
+
pattern_fg_color: :grey
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# loop on csv files
|
|
699
|
+
csv_files.each do |csv_file|
|
|
700
|
+
# create sheet name
|
|
701
|
+
sheet_name = File.basename(csv_file, '.csv')[11...]
|
|
702
|
+
|
|
703
|
+
# create worksheet
|
|
704
|
+
sheet = book.create_worksheet(name: sheet_name)
|
|
705
|
+
|
|
706
|
+
# initialize variables for the color switch
|
|
707
|
+
previous_first_cell_value = nil
|
|
708
|
+
should_color = false
|
|
709
|
+
|
|
710
|
+
# read csv & write worksheet
|
|
711
|
+
row_index = 0
|
|
712
|
+
CSV.foreach(csv_file, headers: false, encoding: 'UTF-8') do |row|
|
|
713
|
+
|
|
714
|
+
# check the first cell on the current row, to known if the color must switch
|
|
715
|
+
if row_index >= 0 # switch start after the first duplicate
|
|
716
|
+
current_first_cell_value = row[0]
|
|
717
|
+
if current_first_cell_value != previous_first_cell_value
|
|
718
|
+
should_color = !should_color # color switch
|
|
719
|
+
previous_first_cell_value = current_first_cell_value
|
|
720
|
+
end
|
|
721
|
+
end
|
|
722
|
+
|
|
723
|
+
# set format
|
|
724
|
+
# on the header
|
|
725
|
+
if row_index == 0
|
|
726
|
+
sheet.row(row_index).default_format = format_header
|
|
727
|
+
# format alternate
|
|
728
|
+
elsif should_color
|
|
729
|
+
sheet.row(row_index).default_format = format_gray
|
|
730
|
+
else
|
|
731
|
+
sheet.row(row_index).default_format = format_default
|
|
732
|
+
end
|
|
733
|
+
|
|
734
|
+
# write line by line
|
|
735
|
+
sheet.row(row_index).concat(row)
|
|
736
|
+
row_index += 1
|
|
737
|
+
end
|
|
738
|
+
|
|
739
|
+
# adjust the column length
|
|
740
|
+
if row_index > 0
|
|
741
|
+
num_columns = sheet.row(0).size
|
|
742
|
+
|
|
743
|
+
(0...num_columns).each do |col_index|
|
|
744
|
+
max_length = 10 # min length
|
|
745
|
+
|
|
746
|
+
# loop on all column
|
|
747
|
+
(0...row_index).each do |r|
|
|
748
|
+
cell_value = sheet.row(r)[col_index].to_s
|
|
749
|
+
cell_length = cell_value.length + 2 # +2 for margin
|
|
750
|
+
max_length = cell_length if cell_length > max_length
|
|
751
|
+
end
|
|
752
|
+
|
|
753
|
+
# max length is 50
|
|
754
|
+
sheet.column(col_index).width = [max_length, 50].min
|
|
755
|
+
end
|
|
756
|
+
end
|
|
757
|
+
end
|
|
758
|
+
|
|
759
|
+
# save the workbook
|
|
760
|
+
book.write(output_file)
|
|
761
|
+
end
|
|
762
|
+
|
|
763
|
+
private def write_xlsx(csv_files)
|
|
764
|
+
# create workbook
|
|
765
|
+
workbook = RubyXL::Workbook.new
|
|
766
|
+
|
|
767
|
+
# delete default worksheet
|
|
768
|
+
workbook.worksheets.delete_at(0)
|
|
769
|
+
|
|
770
|
+
# loop on all csv files
|
|
771
|
+
csv_files.each do |csv_file|
|
|
772
|
+
# create sheet name
|
|
773
|
+
sheet_name = File.basename(csv_file, '.csv')
|
|
774
|
+
|
|
775
|
+
# create worksheet
|
|
776
|
+
worksheet = workbook.add_worksheet(sheet_name)
|
|
777
|
+
|
|
778
|
+
# initialize variables for background color switch
|
|
779
|
+
previous_first_cell_value = nil
|
|
780
|
+
should_color = false
|
|
781
|
+
|
|
782
|
+
# read csv & write worksheet
|
|
783
|
+
row_index = 0
|
|
784
|
+
CSV.foreach(csv_file, headers: false, encoding: 'UTF-8') do |row|
|
|
785
|
+
row.each_with_index do |cell_value, col_index|
|
|
786
|
+
|
|
787
|
+
# check the first cell on the current row, to known if the color must switch
|
|
788
|
+
if row_index >= 0 # start with a blank line after the header
|
|
789
|
+
current_first_cell_value = row[0]
|
|
790
|
+
if current_first_cell_value != previous_first_cell_value
|
|
791
|
+
should_color = !should_color # color switch
|
|
792
|
+
previous_first_cell_value = current_first_cell_value
|
|
793
|
+
end
|
|
794
|
+
end
|
|
795
|
+
|
|
796
|
+
# get the cell
|
|
797
|
+
cell = worksheet.add_cell(row_index, col_index, cell_value)
|
|
798
|
+
|
|
799
|
+
# set format
|
|
800
|
+
# on header row
|
|
801
|
+
if row_index == 0
|
|
802
|
+
cell.change_font_bold(true)
|
|
803
|
+
cell.change_font_name('Verdana')
|
|
804
|
+
cell.change_font_size(8)
|
|
805
|
+
cell.change_fill('C0C0C0')
|
|
806
|
+
|
|
807
|
+
# on other rows
|
|
808
|
+
else
|
|
809
|
+
cell.change_font_name('Verdana')
|
|
810
|
+
cell.change_font_size(8)
|
|
811
|
+
cell.change_fill('DCDCDC') if should_color
|
|
812
|
+
end
|
|
813
|
+
end
|
|
814
|
+
|
|
815
|
+
row_index += 1
|
|
816
|
+
end
|
|
817
|
+
|
|
818
|
+
# adjust the column length
|
|
819
|
+
if row_index > 0
|
|
820
|
+
num_columns = worksheet[0].cells.length
|
|
821
|
+
|
|
822
|
+
(0...num_columns).each do |col_index|
|
|
823
|
+
max_length = 10 # minimum length
|
|
824
|
+
|
|
825
|
+
# loop on all rows in a column
|
|
826
|
+
(0...row_index).each do |r|
|
|
827
|
+
cell = worksheet[r][col_index]
|
|
828
|
+
if cell && cell.value
|
|
829
|
+
cell_length = cell.value.to_s.length + 2 # +2 for margin
|
|
830
|
+
max_length = cell_length if cell_length > max_length
|
|
831
|
+
end
|
|
832
|
+
end
|
|
833
|
+
|
|
834
|
+
# column length is 50 or the length computed
|
|
835
|
+
col = worksheet.cols.get_range(col_index)
|
|
836
|
+
col.width = [max_length * 1.2, 50].min
|
|
837
|
+
col.custom_width = true
|
|
838
|
+
end
|
|
839
|
+
end
|
|
840
|
+
end
|
|
841
|
+
|
|
842
|
+
# save the workbook
|
|
843
|
+
workbook.write(output_file)
|
|
844
|
+
end
|
|
845
|
+
|
|
846
|
+
###
|
|
847
|
+
# Helpers
|
|
848
|
+
|
|
849
|
+
# output directory
|
|
850
|
+
def prepare_output_dir
|
|
851
|
+
dir = "#{study_name}/#{config['output_directory']}"
|
|
852
|
+
|
|
853
|
+
# no value, current repository
|
|
854
|
+
if dir.nil? || dir.strip.empty?
|
|
855
|
+
dir = '.'
|
|
856
|
+
|
|
857
|
+
# repository present, delete all csv file
|
|
858
|
+
elsif Dir.exist?(dir)
|
|
859
|
+
Dir.glob("#{dir}/**/*.csv").each { |f| File.delete(f) }
|
|
860
|
+
|
|
861
|
+
# no repository, creation
|
|
862
|
+
else
|
|
863
|
+
FileUtils.mkdir_p(dir)
|
|
864
|
+
end
|
|
865
|
+
|
|
866
|
+
dir
|
|
867
|
+
end
|
|
868
|
+
|
|
869
|
+
def dir_out
|
|
870
|
+
dir = "#{study_name}/#{config['output_directory']}"
|
|
871
|
+
dir = '.' if dir.nil? || dir.strip.empty?
|
|
872
|
+
dir
|
|
873
|
+
end
|
|
874
|
+
|
|
875
|
+
# extension output file
|
|
876
|
+
def outname_ext
|
|
877
|
+
if %w(xls xlsx).include? config['output_type']
|
|
878
|
+
config['output_type']
|
|
879
|
+
else
|
|
880
|
+
'csv'
|
|
881
|
+
end
|
|
882
|
+
end
|
|
883
|
+
|
|
884
|
+
# compute the file name for Excel file
|
|
885
|
+
def output_file
|
|
886
|
+
if outname_ext == 'csv'
|
|
887
|
+
nil
|
|
888
|
+
else
|
|
889
|
+
"duplicates.#{outname_ext}"
|
|
890
|
+
end
|
|
891
|
+
end
|
|
892
|
+
|
|
893
|
+
# list of invalid dataset, whatever the issue
|
|
894
|
+
def dslist_invalid
|
|
895
|
+
@ds_with_issue.values.flatten.uniq
|
|
896
|
+
end
|
|
897
|
+
|
|
898
|
+
###
|
|
899
|
+
# Report class
|
|
900
|
+
class Report
|
|
901
|
+
attr_reader :report_name
|
|
902
|
+
attr_reader :study_name
|
|
903
|
+
attr_reader :externe_file_type
|
|
904
|
+
|
|
905
|
+
attr_accessor :init_reports
|
|
906
|
+
attr_accessor :dataset_reports
|
|
907
|
+
attr_accessor :output_file
|
|
908
|
+
|
|
909
|
+
def initialize(report_name, study_name, externe_file_type)
|
|
910
|
+
@report_name = report_name
|
|
911
|
+
@study_name = study_name
|
|
912
|
+
@externe_file_type = externe_file_type
|
|
913
|
+
@init_reports = {}
|
|
914
|
+
@dataset_reports = {}
|
|
915
|
+
end
|
|
916
|
+
|
|
917
|
+
# summary part from FindKeys.initialize
|
|
918
|
+
def add_init(text_hash)
|
|
919
|
+
@init_reports = text_hash
|
|
920
|
+
end
|
|
921
|
+
|
|
922
|
+
# report part from FindKeys.high_level_check
|
|
923
|
+
def add_dataset_report(ds_name, report_data)
|
|
924
|
+
@dataset_reports[ds_name] = report_data
|
|
925
|
+
end
|
|
926
|
+
|
|
927
|
+
# dynamic title for acces web or manual
|
|
928
|
+
def title
|
|
929
|
+
if @study_name.include?('/')
|
|
930
|
+
File.basename(@study_name)
|
|
931
|
+
else
|
|
932
|
+
@study_name
|
|
933
|
+
end
|
|
934
|
+
end
|
|
935
|
+
|
|
936
|
+
# html build from ERB file
|
|
937
|
+
def generate_html
|
|
938
|
+
# Charge ERB template
|
|
939
|
+
template_path = File.expand_path('../../../views/report_template.erb', __FILE__)
|
|
940
|
+
|
|
941
|
+
unless File.exist?(template_path)
|
|
942
|
+
raise "Template file not found: #{template_path}"
|
|
943
|
+
end
|
|
944
|
+
|
|
945
|
+
# Encode logo in base64
|
|
946
|
+
logo_path = File.expand_path('../../../public/Contact-LOGO.png', __FILE__)
|
|
947
|
+
@logo_base64 = if File.exist?(logo_path)
|
|
948
|
+
require 'base64'
|
|
949
|
+
"data:image/png;base64,#{Base64.strict_encode64(File.binread(logo_path))}"
|
|
950
|
+
else
|
|
951
|
+
nil
|
|
952
|
+
end
|
|
953
|
+
|
|
954
|
+
# Create HTML
|
|
955
|
+
template = File.read(template_path)
|
|
956
|
+
erb = ERB.new(template, trim_mode: '-')
|
|
957
|
+
erb.result(binding)
|
|
958
|
+
end
|
|
959
|
+
|
|
960
|
+
# write html file
|
|
961
|
+
def save_html(output_path = nil)
|
|
962
|
+
output_path ||= @report_name
|
|
963
|
+
html_content = generate_html
|
|
964
|
+
|
|
965
|
+
File.write(output_path, html_content)
|
|
966
|
+
puts "Rapport HTML généré : #{output_path}"
|
|
967
|
+
end
|
|
968
|
+
end
|
|
969
|
+
|
|
970
|
+
end
|
|
971
|
+
end
|
|
972
|
+
|
|
973
|
+
# config_file = 'config.yaml'
|
|
974
|
+
# fk = FindKeys.new(config_file, verbose: true)
|
|
975
|
+
# fk.high_level_check
|
|
976
|
+
# fk.write_html_report
|
|
977
|
+
# fk.write_word_report
|
|
978
|
+
# puts fk.ds_with_issue
|
|
979
|
+
# puts fk.dslist_invalid
|