evaltree 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. evaltree/EvalTree.py +3359 -0
  2. evaltree/__init__.py +0 -0
  3. evaltree/scripts/ComparingPartitions/.git/HEAD +1 -0
  4. evaltree/scripts/ComparingPartitions/.git/config +12 -0
  5. evaltree/scripts/ComparingPartitions/.git/description +1 -0
  6. evaltree/scripts/ComparingPartitions/.git/hooks/applypatch-msg.sample +15 -0
  7. evaltree/scripts/ComparingPartitions/.git/hooks/commit-msg.sample +24 -0
  8. evaltree/scripts/ComparingPartitions/.git/hooks/fsmonitor-watchman.sample +174 -0
  9. evaltree/scripts/ComparingPartitions/.git/hooks/post-update.sample +8 -0
  10. evaltree/scripts/ComparingPartitions/.git/hooks/pre-applypatch.sample +14 -0
  11. evaltree/scripts/ComparingPartitions/.git/hooks/pre-commit.sample +49 -0
  12. evaltree/scripts/ComparingPartitions/.git/hooks/pre-merge-commit.sample +13 -0
  13. evaltree/scripts/ComparingPartitions/.git/hooks/pre-push.sample +53 -0
  14. evaltree/scripts/ComparingPartitions/.git/hooks/pre-rebase.sample +169 -0
  15. evaltree/scripts/ComparingPartitions/.git/hooks/pre-receive.sample +24 -0
  16. evaltree/scripts/ComparingPartitions/.git/hooks/prepare-commit-msg.sample +42 -0
  17. evaltree/scripts/ComparingPartitions/.git/hooks/push-to-checkout.sample +78 -0
  18. evaltree/scripts/ComparingPartitions/.git/hooks/sendemail-validate.sample +77 -0
  19. evaltree/scripts/ComparingPartitions/.git/hooks/update.sample +128 -0
  20. evaltree/scripts/ComparingPartitions/.git/index +0 -0
  21. evaltree/scripts/ComparingPartitions/.git/info/exclude +6 -0
  22. evaltree/scripts/ComparingPartitions/.git/logs/HEAD +1 -0
  23. evaltree/scripts/ComparingPartitions/.git/logs/refs/heads/master +1 -0
  24. evaltree/scripts/ComparingPartitions/.git/logs/refs/remotes/origin/HEAD +1 -0
  25. evaltree/scripts/ComparingPartitions/.git/objects/pack/pack-b4866a13dc4806c75e665495ee6671b98a695802.idx +0 -0
  26. evaltree/scripts/ComparingPartitions/.git/objects/pack/pack-b4866a13dc4806c75e665495ee6671b98a695802.pack +0 -0
  27. evaltree/scripts/ComparingPartitions/.git/objects/pack/pack-b4866a13dc4806c75e665495ee6671b98a695802.rev +0 -0
  28. evaltree/scripts/ComparingPartitions/.git/packed-refs +5 -0
  29. evaltree/scripts/ComparingPartitions/.git/refs/heads/master +1 -0
  30. evaltree/scripts/ComparingPartitions/.git/refs/remotes/origin/HEAD +1 -0
  31. evaltree/scripts/ComparingPartitions/CP_demodata.txt +326 -0
  32. evaltree/scripts/ComparingPartitions/LICENSE +674 -0
  33. evaltree/scripts/ComparingPartitions/README.md +43 -0
  34. evaltree/scripts/ComparingPartitions/comparing_partitions.py +201 -0
  35. evaltree/scripts/ComparingPartitions/comparing_partitions_v2.py +481 -0
  36. evaltree/scripts/ComparingPartitions/metrics.py +193 -0
  37. evaltree/scripts/WGS_cluster_congruence/.git/HEAD +1 -0
  38. evaltree/scripts/WGS_cluster_congruence/.git/config +12 -0
  39. evaltree/scripts/WGS_cluster_congruence/.git/description +1 -0
  40. evaltree/scripts/WGS_cluster_congruence/.git/hooks/applypatch-msg.sample +15 -0
  41. evaltree/scripts/WGS_cluster_congruence/.git/hooks/commit-msg.sample +24 -0
  42. evaltree/scripts/WGS_cluster_congruence/.git/hooks/fsmonitor-watchman.sample +174 -0
  43. evaltree/scripts/WGS_cluster_congruence/.git/hooks/post-update.sample +8 -0
  44. evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-applypatch.sample +14 -0
  45. evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-commit.sample +49 -0
  46. evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-merge-commit.sample +13 -0
  47. evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-push.sample +53 -0
  48. evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-rebase.sample +169 -0
  49. evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-receive.sample +24 -0
  50. evaltree/scripts/WGS_cluster_congruence/.git/hooks/prepare-commit-msg.sample +42 -0
  51. evaltree/scripts/WGS_cluster_congruence/.git/hooks/push-to-checkout.sample +78 -0
  52. evaltree/scripts/WGS_cluster_congruence/.git/hooks/sendemail-validate.sample +77 -0
  53. evaltree/scripts/WGS_cluster_congruence/.git/hooks/update.sample +128 -0
  54. evaltree/scripts/WGS_cluster_congruence/.git/index +0 -0
  55. evaltree/scripts/WGS_cluster_congruence/.git/info/exclude +6 -0
  56. evaltree/scripts/WGS_cluster_congruence/.git/logs/HEAD +1 -0
  57. evaltree/scripts/WGS_cluster_congruence/.git/logs/refs/heads/main +1 -0
  58. evaltree/scripts/WGS_cluster_congruence/.git/logs/refs/remotes/origin/HEAD +1 -0
  59. evaltree/scripts/WGS_cluster_congruence/.git/objects/pack/pack-e440654b80466e9e939f543a53a1c0c50c2fc934.idx +0 -0
  60. evaltree/scripts/WGS_cluster_congruence/.git/objects/pack/pack-e440654b80466e9e939f543a53a1c0c50c2fc934.pack +0 -0
  61. evaltree/scripts/WGS_cluster_congruence/.git/objects/pack/pack-e440654b80466e9e939f543a53a1c0c50c2fc934.rev +0 -0
  62. evaltree/scripts/WGS_cluster_congruence/.git/packed-refs +3 -0
  63. evaltree/scripts/WGS_cluster_congruence/.git/refs/heads/main +1 -0
  64. evaltree/scripts/WGS_cluster_congruence/.git/refs/remotes/origin/HEAD +1 -0
  65. evaltree/scripts/WGS_cluster_congruence/LICENSE +661 -0
  66. evaltree/scripts/WGS_cluster_congruence/README.md +34 -0
  67. evaltree/scripts/WGS_cluster_congruence/comparison_outbreak_level.py +246 -0
  68. evaltree/scripts/WGS_cluster_congruence/congruence_plots.py +244 -0
  69. evaltree/scripts/WGS_cluster_congruence/get_best_part_correspondence.py +128 -0
  70. evaltree/scripts/WGS_cluster_congruence/get_stats_threshold.py +143 -0
  71. evaltree/scripts/WGS_cluster_congruence/heatmap_final_score.py +137 -0
  72. evaltree/scripts/WGS_cluster_congruence/poli_typing.py +111 -0
  73. evaltree/scripts/WGS_cluster_congruence/remove_hifen.py +46 -0
  74. evaltree/scripts/WGS_cluster_congruence/stats_outbreak_analysis.py +204 -0
  75. evaltree/scripts/WGS_cluster_congruence/stats_outbreak_analysis_snp_dists.py +150 -0
  76. evaltree/scripts/WGS_cluster_congruence/wgmlst_exercise.py +96 -0
  77. evaltree-0.1.0.dist-info/LICENSE +674 -0
  78. evaltree-0.1.0.dist-info/METADATA +219 -0
  79. evaltree-0.1.0.dist-info/RECORD +81 -0
  80. evaltree-0.1.0.dist-info/WHEEL +4 -0
  81. evaltree-0.1.0.dist-info/entry_points.txt +3 -0
evaltree/EvalTree.py ADDED
@@ -0,0 +1,3359 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ EvalTree: toolbox for comparative clustering evaluation of whole genome sequencing (WGS) pipelines for bacteria routine surveillance
5
+ By Joana Gomes Pereira
6
+ @INSA
7
+
8
+ """
9
+ version = "1.0.0"
10
+ last_updated = "2025-05-20"
11
+
12
+ import datetime
13
+ import argparse
14
+ import os
15
+ import sys
16
+ import time
17
+ import textwrap
18
+ import pandas as pd
19
+ import glob
20
+ import fnmatch
21
+ import plotly.express as px
22
+ import plotly.io as pio
23
+ import plotly.graph_objects as go
24
+ import re
25
+ import random
26
+ import numpy as np
27
+ import subprocess
28
+ from scipy import stats
29
+ import math
30
+
31
+
32
+ def get_path_toolbox():
33
+
34
+ """
35
+ Retrieves the absolute path to the current script (EvalTree.py) and its respective directory.
36
+ This is useful for correctly managing file paths relative to the script's location.
37
+
38
+ Parameters
39
+ ---------
40
+ None
41
+
42
+ Returns
43
+ ---------
44
+ path_toolbox_script: str
45
+ The absolute path to the current script.
46
+ directory_toolbox_script: str
47
+ The absolute path to the directory containing the current script.
48
+ """
49
+ #print(f'\n---------------------------------------------- Function: get_path_toolbox ----------------------------------------------')
50
+
51
+ path_toolbox_script = os.path.realpath(__file__)
52
+ directory_toolbox = os.path.dirname(path_toolbox_script)
53
+
54
+ return path_toolbox_script, directory_toolbox
55
+
56
+ def get_path_other_scripts(directory_toolbox):
57
+
58
+ """
59
+ Constructs the paths to locate the scripts that evaluate the pipeline congruence based from the toolbox directory.
60
+ The following scripts are included:
61
+
62
+ -**comparing_partition_v2.py** (Mixão et al., 2024):
63
+ This script has two analysis options: between_methods and stability.
64
+ - The *between_methods* option compares methods from two pipelines to compute the congruence score, assessing the consistency between them.
65
+ - The *stability* option evaluates the cluster stability produced by a given method.
66
+
67
+ -**get_best_part_correspondence.py** (Mixão et al., 2024):
68
+ For each pairwise pipeline comparison, this script identifies the threshold that provides
69
+ the most similar clustering results in the other pipeline (i.e., the best “correspondence point”), based on CS scores.
70
+
71
+ -**remove_hifen_script.py** (Pereira et al., 2025):
72
+ Automatically remove row(s) from the file ALL_CORRESPONDENDE.tsv that do not contain correspondence points produced by get_best_part_correspondence.py.
73
+ Rename the file ALL_CORRESPONDENDE.tsv to All_correspondence.tsv.
74
+
75
+ -**stats_outbreak_script.py** (Mixão et al., 2024):
76
+ This script determines the percentage of clusters identified by a given pipeline at a certain threshold are also detected
77
+ — with the exact same composition — by another pipeline at a similar or higher threshold.
78
+
79
+ Parameters
80
+ ----------
81
+ directory_tool1_script : str
82
+ Path to the directory of the main toolbox script.
83
+
84
+ Returns
85
+ ---------
86
+ comparing_partitions_script: str
87
+ Path to the script comparing_partition_v2.py.
88
+
89
+ get_best_part_correspondence_script: str
90
+ Path to the script get_best_part_correspondence.py.
91
+
92
+ remove_hifen_script: str
93
+ Path to the script remove_hifen.py.
94
+
95
+ stats_outbreak_script: str
96
+ Path to the script stats_outbreak_script.py.
97
+ """
98
+ #print(f'\n---------------------------------------------- Function: get_path_other_scripts ----------------------------------------------\n')
99
+
100
+ comparing_partitions_script = os.path.join(directory_toolbox, 'scripts', 'ComparingPartitions', 'comparing_partitions_v2.py')
101
+
102
+ get_best_part_correspondence_script = os.path.join(directory_toolbox, 'scripts', 'WGS_cluster_congruence', 'get_best_part_correspondence.py')
103
+
104
+ remove_hifen_script = os.path.join(directory_toolbox, 'scripts', 'WGS_cluster_congruence','remove_hifen.py')
105
+
106
+ stats_outbreak_script = os.path.join(directory_toolbox, 'scripts', 'WGS_cluster_congruence','stats_outbreak_analysis.py')
107
+
108
+ return comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, stats_outbreak_script
109
+
110
+ def check_input_argument(input1, input2):
111
+
112
+ """
113
+ Verifies the existence of the specified input paths (-i1 and -i2) and categorizes them.
114
+ If an input is a folder, it is added to the folders list; if it is a `.tsv` file, it is added to the files list.
115
+ Based on the provided input arguments, the function returns two separate lists: one for folders and one for files.
116
+
117
+ Parameters
118
+ ----------
119
+ input1: str
120
+ Relative path to the first input argument.
121
+ input2: str
122
+ Relative path to the second input argument.
123
+
124
+ Returns
125
+ -------
126
+ folders: list
127
+ List of relative paths to input folders.
128
+ files: list
129
+ List of relative paths to input files.
130
+ """
131
+ #print(f'\n---------------------------------------------- Function: check_input_argument ----------------------------------------------\n')
132
+
133
+ folders = []
134
+ files = []
135
+
136
+ arguments = [input1, input2]
137
+
138
+ for elem in arguments:
139
+ if elem is not None:
140
+
141
+ if not os.path.exists(elem):
142
+ sys.exit(f'Error: The input {elem} was not found.')
143
+
144
+ if os.path.isdir(elem):
145
+ folders.append(elem)
146
+
147
+ elif os.path.isfile(elem):
148
+ if elem.endswith('.tsv'):
149
+ files.append(elem)
150
+ else:
151
+ sys.exit(f'\tError: Only files with the *.tsv extension are allowed. The extension of {elem} is not allowed.')
152
+ else:
153
+ print(f"\tWarning: Only one input file was provided. Inter-pipeline cluster congruence analysis will not be performed.\n")
154
+
155
+ return folders, files
156
+
157
+ def check_folder(input_path):
158
+
159
+ """
160
+ Checks the input path for expected files.
161
+ The path must be a directory (e.g., a ReporTree folder).
162
+ This function searches for specific filenames and validates the file prefix consistency.
163
+
164
+ Parameter
165
+ ---------
166
+ input_path: str
167
+ Relative path to the directory.
168
+
169
+ Returns
170
+ -------
171
+ partitions : str or None
172
+ Relative path to the partition matrix file.
173
+ partitions_summary : str or None
174
+ Relative path to the partitions summary file.
175
+ sample_interest : str or None
176
+ Relative path to the sample of interest partitions summary file.
177
+ clusterComposition : str or None
178
+ Relative path to the cluster composition file.
179
+ prefix : str or None
180
+ Prefix present in all files.
181
+ input_path : str
182
+ Name of the input folder.
183
+ stable_region : str or None
184
+ Full path to the stable regions file.
185
+ """
186
+ #print(f'\n---------------------------------------------- Function: check_folder ----------------------------------------------\n')
187
+
188
+ if input_path == []:
189
+ sys.exit(f"\tError: The folder {input_path} is empty or does not exist.")
190
+
191
+ files = [os.path.join(input_path, file) for file in os.listdir(input_path)]
192
+
193
+ partitions = None
194
+ partitions_summary = None
195
+ sample_interest = None
196
+ clusterComposition = None
197
+ stable_region = None
198
+
199
+ prefix_list = []
200
+ file_prefix_map = {}
201
+
202
+ for file in files:
203
+ if fnmatch.fnmatch(file,"*_clusterComposition.tsv"):
204
+ clusterComposition = file
205
+ prefix_cc=file[:-23]
206
+ prefix_list.append(prefix_cc)
207
+ file_prefix_map[file] = prefix_cc
208
+
209
+ elif fnmatch.fnmatch(file, "*_partitions.tsv") and not fnmatch.fnmatch(file, '*_w_partitions.tsv'):
210
+ partitions = file
211
+ prefix_mp=file[:-15]
212
+ prefix_list.append(prefix_mp)
213
+ file_prefix_map[file] = prefix_mp
214
+
215
+ elif fnmatch.fnmatch(file,"*_partitions_summary.tsv") and not fnmatch.fnmatch(file, '*_SAMPLES_OF_INTEREST_partitions_summary.tsv'):
216
+ partitions_summary = file
217
+ prefix_ps=file[:-23]
218
+ prefix_list.append(prefix_ps)
219
+ file_prefix_map[file] = prefix_ps
220
+
221
+ elif fnmatch.fnmatch(file,"*_SAMPLES_OF_INTEREST_partitions_summary.tsv"):
222
+ sample_interest = file
223
+ prefix_si=file[:-43]
224
+ prefix_list.append(prefix_si)
225
+ file_prefix_map[file] = prefix_si
226
+
227
+ elif fnmatch.fnmatch(file,"*_stableRegions.tsv"):
228
+ stable_region = file
229
+ prefix_st=file[:-18]
230
+ prefix_list.append(prefix_st)
231
+ file_prefix_map[file] = prefix_st
232
+
233
+ unique_prefixes = set(prefix_list)
234
+ list_prefixes = [os.path.basename(p) for p in unique_prefixes]
235
+
236
+ if len(unique_prefixes) > 1:
237
+ print(f"\nError: Multiple prefixes were found in the {input_path} folder: {' and '.join(list_prefixes)}. Please revise the structure of your input folder.")
238
+ for prefix in unique_prefixes:
239
+ for file, file_prefix in file_prefix_map.items():
240
+ if file_prefix == prefix:
241
+ print(f" - {file}")
242
+ sys.exit()
243
+ else:
244
+ final_prefix = list_prefixes[0]
245
+
246
+ print(f'\tFiles of {input_path}:')
247
+ print(f'\t\tPrefix: {final_prefix}')
248
+ print(f'\t\tPartition matrix: {partitions}')
249
+ print(f'\t\tPartition summary: {partitions_summary}')
250
+ print(f'\t\tSample of interest: {sample_interest}')
251
+ print(f'\t\tCluster composition: {clusterComposition}')
252
+ print(f'\t\tStable regions: {stable_region}')
253
+
254
+ return partitions, partitions_summary, sample_interest, clusterComposition, final_prefix, input_path, stable_region
255
+
256
+ def check_output(output):
257
+
258
+ """
259
+ Checks if the specified path is a valid directory.
260
+ If the path is not a valid directory, the program will stop with an error message.
261
+
262
+ Parameter
263
+ ---------
264
+ output: str
265
+ Relative path to the directory where the results will be saved.
266
+
267
+ Return
268
+ ------
269
+ output: str
270
+ Absolute path to the output directory.
271
+ """
272
+ print(f'\n---------------------------------------------- Function: check_output ----------------------------------------------\n')
273
+ rename = False
274
+ if output == None:
275
+ output='pipeline1_vs_pipeline2'
276
+ rename = True
277
+ os.makedirs(output, exist_ok=True)
278
+
279
+ elif not os.path.isdir(output):
280
+ sys.exit(f'\tError: The specified {output} is not a valid directory.')
281
+
282
+ full_path_output = os.path.abspath(output)
283
+
284
+ return full_path_output, rename
285
+
286
+ def check_threshold(threshold):
287
+
288
+ """
289
+ Validates the format of the threshold argument for filtering the partition matrix.
290
+
291
+ If the threshold is not "max", it must be in the format "X-Y", where X and Y are positive integers.
292
+ If the format is incorrect, the program will stop with an error message.
293
+
294
+ Parameter
295
+ ---------
296
+ threshold: str
297
+ Range of thresholds to apply in the partition matrix file.
298
+
299
+ Return
300
+ ------
301
+ threshold: str
302
+ The validated threshold string.
303
+ """
304
+ #print(f'\n---------------------------------------------- Function: check_threshold ----------------------------------------------\n')
305
+
306
+ if threshold != 'max':
307
+ parts = threshold.split('-')
308
+
309
+ if len(parts) != 2 or not all(part.isdigit() for part in parts):
310
+ sys.exit(f"\tError: The threshold argument (-t) must be in the format 'X-Y', where X and Y are positive integers.")
311
+
312
+ return threshold
313
+
314
+ def check_score(score):
315
+
316
+ """
317
+ Checks that the score is a float between 0 and 3.
318
+ If the format is incorrect, the program will stop with an error message.
319
+
320
+ Parameter
321
+ ---------
322
+ score: str
323
+ Score value as string.
324
+
325
+ Return
326
+ ------
327
+ score_value: str
328
+ Validated score.
329
+ """
330
+ #print(f'\n---------------------------------------------- Function: check_score ----------------------------------------------\n')
331
+
332
+ try:
333
+ score_value=float(score)
334
+ if not 0 <= score_value <= 3:
335
+ sys.exit(f"\tError: The score value {float(score)} is out of the allowed range (0 to 3).")
336
+ except ValueError as e:
337
+ sys.exit(f"\tError: The score value {score} is not a float.")
338
+
339
+ return score_value
340
+
341
+ def check_file(file):
342
+
343
+ """
344
+ Check the structure of the input file (either a sequence-type matrix or a partition matrix).
345
+
346
+ Parameter
347
+ ---------
348
+ file: str
349
+ Relative path to the input matrix file.
350
+
351
+ Returns
352
+ -------
353
+ filename : str
354
+ Absolute path to the input file.
355
+ prefix : str
356
+ Prefix of the input file.
357
+ path_directory : str
358
+ Directory name where the file is located.
359
+ file_type : bool
360
+ False for sequence-type matrix; True for partition matrix.
361
+ n_samples : int
362
+ Number of samples present in the file.
363
+ n_groups : int or None
364
+ Number of groups if applicable (only for sequence-type matrix).
365
+ """
366
+ #print(f'\n---------------------------------------------- Function: check_file ----------------------------------------------\n')
367
+
368
+ df = pd.read_table(file)
369
+ nr_columns = df.shape[1]
370
+ n_samples = df.shape[0]
371
+
372
+ filename = os.path.abspath(file)
373
+ prefix_file = os.path.basename(file)
374
+ path_directory = filename.split('/')[-2]
375
+
376
+ if nr_columns == 2:
377
+ n_groups = len(df.iloc[:,1].unique())
378
+ prefix = prefix_file.split('.tsv')[0]
379
+ file_type = False
380
+
381
+ elif nr_columns > 2:
382
+
383
+ prefix = prefix_file.split('.tsv')[0]
384
+ n_groups = None
385
+ file_type = True
386
+
387
+ return filename, prefix, path_directory, file_type, n_samples, n_groups
388
+
389
+ def check_str_plots_threshold(plots_thresholds):
390
+
391
+ """
392
+ Check if the threshold has the correct format.
393
+ Otherwise, the program will terminate with an error message indicating that the format is incorrect.
394
+
395
+ Parameter
396
+ ---------
397
+ plots_thresholds: str
398
+ One or more thresholds provided by the user, separated by commas and without spaces.
399
+ Return
400
+ ------
401
+ plots_thresholds: list
402
+ Valid list of plot thresholds in the format 'METHOD-NxM.M' (e.g., MST-4x1.0)
403
+ """
404
+ #print(f'\n---------------------------------------------- Function: check_str_plots_thresholds ----------------------------------------------\n')
405
+
406
+ pattern = r'^[A-Za-z]+-\d+x\d+\.\d+$'
407
+
408
+ thresholds = [th for th in plots_thresholds.split(",")]
409
+
410
+ values=[]
411
+ for th in thresholds:
412
+ if not re.match(pattern, th):
413
+ sys.exit(f"\tError: The value '{th}' does not follow the expected format (e.g., METHOD-NxM.M). Multiple plots must be separated by commas and without spaces.")
414
+ values.append(th)
415
+
416
+ return values
417
+
418
+ def check_combinations_arguments(plots_summary_arg, data_folder, data_files):
419
+
420
+ """
421
+ Check the argument combinations provided by the user.
422
+ If any combination of arguments is invalid, the program will stop.
423
+
424
+ Parameters
425
+ ----------
426
+ plots_summary_arg: str
427
+ Type of file to be used for clustering characterization.
428
+
429
+ data_folder: list
430
+ List containing information about the input files, file prefixes, and directories.
431
+
432
+ data_file: list
433
+ List containing information about the input files, file prefixes, and directories.
434
+
435
+ Return
436
+ ------
437
+ go_clustering: boolean
438
+ Indicates whether clustering characterization can be executed.
439
+ go_outbreak: boolean
440
+ Indicates whether outbreak analysis can be executed.
441
+ """
442
+ #print(f'\n---------------------------------------------- Function: check_combinations_arguments ----------------------------------------------\n')
443
+
444
+ type_file = plots_summary_arg
445
+
446
+ args = sys.argv
447
+ if '-pcn' in args and '-pcp' in args :
448
+ sys.exit(f"\tError: It is not possible to provide the plots_category_number (-pcn) and plots_category_percentage (-pcp) at the same time.")
449
+
450
+ errors = []
451
+ go_clustering = False
452
+ go_outbreaks = False
453
+
454
+ if data_folder:
455
+ for elem in data_folder:
456
+ partitions_summary = elem[3]
457
+ sample_of_interest = elem[4]
458
+ cluster_composition = elem[5]
459
+ input_path = elem[2]
460
+ stable_regions = elem[6]
461
+
462
+
463
+ if type_file == 'sample_of_interest':
464
+ if '-cp' not in args:
465
+ errors.append(f'\tError: For clustering analysis you must specify the column plots (-cp) argument with SAMPLE_OF_INTEREST_partitions_summary file.\n')
466
+
467
+ if '-n' in args:
468
+ errors.append(f'\tError: It is impossible to use -n argument with SAMPLE_OF_INTEREST_partitions_summary file.\n')
469
+
470
+ if '-pt' not in args:
471
+ errors.append(f'\tError: For clustering analyis you must specify the plots threshold (-pt) argument with SAMPLE_OF_INTEREST_partitions_summary file.\n')
472
+
473
+ if sample_of_interest is None:
474
+ errors.append(f'\tError: The file SAMPLE_OF_INTEREST_partitions_summary does not exist in {input_path}.\n')
475
+ go_clustering = True
476
+
477
+ if type_file == 'partitions_summary':
478
+
479
+ if '-cp' in args and '-pt' in args and partitions_summary is not None:
480
+ go_clustering = True
481
+
482
+ else:
483
+ if '-cp' in args and not '-pt' in args:
484
+ errors.append(f'\tError: For clustering analyis you must specify the plots threshold (-pt) argument.\n')
485
+
486
+ if '-pt' in args and not '-cp' in args:
487
+ errors.append(f'\tError: For clustering analysis you must specify the column plots (-cp) argument.\n')
488
+
489
+ if partitions_summary is None:
490
+ errors.append(f'\tError: The file partitions_summary does not exist in {input_path}.\n')
491
+
492
+ if stable_regions is None:
493
+ if '-n_stab' in args:
494
+ errors.append(f'\tError: It is impossible to use the -n_stab argument when the file stableRegions does not exist.\n')
495
+ if '-n_thr' in args:
496
+ errors.append(f'\tError: It is impossible to use the -n_thr argument when the file stableRegions does not exist.\n')
497
+
498
+ if '-to' in args:
499
+ if cluster_composition is None:
500
+ errors.append(f'\tError: It is impossible to use the -to argument when the cluster_composition file is not in {input_path}.\n')
501
+
502
+ if '-to' in args and len(data_folder) == 1:
503
+ errors.append(f'\tError: It is impossible to use the -to argument only with one folder.\n')
504
+
505
+ if '-to' in args and len(data_files) == 1:
506
+ errors.append(f'\tError: It is impossible to use the -to argument only with input files.\n')
507
+
508
+ if len(data_folder) == 2:
509
+ if '-to' in args:
510
+ if len(data_folder[0]) == 7 and len(data_folder[1]):
511
+ if data_folder[0][5] is not None and data_folder [1][5] is not None:
512
+ go_outbreaks = True
513
+
514
+ if data_files:
515
+ if len(data_files) == 2:
516
+ cluster_args = ['-cp', '-pt', '-n', '-ps', '-pcn', '-pcp']
517
+ for elem in cluster_args:
518
+ if elem in args:
519
+ errors.append(f'\tError: It is impossible to use the {elem} argument when input file(s) are provided.\n')
520
+
521
+ if errors:
522
+ unique_errors = set(errors)
523
+ sys.exit(f"\nThe following problems were found:\n {' '.join(unique_errors)}")
524
+
525
+ return go_clustering, go_outbreaks
526
+
527
+ def check_data_folders_file(data_folder, data_files):
528
+
529
+ """
530
+ Validates if the prefixes in data_folder and data_files are different.
531
+ If valid, concatenates the prefixes into one string.
532
+
533
+ Parameters
534
+ ----------
535
+ data_folder: list
536
+ List of folder-related elements. Prefix in the second position.
537
+ data_files: list
538
+ List of file-related elements. Prefix in the second position.
539
+
540
+ Returns
541
+ -------
542
+ data_folder : list
543
+ The same input data_folder, unchanged.
544
+ data_files : list
545
+ The same input data_files, unchanged.
546
+ prefix_both: str
547
+ Concatenated prefix inputs (e.g., 'HC_vs_GT'). If only one input is present, returns its prefix.
548
+ """
549
+ #print(f'\n---------------------------------------------- Function: check_data_folders_file ----------------------------------------------\n')
550
+
551
+ check_prefixes=[]
552
+
553
+ if data_folder:
554
+ for elem in data_folder:
555
+ check_prefixes.append(elem[1])
556
+
557
+ if data_files:
558
+ for elem in data_files:
559
+ check_prefixes.append(elem[1])
560
+
561
+ if len(check_prefixes) == 2 and check_prefixes[0] == check_prefixes[1]:
562
+ sys.exit(f"Error: Impossible to analyse inputs with the same prefix {check_prefixes[0]} and {check_prefixes[1]}.")
563
+
564
+ if len(check_prefixes) == 2:
565
+ prefix_both = check_prefixes[0] + '_vs_' + check_prefixes[1]
566
+ else:
567
+ prefix_both = check_prefixes[0]
568
+
569
+ return data_folder, data_files, prefix_both
570
+
571
+ def check_range_threshold(partition_matrix, threshold, log):
572
+
573
+ """
574
+ Checks whether a given threshold range is valid within the column range of the partition matrix.
575
+ If valid, decomposes the string threshold into two integers: start and end thresholds.
576
+
577
+ Parameters
578
+ ----------
579
+ threshold: str
580
+ Range threshold in the format 'start-end' or 'max'.
581
+
582
+ partition_matrix: str
583
+ Relative path to the partition matrix file.
584
+
585
+ Returns
586
+ -------
587
+ start_threshold : int or None
588
+ The start of the threshold range. None if 'max' is used.
589
+
590
+ end_threshold : int or None
591
+ The end of the threshold range. None if 'max' is used.
592
+ """
593
+ #print(f'\n---------------------------------------------- Function: check_range_threshold ----------------------------------------------')
594
+
595
+ if threshold != 'max':
596
+ parts = threshold.split('-')
597
+ start_threshold = int(parts[0])
598
+ end_threshold = int(parts[1])
599
+
600
+ df = pd.read_table(partition_matrix)
601
+ columns_df = len(df.columns)
602
+ column_range = (0, columns_df)
603
+ min_column,max_column=column_range
604
+
605
+ if start_threshold > end_threshold:
606
+ sys.exit(f"\tError: Start threshold {start_threshold} is greater than end threshold {end_threshold}.")
607
+
608
+ else:
609
+ if not (min_column <= start_threshold <= max_column):
610
+ sys.exit(f"\tError: Start threshold {start_threshold} is outside the valid column range {column_range}.")
611
+
612
+ if not (min_column <= end_threshold <= max_column):
613
+ print_log(f'\t\tWarning: The final threshold ({end_threshold}) is higher than the available number of columns.',log)
614
+ else:
615
+ start_threshold = None
616
+ end_threshold = None
617
+
618
+ return start_threshold, end_threshold
619
+
620
+ def management_main_scripts(comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, input1, input2, prefix_both, output, score, log):
621
+
622
+ """
623
+ Executes congruence scripts to evaluate the agreement between two genomic pipelines.
624
+ This function orchestrates the execution of all previously mentioned scripts.
625
+
626
+ Parameters
627
+ ----------
628
+ comparing_partitions_script : str
629
+ Absolute path to the comparing_partitions_v2.py.
630
+ get_best_part_correspondence_script : str
631
+ Absolute path to the get_best_part_correspondence.py.
632
+ remove_hifen_script : str
633
+ Absolute path to the remove_hifen.py.
634
+
635
+ input1 : str
636
+ Path to the first input file (e.g., *_partitions.tsv or sequence type matrix).
637
+ input2 : str
638
+ Path to the second input file (e.g, *_partitions.tsv or sequence type matrix).
639
+ prefix_both: str
640
+ Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX).
641
+ output : str
642
+ Full path to the directory where the results will be saved.
643
+ score : str
644
+ Minimum score to consider two partitions as a correspondence.
645
+
646
+ Returns
647
+ -------
648
+ Output files generated by each script include:
649
+
650
+ - comparing_partitions_v2.py:
651
+ *_AdjustedRand.tsv, *_AdjWallace1.tsv, *_AdjWallace2.tsv, *_final_score.tsv,
652
+ *_Simpsons1.tsv, *_Simpsons2.tsv, *_Wallace1.tsv, *_Wallace2.tsv
653
+
654
+ - get_best_part_correspondence.py:
655
+ *_ALL_CORRESPONDENCE.tsv
656
+
657
+ - remove_hifen.py:
658
+ *_All_correspondence.tsv
659
+ """
660
+ #print_log(f'\n---------------------------------------------- Function: management_all_scripts ----------------------------------------------', log)
661
+
662
+ print_log(f'\tObtaining the cluster congruence score ...', log)
663
+
664
+ #1- Running the first script with the user's inputs
665
+
666
+ print_log(f"\t\tRunning comparing_partitions_v2.py in “between_methods” mode.", log)
667
+
668
+ cmd=[ "python", comparing_partitions_script, "-o1", "0", "-o2", "0", "-a", "between_methods",
669
+ "-log", f"{output}/{prefix_both}_Comparing_partitions.log", "-t", f"{output}/{prefix_both}",
670
+ "-i1", input1, "-i2", input2, "--keep-redundants"]
671
+
672
+ print_log(f'\t\t\t{" ".join(cmd)}', log)
673
+ subprocess.run(cmd)
674
+
675
+ print_log(f"\t\tDone.\n", log)
676
+
677
+ #2- Running the second script with the user's inputs #Input directory with all the *final_score.tsv files
678
+
679
+ print_log(f'\tIdentifying the inter-pipeline “corresponding points”', log)
680
+ print_log(f'\t\tRunning get_best_part_correspondence.py with a score of {score}.', log)
681
+ cmd=["python", get_best_part_correspondence_script, "-i", output, "-s", str(score)]
682
+ print_log(f'\t\t\t{" ".join(cmd)}', log)
683
+
684
+ subprocess.run(cmd)
685
+ print_log(f"\t\tDone.\n", log)
686
+
687
+ # # 3- Execution of the third script - remove hyphens from ALL_CORRESPONDENCE.tsv
688
+
689
+ print_log("\t\tFiltering output file with remove_hifen.py.", log)
690
+ cmd=["python", remove_hifen_script, "-i", f"{output}/ALL_CORRESPONDENCE.tsv", "-o", f"{output}/{prefix_both}_ALL_CORRESPONDENCE.tsv"]
691
+ print_log(f'\t\t\t{" ".join(cmd)}', log)
692
+ subprocess.run(cmd)
693
+ print_log(f"\t\tDone.\n", log)
694
+
695
+ original_file=(f"{output}/{prefix_both}_ALL_CORRESPONDENCE.tsv")
696
+ path_all_correspondence_lower=(f"{output}/{prefix_both}_All_correspondence.tsv")
697
+ os.rename(original_file, path_all_correspondence_lower)
698
+ path_all_correspondence=(f"{output}/ALL_CORRESPONDENCE.tsv")
699
+ os.remove(path_all_correspondence)
700
+
701
+ return path_all_correspondence_lower
702
+
703
+ def tendency_slop(correspondence, pipeline1, pipeline2, output_folder): #correspondence=ALL_CORRESPONDENCE.tsv
704
+
705
+ """
706
+ This function is part of the script heatmap_final_score.py (Mixão et. al., 2024), and is responsible for generating the *_slope.tsv file.
707
+ This file contains information about the r-value and p-value of the trend line.
708
+
709
+ Parameters
710
+ ----------
711
+ correspondence: str
712
+ Absolute path to the All_correspondence.tsv file.
713
+ pipeline1: str
714
+ Prefix of the first pipeline.
715
+ pipeline2: str
716
+ Prefix of the second pipeline.
717
+ output_folder: str
718
+ Full path to the directory where the results will be saved.
719
+
720
+ Return
721
+ ------
722
+ comparison: str
723
+ An empty string. The function generates a *_slope.tsv file in the output folder.
724
+ """
725
+ #print(f'\n---------------------------------------------- Function: tendency_slop ----------------------------------------------')
726
+
727
+ possible_comparison_names = [pipeline1 + "_vs_" + pipeline2, pipeline2 + "_vs_" + pipeline1]
728
+ mx = pd.read_table(correspondence)
729
+ all_comparisons = pd.unique(mx[mx.columns[0]])
730
+
731
+ comparison = ""
732
+ for comp1 in all_comparisons:
733
+ if "_rev" not in comp1:
734
+ if comp1 in possible_comparison_names:
735
+
736
+ extension = output_folder + "/" + comp1 + "_slope.tsv"
737
+ with open(extension, "w+") as out:
738
+ print("#comp1\tcomp2\tslope\tintercept\tr_value\tp_value\tstd_err", file = out)
739
+ for comp2 in all_comparisons:
740
+ if "_rev" in comp2:
741
+ if comp2.split("_rev")[0] == comp1:
742
+ comps = [comp1, comp2]
743
+ flt_mx = mx.loc[mx[mx.columns[0]].isin(comps)]
744
+
745
+ if len(flt_mx[flt_mx.columns[0]].values.tolist()) == 0:
746
+ print("No trend line will be provided as no congruence point was found!!")
747
+ else:
748
+ if len(flt_mx[flt_mx[flt_mx.columns[0]] == comp1]["method1"].values.tolist()) != 0:
749
+ slope, intercept, r_value, p_value, std_err = stats.linregress(flt_mx[flt_mx[flt_mx.columns[0]] == comp1]["method1"],flt_mx[flt_mx[flt_mx.columns[0]] == comp1]["method2"])
750
+ print(comp1 + "\t" + comp2 + "\t" + str(slope) + "\t" + str(intercept) + "\t" + str(r_value) + "\t" + str(p_value) + "\t" + str(std_err), file = out)
751
+ if len(flt_mx[flt_mx[flt_mx.columns[0]] == comp2]["method1"].values.tolist()) != 0:
752
+ slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(flt_mx[flt_mx[flt_mx.columns[0]] == comp2]["method1"],flt_mx[flt_mx[flt_mx.columns[0]] == comp2]["method2"])
753
+ print(comp2 + "\t" + comp1 + "\t" + str(slope2) + "\t" + str(intercept2) + "\t" + str(r_value2) + "\t" + str(p_value2) + "\t" + str(std_err2), file = out)
754
+
755
+ return comparison
756
+
757
+ def filter_partition_matrix(partition_matrix, prefix_single, start_threshold, end_threshold, output, log):
758
+
759
+ """
760
+ Applies a valid range threshold, when specified, to the partition matrix file.
761
+ A new partition matrix file containing only the selected threshold columns is created.
762
+
763
+ Parameters
764
+ ----------
765
+ partition_matrix: str
766
+ Relative path to the partition matrix.
767
+ prefix_single: str
768
+ Prefix to be used in the name of the filtered partition matrix file.
769
+ output: str
770
+ Path to the directory where the filtered matrix file will be saved
771
+ start_threshold: int
772
+ Starting column index for threshold selection in the partition matrix.
773
+ end_threshold: int
774
+ Ending column index for threshold selection in the partition matrix.
775
+
776
+ Return
777
+ ------
778
+ input_filtered: str
779
+ Relative path to the new filtered partition matrix (*.tsv) containing only the selected threshold columns.
780
+ """
781
+ #print_log(f'\n---------------------------------------------- Function: filter_partition_matrix ----------------------------------------------', log)
782
+
783
+ df = pd.read_table(partition_matrix)
784
+
785
+ columns_to_keep = df.columns[start_threshold + 1 : end_threshold + 2]
786
+ columns_to_keep_1 = [df.columns[0]] + list(columns_to_keep)
787
+ df_filtered = df[columns_to_keep_1]
788
+ input_filtered = f"{output}/{prefix_single}_partitions-filtered.tsv"
789
+ df_filtered.to_csv(input_filtered, sep = '\t', index = False)
790
+
791
+ print_log(f'\tFiltering the partitions table for the range {start_threshold}-{end_threshold}...',log)
792
+
793
+ return input_filtered
794
+
795
+ def stability_region(output, partition_matrix, prefix, comparing_partitions_script, n_stability, thr_stability, log):
796
+
797
+ """
798
+ Executes the stability analysis using comparing_partitions_v2.py when the file *_stableRegions.tsv
799
+ does not already exist in the input directory. If executed, the command generates *_stableRegions.tsv and *_metrics.tsv files.
800
+
801
+ Parameters
802
+ ----------
803
+ output: str
804
+ Path to the directory where the results will be saved.
805
+ partition_matrix: str
806
+ Path to the input directory.
807
+ comparing_partitions_script: str
808
+ Path to the comparing_partitions_v2.py script.
809
+
810
+ Return
811
+ ------
812
+ file_stability : str
813
+ Path to the *_stableRegions.tsv file created.
814
+ """
815
+ #print_log(f'\n---------------------------------------------- Function: stability_region ----------------------------------------------', log)
816
+
817
+ cmd = [
818
+ "python", comparing_partitions_script, "-i1", partition_matrix, "-o1", "0", "-a", "stability", "-n", str(n_stability), "-thr", str(thr_stability),
819
+ "-log", f"{output}/{prefix}_Comparing_partitions.log", "-t", f"{output}/{prefix}", "--keep-redundants"]
820
+
821
+ print_log(f'\t\t\t{" ".join(cmd)}', log)
822
+
823
+ subprocess.run(cmd, capture_output = True, text = True)
824
+
825
+ file_stability = f'{output}/{prefix}_stableRegions.tsv'
826
+
827
+ return file_stability
828
+
829
+ def get_heatmap(output, i1_prefix, i2_prefix, threshold, log):
830
+
831
+ """
832
+ Generates a heatmap figure representing the congruence score between two genomic pipelines,
833
+ based on the *_final_score.tsv file.
834
+
835
+ Parameters
836
+ ----------
837
+ output: str
838
+ Path to the directory where the *_final_score.tsv file is located and where the heatmap will be saved.
839
+ i1_prefix : str
840
+ Prefix of the first pipeline (y-axis).
841
+ i2_prefix : str
842
+ Prefix of the second pipeline (x-axis).
843
+ threshold : str
844
+ Threshold value used to filter the partition matrix ('max' if no filtering).
845
+ Return
846
+ ------
847
+ fig_heatmap: class plotly.graph_objs._figure.Figure
848
+ Plotly figure object of the heatmap. Also saved as PNG in the output folder.
849
+ """
850
+ #print_log(f'\n---------------------------------------------- Function: get_heatmap ----------------------------------------------', log)
851
+
852
+ max_ticks = 16
853
+
854
+ final_score = glob.glob(output +'/*_final_score.tsv' )[0]
855
+ df = pd.read_csv(final_score, sep ='\t')
856
+ df_filtered = df.drop(df.columns[0], axis = 1)
857
+ df_filtered.columns = range(len(df_filtered.columns))
858
+ n_lines, n_column = df_filtered.shape
859
+
860
+ #---------------------------------
861
+ fig_heatmap = px.imshow(df_filtered,
862
+ labels = dict(x = f"Threshold <br> -{i2_prefix}-</br>", y = f"Threshold <br> -{i1_prefix}-</br>"))
863
+
864
+ if n_lines > max_ticks:
865
+ step = math.ceil(n_lines/ max_ticks)
866
+ y_list = list(range(0, n_lines, step))
867
+ else:
868
+ y_list=list(range(0,n_lines))
869
+
870
+ if n_column > max_ticks:
871
+ step = math.ceil(n_column / max_ticks)
872
+ x_list = list(range(0, n_column, step))
873
+ else:
874
+ x_list = list(range(0, n_column))
875
+
876
+ fig_heatmap.update_layout(
877
+ height = 500,
878
+ width = 500,
879
+ title_x = 0.5,
880
+ xaxis = dict(scaleanchor = None, constrain='domain', tickvals = x_list),
881
+ yaxis = dict(scaleanchor = None, constrain='domain', tickvals = y_list),
882
+ coloraxis = dict(colorscale = 'Blues', cmin = 0, cmax = 3))
883
+
884
+ #--------------------------------------------------------------------------
885
+ # For both other matrices:
886
+ if n_column == 1:
887
+ fig_heatmap.update_layout(xaxis = dict(tickmode = 'array', tickvals = [0], ticktext = [0]))
888
+ if n_lines == 1:
889
+ fig_heatmap.update_layout(yaxis = dict(tickmode = 'array', tickvals = [0], ticktext = [0]))
890
+
891
+ #--------------------------------------------------------------------------
892
+ #For the partitions filtered matrix:
893
+
894
+ if threshold != 'max':
895
+
896
+ #--------------------------------------------------------------------------
897
+ # Partition matrix and sequence type
898
+
899
+ if n_column == 1 and n_lines != 1:
900
+ columns_y = df.iloc[:, 0]
901
+ string_columns_y = [s.split('-')[1].split('x')[0] for s in columns_y]
902
+ len_y = len(columns_y)
903
+
904
+ if len_y <= max_ticks:
905
+ index_y = list(range(len_y))
906
+ fig_heatmap.update_layout(xaxis = dict(tickmode ='array', tickvals = [0], ticktext = [0]))
907
+ fig_heatmap.update_layout(yaxis = dict(tickmode ='array', tickvals = index_y, ticktext = string_columns_y))
908
+ else:
909
+ step_y = math.ceil(len_y / max_ticks)
910
+ list_index_y = [i * step_y for i in range(max_ticks)]
911
+ list_strings_y = string_columns_y [::step_y]
912
+
913
+ fig_heatmap.update_layout(
914
+ xaxis = dict(tickvals = [0], ticktext = [0]),
915
+ yaxis = dict(tickvals = list_index_y, ticktext = list_strings_y))
916
+
917
+ #--------------------------------------------------------------------------
918
+ # Both Partition matrix
919
+
920
+ if n_column != 1 and n_lines != 1:
921
+
922
+ columns = df.columns.tolist()[1:]
923
+ string_columns = [s.split('-')[1].split('x')[0] for s in columns]
924
+ len_x = len(columns)
925
+
926
+ if len_x > max_ticks:
927
+
928
+ step_x = math.ceil(len_x / max_ticks)
929
+ list_index = [i * step_x for i in range(max_ticks)]
930
+ list_strings = string_columns[::step_x]
931
+
932
+ fig_heatmap.update_layout(
933
+ xaxis = dict(tickvals = list_index, ticktext = list_strings),
934
+ yaxis = dict(tickvals = list_index, ticktext = list_strings))
935
+ else:
936
+
937
+ fig_heatmap.update_layout(
938
+ xaxis = dict(tickvals = list(range(len_x)), ticktext = string_columns),
939
+ yaxis = dict(tickvals = list(range(len_x)), ticktext = string_columns))
940
+
941
+ fig_heatmap.update_layout(margin=dict(l=0, r=0, t=20, b=0))
942
+ fig_heatmap.write_image(f'{output}/{i1_prefix}_vs_{i2_prefix}_heatmap.png', format = "png")
943
+
944
+ return fig_heatmap
945
+
946
+ def get_tendency(output, prefix_both, log):
947
+
948
+ """
949
+ Creates a scatter plot with trendline from *_All_correspondence.tsv,
950
+ showing the best correspondence points between methods in each pipeline.
951
+
952
+ Parameters
953
+ ----------
954
+ output: str
955
+ Path to the directory where the *_All_correspondence.tsv file is located and where the figure will be saved.
956
+
957
+ prefix_both: str
958
+ Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX)
959
+ Return
960
+ ------
961
+ fig_tendency: plotly.graph_objs._figure.Figure
962
+ Plotly figure object of the scatter plot. Also saved as PNG in the output folder.
963
+ """
964
+ #print_log(f'\n---------------------------------------------- Function: get_tendency ----------------------------------------------', log)
965
+
966
+ all_correspondence=glob.glob(output + '/*_All_correspondence.tsv')[0]
967
+ df = pd.read_csv(all_correspondence, sep = "\t")
968
+
969
+ df_1=df.iloc[:,0]
970
+
971
+ values_rev = [string for string in df_1 if '_rev' in string]
972
+
973
+ nr_point=len(df_1)
974
+ nr_point_method_2 = len(values_rev)
975
+ nr_point_method_1 = nr_point - nr_point_method_2
976
+
977
+ for elem in values_rev:
978
+ string_r=elem.split('_')
979
+ reverse_prefix = string_r[-2]+'_vs_'+string_r[0]
980
+
981
+ df_modified=df.replace(to_replace = elem, value = reverse_prefix)
982
+
983
+ x_axes = df_modified.columns[1]
984
+ y_axes = df_modified.columns[2]
985
+
986
+ fig_tendency = px.scatter(df_modified, x = x_axes, y = y_axes, trendline = "ols", color_discrete_sequence = ["orange", "blue"], color = 'comparison')
987
+ fig_tendency.update_layout(title_x = 0.5, legend=dict( orientation="h",yanchor="bottom", y=-0.35, xanchor="center", x=0.5), margin=dict(l=0, r=0, t=20, b=0))
988
+ fig_tendency.write_image(f'{output}/{prefix_both}_tendency.png', format = "png")
989
+
990
+ return fig_tendency, nr_point_method_1, nr_point_method_2
991
+
992
+ def join_inputs_variables(data_folder, data_files):
993
+
994
+ """
995
+ Join the input variables provided in the command line for the congruence analysis.
996
+
997
+ Parameters
998
+ ----------
999
+ data_folder: list
1000
+ List of folder-related elements.
1001
+
1002
+ data_files: list
1003
+ List of folder-related elements.
1004
+
1005
+ Return
1006
+ ------
1007
+ inputs_variables : list
1008
+ Combined list of all valid inputs.
1009
+ """
1010
+
1011
+ #print(f'\n---------------------------------------------- Function: join_inputs_variables ----------------------------------------------')
1012
+
1013
+ inputs_variables = []
1014
+
1015
+ if data_folder:
1016
+ for elem in data_folder:
1017
+ inputs_variables.append(elem)
1018
+
1019
+ if data_files:
1020
+ for elem in data_files:
1021
+ inputs_variables.append(elem)
1022
+
1023
+ if len(inputs_variables) == 1:
1024
+ if inputs_variables[0][0] == None:
1025
+ sys.exit(f"\tError: It is impossible to proceed the analysis without a partition_matrix file.")
1026
+
1027
+ if len(inputs_variables) == 2:
1028
+ i1,i2 = inputs_variables[0][0], inputs_variables[1][0]
1029
+
1030
+ if i1 is None and i2 is None:
1031
+ sys.exit("\tError: It is impossible to proceed the analysis")
1032
+ else:
1033
+ print(f'\nChecking the command line:')
1034
+ print(f'\tThe provided arguments are all compatible. Everything is ready to run EvalTree.py :)\n')
1035
+
1036
+ return inputs_variables
1037
+
1038
+ def load_and_prepare_data(file, log):
1039
+
1040
+ """
1041
+ Identifies the type of partition summary file and processes it.
1042
+
1043
+ The difference between *_partitions_summary.tsv and *_SAMPLE_OF_INTEREST_partitions_summary.tsv
1044
+ is that the latter contains an extra 'SAMPLE_OF_INTEREST' column, which will be removed from the
1045
+ dataframe if present.
1046
+
1047
+ Parameters
1048
+ ----------
1049
+ file: str
1050
+ Path to the file that will be processed.
1051
+
1052
+ Returns
1053
+ -------
1054
+ df_data : pandas DataFrame
1055
+ Returns the dataframe with or without the 'SAMPLE_OF_INTEREST' column, depending on the file type.
1056
+
1057
+ """
1058
+ #print_log(f'\n---------------------------------------------- Function: load_and_prepare_data ----------------------------------------------\n', log)
1059
+
1060
+ df = pd.read_table(file)
1061
+ first_cell = df.columns[0]
1062
+
1063
+ if first_cell == 'SAMPLE_OF_INTEREST':
1064
+ df_data = df.iloc[:, 1:]
1065
+ else:
1066
+ df_data = df
1067
+
1068
+ return df_data
1069
+
1070
+ def order_cluster_by_size(df_data, log):
1071
+
1072
+ """
1073
+ Checks if the 'cluster_length' column exists in the dataframe and sorts the dataframe by it in descending order.
1074
+
1075
+ Parameters
1076
+ ----------
1077
+ df_data : pandas.DataFrame
1078
+ Dataframe with cluster data to analyze.
1079
+
1080
+ Returns
1081
+ -------
1082
+ df_filtered : pandas.DataFrame or None
1083
+ Sorted dataframe by 'cluster_length', or None if the column doesn't exist.
1084
+ """
1085
+ #print_log(f'\n---------------------------------------------- Function: order_cluster_by_size ----------------------------------------------\n', log)
1086
+
1087
+ if 'cluster_length' in df_data.columns:
1088
+ df_filtered = df_data.sort_values(by = 'cluster_length', ascending=False)
1089
+ #print_log(f'\t\tOrdering clusters by cluster-length (values in descending order)...', log)
1090
+ else:
1091
+ #print_log(f'\t\tOrdering clusters by cluster-length is not possible because, cluster-length column is not present in the file...', log)
1092
+ df_filtered = None
1093
+
1094
+ return df_filtered
1095
+
1096
+ def check_plot_threshold(plots_thresholds, df_filtered, log):
1097
+
1098
+ """
1099
+ Checking if the plots_threshold argument contains one or more integer thresholds.
1100
+ Generating the MST structure for each threshold as present in the file (sample_of_interest or partition_summary).
1101
+
1102
+ Parameters
1103
+ ----------
1104
+ plots_thresholds: list
1105
+ One or more thresholds provided by the user, separated by commas and without spaces.
1106
+ df_filtered: pandas.DataFrame
1107
+ Sorted dataframe containing the necessary data.
1108
+ Return
1109
+ ------
1110
+ method: list
1111
+ A list of thresholds in the format 'MST-{value}x1.0'.
1112
+ """
1113
+ #print_log(f'\n---------------------------------------------- Function: check_plots_thresholds ----------------------------------------------\n', log)
1114
+
1115
+ name_threshold_in_df = df_filtered.iloc[:,0].unique().tolist()
1116
+ method = []
1117
+ for elem in plots_thresholds:
1118
+ if elem in name_threshold_in_df:
1119
+ method.append(elem)
1120
+ else:
1121
+ print_log(f"\tThe plot threshold {elem} does not exist in the file.", log)
1122
+
1123
+ return method
1124
+
1125
+ def check_threshold_in_file(method, df_filtered, clustering_file, log):
1126
+
1127
+ """
1128
+ Look for the unique thresholds in the file (*_partitions_summary.tsv or *_SAMPLE_OF_INTEREST_partitions_summary.tsv)
1129
+ Check if the threshold(s) entered exist in the selected file.
1130
+
1131
+ Parameters
1132
+ ----------
1133
+ method: list
1134
+ A list of thresholds in the format 'MST-{value}x1.0'.
1135
+ df_filtered: pandas.DataFrame
1136
+ A dataframe containing data from the selected file.
1137
+ clustering_file: str
1138
+ Path to the file (partitions_summary or sample_of_interest)
1139
+
1140
+ Return
1141
+ ------
1142
+ filtered_threshold: list
1143
+ List of valid thresholds that will be applied to df_filtered.
1144
+ """
1145
+ #print_log(f'\n---------------------------------------------- Function: check_threshold_in_file ----------------------------------------------\n', log)
1146
+
1147
+ all_lines_in_one_column = df_filtered.iloc[:,0]
1148
+ unique_threshold = all_lines_in_one_column.unique().tolist()
1149
+
1150
+ filtered_threshold = []
1151
+
1152
+ for elem in method:
1153
+ if elem not in unique_threshold:
1154
+ print(f'\tThe plot threshold entered {elem} does not exist in the {clustering_file}...')
1155
+ else:
1156
+ filtered_threshold.append(elem)
1157
+
1158
+ return filtered_threshold
1159
+
1160
+ def filter_df_by_plot_threshold(filtered_threshold, df_filtered, n_cluster, log):
1161
+
1162
+ """
1163
+ Check if the number of clusters selected by the n_cluster argument can be applied to the dataframe.
1164
+ Filter the dataframe according to the threshold plots and limit the number of clusters per threshold.
1165
+
1166
+ Parameters
1167
+ ----------
1168
+ filtered_threshold: list
1169
+ List of valid thresholds to be applied to df_filtered to generate cluster plots.
1170
+
1171
+ df_filtered: pandas.DataFrame
1172
+ The dataframe is organized with the largest clusters.
1173
+
1174
+ n_cluster: int
1175
+ The number of clusters (pie plot(s)) to be produced.
1176
+
1177
+ Return
1178
+ ------
1179
+ result_df: pandas.DataFrame
1180
+ Dataframe with the selected information based on the provided arguments (threshold(s) and n_cluster).
1181
+ """
1182
+ #print_log(f'\n---------------------------------------------- Function: filter_df_by_plot_threshold ----------------------------------------------\n', log)
1183
+
1184
+ head = df_filtered.columns.tolist()
1185
+ name_partition = head[0]
1186
+
1187
+ results = []
1188
+
1189
+ for threshold in filtered_threshold:
1190
+ threshold_df = df_filtered[df_filtered[name_partition] == threshold]
1191
+ n_lines = len(threshold_df)
1192
+
1193
+ if n_cluster > n_lines:
1194
+ print_log(f"\t\tThe entered n_cluster value ({n_cluster}) is higher than the number of lines in the threshold {threshold} dataframe ({n_lines}).", log)
1195
+ threshold_df_end = threshold_df.head(n_lines)
1196
+ results.append(threshold_df_end)
1197
+ print_log(f"\t\tIt will be produced plots according the number of lines available in the dataset for the {threshold}.", log)
1198
+ else:
1199
+ threshold_df_end = threshold_df.head(n_cluster)
1200
+ results.append(threshold_df_end)
1201
+
1202
+ if not results:
1203
+ return None
1204
+
1205
+ result_df = pd.concat(results)
1206
+
1207
+ return result_df
1208
+
1209
+ def filtering_df_threshold(filtered_threshold, df_filtered, log):
1210
+
1211
+ """
1212
+ Filter the dataframe according to the provided threshold values.
1213
+
1214
+ Parameters
1215
+ ----------
1216
+ filtered_threshold : list
1217
+ List of valid thresholds to be applied to df_filtered.
1218
+
1219
+ df_filtered: pandas.DataFrame
1220
+ DataFrame containing the clustering information, where the first column represents thresholds.
1221
+
1222
+ Returns
1223
+ -------
1224
+ df_filtered_threshold: pandas.DataFrame
1225
+ DataFrame filtered by the thresholds in filtered_threshold.
1226
+ """
1227
+ #print_log(f'\n---------------------------------------------- Function: filtering_df_threshold ----------------------------------------------\n', log)
1228
+
1229
+ head=df_filtered.columns.tolist()
1230
+ name_partition = head[0]
1231
+
1232
+ results = []
1233
+
1234
+ for threshold in filtered_threshold:
1235
+ if threshold in df_filtered.iloc[:,0].values:
1236
+ threshold_df = df_filtered[df_filtered[name_partition] == threshold]
1237
+ results.append(threshold_df)
1238
+
1239
+ df_filtered_threshold = pd.concat(results)
1240
+
1241
+ return df_filtered_threshold
1242
+
1243
+ def check_column_plots(user_columns_plots, result_df, log):
1244
+
1245
+ """
1246
+ Validate whether the column name(s) selected for plotting exist in the dataframe.
1247
+
1248
+ Parameters
1249
+ ----------
1250
+ user_columns_plots: str
1251
+ Column(s) selected by the user for plotting.
1252
+ result_df: pandas.DataFrame
1253
+ Dataframe containing the data to be plotted.
1254
+
1255
+ Return
1256
+ ------
1257
+ check_columns: list
1258
+ A list of valid column names found in the dataframe.
1259
+ """
1260
+ #print_log(f'\n---------------------------------------------- Function: check_column_plots ----------------------------------------------\n', log)
1261
+
1262
+ column_in_df = result_df.columns.tolist()
1263
+ check_columns = []
1264
+
1265
+ value_columns_plots = [col.strip() for col in user_columns_plots.split(',')]
1266
+ for col in value_columns_plots:
1267
+ if col in column_in_df:
1268
+ check_columns.append(col)
1269
+ else:
1270
+ print_log(f'\t\tInvalid column name for plot: {col}. It does not exist.', log)
1271
+
1272
+
1273
+ return check_columns
1274
+
1275
+ def generate_pastel_color():
1276
+
1277
+ """Generation of the random color pallete to the cluster plots"""
1278
+ r = random.randint(100, 200)
1279
+ g = random.randint(100, 200)
1280
+ b = random.randint(100, 200)
1281
+
1282
+ return f'#{r:02X}{g:02X}{b:02X}'
1283
+
1284
+ def check_structure_lines_column_plots(check_columns, result_df, plots_category_percentage, plots_category_number,output, prefix, plots_summary, category_colors, log):
1285
+
1286
+ """
1287
+ Check if the rows of the valid column plots have the correct structure to perform the cluster characterization.
1288
+ Generate cluster characterization plots based on validated columns.
1289
+
1290
+ Parameters
1291
+ ----------
1292
+ check_columns: list
1293
+ List of column names to be validated and plotted.
1294
+ result_df: pandas.DataFrame
1295
+ Dataframe containing the selected information to be processed.
1296
+ plots_category_percentage: float
1297
+ Percentage threshold for aggregating smaller categories.
1298
+ plots_category_number: int
1299
+ Maximum number of categories to show.
1300
+ output: str
1301
+ Path to the output directory where the images will be saved.
1302
+ prefix: str
1303
+ Prefix to be added to the output file names.
1304
+ plots_summary: str
1305
+ Type of file selected by user.
1306
+ category_colors: dict
1307
+ Dictionary to store and reuse colors for each category.
1308
+
1309
+ Return
1310
+ ------
1311
+ results_list: list
1312
+ List of dictionaries containing:
1313
+ - A: the threshold
1314
+ - B: the column
1315
+ - C: the plotly figure object
1316
+
1317
+ """
1318
+ #print_log(f'\n---------------------------------------------- Function: check_struture_lines_column_plots ----------------------------------------------\n', log)
1319
+
1320
+ pattern_line_column_plot = r'^(.+ \(\d+(\.\d+)?%\))(, .+ \(\d+(\.\d+)?%\))*( \(n = \d+\))$'
1321
+ results_list = []
1322
+ flag = False
1323
+ strings = []
1324
+ for _,row in result_df.iterrows():
1325
+
1326
+ for col in check_columns:
1327
+
1328
+ #---------------------
1329
+ mst = row.iloc[0]
1330
+ cluster = row.iloc[1]
1331
+ cluster_rename = cluster[0].upper() + cluster[1:]
1332
+ n_cluster_length = row['cluster_length']
1333
+
1334
+ #-------------------------------
1335
+ #Check if the line is valid
1336
+
1337
+ if re.match(pattern_line_column_plot, str(row[col])):
1338
+
1339
+ #---------------------------------------------
1340
+ #Processing plots
1341
+
1342
+ if plots_summary == 'sample_of_interest':
1343
+ sample_increase = row['samples_increase']
1344
+ else:
1345
+ sample_increase = ''
1346
+
1347
+ #---------------------------------
1348
+ # Split informations
1349
+
1350
+ components_row = row[col].split(" (n =")[0].split(", ")
1351
+ category = []
1352
+ values = []
1353
+
1354
+ for elem in components_row:
1355
+ label, percentage = elem.split(" (")
1356
+ percentage_value = percentage [:-2]
1357
+ category.append(label)
1358
+ values.append(float(percentage_value))
1359
+
1360
+ #--------------------------------------------------------
1361
+ #Processing information by plots_category_number argument
1362
+
1363
+ if plots_category_percentage is not None:
1364
+ plots_category_number = None
1365
+
1366
+ if plots_category_number is not None:
1367
+ if not flag:
1368
+ flag = True
1369
+
1370
+ list_category = category[0:plots_category_number]
1371
+ list_values = values[0:plots_category_number]
1372
+ percentage = sum(list_values)
1373
+ remaining_percentage = 100 - percentage
1374
+ if remaining_percentage != 0:
1375
+ list_category.append('Others')
1376
+ list_values.append(remaining_percentage)
1377
+
1378
+ #--------------------------------------------------------
1379
+ #Processing information by plots_category_percentage argument
1380
+
1381
+ if plots_category_percentage is not None:
1382
+ if not flag:
1383
+ flag = True
1384
+
1385
+ other_values = []
1386
+
1387
+ for num in values:
1388
+ if num <= plots_category_percentage:
1389
+
1390
+ other_values.append(num)
1391
+
1392
+ percentage = sum(other_values)
1393
+
1394
+ if percentage != 0:
1395
+ size = len(other_values)
1396
+ list_category = category[:-size]
1397
+ list_values = values[:-size]
1398
+ list_category.append('Others')
1399
+ list_values.append(percentage)
1400
+ else:
1401
+ list_category = category
1402
+ list_values = values
1403
+
1404
+ #------------------------------------------
1405
+ # Definition of colors for each category
1406
+ colors = []
1407
+
1408
+ for cat in list_category:
1409
+ if cat not in category_colors:
1410
+ category_colors[cat] = generate_pastel_color()
1411
+ colors.append(category_colors[cat])
1412
+
1413
+ #-----------------------------------------
1414
+ #Production of image
1415
+
1416
+ df = pd.DataFrame({'Category': list_category, 'Percentage': list_values})
1417
+ fig = px.pie(df, values = 'Percentage', names = 'Category', title = f'{cluster_rename}')
1418
+ fig.update_traces(marker = dict(colors = colors))
1419
+
1420
+ fig.update_layout(title_x = 0.5, annotations = [dict(
1421
+ x = 0.5,
1422
+ y = -0.2,
1423
+ text = f'Number of samples: {n_cluster_length}<br>{sample_increase}',showarrow=False)])
1424
+
1425
+ fig.write_image(f'{output}/{prefix}_{mst}_{col}_{cluster_rename}.png', format="png")
1426
+ result_dict = {'A': mst, 'B': col, 'C': fig}
1427
+ results_list.append(result_dict)
1428
+ strings.append(f"\t\tAnalyzing threshold {mst}, column {col}.")
1429
+
1430
+ else:
1431
+ print_log(f'\tError: INVALID values present in the line with the {col} column at the {mst}: {row[col]}.\n', log)
1432
+ results_list = None
1433
+
1434
+ unique_strings = set(strings)
1435
+ for elem in unique_strings:
1436
+ print_log(elem, log)
1437
+
1438
+ print_log(f'\tSaving the cluster characterization plots.', log)
1439
+ return results_list
1440
+
1441
+
1442
+ def select_nomenclature_change(df_filtered_threshold, log):
1443
+
1444
+ """
1445
+ Select clusters with increase behavior in the 'nomenclature_change' column.
1446
+ If the 'nomenclature_change' column exists, this function filters the DataFrame to retain
1447
+ only the clusters with specific increase or new related tags.
1448
+
1449
+ Parameters
1450
+ ----------
1451
+ df_filtered: pd.DataFrame
1452
+ DataFrame containing filtered cluster data.
1453
+ Return
1454
+ ------
1455
+ result_df: pd.DataFrame or None
1456
+ A new DataFrame containing only the rows where nomenclature_change indicates cluster increase,
1457
+ or None if the column is missing or no valid categories are found.
1458
+ """
1459
+ #print_log(f'\n---------------------------------------------- Function: select_nomenclature_change ----------------------------------------------\n', log)
1460
+ results = []
1461
+
1462
+ possibilities = ['kept (increase)','new','new (increase)', 'new (merge_increase)', 'new (split_increase)', 'new (split_merge_increase)']
1463
+
1464
+ if 'nomenclature_change' in df_filtered_threshold.columns:
1465
+ data = df_filtered_threshold['nomenclature_change'].values.tolist()
1466
+ unique_list = set(data)
1467
+
1468
+ for elem in unique_list:
1469
+ if elem in possibilities:
1470
+ filtered_df = df_filtered_threshold[df_filtered_threshold['nomenclature_change'] == elem]
1471
+ results.append(filtered_df)
1472
+
1473
+ if results != []:
1474
+ result_df = pd.concat(results)
1475
+ else:
1476
+ result_df = None
1477
+ print_log(f'\tNo information about the behavior of the “Cluster Nomenclature System” in some of the most common situations in a routine surveillance scenario.', log)
1478
+ else:
1479
+ result_df = None
1480
+ print_log(f'\tColumn nomenclature change not found in the selected file.', log)
1481
+
1482
+ return result_df
1483
+
1484
+ def get_nr_lines_threshold(partition_matrix, log):
1485
+
1486
+ """
1487
+ Retrieve the number of samples (rows) and thresholds (columns) from a *_partitions.tsv file.
1488
+
1489
+ Parameters
1490
+ ----------
1491
+ partition_matrix: str
1492
+ Path to the *_partitions.tsv file.
1493
+
1494
+ Returns
1495
+ -------
1496
+ nr_columns_df: int
1497
+ Number of thresholds (columns/partitions) presents in the file.
1498
+ nr_lines_df: int
1499
+ Number of samples (rows) present in the file.
1500
+ """
1501
+ #print_log(f'\n---------------------------------------------- Function: get_nr_lines_threshold ----------------------------------------------\n', log)
1502
+
1503
+ df = pd.read_table(partition_matrix)
1504
+ nr_columns_df = (len(df.columns)-1)
1505
+ nr_lines_df = len(df)
1506
+
1507
+ return nr_lines_df, nr_columns_df
1508
+
1509
+ def get_file_partition_by_threshold (partition_matrix, prefix, output, log):
1510
+
1511
+ """
1512
+ Generate a *_cluters_partitions.tsv or *_cluters_partitions-filered.tsv file
1513
+ with the number of partitions per threshold for each partition matrix (normal or filtered).
1514
+
1515
+ Parameters
1516
+ ---------
1517
+ partition_matrix: str
1518
+ Path to the *_partitions.tsv file.
1519
+ prefix: str
1520
+ Prefix to include in the output filename.
1521
+ output: str
1522
+ Path to the directory where the results will be saved.
1523
+ Return
1524
+ ------
1525
+ file_partition_by_threshold: str
1526
+ Path to the newly generated file.
1527
+ """
1528
+ #print_log(f'\n---------------------------------------------- Function: get_file_partition_by_threshold ----------------------------------------------\n', log)
1529
+
1530
+ order_col = ["pipeline", "threshold", "partitions"]
1531
+ info_partitions = {"pipeline": [], "threshold": [], "partitions": []}
1532
+
1533
+ partitions = pd.read_table(partition_matrix)
1534
+
1535
+ for i in range(1,len(partitions.columns)):
1536
+ clusters = pd.unique(partitions[partitions.columns[i]])
1537
+ info_partitions["pipeline"].append(prefix)
1538
+ info_partitions["threshold"].append(i)
1539
+ info_partitions["partitions"].append(len(clusters))
1540
+
1541
+ cluster_partition_matrix = pd.DataFrame(data = info_partitions, columns = order_col)
1542
+
1543
+ if '-' in partition_matrix:
1544
+ file_partition_by_threshold = (f'{output}/{prefix}_clusters_partitions-filtered.tsv')
1545
+ else:
1546
+ file_partition_by_threshold = (f'{output}/{prefix}_clusters_partitions.tsv')
1547
+
1548
+ cluster_partition_matrix.to_csv(file_partition_by_threshold, index = False, header = True, sep = "\t")
1549
+
1550
+ return file_partition_by_threshold
1551
+
1552
+ def get_graph_partition_by_threshold(file_partition_by_threshold, prefix, prefix_both, yes_prefix_both, output, log):
1553
+
1554
+ """
1555
+ Generate a graphic showing the number of partitions vs. thresholds
1556
+ for one pipeline using the *_clusters_partitions.tsv or *_clusters_partitions-filtered.tsv file.
1557
+
1558
+ Parameters
1559
+ ----------
1560
+ file_partition_by_threshold: str
1561
+ Relative path to the *_cluster_partitions file.
1562
+ prefix: str
1563
+ Prefix to name the output.
1564
+ output: str
1565
+ Full path to the directory where the results will be saved.
1566
+
1567
+ Returns
1568
+ -------
1569
+ fig_partition_vs_threshols: plotly.graph_objs._figure.Figure
1570
+ Plotly figure object showing the number of partitions by threshold.
1571
+
1572
+ """
1573
+ #print_log(f'\n---------------------------------------------- Function: get_graph_partition_by_threshold ----------------------------------------------\n', log)
1574
+
1575
+ df = pd.read_csv(file_partition_by_threshold, sep = '\t')
1576
+
1577
+ fig_partition_vs_threshols=px.line(df, x = "threshold", y = "partitions", color = "pipeline",
1578
+ labels = {'partitions': 'Partitions', 'threshold': 'Threshold'})
1579
+ fig_partition_vs_threshols.update_layout(legend=dict (orientation="h",yanchor="bottom",y=-0.35,xanchor="center", x=0.5), margin=dict(l=0, r=0, t=20, b=0))
1580
+
1581
+ if yes_prefix_both == False:
1582
+ fig_partition_vs_threshols.write_image(f'{output}/{prefix}_lineplot.png', format = "png")
1583
+
1584
+ if yes_prefix_both == True:
1585
+ fig_partition_vs_threshols.write_image(f'{output}/{prefix_both}_lineplot.png', format = "png")
1586
+
1587
+ return fig_partition_vs_threshols
1588
+
1589
+ def concatenation_files(file1, file2, output, prefix_both):
1590
+
1591
+ """
1592
+ Concatenate two TSV files containing cluster partition data (one of each pipeline) and save the result.
1593
+
1594
+ Parameters
1595
+ ----------
1596
+ file1 : str
1597
+ Path to the first TSV file.
1598
+ file2 : str
1599
+ Path to the second TSV file.
1600
+ output: str
1601
+ The directory where the combined file will be saved.
1602
+ prefix_both : str
1603
+ Prefix for naming the output file.
1604
+
1605
+ Returns
1606
+ -------
1607
+ path: str
1608
+ Full path to the saved concatenated file.
1609
+ """
1610
+ #print(f'\n---------------------------------------------- Function: concatenation_files----------------------------------------------\n')
1611
+
1612
+ df_cp1 = pd.read_csv(file1, sep = '\t')
1613
+ df_cp2 = pd.read_csv(file2, sep = '\t')
1614
+ df1 = pd.DataFrame(df_cp1)
1615
+ df2 = pd.DataFrame(df_cp2)
1616
+
1617
+ df_combined = pd.concat([df1, df2])
1618
+
1619
+ path = f'{output}/{prefix_both}_cluster_partitions.tsv'
1620
+ df_combined.to_csv(path, index = False, header = True, sep = "\t")
1621
+
1622
+ return path
1623
+
1624
+ def organize_clusters(results_list):
1625
+
1626
+ """
1627
+ Organizes plotly figure objects by threshold (e.g., MST) and by category
1628
+ (e.g., source, country).
1629
+
1630
+ Parameters
1631
+ ----------
1632
+ results_list : list
1633
+ A list of dictionaries, each containing:
1634
+ - "A": threshold/method string (e.g., 'MST-4x1.0')
1635
+ - "B": category string (e.g., 'country' or 'source')
1636
+ - "C": a Plotly figure object (plotly.graph_objs._figure.Figure)
1637
+
1638
+ Returns
1639
+ -------
1640
+ mst_groups : dict
1641
+ The dictionary is organized by threshold and then by category.
1642
+ """
1643
+ #print(f'\n---------------------------------------------- Function: organize_clusters----------------------------------------------\n')
1644
+
1645
+ method_groups = {}
1646
+
1647
+ for item in results_list:
1648
+ method = item["A"]
1649
+ category = item["B"]
1650
+ image = item["C"]
1651
+
1652
+ if method not in method_groups:
1653
+ method_groups[method] = {}
1654
+
1655
+ if category not in method_groups[method]:
1656
+ method_groups[method][category] = []
1657
+
1658
+ method_groups[method][category].append(image)
1659
+
1660
+ return method_groups
1661
+
1662
+ def processing_block_names(file_stability, prefix, log):
1663
+
1664
+ """
1665
+ Identifies the names of stability blocks in a *_stableRegions.tsv file and adds a prefix to each.
1666
+
1667
+ Parameters
1668
+ -----------
1669
+ file_stability: str
1670
+ Path to the *_stableRegions.tsv file.
1671
+
1672
+ prefix: str
1673
+ Prefix to prepend to each block name.
1674
+
1675
+ Returns
1676
+ -------
1677
+ name_block : list or None
1678
+ List of block names with the given prefix, or None if the file is empty.
1679
+ """
1680
+ #print_log(f'\n---------------------------------------------- Function: processing_file_sta_reg----------------------------------------------\n', log)
1681
+
1682
+ df = pd.read_csv(file_stability, sep = '\t', comment = "#", header = None)
1683
+
1684
+ if df.empty:
1685
+ name_block = None
1686
+ else:
1687
+ name_block = []
1688
+ for elem in df[0]:
1689
+ string = f'{prefix}_' + elem
1690
+ name_block.append(string)
1691
+
1692
+ return name_block
1693
+
1694
+ def processing_data(file, log):
1695
+
1696
+ """
1697
+ Extracts the start and end positions of stability blocks from a *_stableRegions.tsv file.
1698
+
1699
+ Paramenter
1700
+ ----------
1701
+ file: str
1702
+ Path to the *_stableRegions.tsv file.
1703
+
1704
+ Returns
1705
+ -------
1706
+ first_data: list of int
1707
+ List of integers identifying the start of each stability block.
1708
+
1709
+ final_data: list of int
1710
+ List of integers identifying the end of each stability block.
1711
+
1712
+ values_block: list of int
1713
+ Combined and sorted list of all start and end points.
1714
+ """
1715
+ #print_log(f'\n---------------------------------------------- Function: processing_data----------------------------------------------\n', log)
1716
+
1717
+ df = pd.read_csv(file, sep="\t", comment='#', header=None)
1718
+
1719
+ first_data = []
1720
+ final_data = []
1721
+
1722
+ for elem in df[1]:
1723
+ line = elem.split('-')
1724
+ value = line[3]
1725
+ first_partition = value.split('x')[0]
1726
+ first_data.append(int(first_partition))
1727
+
1728
+ for elem in df[2]:
1729
+ line = elem.split('-')
1730
+ value = line[1]
1731
+ last_partition = value.split('x')[0]
1732
+ final_data.append(int(last_partition))
1733
+
1734
+
1735
+
1736
+ return first_data, final_data
1737
+
1738
+ def change_processing_data(final_df, i1_prefix, i2_prefix, output, log):
1739
+
1740
+ """
1741
+ Conversion of dataframe values in logarithms to create the graph.
1742
+
1743
+ Parameters
1744
+ ----------
1745
+ final_df: pd.DataFrame
1746
+ Dataframe with the start and end of each block per pipeline
1747
+
1748
+ i1_prefix: str
1749
+ Prefix added to the result in pipeline i1
1750
+
1751
+ i2_prefix: str
1752
+ Prefix added to the result in pipeline i2
1753
+ """
1754
+
1755
+ #print_log(f'\n---------------------------------------------- Function: change_processing_data----------------------------------------------\n', log)
1756
+
1757
+ df_final = final_df.rename(columns={"Finish": "temp"})
1758
+
1759
+ df_final['temp']=np.log2(df_final['temp'])
1760
+ df_final['Start'] = np.log2(df_final['Start'])
1761
+ df_final['Finish'] = df_final['temp'] - df_final['Start']
1762
+ df1_inverted = df_final.iloc[::-1]
1763
+
1764
+ max_val = df1_inverted["temp"].max()
1765
+ max_val_1= int(round(2 ** max_val,0))
1766
+ list_tickvals = list(range(1, max_val_1 + 1))
1767
+ list_ticktext= [str (2 ** x) for x in range(1, max_val_1 + 1)]
1768
+
1769
+
1770
+ fig_st = px.bar(df1_inverted,
1771
+ x="Finish",
1772
+ y="Block_id",
1773
+ base='Start',
1774
+ color="Pipeline",
1775
+ orientation="h")
1776
+
1777
+ fig_st.update_layout(
1778
+ xaxis_title="Threshold",
1779
+ yaxis_title='',
1780
+ xaxis=dict(
1781
+ tickvals=list_tickvals,
1782
+ ticktext=list_ticktext),
1783
+ yaxis=dict(showticklabels=False), legend=dict( orientation="h",yanchor="bottom",y=-0.35,xanchor="center", x=0.5), margin=dict(l=0, r=0, t=20, b=0))
1784
+
1785
+ if i2_prefix is None:
1786
+ prefix=f'{i1_prefix}'
1787
+ else:
1788
+ prefix=f'{i1_prefix}_vs_{i2_prefix}'
1789
+
1790
+ fig_st.write_image(f'{output}/{prefix}_StableRegions.png', format='png')
1791
+
1792
+
1793
+ return fig_st
1794
+
1795
+ #################################################################### OUTBREAKS ###############################################################
1796
+
1797
+ def validate_combinations_outbreak(threshold_outbreak):
1798
+
1799
+ """
1800
+ Validates the structure of threshold_outbreaks combinations, including their components, and
1801
+ identifies the comparison type and thresholds for outbreak analysis.
1802
+
1803
+ The comparison type supported:
1804
+ - "equal" (defined by ',')
1805
+ - "lower_equal" (defined by '<=')
1806
+
1807
+ Multiple combinations must be separated by semicolons (';'), without spaces.
1808
+
1809
+ This function processes one or more threshold pairs, extracting threshold_1, threshold_2, and the comparison type.
1810
+
1811
+ It also validates that each threshold follows the expected pattern: string-integerxfloat (e.g., 'MST-7x1.0').
1812
+
1813
+ Parameters
1814
+ ----------
1815
+ threshold_outbreak: str
1816
+ One or more outbreak threshold combinations provided by the user.
1817
+
1818
+ Returns
1819
+ -------
1820
+ valid_combinations: list
1821
+ A list of sublists ([[threshold_1, threshold_2, comparison_type]]) containing valid combination structures
1822
+ for downstream outbreak analysis.
1823
+ """
1824
+ #print(f'\n---------------------------------------------- Function: validate_combinations_outbreak----------------------------------------------\n')
1825
+
1826
+ regex = r'^[A-Za-z]+-\d+x\d+\.\d+$'
1827
+ valid_combinations = []
1828
+ combos = threshold_outbreak.split(';')
1829
+
1830
+ for combo in combos:
1831
+
1832
+ parts = combo.split(',')
1833
+
1834
+ if len(parts) != 2:
1835
+ sys.exit(f"The combination '{combo}' must have 2 elements separated by a comma (e.g., 'MST-7x1.0,MST-7x1.0'). Multiple combinations must be separated by ; . Please, do not use spaces.")
1836
+
1837
+ pattern1 = parts[0]
1838
+
1839
+ if parts[1].startswith('<='):
1840
+ pattern2 = parts[1][2:]
1841
+ else:
1842
+ pattern2 = parts[1]
1843
+
1844
+ if not re.match(regex, pattern1):
1845
+ sys.exit(f"Error: Pattern '{pattern1}' (part 1) is not in the correct format (e.g., 'MST-7x1.0'). Please, do not use spaces.")
1846
+
1847
+ if not re.match(regex, pattern2):
1848
+ sys.exit(f"Error: Pattern '{parts[1]}' (part 2) is not in the correct format (e.g., 'MST-7x1.0' or '<=MST-10x1.0'). Please, do not use spaces.")
1849
+
1850
+ if parts[1].startswith('<='):
1851
+ valid_combinations.append([parts[0], parts[1], 'lower_equal'])
1852
+ else:
1853
+ valid_combinations.append([parts[0], parts[1], 'equal'])
1854
+
1855
+ return valid_combinations
1856
+
1857
+ def extract_integer_part(valid_combinations, log):
1858
+
1859
+ """
1860
+ Extract the integer thresholds from string-formatted threshold.
1861
+ Paramenter
1862
+ ---------
1863
+ valid_combinations: list
1864
+ List of sublist ([[threshold_1, threshold_2, type_comparison]]),
1865
+ containing the valid combinations structure to use in the downstream outbreak analysis.
1866
+
1867
+ Return
1868
+ ------
1869
+ extracted: list
1870
+ List of tuples [(integer, integer, type_comparison)] with the extracted integer values and the comparison type,
1871
+ to be used in the command-line call of the script `stats_outbreak_script.py`
1872
+ """
1873
+ #print_log(f'\n---------------------------------------------- Function: extract_integer_part----------------------------------------------\n', log)
1874
+
1875
+ values_outbreak = []
1876
+
1877
+ for p1, p2, comp in valid_combinations:
1878
+ n1 = int(p1.split('-')[1].split('x')[0])
1879
+ n2 = int(p2.split('-')[1].split('x')[0])
1880
+ values_outbreak.append((n1, n2, comp))
1881
+
1882
+ return values_outbreak
1883
+
1884
+ def creation_tsv_stats_outbreak(clusterComposition_1, clusterComposition_2, output, prefix_both, log):
1885
+
1886
+ """
1887
+ Create a new file (*_path_stats_outbreak.tsv) containing the path to each *clusterComposition.tsv file
1888
+ (these files can be obtained with ReporTree), which will be used as an input argument to stats_oubtreak analysis.py script.
1889
+
1890
+ Parameters
1891
+ ----------
1892
+ clusterComposition_1: str
1893
+ Path to the *cluster_composition file of pipeline 1.
1894
+ and clusterComposition_2: str
1895
+ Path to the *cluster_composition file of pipeline 1.
1896
+ output: str
1897
+ Path to the directory where the results will be saved.
1898
+ prefix_both: str
1899
+ Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX).
1900
+
1901
+ Return
1902
+ ------
1903
+ df: pandas.DataFrame
1904
+ DataFrame containing the paths to the clusterComposition.tsv files.
1905
+ path_comparison_outbreak: str
1906
+ Path to the *_path_stats_outbreak.tsv file.
1907
+ """
1908
+ #print_log(f'\n---------------------------------------------- Function: creation_tsv_stats_outbreak ----------------------------------------------\n', log)
1909
+
1910
+ data=[[clusterComposition_1], [clusterComposition_2]]
1911
+ df=pd.DataFrame(data)
1912
+ path_stats_outbreak=f'{output}/{prefix_both}_path_stats_outbreak.tsv'
1913
+ df.to_csv(path_stats_outbreak, sep='\t', index=False, header=None)
1914
+
1915
+ return df, path_stats_outbreak
1916
+
1917
+
1918
+ def calling_script_outbreak(stats_outbreak_script, path_stats_outbreak, output, prefix_both, values_outbreak, log):
1919
+
1920
+ """
1921
+ Calls the outbreak script.
1922
+
1923
+ Parameters:
1924
+ -----------
1925
+ stats_outbreak_script: str
1926
+ Path to the stats_outbreak_analysis.py script.
1927
+ path_stats_outbreak: str
1928
+ Path to the *_path_stats_outbreak.tsv file.
1929
+ output: str
1930
+ Path to the directory where the results will be saved.
1931
+ prefix_both: str
1932
+ Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX)
1933
+ values_outbreak: list
1934
+ The struture of tuple list is i.e, [(threshold1, threshold2, type_comparison)]
1935
+
1936
+ Return:
1937
+ ------
1938
+ Files produced by script, i.e,:
1939
+ -XX_vs_XX_stats_outbreak_missing_clusters_INTEGER_equal_INTEGER.tsv
1940
+ -XX_vs_XX_stats_outbreak_pairwise_comparison_INTEGER_equal_INTEGER.tsv
1941
+ -XX_vs_XX_stats_outbreak_pairwise_comparison_INTEGER_equal_INTEGER_pct.tsv
1942
+ -XX_vs_XX_stats_outbreak_summary_INTEGER_equal_INTEGER.tsv
1943
+ """
1944
+ #print_log(f'\n---------------------------------------------- Function: calling_script_outbreak ----------------------------------------------\n', log)
1945
+
1946
+ if values_outbreak !=[]:
1947
+ for th1,th2,type_comparison in values_outbreak:
1948
+
1949
+ cmd= ["python", stats_outbreak_script, "-i", path_stats_outbreak, "-t1", str(th1), "-t2", str(th2),
1950
+ "-o", f"{output}/{prefix_both}_stats_outbreak", "-c", type_comparison]
1951
+
1952
+ subprocess.run(cmd,capture_output=True, text=True)
1953
+
1954
+ print_log(f"\tRunning stats_outbreak_analysis.py for {th1} {type_comparison} {th2}", log)
1955
+ print_log(f'\t\t{" ".join(cmd)}', log)
1956
+
1957
+ print_log(f"\tDone!", log)
1958
+ else:
1959
+ print_log(f'\tImpossible to call the stats_outbreak_analysis.py.', log)
1960
+
1961
+ def read_files_outbreak(output):
1962
+
1963
+ """
1964
+ Identifying of the file *stats_outbreak_pairwise_comparison_*_pct.tsv, which contains percentage values of clusters detected by one pipeline that are also detected,
1965
+ with the exact same composition, by another pipeline.
1966
+
1967
+ Parameters:
1968
+ ----------
1969
+ output: str
1970
+ Path to the directory where the results will be saved.
1971
+
1972
+ Return:
1973
+ ------
1974
+ process_files: list
1975
+ List with the full path of *stats_outbreak_pairwise_comparison_*_pct.tsv.
1976
+ """
1977
+ #print(f'\n---------------------------------------------- Function: read_files_outbreak ----------------------------------------------\n')
1978
+
1979
+ files_outbreak_pct=os.listdir(output)
1980
+ process_files=[]
1981
+ for file in files_outbreak_pct:
1982
+ if file.endswith('_pct.tsv'):
1983
+ path_file=f'{output}/{file}'
1984
+ abs_path_file=os.path.realpath(path_file)
1985
+ process_files.append(abs_path_file)
1986
+
1987
+ return process_files
1988
+
1989
+ def creation_overlap_clusters(process_files, output, values_oubreak):
1990
+
1991
+ """
1992
+ Production of the graphics with the overlap genetic clusters according the threshold outbreak.
1993
+
1994
+ Parameters:
1995
+ -----------
1996
+ process_files: list
1997
+ List with the full path of *stats_outbreak_pairwise_comparison_*_pct.tsv.
1998
+
1999
+ output: str
2000
+ Path to the directory where the results will be saved.
2001
+
2002
+ values_outbreak: list
2003
+ List of tuples [(integer, integer, type_comparison)]
2004
+
2005
+ Return:
2006
+ ------
2007
+ fig_result: list
2008
+ List of images (plotly.graph_objs._figure.Figure).
2009
+ """
2010
+ #print(f'\n---------------------------------------------- Function: creation_overlap_clusters ----------------------------------------------\n')
2011
+
2012
+ result=[]
2013
+ for th1, thr2, type_compo in values_oubreak:
2014
+ for path in process_files:
2015
+ if f'_{th1}_{type_compo}_{thr2}_' in path:
2016
+ result.append([th1,thr2,type_compo,path])
2017
+
2018
+ fig_result=[]
2019
+ thresholds=[]
2020
+ for i in result:
2021
+ file=i[3]
2022
+ thr1=i[0]
2023
+ thr2=i[1]
2024
+ type_com=i[2]
2025
+
2026
+ df=pd.read_table(file)
2027
+ df_filtered = df.drop(df.columns[0], axis=1)
2028
+
2029
+ if df_filtered.shape[1] == 2:
2030
+ values_col1 = df_filtered.columns[0]
2031
+ values_col2 = df_filtered.columns[1]
2032
+
2033
+ if df_filtered.shape[0] <= 2:
2034
+ df_filtered[''] = ''
2035
+ values_col1 = df_filtered.columns[0]
2036
+ values_col2 = df_filtered.columns[1]
2037
+
2038
+ df_percentage= df_filtered*100
2039
+ name_file=os.path.basename(file)
2040
+ base, ext=os.path.splitext(name_file)
2041
+
2042
+ if type_com=='equal':
2043
+ string1=f'at {thr1} threshold'
2044
+ string2=f'at {thr2} threshold'
2045
+ else:
2046
+ string1=f'at {thr1} threshold'
2047
+ string2=f'at up {thr2} threshold'
2048
+ thresholds.append((thr1,thr2,type_com))
2049
+
2050
+ colors = [[0, 'white'], [0.5, 'white'], [0.5, '#FDFD96'], [1, '#89B6E3']]
2051
+
2052
+ fig = go.Figure(data=go.Heatmap(
2053
+ z=df_percentage.values,
2054
+ x =[f'{values_col1}', f'{values_col2}'],
2055
+ y =[f'{values_col1}', f'{values_col2}'],
2056
+ text=df_percentage.values,
2057
+ texttemplate="%{text:.2f}%",
2058
+ textfont=dict(size=11, color="black"),
2059
+ colorscale=colors,
2060
+ colorbar=dict(title="Overlap"),
2061
+ zmin=0, zmax=100
2062
+ ))
2063
+
2064
+ fig.update_layout(
2065
+ xaxis_title=f"Cluster detected {string2}",
2066
+ yaxis_title=f"Cluster detected {string1}",
2067
+ plot_bgcolor='white',
2068
+ paper_bgcolor='white', margin=dict(l=0, r=0, t=20, b=0))
2069
+
2070
+ fig.write_image(f'{output}/{base}.png', format="png")
2071
+ fig_result.append(fig)
2072
+
2073
+ return fig_result, thresholds
2074
+
2075
+ def get_plot_columns(file):
2076
+
2077
+ """
2078
+ Get the names of the available columns for cluster plots in a given summary file (*_partition_summary or *_sample_of_interest), to perform the cluster characterization.
2079
+
2080
+ Parameters
2081
+ ----------
2082
+ file: str
2083
+ Path to the *_partitions_summary.tsv or *_SAMPLE_OF_INTEREST_partitions_summary.tsv file.
2084
+
2085
+ Return
2086
+ ------
2087
+ List of columns present in the file that are not part of the default memory_columns.
2088
+ If none are found, the program exits with an error message.
2089
+ """
2090
+ #print(f'\n---------------------------------------------- Function: get_plot_columns_list ----------------------------------------------\n')
2091
+
2092
+ memory_columns = ['partition', 'cluster', 'nomenclature_change', 'n_increase', 'cluster_length', 'samples', 'samples_increase','SAMPLE_OF_INTEREST']
2093
+
2094
+ df = pd.read_csv(file, sep = "\t")
2095
+ name_folder = file.split('/')[0]
2096
+ print(f"\nAvailable columns for {name_folder}:")
2097
+
2098
+ columns_df=df.columns.tolist()
2099
+
2100
+ attachement_list=[]
2101
+ for elem in columns_df:
2102
+ if elem not in memory_columns:
2103
+ attachement_list.append(elem)
2104
+
2105
+ if attachement_list != []:
2106
+ for elem in attachement_list:
2107
+ print(f'\t- {elem}')
2108
+ else:
2109
+ sys.exit(f'Error: No additional columns found in {file}.')
2110
+
2111
+ def find_html_outbreak(output,prefix_both,log):
2112
+
2113
+ """
2114
+ Check if there is an initial HTML report and if the second HTML report, created by the reanalysis of the threshold outbreak (-rto argument), exists.
2115
+
2116
+ Parameters
2117
+ ---------
2118
+ output: str
2119
+ Path to the directory where the results will be saved.
2120
+ prefix_both: str
2121
+ Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX)
2122
+
2123
+ Return
2124
+ ------
2125
+ final_files: <class 'list'>
2126
+ If the files exist, a list with their relative paths is created.
2127
+ """
2128
+ #print_log(f'\n---------------------------------------------- Function: find_html_outbreak----------------------------------------------\n', log)
2129
+
2130
+ all_files = os.listdir(output)
2131
+
2132
+ expected_new_report = f"{prefix_both}_2ºRUN_report.html"
2133
+ expected_report = f"{prefix_both}_report.html"
2134
+
2135
+ if expected_new_report not in all_files:
2136
+ sys.exit(f"Error: {expected_new_report} not found!")
2137
+
2138
+ if expected_report not in all_files:
2139
+ sys.exit(f"Error: {expected_report} not found!")
2140
+
2141
+ final_files = [expected_new_report, expected_report]
2142
+
2143
+ return final_files
2144
+
2145
+ def extration_section_original_file(output, final_files, log):
2146
+
2147
+ """
2148
+ Extracts specific sections (e.g., clustering and congruence) from an original HTML file,
2149
+ and saves them in a temporary text file for later use (e.g., to merge with another report).
2150
+
2151
+ Parameters:
2152
+ ----------
2153
+ output: str
2154
+ Path to the directory where the results will be saved.
2155
+ final_files: list
2156
+ List with relative paths to the HTML reports.
2157
+
2158
+ Return:
2159
+ ------
2160
+ path_temp: str
2161
+ Path to the *.txt file with clustering and congruence information.
2162
+ """
2163
+ #print_log(f'\n---------------------------------------------- Function: extration_section_original_file----------------------------------------------\n', log)
2164
+
2165
+ original = final_files[1]
2166
+ original_file = f'{output}/{original}'
2167
+
2168
+ line = 73
2169
+
2170
+ with open(original_file, "r") as f:
2171
+ lines = f.readlines()
2172
+
2173
+ end = '<button class="accordion">Outbreak</button>'
2174
+ path_temp = f'{output}/exit.txt'
2175
+
2176
+ with open(path_temp, "w") as f:
2177
+ for current_line in lines[line-1:]:
2178
+ if end in current_line:
2179
+ break
2180
+ f.write(current_line)
2181
+
2182
+ print_log(f"Content saved in: {path_temp}", log)
2183
+ return path_temp
2184
+
2185
+ def transfer_info_to_html_content(path_temp, html_content, log):
2186
+
2187
+ """
2188
+ Reads clustering and congruence information from a temporary .txt file,
2189
+ appends it to the existing HTML header content, and deletes the .txt file.
2190
+
2191
+ Parameters
2192
+ ----------
2193
+ path_temp: str
2194
+ Path to the *.txt file containing clustering and congruence information.
2195
+
2196
+ html_content: str
2197
+ Initial content of the new HTML report (e.g., header section).
2198
+
2199
+ Return:
2200
+ ------
2201
+ Merges the header of the new report with the clustering and congruence analysis from the initial report.
2202
+ """
2203
+
2204
+ #print_log(f'\n---------------------------------------------- Function: transfer_info_to_html_content----------------------------------------------\n', log)
2205
+
2206
+ if os.path.exists(path_temp):
2207
+ #print_log(path_temp, log)
2208
+ with open(path_temp, 'r') as input_file:
2209
+ content = input_file.read()
2210
+
2211
+ else:
2212
+ print_log('There was a problem with the creation of the file containing information about clustering and congruence for the second HTML report.', log)
2213
+ html_content = content
2214
+ #os.remove(path_temp)
2215
+ return html_content
2216
+
2217
+ ################################################################ MODULE 1 ################################################################################
2218
+
2219
+ def reading_sequence_type(sequence_type_file, output, prefix_st, log):
2220
+
2221
+ """
2222
+ Reading the sequence type matrix.
2223
+
2224
+ Parameters
2225
+ ----------
2226
+ sequence_type: str
2227
+ Full path to the sequence type matrix.
2228
+
2229
+ output: str
2230
+ Path to the directory where the results will be saved.
2231
+
2232
+ prefix_st: str
2233
+ The prefix that will be added to the file.
2234
+
2235
+ Returns
2236
+ -------
2237
+ fig: plotly.graph_objs._figure.Figure
2238
+ Code to produce figure
2239
+ """
2240
+
2241
+ #print_log(f'\n---------------------------------------------- Function: reading_sequence_type----------------------------------------------\n', log)
2242
+
2243
+ df=pd.read_table(sequence_type_file)
2244
+ column=df.columns[1]
2245
+
2246
+ name_cluster=[]
2247
+ nr_cluster=[]
2248
+
2249
+ for elem in df[column]:
2250
+ if elem not in name_cluster:
2251
+ name_cluster.append(elem)
2252
+ number = df[column].tolist().count(elem)
2253
+ nr_cluster.append(number)
2254
+
2255
+ new_df = pd.DataFrame({"Cluster": name_cluster, "Count": nr_cluster})
2256
+ fig = px.bar(new_df, x="Cluster", y="Count", title=f"Most represented STs in the {prefix_st} pipeline", labels={"Cluster": "Cluster name", "Count": "Number of samples"})
2257
+ fig.update_layout(title_x=0.5)
2258
+ fig.write_image(f'{output}/{prefix_st}_pipeline_clusters.png', format='png')
2259
+
2260
+ return fig
2261
+
2262
+
2263
+ ###########################################################################################################################################################
2264
+ ########################################################################## HTML ###########################################################################
2265
+ ###########################################################################################################################################################
2266
+
2267
+ def create_html(log, file_path_report):
2268
+
2269
+ """
2270
+ Opening HTML file to save the dynamic graphs of the various analyses.
2271
+
2272
+ Parameters
2273
+ ----------
2274
+ log: <class '_io.TextIOWrapper'>
2275
+
2276
+ file_path_report: str
2277
+ Full path to the HTML file with all analysis results.
2278
+
2279
+ Return
2280
+ ------
2281
+ html_content: str
2282
+ It contains the body of an HTML document.
2283
+
2284
+ """
2285
+ #print_log(f'\n---------------------------------------------- Function: create_html----------------------------------------------\n',log)
2286
+
2287
+ name_file=os.path.basename(file_path_report)
2288
+ title="Report EvalTree"
2289
+
2290
+ html_content= f"""<!DOCTYPE html>
2291
+ <html>
2292
+ <head>
2293
+ <meta charset="utf-8">
2294
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2295
+ <title>{title}</title>
2296
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
2297
+ <style>
2298
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
2299
+ section {{ margin: 20px 0; }}
2300
+
2301
+
2302
+ /* ------------------------- CLUSTERING images------------------------------------------- */
2303
+ .image-row {{
2304
+ display: flex;
2305
+ flex-wrap: wrap;
2306
+ justify-content: space-around;
2307
+ margin-top: 20px;
2308
+ }}
2309
+
2310
+ .image-item {{
2311
+ flex: 1 1 calc(25% - 20px);
2312
+ box-sizing: border-box;
2313
+ margin: 10px;
2314
+ max-width: calc(25% - 20px);
2315
+ }}
2316
+
2317
+ /* --------------------------- START Accordion ------------------------------------------- */
2318
+ .accordion {{
2319
+ background-color: #eee;
2320
+ color: #444;
2321
+ cursor: pointer;
2322
+ padding: 18px;
2323
+ width: 100%;
2324
+ border: none;
2325
+ text-align: left;
2326
+ outline: none;
2327
+ font-size: 20px;
2328
+ transition: 0.4s;
2329
+ font-weight: bold;
2330
+ }}
2331
+ .active, .accordion:hover {{background-color: #ccc;}}
2332
+
2333
+ .panel {{ padding: 0 10px;
2334
+ display: none;
2335
+ background-color: white;
2336
+ overflow: hidden;}}
2337
+
2338
+ /* --------------------------- END Accordion ------------------------------------------- */
2339
+
2340
+ .image-heatmap {{
2341
+ display: flex;
2342
+ justify-content: space-around;
2343
+ margin-top: 20px;
2344
+ flex-wrap: wrap;
2345
+ }}
2346
+
2347
+ .compact {{margin: 2px 0; line-height: 1.2;}}
2348
+ </style>
2349
+ </head>
2350
+ <body>
2351
+ <header>
2352
+ <h1>EvalTree Report</h1>
2353
+ <p> Toolbox for comparative clustering evaluation of whole genome sequencing pipelines for bacteria routine surveillance</p>
2354
+ </header>"""
2355
+
2356
+ return html_content
2357
+
2358
+ def body_html(start, command_line,version):
2359
+
2360
+ html_content=f"""<button class="accordion">Overview</button>
2361
+ <div class="panel">
2362
+ <p>Report generated on: {start}</p>
2363
+ <p>Entered command line: {command_line}</p>
2364
+ <p>Version: {version}</p>
2365
+ </div>
2366
+ """
2367
+ return html_content
2368
+
2369
+ def get_sequence_type(prefix_st,samples_st,groups_st,sequence_type_file):
2370
+
2371
+ html_content=f"""
2372
+ <button class="accordion">Pipeline characterization: {prefix_st}</button>
2373
+ <div class="panel">
2374
+ <p>Number of samples: {samples_st} </p>
2375
+ <p>Number of groups:{groups_st} </p>
2376
+ <p> Name of file: {sequence_type_file}</p>
2377
+ """
2378
+ return html_content
2379
+
2380
+ def sequence_type_image(fig_html):
2381
+
2382
+ html_content=f"""
2383
+ <div>{fig_html}</div></div>
2384
+ """
2385
+ return html_content
2386
+
2387
+ def get_partitions_threshold(prefix_single, nr_lines_df, nr_columns_df, fig_partition_vs_threshols):
2388
+
2389
+ fig_partition_vs_threshols.update_layout(margin=dict(l=0, r=0, t=20, b=0))
2390
+ fig_pt=pio.to_html(fig_partition_vs_threshols, include_plotlyjs='cdn', full_html=False)
2391
+
2392
+ html_content= f"""<button class="accordion">Pipeline characterization: {prefix_single}</button>
2393
+ <div class="panel">
2394
+ <h3> Summary: {prefix_single} </h3>
2395
+ <p> Number of samples: {nr_lines_df} </p>
2396
+ <p> Number of thresholds: {nr_columns_df} </p>
2397
+ <h3> Number of partitions per threshold </h3>
2398
+ <div> {fig_pt} </div>
2399
+ <p> This line plot shows the number of partitions (groups) at each threshold. </p>
2400
+ """
2401
+ return html_content
2402
+
2403
+ def get_clusters(mst_groups, prefix):
2404
+
2405
+ """
2406
+ Display pie charts by threshold and category, generating CLUSTER HTML content for each pipeline.
2407
+
2408
+ Parameters
2409
+ ----------
2410
+
2411
+ mst_groups: dict
2412
+ Categories and images.
2413
+ prefix: str
2414
+ Prefix the name of the file that is being processed.
2415
+
2416
+ Return
2417
+ -----
2418
+ HTML file with data chosen by the user."
2419
+ """
2420
+
2421
+ html_content=''
2422
+ html_content +=f'<button class="accordion"> ReporTree clustering visualization: pipeline {prefix} </button>\n'
2423
+ html_content +=f'<div class="panel">\n'
2424
+
2425
+ for mst, categories in mst_groups.items():
2426
+
2427
+ html_content += f'<button class="accordion">Threshold: {mst}</button>\n'
2428
+ html_content += f'<div class="panel">\n'
2429
+
2430
+ for category, images in categories.items():
2431
+ html_content += f"<h4>Category: {category}</h4>\n"
2432
+ html_content+=f'<div class="image-row">\n'
2433
+
2434
+ for image in images:
2435
+ width_percent = 25
2436
+ fig_html=pio.to_html(image, include_plotlyjs = 'cdn', full_html = False)
2437
+ html_content += f'<div class="image-item">{fig_html}</div>\n'
2438
+ #html_content += f'<div class="image-item" style="flex: 0 0 {width_percent}%; max-width: {width_percent}%;">{fig_html}</div>\n'
2439
+
2440
+ html_content += f"</div>\n"
2441
+ html_content += f'</div>\n'
2442
+ html_content += f'</div>\n'
2443
+ html_content += f'</div>\n'
2444
+
2445
+ return html_content
2446
+
2447
+ def summary_congruence():
2448
+
2449
+ html_content=f"""
2450
+ <button class="accordion">Inter-pipeline cluster congruence</button>
2451
+ <div class="panel" >
2452
+ <p > This section evaluates the clustering congruence between two WGS-based pipelines by comparing their cluster compositon at all possible threshold levels.
2453
+ The goal is to assess how similarly the pipelines group the isolates, by measuring the consistency of cluster assignments at each threshold.
2454
+ This helps determine the level of agreement between the pipelines and identify the most comparable thresholds.
2455
+ More detailed information is available on the
2456
+ <a href="https://github.com/insapathogenomics/CENTAUR/tree/main/EvalTree" target="_blank" rel="noopener noreferrer">
2457
+ EvalTree GitHub
2458
+ </a>.
2459
+ </p>
2460
+
2461
+ """
2462
+ return html_content
2463
+
2464
+ def summary_partition_threshold(fig_html_partition_threshold, prefix_both):
2465
+
2466
+ html_content=f"""
2467
+ <h3> Number of partitions per threshold </h3>
2468
+ <div> {fig_html_partition_threshold} </div>
2469
+ <p class="compact"> The line plot shows the number of partitions at each threshold.</p>
2470
+ <p class="compact"> Detailed information is available in the <code> {prefix_both}_cluster_partitions.tsv</code> file.</p>
2471
+ """
2472
+ return html_content
2473
+
2474
+ def congruence_stability(fig_html_st, prefix, prefix_2, n_stability, thr_stability):
2475
+
2476
+ html_content=f"""
2477
+ <h3> Blocks of stability regions </h3>
2478
+ <div>{fig_html_st}</div>
2479
+ <p class="compact"> For each pipeline, clustering stability regions are defined as a range of thresholds e.g., {n_stability} with a nAWC of e.g., {thr_stability} which cluster composition remains stable/consistent. </p>
2480
+ <p class="compact"> To better distinguish each region (represented by separated rectangle blocks), the blocks are vertically offset, starting on a different line. </p>
2481
+ <p class="compact"> Distance thresholds (x axis) are presented in log2 scale. </p>
2482
+ <p class="compact"> Detailed information is available in the following files: </p>
2483
+ """
2484
+
2485
+ html_content += f"- <code>{prefix}_metrics.tsv"
2486
+ if prefix_2 is not None:
2487
+ html_content += f" and {prefix_2}_metrics.tsv"
2488
+ html_content += f": summarizes all comparisons between consecutive pairs of thresholds (“n + 1” → “n”). A region is stable when at least five consecutive pairs of thresholds yield an nAWC greater than 0.99.</code>"
2489
+
2490
+ html_content += f"<br>- <code>{prefix}_StableRegions.tsv"
2491
+ if prefix_2 is not None:
2492
+ html_content += f" and {prefix_2}_StableRegions.tsv"
2493
+ html_content += f": lists the block names, their respective threshold range, and the length of each block.</code>"
2494
+
2495
+
2496
+ return html_content
2497
+
2498
+ def congruence_heatmap(fig_html_heatmap, prefix_both):
2499
+
2500
+ split_prefix=prefix_both.split('_')
2501
+ first=split_prefix[0]
2502
+ second=split_prefix[-1]
2503
+
2504
+ html_content=f"""
2505
+ <h3> Congruence score </h3>
2506
+ <div class='image-heatmap'>{fig_html_heatmap} </div>
2507
+ <p class="compact"> The heatmap shows a pairwise comparison of clustering results from two pipelines, {first} and {second}, at all possible distance thresholds. </p>
2508
+ <p class="compact"> The congruence score (CS) is a metric ranging from 0 (no congruence between methods) to 3 (absolute congruence).</p>
2509
+ <p class="compact"> Detailed information is available in the <code> {prefix_both}_final_score.tsv </code> file.</p>
2510
+ """
2511
+ return html_content
2512
+
2513
+ def congruence_tendency(fig_tendency_html, score_value, prefix_both, nr_point_method_1, nr_point_method_2):
2514
+
2515
+ pipeline1=prefix_both.split('_vs_')[0]
2516
+ pipeline2=prefix_both.split('_vs_')[-1]
2517
+
2518
+ html_content= f"""
2519
+ <h3> Corresponding points </h3>
2520
+ <div> {fig_tendency_html} </div>
2521
+ <p class="compact"> This graph shows the corresponding points between the two pipelines in both directions above (CS >= {score_value}). </p>
2522
+ <p class="compact"> When comparing a set of samples between two pipelines, the probability of two sample clustering together in one method/pipeline in a given threshold
2523
+ may not to be the same in the other method/pipeline. Therefore:</p>
2524
+ <p class="compact"> - First, the threshold in the {pipeline1} pipeline (method 1) that produces clustering results most similar to those in the {pipeline2} pipeline (method 2) is identified. </p>
2525
+ <p class="compact"> - Then, the threshold in the {pipeline2} pipeline (method 1) that produces clustering results most similar to those in the {pipeline1} pipeline (method 2) is identified.</p>
2526
+ <p class="compact"> Both methods produce similar clustering results when the tendency line has a slope near 1. </p>
2527
+ <p class="compact">A linear tendency line supported by {nr_point_method_1} (blue) and {nr_point_method_2} (orange) points is presented. </p>
2528
+ <p class="compact"> Detailed information is available in the <code> {prefix_both}_All_correspondence.tsv </code> file. </p>
2529
+ <p style="margin-bottom: 8px;"></p>
2530
+ </div>
2531
+ """
2532
+ return html_content
2533
+
2534
+ def congruence_st(fig_html_heatmap, prefix_both):
2535
+
2536
+ html_content= f"""
2537
+ <button class="accordion">Congruence</button>
2538
+ <div class="panel">
2539
+ <p> This section makes it possible to evaluate the congruence of the two genomic pipelines. </p>
2540
+ <h3> Congruence score </h3>
2541
+ <div class="image">{fig_html_heatmap} </div>
2542
+ <p> Sequence type {prefix_both} pipelines</p>
2543
+ </div>
2544
+ """
2545
+ return html_content
2546
+
2547
+ def html_tradicional_typing(n_samples,n_groups, prefix):
2548
+
2549
+ html_content= f"""
2550
+ <button class="accordion">Sequence type {prefix}</button>
2551
+ <div class="panel">
2552
+ <p>Number of samples: {n_samples}</p>
2553
+ <p>Number of groups: {n_groups}</p>
2554
+ </div>
2555
+ """
2556
+ return html_content
2557
+
2558
+ def image_outbreak(fig_result):
2559
+
2560
+
2561
+ html_content=''
2562
+ html_content +=f'<button class="accordion">Outbreak</button>'
2563
+ html_content +=f'<div class="panel">'
2564
+ html_content+=f'<div class="image-row">\n'
2565
+
2566
+ for fig in fig_result:
2567
+ fig_html=pio.to_html(fig, full_html=False, include_plotlyjs='cdn')
2568
+ html_content += f'<div class="image-item">{fig_html}</div>'
2569
+ #html_content+='</div>'
2570
+ html_content+='</div>'
2571
+
2572
+ return html_content
2573
+
2574
+ def summary_outbreak(prefix_both, thresholds):
2575
+
2576
+ html_content=f"""
2577
+ <p class="compact">Determines the percentage of clusters identified in a pipeline at a given threshold that could be detected with the same composition by another pipeline at a similar or even higher threshold.</p>
2578
+ """
2579
+ for elem in thresholds:
2580
+ string1, string2, type_com = elem
2581
+ html_content += f"""<p class="compact"> Detailed information is available in the <code> {prefix_both}_stats_outbreak_summary_{string1}_{type_com}_{string2} file.</code></p>"""
2582
+ html_content += f"""<p class="compact"> Detailed information is available in the <code> {prefix_both}_stats_outbreak_pairwise_comparison_{string1}_{type_com}_{string2} file.</code></p>"""
2583
+ html_content += f"""</div> """
2584
+
2585
+ return html_content
2586
+
2587
+ def references():
2588
+
2589
+ html_content = f"""
2590
+ <p style="font-size: 10pt;"> <strong> References:</strong> </p>
2591
+ <p style="font-size: 8pt;"><a href="https://doi.org/10.1038/s41467-025-59246-8" target="_blank">Mixão V et al. (2025). Multi-country and intersectoral assessment of cluster congruence between pipelines for genomics surveillance of foodborne pathogens. <em>Nature Communications</em>, 16, Article 3961.</a></p>
2592
+ <p style="font-size: 8pt;"> EvalTree relies on the work of other developers. So you must also cite: </p>
2593
+ <p style="font-size: 8pt;"> -<a href="https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-023-01196-1"> Mixão V et al. (2023). ReporTree: a surveillance-oriented tool to strengthen the linkage between pathogen genetic clusters and epidemiological data.</a></p>
2594
+ <p style="font-size: 8pt;"> -<a href="https://journals.asm.org/doi/10.1128/jcm.02536-05?permanently=true"> Carriço J et al. (2006). Illustration of a Common Framework for Relating Multiple Typing Methods by Application to Macrolide-Resistant Streptococcus pyogenes.</a></p>
2595
+ <br></br>
2596
+ <p style="text-align: center; max-width: 1000px; margin: 0 auto;">
2597
+ <em>EvalTree.py</em> is a tool developed in the frame of the <strong>CENTAUR project</strong> (supported by the European ISIDORe initiative) at the
2598
+ Genomics and Bioinformatics Unit of the Department of Infectious Diseases in the National Institute of Health Dr. Ricardo Jorge (INSA, Portugal).
2599
+ </p>
2600
+
2601
+ """
2602
+
2603
+ return html_content
2604
+ def javascript_function():
2605
+
2606
+ html_content=f"""
2607
+ <script>
2608
+ var acc = document.getElementsByClassName("accordion");
2609
+ var i;
2610
+
2611
+ for (i = 0; i < acc.length; i++) {{
2612
+ acc[i].addEventListener("click", function() {{
2613
+ var panel = this.nextElementSibling;
2614
+ if (panel.style.display === "block") {{
2615
+ panel.style.display = "none";
2616
+ }} else {{
2617
+ panel.style.display = "block";
2618
+ }}
2619
+ }});
2620
+ }}
2621
+ </script>
2622
+ """
2623
+
2624
+ return html_content
2625
+
2626
+ def write_html(html_content, file_path_report, log):
2627
+
2628
+ """
2629
+ Writing HTML
2630
+
2631
+ Parameters
2632
+ ----------
2633
+ html_content: str
2634
+ Code of HTML report.
2635
+
2636
+ file_path_report: str
2637
+ Path of the report HTML file.
2638
+ Return
2639
+ -----
2640
+ file_path_report: str
2641
+ HTML file with results.
2642
+ """
2643
+ #print(f'\n---------------------------------------------- Function: write_html----------------------------------------------\n')
2644
+
2645
+ with open(file_path_report, "w") as file:
2646
+ file.write(html_content)
2647
+ print_log(f"\nReport successfully generated in:\n {file_path_report}.\n", log)
2648
+
2649
+ def create_html_footer():
2650
+ return """
2651
+ </body>
2652
+ </html>
2653
+ """
2654
+
2655
+ def close_painel(prefix, message=None):
2656
+ html_content = f"""
2657
+ <button class="accordion">Clusters {prefix}</button>
2658
+ <div class="panel">
2659
+ """
2660
+
2661
+ if message:
2662
+ html_content += f'<p>Error: {message}</p>\n'
2663
+
2664
+ html_content += "</div>\n"
2665
+ html_content += "</div>\n"
2666
+
2667
+ return html_content
2668
+
2669
+
2670
+ def print_log(message, log):
2671
+ """ print messages in the terminal and in the log file """
2672
+
2673
+ print(message)
2674
+ print(message, file = log)
2675
+
2676
+ #####################################################################################################################################
2677
+ ###################################################################***###############################################################
2678
+ ###############################################################***EvalTree***##########################################################
2679
+ ###################################################################***###############################################################
2680
+ #####################################################################################################################################
2681
+
2682
+ def main():
2683
+ """
2684
+ This function is instrumental in the tool.
2685
+ It manages the flow of the program, determining which functions to call and in what order.
2686
+
2687
+ Parameters
2688
+ ---------
2689
+ Without parameters, it will pass all arguments entered by the user.
2690
+
2691
+ Returns
2692
+ ---------
2693
+ None
2694
+
2695
+ """
2696
+ #-------------------------------------------------------------------------------------------------------------------------------------
2697
+ # Configures the parser for command line arguments
2698
+
2699
+ parser = argparse.ArgumentParser(description="Running EvalTree")
2700
+ parser = argparse.ArgumentParser(prog="EvalTree.py",
2701
+ formatter_class=argparse.RawDescriptionHelpFormatter,
2702
+ description=textwrap.dedent("""
2703
+ EvalTree.py
2704
+
2705
+ EvalTree was designed for comparing two genomic pipeline inputs (e.g., cg/wgMLST, traditional sequence-type matrix), with three main functionalities:
2706
+
2707
+ - Evaluates the congruence between the pipelines.
2708
+ - Characterizes genetic clusters.
2709
+ - Detects closely related outbreak clusters at given thresholds.
2710
+
2711
+ The EvalTree toolbox accepts two types of inputs: folders and files.
2712
+ - Folders must be derived from ReporTree outputs (highly recommended). Pipelines of cg/wgMLST should contain clustering data (clusters and/or singletons) for
2713
+ all possible thresholds in a partition.tsv file.
2714
+ - Files can be partition files or other types of files with classifications (e.g., sequence-type, serotypes).
2715
+ It will return an interactive HTML report based on the selected arguments.
2716
+
2717
+ The following arguments are used for specific analyses:
2718
+
2719
+ - plot_summary, plots_threshold, column_plot, n_cluster, plots_category_number, plots_category_percentage: These are used exclusively to characterize genetic clusters from
2720
+ ReporTree output files (e.g., *_partitions_summary.tsv).
2721
+ - score and threshold: These are used in the cg/wgMLST pipeline congruence analysis.
2722
+ - threshold_outbreak, repeat_threshold_outbreak: These are used in the outbreak analysis, utilizing the cluster_composition.tsv file produced by ReporTree."""))
2723
+
2724
+ # Mandatory arguments
2725
+ parser.add_argument("-i1", "--input1",
2726
+ action = "store",
2727
+ required = True,
2728
+ help = '[MANDATORY] Specifies the first input type (folder or file), requiring the full path. \
2729
+ The folder must contain the partition matrix file with clustering data, and is highly recommended to be a Reportree output folder.\
2730
+ Alternatively, the file can be a traditional sequence-type matrix or a partition matrix.\
2731
+ Using either of these input types enables the analysis.')
2732
+
2733
+ parser.add_argument("-i2", "--input2",
2734
+ action = "store",
2735
+ required = False,
2736
+ help = '[OPTIONAL] Specifies the second input type (folder or file), requiring the full path. \
2737
+ The folder must contain the partition matrix file with clustering data, and is highly recommended to be a Reportree output folder. \
2738
+ Alternatively, the file can be a traditional sequence-type matrix or a partition matrix. \
2739
+ Using either of these input types enables the analysis.')
2740
+
2741
+ parser.add_argument("-o", "--output",
2742
+ action = "store",
2743
+ help = '[MANDATORY] Specifies the output directory for storing all analysis results. \
2744
+ If no folder is provided, the program will automatically create one based on the prefix of the files.')
2745
+
2746
+ # Optional arguments
2747
+ parser.add_argument('-s', '--score',
2748
+ dest = 'score',
2749
+ default = '2.85',
2750
+ help = '[OPTIONAL] Define a minimum score to consider two partitions (one from each pipeline) as corresponding. The score accepts values between 0 and 3.\
2751
+ Partition - It refer to the number of identical clusters that exist at the same threshold.')
2752
+
2753
+ parser.add_argument('-t', '--threshold',
2754
+ dest = 'threshold',
2755
+ default = 'max',
2756
+ help = '[OPTIONAL] Defines an integer range to select or filter threshold columns from the partition matrix file. \
2757
+ A filtered partition matrix, containing only the selected columns, will be created and used for subsequent analysis. \
2758
+ Ranges are specified using a hyphen to separate the minimum and maximum values (e.g., 10-20). \
2759
+ If this option is not set, the script will perform clustering for all possible thresholds in the range 0 to the maximum threshold.')
2760
+
2761
+ parser.add_argument('-ps', '--plots_summary',
2762
+ dest = 'plots_summary',
2763
+ choices = ['partitions_summary','sample_of_interest'],
2764
+ default = 'partitions_summary',
2765
+ help = '[OPTIONAL] Specify the type of cluster characterization file (partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv), both of which are expected to be located within a Reportree results folder. \
2766
+ Using the partition_summary option, the largest clusters present in the file will be characterized. \
2767
+ Alternatively, the samples_of_interest option will characterize all clusters, including those resulting from the addition of new samples (kept increase, new, new (increase), new (merge_increase), new (split_increase), new (split_merge_increase)).')
2768
+
2769
+ parser.add_argument('-n', '--n_cluster',
2770
+ dest = 'n_cluster',
2771
+ type = int,
2772
+ default = 3,
2773
+ help = '[OPTIONAL] Specify the number of top clusters to be displayed from the partitions_summary.tsv file, which must be located within a Reportree results folder. \
2774
+ This argument is not applicable when using the samples_of_interest option.')
2775
+
2776
+ parser.add_argument('-cp', '--columns_plots',
2777
+ dest = 'columns_plots',
2778
+ help = '[OPTIONAL] Name(s) of the column(s) to process the characterization of the clustering data in the selected file (specified by the plots_summary argument). \
2779
+ For multiple column names, indicate them separated by commas without spaces (e.g., column1,column2).')
2780
+
2781
+ parser.add_argument('-pt','--plots_threshold',
2782
+ dest='plots_threshold',
2783
+ help='[OPTIONAL] Identify the integer threshold(s) to be applied to the file specified by the plots_summary argument. \
2784
+ For multiple thresholds, indicate them separated by commas without spaces (e.g., X,Y,Z). \
2785
+ This generates a pie chart showing the clustering data for the specified threshold(s), according to the columns_plot argument.')
2786
+
2787
+ parser.add_argument('-pcn','--plots_category_number',
2788
+ dest='plots_category_number',
2789
+ default=5,
2790
+ type=int,
2791
+ help='[OPTIONAL] Determines the number of plot categories in the partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv file\
2792
+ that are intended to be collapsed into the '"Other"' category for visualization in the cluster plots.\
2793
+ When there are more than 5 slices (default), they will be combined into one category named Other')
2794
+
2795
+ parser.add_argument('-pcp','--plots_category_percentage',
2796
+ dest='plots_category_percentage',
2797
+ type=float,
2798
+ help='[OPTIONAL] Determines the percentage of plot categories in the partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv file\
2799
+ that are intended to be collapse into the '"Other"'category for visualization in the cluster plots.\
2800
+ Slices plots with a lower percentage than the entered plots_category_percentage will be combined into one category named Others')
2801
+
2802
+ parser.add_argument('-to', '--threshold_outbreak',
2803
+ dest='threshold_outbreak',
2804
+ type=str,
2805
+ help='[OPTIONAL] Determine the number of clusters identified in one pipeline at a given threshold \
2806
+ that will exist with the same composition in another pipeline at the same or a higher threshold.\
2807
+ Full attention, this argument has its own structure: two threshold (strings-methods) and the type of comparison is \
2808
+ either equal (defined by , ) or lower_equal (defined by <= ) \
2809
+ Threshold1: Threshold at which the genetic clusters must be identified for the pipeline of interest.\
2810
+ Threshold2: Threshold at which the genetic clusters must be searched in the other pipelines.\
2811
+ Comparison (equal or lower equal): \
2812
+ - ''equal'': Used to assess whether a cluster is detected at a given threshold by another pipeline. \
2813
+ Use a comma '','' to separate threshold1,threshold2. Example of expression: MST-7x1.0,MST-7x1.0.\
2814
+ - ''lower_equal'': Used to assess whether a cluster is detected up to a given threshold in another pipeline. \
2815
+ Use <= between threshold1<=threshold2. Example of expression: MST-7x1.0,<=MST-9x1.0.\
2816
+ \
2817
+ For multiple pair of threshold values, use '';'' as a separator. Example of expression: "MST-7x1.0,MST-7x1.0;<=MST-7x1.0,MST-10x1.0" represents two pair of threshold values.')
2818
+
2819
+ parser.add_argument('-list', '--list',
2820
+ dest='list',
2821
+ choices=['partitions_summary','sample_of_interest'],
2822
+ help='[OPTIONAL] Specify the names of the columns present in the partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv file.')
2823
+
2824
+ parser.add_argument('-rto','--repeat_threshold_outbreak',
2825
+ dest='repeat_threshold_outbreak',
2826
+ action="store_true",
2827
+ help='[OPTIONAL] This argument can only be used after of a previous analysis of threshold_outbreak.')
2828
+
2829
+ parser.add_argument('-v', '--version',
2830
+ action='version',
2831
+ version='EvalTree 1.0.0, last update 2025-05-20',
2832
+ help='[OPTIONAL] Specify the version number of EvalTree.')
2833
+
2834
+ parser.add_argument('-n_stab', '--n_stability',
2835
+ dest = 'n_stability',
2836
+ default = 5,
2837
+ type = int,
2838
+ help = '[OPTIONAL] Range of threshold which the cluster composition can be conistent/stable.')
2839
+
2840
+ parser.add_argument('-thr_stab', '--thr_stability',
2841
+ dest = 'thr_stability',
2842
+ default = 0.99,
2843
+ type = float,
2844
+ help = '[OPTIONAL] The neighborhood Adjusted Wallace Coefficient (nAWC) threshold used to determine if a clustering threshold is considered consistent or stable.')
2845
+
2846
+ #------------------------------------------------------------------
2847
+ # INITIAL INFORMATIONS
2848
+ # Read the command line arguments and retrieve paths
2849
+
2850
+ args = parser.parse_args()
2851
+ path_toolbox_script, directory_toolbox = get_path_toolbox()
2852
+ comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, stats_outbreak_script = get_path_other_scripts(directory_toolbox)
2853
+
2854
+ #------------------------------------------------------------------
2855
+ # I- Structural validation of the arguments
2856
+ # I1- Check the input argument(s) (-i1 and/or -i2)
2857
+
2858
+ input1 = None
2859
+ input2 = None
2860
+
2861
+ if args.input1:
2862
+ input1 = args.input1
2863
+
2864
+ if args.input2:
2865
+ input2=args.input2
2866
+
2867
+ folders, files = check_input_argument(input1, input2)
2868
+ print(f'Checking inputs:')
2869
+
2870
+ data_folder = []
2871
+ if folders !=[]:
2872
+
2873
+ for folder in folders:
2874
+ print(f"\tFolder: {folder}")
2875
+ partitions, partitions_summary, sample_interest, clusterComposition, prefix, input_path, stable_region = check_folder(folder)
2876
+ data_folder += [[partitions, prefix, input_path, partitions_summary, sample_interest, clusterComposition, stable_region]] #7
2877
+
2878
+ data_files = []
2879
+ for file in files:
2880
+ print(f"\n\tFile: {file}")
2881
+ file, prefix_file, path_directory, file_type, n_samples, n_groups = check_file(file)
2882
+ print(f"\t\tPrefix: {prefix_file}")
2883
+ print(f'\t\tDirectory: {path_directory}')
2884
+ data_files += [[file, prefix_file, path_directory, file_type, n_samples, n_groups]] #6
2885
+
2886
+ #----------------------------------------------------------------
2887
+ # I2- Check the output argument (-o)
2888
+ output, rename = check_output(args.output)
2889
+
2890
+ #----------------------------------------------------------------
2891
+ # I3- Check the list argument (-list)
2892
+
2893
+ list_column_plot = args.list
2894
+
2895
+ if list_column_plot is not None:
2896
+ if data_folder != []:
2897
+ for sub in data_folder:
2898
+
2899
+ if list_column_plot == 'partitions_summary':
2900
+ file_summary = sub[3]
2901
+ if file_summary is not None:
2902
+ get_plot_columns(file_summary)
2903
+ else:
2904
+ sys.exit(f"There is no partitions_summary file in the {sub[2]}. ")
2905
+ else:
2906
+ file_s_interest=sub[4]
2907
+ if file_s_interest is not None:
2908
+ get_plot_columns(file_s_interest)
2909
+ else:
2910
+ sys.exit(f"\nThere is no the sample_of_interest file in the {sub[2]}.")
2911
+ sys.exit()
2912
+ else:
2913
+ sys.exit('It is impossible to use the list argument (-list) when the input(s) (-i1, -i2) argument(s) have provided file(s).')
2914
+
2915
+ #----------------------------------------------------------------
2916
+ # I4- Check the column plot(s) (-cp) and plots thresholds (-pt) arguments
2917
+
2918
+ columns_plots = args.columns_plots
2919
+ plots_thresholds = args.plots_threshold
2920
+
2921
+ if plots_thresholds is not None:
2922
+ plots_thresholds = check_str_plots_threshold(plots_thresholds)
2923
+
2924
+ #----------------------------------------------------------------
2925
+ # I5- Check the threshold (-t) and score(-s) arguments
2926
+
2927
+ score_value = check_score(args.score)
2928
+ threshold = check_threshold(args.threshold)
2929
+
2930
+ if threshold != 'max':
2931
+
2932
+ for sub in data_files:
2933
+ file_matrix = sub[0]
2934
+ identify_matrix = sub[3]
2935
+ if identify_matrix == False:
2936
+ if len(data_files) == 1:
2937
+ print(f'\t\tWarning: The threshold argument (-t) is only applied to a partition matrix, so it is not applicable to the {file_matrix}.')
2938
+
2939
+ #----------------------------------------------------------------
2940
+ # I6- Arguments that do not require structural validation
2941
+
2942
+ n_cluster = args.n_cluster
2943
+ plots_summary_arg = args.plots_summary
2944
+ plots_category_percentage = args.plots_category_percentage
2945
+ plots_category_number = args.plots_category_number
2946
+ n_stability = args.n_stability
2947
+ thr_stability = args.thr_stability
2948
+
2949
+ #----------------------------------------------------------------
2950
+ # I7- Check the threshold outbreak (-to) and repeat_threshold_outbreak (-rto) arguments
2951
+
2952
+ threshold_outbreak = args.threshold_outbreak
2953
+ repeat_threshold_outbreak = args.repeat_threshold_outbreak
2954
+
2955
+ if threshold_outbreak is not None:
2956
+ valid_combinations = validate_combinations_outbreak(threshold_outbreak)
2957
+
2958
+ #----------------------------------------------------------------
2959
+ # II- Validation of the argument combination (clustering, outbreaks)
2960
+
2961
+ go_clustering, go_outbreaks = check_combinations_arguments(plots_summary_arg, data_folder, data_files)
2962
+
2963
+ #----------------------------------------------------------------
2964
+ # III- Validation of file prefixes provided in different inputs
2965
+
2966
+ data_folder, data_files, prefix_both = check_data_folders_file(data_folder, data_files)
2967
+
2968
+ #---------------------------------------------------------
2969
+ # IV- Validation of partition matrix FUNDAMENTAL
2970
+
2971
+ inputs_variables = join_inputs_variables(data_folder,data_files)
2972
+
2973
+ #---------------------------------------------------------
2974
+ # V- Validation of congruence
2975
+ go_congruence = False
2976
+
2977
+ if len(inputs_variables) == 2:
2978
+
2979
+ i1,i2 = inputs_variables[0][0], inputs_variables[1][0]
2980
+
2981
+ if i1 is not None and i2 is not None:
2982
+ go_congruence = True
2983
+
2984
+ else:
2985
+ print("Congruence analysis is not possible. It is necessary two *_partitions.tsv files.\n")
2986
+
2987
+
2988
+ #---------------------------------------------------------------------------------------------------
2989
+ # VI- Outbreaks (-rto)
2990
+
2991
+ if repeat_threshold_outbreak is not False:
2992
+
2993
+ if args.output is None:
2994
+ sys.exit('Error: Please specify the output folder with the -o argument. It should contain the previous results.')
2995
+
2996
+ file=glob.glob(os.path.join(output,'*_report.html'))
2997
+ if not file:
2998
+ sys.exit("Error: The expected *_report.html file was not found. Please run the program first with the -to argument, and then with the -rto argument.")
2999
+
3000
+ if not threshold_outbreak:
3001
+ print('\tDo not forget the double quotation marks!')
3002
+ sys.exit("Error: You must specify a new argument for the threshold_outbreak (-to).")
3003
+
3004
+ #---------------------------------------------------------------------------------------------------
3005
+ # VII - Stable Regions (-thr_stab)
3006
+
3007
+ if thr_stability != 0.99:
3008
+ if not (0 <= thr_stability <= 1):
3009
+ sys.exit("Error: thr_stability must be between 0 and 1.")
3010
+
3011
+ #--------------------------------------------------------------------------------------------------
3012
+ # Starting logs
3013
+
3014
+ if not repeat_threshold_outbreak:
3015
+ log_name = (f'{output}/{prefix_both}.log')
3016
+ log = open(log_name, "w+")
3017
+
3018
+ else:
3019
+ log_name = (f'{output}/{prefix_both}_reanalyse.log')
3020
+ log = open(log_name, "w+")
3021
+
3022
+ # -------------------------------------------------------------------------------------------------------------------------
3023
+ # INITIAL INFORMATIONS
3024
+
3025
+ print("---------------------------------------------- Running EvalTree.py ----------------------------------------------\n")
3026
+ print_log(f"Version " + str(version) + " last updated on " + str(last_updated)+"\n", log)
3027
+ command_line = " ".join(sys.argv)
3028
+ print_log(f"Running EvalTree with the following command: {command_line}\n", log)
3029
+ print_log(f'Log file name: {log_name}\n', log)
3030
+ start = datetime.datetime.now()
3031
+ print_log("Start: " + str(start)+"\n", log)
3032
+ print_log(f'Output directory: {output}\n', log)
3033
+
3034
+ #-----------------------------------------------
3035
+ # STAR HTML
3036
+
3037
+ if not repeat_threshold_outbreak:
3038
+ file_path_report = os.path.join(output, f'{prefix_both}_report.html')
3039
+ html_content = create_html(log, file_path_report)
3040
+ html_content += body_html(start, command_line,version)
3041
+ else:
3042
+ file_path_report = os.path.join(output, f'{prefix_both}_2ºRUN_report.html')
3043
+ html_content = create_html(log, file_path_report)
3044
+ html_content += body_html(start, command_line,version)
3045
+ html_report = write_html(html_content,file_path_report, log)
3046
+
3047
+ #-------------------------------------------------------------------------------------------------------------------------------
3048
+
3049
+ if not repeat_threshold_outbreak:
3050
+
3051
+ #-------------------------------------------------------------------------------------------------------------------------------
3052
+
3053
+ if inputs_variables:
3054
+ for sub in inputs_variables:
3055
+
3056
+ #-------------------------------------------------------------------------------------------------------------------------------
3057
+ # MODULE 1 - SEQUENCE TYPE
3058
+ if len(sub) == 6:
3059
+
3060
+ if sub[3] == False:
3061
+ samples_st = sub[4]
3062
+ groups_st = sub[5]
3063
+ sequence_type_file = sub[0]
3064
+ prefix_st = sub[1]
3065
+ html_content += get_sequence_type(prefix_st,samples_st,groups_st,sequence_type_file)
3066
+ fig_clusters = reading_sequence_type(sequence_type_file, output, prefix_st, log)
3067
+ fig_html = pio.to_html(fig_clusters, include_plotlyjs='cdn', full_html=False)
3068
+ html_content += sequence_type_image(fig_html)
3069
+
3070
+ #-------------------------------------------------------------------------------------------------------------------------------
3071
+ list_partition_by_threshold=[]
3072
+ category_colors = {'Others':'#000000'}
3073
+
3074
+ if inputs_variables:
3075
+ for sub in inputs_variables:
3076
+ partition_matrix = sub[0]
3077
+ prefix = sub[1]
3078
+ directory = sub[2]
3079
+
3080
+ #-------------------------------------------------------------------------------------------------------------------------------
3081
+ # MODULE 2 - Characterization of the ONE pipeline (Nr_partitions vs Nr_thresholds)
3082
+ if len(sub) == 7 or (len(sub) == 6 and sub[3] == True):
3083
+
3084
+ print_log(f'\nPipeline characterization: {directory}', log)
3085
+ print_log(f'\tPipeline name: {prefix}', log)
3086
+
3087
+ if partition_matrix is not None:
3088
+
3089
+ if threshold !='max':
3090
+ start_threshold, end_threshold = check_range_threshold(partition_matrix,threshold,log)
3091
+ input_filtered = filter_partition_matrix(partition_matrix, prefix, start_threshold, end_threshold, output, log)
3092
+ sub[0] = input_filtered
3093
+ partition_matrix = input_filtered
3094
+
3095
+ nr_lines_df, nr_columns_df = get_nr_lines_threshold(partition_matrix, log)
3096
+ file_partition_by_threshold = get_file_partition_by_threshold (partition_matrix, prefix, output, log)
3097
+ list_partition_by_threshold.append(file_partition_by_threshold)
3098
+ print_log(f'\tObtaining the number of partitions per threshold.', log)
3099
+ yes_prefix_both=False
3100
+ fig_partition_vs_threshols = get_graph_partition_by_threshold(file_partition_by_threshold, prefix, prefix_both, yes_prefix_both, output, log)
3101
+ html_content += get_partitions_threshold(prefix, nr_lines_df, nr_columns_df, fig_partition_vs_threshols)
3102
+
3103
+
3104
+ if go_clustering == False:
3105
+ html_content += f'</div>\n'
3106
+
3107
+ #-----------------------------------------------------------------------
3108
+ # MODULE 3 - REPORTREE
3109
+ if len(sub) == 7: #folder
3110
+ partitions_summary = sub[3]
3111
+ sample_interest = sub[4]
3112
+
3113
+ if go_clustering == True:
3114
+ plots_file = None
3115
+
3116
+ if plots_summary_arg == 'partitions_summary':
3117
+ if partitions_summary is not None:
3118
+ plots_file = partitions_summary
3119
+ else:
3120
+ if sample_interest is not None:
3121
+ plots_file = sample_interest
3122
+
3123
+ print_log(f'\tPlotting cluster characterization ...', log)
3124
+
3125
+ #-----------------------------------------------------------------------
3126
+ # Starting clustering
3127
+ df_data=load_and_prepare_data(plots_file, log)
3128
+ df_filtered=order_cluster_by_size(df_data, log)
3129
+
3130
+ #-----------------------------------------------------------------------
3131
+ if df_filtered is not None:
3132
+ method = check_plot_threshold(plots_thresholds, df_filtered, log)
3133
+
3134
+ if method != []:
3135
+ filtered_threshold = check_threshold_in_file(method, df_filtered, plots_file, log)
3136
+
3137
+ if filtered_threshold != []:
3138
+
3139
+ if plots_summary_arg == 'partitions_summary':
3140
+ result_df = filter_df_by_plot_threshold(filtered_threshold, df_filtered, n_cluster, log)
3141
+
3142
+ if plots_summary_arg == 'sample_of_interest':
3143
+ df_filtered_threshold = filtering_df_threshold(filtered_threshold, df_filtered, log)
3144
+ result_df = select_nomenclature_change(df_filtered_threshold, log)
3145
+
3146
+ if result_df is not None:
3147
+ check_columns = check_column_plots(columns_plots, result_df, log)
3148
+
3149
+ #-----------------------------------------------------------------------
3150
+ # PLots
3151
+ if check_columns != []:
3152
+
3153
+ results_list = check_structure_lines_column_plots(check_columns, result_df, plots_category_percentage, plots_category_number, output, prefix, plots_summary_arg, category_colors, log)
3154
+
3155
+ if results_list is not None:
3156
+ mst_groups = organize_clusters(results_list)
3157
+ html_content += get_clusters(mst_groups, prefix)
3158
+ else:
3159
+ html_content += close_painel(prefix,"Error: Impossible to produce cluster plots.")
3160
+ print_log(f'\tError: Impossible to produce cluster plots.', log)
3161
+ else:
3162
+ html_content += close_painel(prefix,"Error: Invalid column plots, without clustering analysis.")
3163
+ print_log(f'\tError: Invalid column plots, without clustering analysis ...', log)
3164
+ #-------------------------------------------------------------------------
3165
+
3166
+ else:
3167
+ html_content += close_painel(prefix,"Error: No data for processing, without clustering analysis.")
3168
+ print_log(f'\tError: No data for processing, without clustering analysis.', log)
3169
+ else:
3170
+ html_content += close_painel(prefix,"Error: The plot_threshold argument is invalid, without clustering analysis.")
3171
+ print_log(f'\tError: The plot_threshold argument is invalid, without clustering analysis. ', log)
3172
+ else:
3173
+ html_content += close_painel(prefix,"Error: No analysis method provided, without clustering analysis.")
3174
+ print_log(f"\tError: No analysis method provided, without clustering analysis.",log)
3175
+ else:
3176
+ html_content += close_painel(prefix,"Error: Impossible to order the Dataframe by cluster length, without clustering analysis.")
3177
+ print_log(f'\tError: Impossible to order the Dataframe by cluster length, without clustering analysis.')
3178
+ else:
3179
+ html_content += f'</div>\n'
3180
+
3181
+ #-------------------------------------------------------------------------------------------------------------------------------
3182
+ print_log(f"\nInter-pipeline cluster congruence analysis:\n", log)
3183
+ html_content += summary_congruence()
3184
+
3185
+ #-----------------------------------------------------------------------
3186
+ # MODULE 4.1 - Characterization of the BOTH pipelines (Nr_partitions vs Nr_thresholds)
3187
+ if list_partition_by_threshold:
3188
+ if len(list_partition_by_threshold) == 2:
3189
+ file1, file2 = list_partition_by_threshold
3190
+ path = concatenation_files(file1, file2, output, prefix_both)
3191
+ yes_prefix_both=True
3192
+ fig = get_graph_partition_by_threshold(path, prefix, prefix_both, yes_prefix_both, output, log)
3193
+ print_log(f"\tPlotting the number of partitions per threshold for the two pipelines ...", log)
3194
+ fig_html = pio.to_html(fig, include_plotlyjs='cdn', full_html=False)
3195
+ html_content += summary_partition_threshold(fig_html, prefix_both)
3196
+
3197
+ #-----------------------------------------------------------------------
3198
+ # MODULE 4.2 - Stability regions #or (len(sub)==6 and sub[3]==True):
3199
+ print_log(f"\tIdentifying cluster stability regions for each pipeline ...", log)
3200
+ print_log(f"\t\tRunning comparing_partitions_v2.py in “stability” mode.", log)
3201
+
3202
+ files_to_stability = []
3203
+
3204
+ #-----------------------------------------------------------------------
3205
+ for sub in inputs_variables:
3206
+ partition_matrix = sub[0]
3207
+ prefix = sub[1]
3208
+ directory = sub[2]
3209
+
3210
+ if len(sub) == 7: #folders
3211
+ stable_region = sub[6]
3212
+ if stable_region is None or threshold !='max':
3213
+ file_stability = stability_region(output, partition_matrix, prefix, comparing_partitions_script, n_stability, thr_stability, log)
3214
+ files_to_stability.append([file_stability,prefix])
3215
+ else:
3216
+ files_to_stability.append([stable_region,prefix])
3217
+
3218
+ if len(sub) == 6: #files
3219
+ type_file = sub[3]
3220
+ if type_file == True: #if true it is partition matrix
3221
+ file_stability = stability_region(output, partition_matrix, prefix, comparing_partitions_script, n_stability, thr_stability, log)
3222
+ files_to_stability.append([file_stability, prefix])
3223
+ else:
3224
+ go_stability = False
3225
+
3226
+ #-----------------------------------------------------------------------
3227
+ all_dfs = []
3228
+ prefix_df = []
3229
+ #list_values_block = []
3230
+
3231
+ if files_to_stability:
3232
+
3233
+ for file, prefix in files_to_stability:
3234
+
3235
+ try:
3236
+ name_block = processing_block_names(file, prefix, log)
3237
+ first_data, final_data = processing_data(file, log)
3238
+ #list_values_block.append(values_block)
3239
+
3240
+ df = pd.DataFrame({'Block_id': name_block, 'Start': first_data, 'Finish': final_data, 'Pipeline': prefix})
3241
+ all_dfs.append(df)
3242
+ prefix_df.append(prefix)
3243
+ go_stability = True
3244
+
3245
+ except Exception as e:
3246
+ print(f'\t\tWarning: without stability reagions in the file {file}.')
3247
+ go_stability = False
3248
+
3249
+ if all_dfs:
3250
+ df = pd.concat(all_dfs, ignore_index = True)
3251
+ if len(prefix_df) == 2:
3252
+ prefix=prefix_df[0]
3253
+ prefix_2=prefix_df[1]
3254
+
3255
+ else:
3256
+ prefix_2 = None
3257
+ prefix = prefix_df[0]
3258
+
3259
+
3260
+ if go_stability == True:
3261
+ fig_st = change_processing_data(df, prefix, prefix_2, output, log)
3262
+ print_log(f"\t\tDone.\n", log)
3263
+ fig_html_st = pio.to_html(fig_st, include_plotlyjs='cdn', full_html=False)
3264
+ html_content += congruence_stability(fig_html_st, prefix, prefix_2, n_stability, thr_stability)
3265
+
3266
+ #-----------------------------------------------------------------------
3267
+ # MODULE 4.3 - Congruence between pipelines
3268
+ if go_congruence == True:
3269
+
3270
+ i1_matrix=inputs_variables[0][0]
3271
+ i2_matrix=inputs_variables[1][0]
3272
+ i1_prefix=inputs_variables[0][1]
3273
+ i2_prefix=inputs_variables[1][1]
3274
+
3275
+ path_all_correspondence_lower = management_main_scripts(comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, i1_matrix, i2_matrix, prefix_both, output, score_value, log)
3276
+
3277
+ #Final score
3278
+ fig_heatmap = get_heatmap(output, i1_prefix, i2_prefix, threshold, log)
3279
+ fig_html_heatmap = pio.to_html(fig_heatmap, include_plotlyjs='cdn',full_html=False)
3280
+ html_content += congruence_heatmap(fig_html_heatmap, prefix_both)
3281
+
3282
+ #-----------------------------------------------------------------------
3283
+ # Get best correspondence
3284
+
3285
+ if not any(len(elem) == 6 and elem[3] is False for elem in inputs_variables):
3286
+ fig_tendency, nr_point_method_1, nr_point_method_2 = get_tendency(output, prefix_both, log)
3287
+ fig_tendency_html = pio.to_html(fig_tendency,include_plotlyjs='cdn', full_html=False)
3288
+ html_content += congruence_tendency(fig_tendency_html, score_value, prefix_both, nr_point_method_1, nr_point_method_2)
3289
+ comparison = tendency_slop(path_all_correspondence_lower, i1_prefix, i2_prefix, output)
3290
+ #-------------------------------------------------------------------------------------------------------------------------------
3291
+ # MODULE 5 - OUTBREAK
3292
+
3293
+ if go_outbreaks == True:
3294
+
3295
+ #-----------------------------------------------------------------------
3296
+ # Variables
3297
+ clusterComposition_1 = inputs_variables[0][5]
3298
+ clusterComposition_2 = inputs_variables[1][5]
3299
+
3300
+ if valid_combinations != []:
3301
+
3302
+ print_log(f"\tThreshold outbreaks was validated successfully.", log)
3303
+ values_outbreak = extract_integer_part(valid_combinations, log)
3304
+ print_log(f"\tAssessing the overlap of cluster composition.\n", log)
3305
+ df_stats_outbreak, path_stats_outbreak = creation_tsv_stats_outbreak(clusterComposition_1, clusterComposition_2, output, prefix_both, log)
3306
+
3307
+ #-----------------------------------------------------------------------
3308
+ if values_outbreak:
3309
+ calling_script_outbreak(stats_outbreak_script, path_stats_outbreak, output, prefix_both, values_outbreak, log)
3310
+ process_files = read_files_outbreak(output)
3311
+ fig_result, thresholds = creation_overlap_clusters(process_files, output, values_outbreak)
3312
+ print_log(f"\tPlotting the matrices with the cluster overlap for each comparison", log)
3313
+
3314
+ if not repeat_threshold_outbreak:
3315
+ html_content += image_outbreak(fig_result)
3316
+ html_content += summary_outbreak(prefix_both, thresholds)
3317
+ else:
3318
+ final_files = find_html_outbreak(output, prefix_both, log)
3319
+ path_temp = extration_section_original_file(output, final_files, log)
3320
+ html_content += transfer_info_to_html_content(path_temp, html_content, log)
3321
+ html_content += image_outbreak(fig_result)
3322
+ html_content += summary_outbreak(prefix_both, thresholds)
3323
+ else:
3324
+ print_log(f'\tImpossible outbreaks analysis.', log)
3325
+
3326
+ #--------------------------------------------------------------------------------------------------------------------------
3327
+ #4 - END HTML report
3328
+ html_content += references()
3329
+ html_content += javascript_function()
3330
+ html_content += create_html_footer()
3331
+ html_report = write_html(html_content, file_path_report, log)
3332
+
3333
+ # path=f'{output}/html_all_modules.txt'
3334
+ # with open(path, 'w') as f:
3335
+ # f.write(html_content)
3336
+
3337
+ #----------------------------------------------------------------------------------------------------------------------------
3338
+ #END INFORMATIONS
3339
+
3340
+ #print_log("\nEND Running EvalTree.py ...\n", log)
3341
+ print_log('Evaltree is done! If you found any issue please contact us.\n', log)
3342
+ end = datetime.datetime.now()
3343
+ elapsed = end - start
3344
+ print_log("\nEnd: " + str(end), log)
3345
+ print_log("Time elapsed: " + str(elapsed), log)
3346
+ log.close()
3347
+
3348
+ #--------------------------------------------------------------------------------------------------
3349
+ # Rename ouput folder if it was automatically created
3350
+ if rename == True:
3351
+
3352
+ rename_folder = os.path.join(os.path.dirname(output), prefix_both)
3353
+ os.rename(output, rename_folder)
3354
+
3355
+
3356
+ if __name__ == "__main__":
3357
+ main()
3358
+
3359
+