evaltree 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaltree/EvalTree.py +3359 -0
- evaltree/__init__.py +0 -0
- evaltree/scripts/ComparingPartitions/.git/HEAD +1 -0
- evaltree/scripts/ComparingPartitions/.git/config +12 -0
- evaltree/scripts/ComparingPartitions/.git/description +1 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/applypatch-msg.sample +15 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/commit-msg.sample +24 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/fsmonitor-watchman.sample +174 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/post-update.sample +8 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/pre-applypatch.sample +14 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/pre-commit.sample +49 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/pre-merge-commit.sample +13 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/pre-push.sample +53 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/pre-rebase.sample +169 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/pre-receive.sample +24 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/prepare-commit-msg.sample +42 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/push-to-checkout.sample +78 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/sendemail-validate.sample +77 -0
- evaltree/scripts/ComparingPartitions/.git/hooks/update.sample +128 -0
- evaltree/scripts/ComparingPartitions/.git/index +0 -0
- evaltree/scripts/ComparingPartitions/.git/info/exclude +6 -0
- evaltree/scripts/ComparingPartitions/.git/logs/HEAD +1 -0
- evaltree/scripts/ComparingPartitions/.git/logs/refs/heads/master +1 -0
- evaltree/scripts/ComparingPartitions/.git/logs/refs/remotes/origin/HEAD +1 -0
- evaltree/scripts/ComparingPartitions/.git/objects/pack/pack-b4866a13dc4806c75e665495ee6671b98a695802.idx +0 -0
- evaltree/scripts/ComparingPartitions/.git/objects/pack/pack-b4866a13dc4806c75e665495ee6671b98a695802.pack +0 -0
- evaltree/scripts/ComparingPartitions/.git/objects/pack/pack-b4866a13dc4806c75e665495ee6671b98a695802.rev +0 -0
- evaltree/scripts/ComparingPartitions/.git/packed-refs +5 -0
- evaltree/scripts/ComparingPartitions/.git/refs/heads/master +1 -0
- evaltree/scripts/ComparingPartitions/.git/refs/remotes/origin/HEAD +1 -0
- evaltree/scripts/ComparingPartitions/CP_demodata.txt +326 -0
- evaltree/scripts/ComparingPartitions/LICENSE +674 -0
- evaltree/scripts/ComparingPartitions/README.md +43 -0
- evaltree/scripts/ComparingPartitions/comparing_partitions.py +201 -0
- evaltree/scripts/ComparingPartitions/comparing_partitions_v2.py +481 -0
- evaltree/scripts/ComparingPartitions/metrics.py +193 -0
- evaltree/scripts/WGS_cluster_congruence/.git/HEAD +1 -0
- evaltree/scripts/WGS_cluster_congruence/.git/config +12 -0
- evaltree/scripts/WGS_cluster_congruence/.git/description +1 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/applypatch-msg.sample +15 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/commit-msg.sample +24 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/fsmonitor-watchman.sample +174 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/post-update.sample +8 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-applypatch.sample +14 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-commit.sample +49 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-merge-commit.sample +13 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-push.sample +53 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-rebase.sample +169 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/pre-receive.sample +24 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/prepare-commit-msg.sample +42 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/push-to-checkout.sample +78 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/sendemail-validate.sample +77 -0
- evaltree/scripts/WGS_cluster_congruence/.git/hooks/update.sample +128 -0
- evaltree/scripts/WGS_cluster_congruence/.git/index +0 -0
- evaltree/scripts/WGS_cluster_congruence/.git/info/exclude +6 -0
- evaltree/scripts/WGS_cluster_congruence/.git/logs/HEAD +1 -0
- evaltree/scripts/WGS_cluster_congruence/.git/logs/refs/heads/main +1 -0
- evaltree/scripts/WGS_cluster_congruence/.git/logs/refs/remotes/origin/HEAD +1 -0
- evaltree/scripts/WGS_cluster_congruence/.git/objects/pack/pack-e440654b80466e9e939f543a53a1c0c50c2fc934.idx +0 -0
- evaltree/scripts/WGS_cluster_congruence/.git/objects/pack/pack-e440654b80466e9e939f543a53a1c0c50c2fc934.pack +0 -0
- evaltree/scripts/WGS_cluster_congruence/.git/objects/pack/pack-e440654b80466e9e939f543a53a1c0c50c2fc934.rev +0 -0
- evaltree/scripts/WGS_cluster_congruence/.git/packed-refs +3 -0
- evaltree/scripts/WGS_cluster_congruence/.git/refs/heads/main +1 -0
- evaltree/scripts/WGS_cluster_congruence/.git/refs/remotes/origin/HEAD +1 -0
- evaltree/scripts/WGS_cluster_congruence/LICENSE +661 -0
- evaltree/scripts/WGS_cluster_congruence/README.md +34 -0
- evaltree/scripts/WGS_cluster_congruence/comparison_outbreak_level.py +246 -0
- evaltree/scripts/WGS_cluster_congruence/congruence_plots.py +244 -0
- evaltree/scripts/WGS_cluster_congruence/get_best_part_correspondence.py +128 -0
- evaltree/scripts/WGS_cluster_congruence/get_stats_threshold.py +143 -0
- evaltree/scripts/WGS_cluster_congruence/heatmap_final_score.py +137 -0
- evaltree/scripts/WGS_cluster_congruence/poli_typing.py +111 -0
- evaltree/scripts/WGS_cluster_congruence/remove_hifen.py +46 -0
- evaltree/scripts/WGS_cluster_congruence/stats_outbreak_analysis.py +204 -0
- evaltree/scripts/WGS_cluster_congruence/stats_outbreak_analysis_snp_dists.py +150 -0
- evaltree/scripts/WGS_cluster_congruence/wgmlst_exercise.py +96 -0
- evaltree-0.1.0.dist-info/LICENSE +674 -0
- evaltree-0.1.0.dist-info/METADATA +219 -0
- evaltree-0.1.0.dist-info/RECORD +81 -0
- evaltree-0.1.0.dist-info/WHEEL +4 -0
- evaltree-0.1.0.dist-info/entry_points.txt +3 -0
evaltree/EvalTree.py
ADDED
|
@@ -0,0 +1,3359 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
EvalTree: toolbox for comparative clustering evaluation of whole genome sequencing (WGS) pipelines for bacteria routine surveillance
|
|
5
|
+
By Joana Gomes Pereira
|
|
6
|
+
@INSA
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
version = "1.0.0"
|
|
10
|
+
last_updated = "2025-05-20"
|
|
11
|
+
|
|
12
|
+
import datetime
|
|
13
|
+
import argparse
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
import textwrap
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import glob
|
|
20
|
+
import fnmatch
|
|
21
|
+
import plotly.express as px
|
|
22
|
+
import plotly.io as pio
|
|
23
|
+
import plotly.graph_objects as go
|
|
24
|
+
import re
|
|
25
|
+
import random
|
|
26
|
+
import numpy as np
|
|
27
|
+
import subprocess
|
|
28
|
+
from scipy import stats
|
|
29
|
+
import math
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_path_toolbox():
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
Retrieves the absolute path to the current script (EvalTree.py) and its respective directory.
|
|
36
|
+
This is useful for correctly managing file paths relative to the script's location.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
---------
|
|
40
|
+
None
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
---------
|
|
44
|
+
path_toolbox_script: str
|
|
45
|
+
The absolute path to the current script.
|
|
46
|
+
directory_toolbox_script: str
|
|
47
|
+
The absolute path to the directory containing the current script.
|
|
48
|
+
"""
|
|
49
|
+
#print(f'\n---------------------------------------------- Function: get_path_toolbox ----------------------------------------------')
|
|
50
|
+
|
|
51
|
+
path_toolbox_script = os.path.realpath(__file__)
|
|
52
|
+
directory_toolbox = os.path.dirname(path_toolbox_script)
|
|
53
|
+
|
|
54
|
+
return path_toolbox_script, directory_toolbox
|
|
55
|
+
|
|
56
|
+
def get_path_other_scripts(directory_toolbox):
|
|
57
|
+
|
|
58
|
+
"""
|
|
59
|
+
Constructs the paths to locate the scripts that evaluate the pipeline congruence based from the toolbox directory.
|
|
60
|
+
The following scripts are included:
|
|
61
|
+
|
|
62
|
+
-**comparing_partition_v2.py** (Mixão et al., 2024):
|
|
63
|
+
This script has two analysis options: between_methods and stability.
|
|
64
|
+
- The *between_methods* option compares methods from two pipelines to compute the congruence score, assessing the consistency between them.
|
|
65
|
+
- The *stability* option evaluates the cluster stability produced by a given method.
|
|
66
|
+
|
|
67
|
+
-**get_best_part_correspondence.py** (Mixão et al., 2024):
|
|
68
|
+
For each pairwise pipeline comparison, this script identifies the threshold that provides
|
|
69
|
+
the most similar clustering results in the other pipeline (i.e., the best “correspondence point”), based on CS scores.
|
|
70
|
+
|
|
71
|
+
-**remove_hifen_script.py** (Pereira et al., 2025):
|
|
72
|
+
Automatically remove row(s) from the file ALL_CORRESPONDENDE.tsv that do not contain correspondence points produced by get_best_part_correspondence.py.
|
|
73
|
+
Rename the file ALL_CORRESPONDENDE.tsv to All_correspondence.tsv.
|
|
74
|
+
|
|
75
|
+
-**stats_outbreak_script.py** (Mixão et al., 2024):
|
|
76
|
+
This script determines the percentage of clusters identified by a given pipeline at a certain threshold are also detected
|
|
77
|
+
— with the exact same composition — by another pipeline at a similar or higher threshold.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
directory_tool1_script : str
|
|
82
|
+
Path to the directory of the main toolbox script.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
---------
|
|
86
|
+
comparing_partitions_script: str
|
|
87
|
+
Path to the script comparing_partition_v2.py.
|
|
88
|
+
|
|
89
|
+
get_best_part_correspondence_script: str
|
|
90
|
+
Path to the script get_best_part_correspondence.py.
|
|
91
|
+
|
|
92
|
+
remove_hifen_script: str
|
|
93
|
+
Path to the script remove_hifen.py.
|
|
94
|
+
|
|
95
|
+
stats_outbreak_script: str
|
|
96
|
+
Path to the script stats_outbreak_script.py.
|
|
97
|
+
"""
|
|
98
|
+
#print(f'\n---------------------------------------------- Function: get_path_other_scripts ----------------------------------------------\n')
|
|
99
|
+
|
|
100
|
+
comparing_partitions_script = os.path.join(directory_toolbox, 'scripts', 'ComparingPartitions', 'comparing_partitions_v2.py')
|
|
101
|
+
|
|
102
|
+
get_best_part_correspondence_script = os.path.join(directory_toolbox, 'scripts', 'WGS_cluster_congruence', 'get_best_part_correspondence.py')
|
|
103
|
+
|
|
104
|
+
remove_hifen_script = os.path.join(directory_toolbox, 'scripts', 'WGS_cluster_congruence','remove_hifen.py')
|
|
105
|
+
|
|
106
|
+
stats_outbreak_script = os.path.join(directory_toolbox, 'scripts', 'WGS_cluster_congruence','stats_outbreak_analysis.py')
|
|
107
|
+
|
|
108
|
+
return comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, stats_outbreak_script
|
|
109
|
+
|
|
110
|
+
def check_input_argument(input1, input2):
|
|
111
|
+
|
|
112
|
+
"""
|
|
113
|
+
Verifies the existence of the specified input paths (-i1 and -i2) and categorizes them.
|
|
114
|
+
If an input is a folder, it is added to the folders list; if it is a `.tsv` file, it is added to the files list.
|
|
115
|
+
Based on the provided input arguments, the function returns two separate lists: one for folders and one for files.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
input1: str
|
|
120
|
+
Relative path to the first input argument.
|
|
121
|
+
input2: str
|
|
122
|
+
Relative path to the second input argument.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
folders: list
|
|
127
|
+
List of relative paths to input folders.
|
|
128
|
+
files: list
|
|
129
|
+
List of relative paths to input files.
|
|
130
|
+
"""
|
|
131
|
+
#print(f'\n---------------------------------------------- Function: check_input_argument ----------------------------------------------\n')
|
|
132
|
+
|
|
133
|
+
folders = []
|
|
134
|
+
files = []
|
|
135
|
+
|
|
136
|
+
arguments = [input1, input2]
|
|
137
|
+
|
|
138
|
+
for elem in arguments:
|
|
139
|
+
if elem is not None:
|
|
140
|
+
|
|
141
|
+
if not os.path.exists(elem):
|
|
142
|
+
sys.exit(f'Error: The input {elem} was not found.')
|
|
143
|
+
|
|
144
|
+
if os.path.isdir(elem):
|
|
145
|
+
folders.append(elem)
|
|
146
|
+
|
|
147
|
+
elif os.path.isfile(elem):
|
|
148
|
+
if elem.endswith('.tsv'):
|
|
149
|
+
files.append(elem)
|
|
150
|
+
else:
|
|
151
|
+
sys.exit(f'\tError: Only files with the *.tsv extension are allowed. The extension of {elem} is not allowed.')
|
|
152
|
+
else:
|
|
153
|
+
print(f"\tWarning: Only one input file was provided. Inter-pipeline cluster congruence analysis will not be performed.\n")
|
|
154
|
+
|
|
155
|
+
return folders, files
|
|
156
|
+
|
|
157
|
+
def check_folder(input_path):
|
|
158
|
+
|
|
159
|
+
"""
|
|
160
|
+
Checks the input path for expected files.
|
|
161
|
+
The path must be a directory (e.g., a ReporTree folder).
|
|
162
|
+
This function searches for specific filenames and validates the file prefix consistency.
|
|
163
|
+
|
|
164
|
+
Parameter
|
|
165
|
+
---------
|
|
166
|
+
input_path: str
|
|
167
|
+
Relative path to the directory.
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
partitions : str or None
|
|
172
|
+
Relative path to the partition matrix file.
|
|
173
|
+
partitions_summary : str or None
|
|
174
|
+
Relative path to the partitions summary file.
|
|
175
|
+
sample_interest : str or None
|
|
176
|
+
Relative path to the sample of interest partitions summary file.
|
|
177
|
+
clusterComposition : str or None
|
|
178
|
+
Relative path to the cluster composition file.
|
|
179
|
+
prefix : str or None
|
|
180
|
+
Prefix present in all files.
|
|
181
|
+
input_path : str
|
|
182
|
+
Name of the input folder.
|
|
183
|
+
stable_region : str or None
|
|
184
|
+
Full path to the stable regions file.
|
|
185
|
+
"""
|
|
186
|
+
#print(f'\n---------------------------------------------- Function: check_folder ----------------------------------------------\n')
|
|
187
|
+
|
|
188
|
+
if input_path == []:
|
|
189
|
+
sys.exit(f"\tError: The folder {input_path} is empty or does not exist.")
|
|
190
|
+
|
|
191
|
+
files = [os.path.join(input_path, file) for file in os.listdir(input_path)]
|
|
192
|
+
|
|
193
|
+
partitions = None
|
|
194
|
+
partitions_summary = None
|
|
195
|
+
sample_interest = None
|
|
196
|
+
clusterComposition = None
|
|
197
|
+
stable_region = None
|
|
198
|
+
|
|
199
|
+
prefix_list = []
|
|
200
|
+
file_prefix_map = {}
|
|
201
|
+
|
|
202
|
+
for file in files:
|
|
203
|
+
if fnmatch.fnmatch(file,"*_clusterComposition.tsv"):
|
|
204
|
+
clusterComposition = file
|
|
205
|
+
prefix_cc=file[:-23]
|
|
206
|
+
prefix_list.append(prefix_cc)
|
|
207
|
+
file_prefix_map[file] = prefix_cc
|
|
208
|
+
|
|
209
|
+
elif fnmatch.fnmatch(file, "*_partitions.tsv") and not fnmatch.fnmatch(file, '*_w_partitions.tsv'):
|
|
210
|
+
partitions = file
|
|
211
|
+
prefix_mp=file[:-15]
|
|
212
|
+
prefix_list.append(prefix_mp)
|
|
213
|
+
file_prefix_map[file] = prefix_mp
|
|
214
|
+
|
|
215
|
+
elif fnmatch.fnmatch(file,"*_partitions_summary.tsv") and not fnmatch.fnmatch(file, '*_SAMPLES_OF_INTEREST_partitions_summary.tsv'):
|
|
216
|
+
partitions_summary = file
|
|
217
|
+
prefix_ps=file[:-23]
|
|
218
|
+
prefix_list.append(prefix_ps)
|
|
219
|
+
file_prefix_map[file] = prefix_ps
|
|
220
|
+
|
|
221
|
+
elif fnmatch.fnmatch(file,"*_SAMPLES_OF_INTEREST_partitions_summary.tsv"):
|
|
222
|
+
sample_interest = file
|
|
223
|
+
prefix_si=file[:-43]
|
|
224
|
+
prefix_list.append(prefix_si)
|
|
225
|
+
file_prefix_map[file] = prefix_si
|
|
226
|
+
|
|
227
|
+
elif fnmatch.fnmatch(file,"*_stableRegions.tsv"):
|
|
228
|
+
stable_region = file
|
|
229
|
+
prefix_st=file[:-18]
|
|
230
|
+
prefix_list.append(prefix_st)
|
|
231
|
+
file_prefix_map[file] = prefix_st
|
|
232
|
+
|
|
233
|
+
unique_prefixes = set(prefix_list)
|
|
234
|
+
list_prefixes = [os.path.basename(p) for p in unique_prefixes]
|
|
235
|
+
|
|
236
|
+
if len(unique_prefixes) > 1:
|
|
237
|
+
print(f"\nError: Multiple prefixes were found in the {input_path} folder: {' and '.join(list_prefixes)}. Please revise the structure of your input folder.")
|
|
238
|
+
for prefix in unique_prefixes:
|
|
239
|
+
for file, file_prefix in file_prefix_map.items():
|
|
240
|
+
if file_prefix == prefix:
|
|
241
|
+
print(f" - {file}")
|
|
242
|
+
sys.exit()
|
|
243
|
+
else:
|
|
244
|
+
final_prefix = list_prefixes[0]
|
|
245
|
+
|
|
246
|
+
print(f'\tFiles of {input_path}:')
|
|
247
|
+
print(f'\t\tPrefix: {final_prefix}')
|
|
248
|
+
print(f'\t\tPartition matrix: {partitions}')
|
|
249
|
+
print(f'\t\tPartition summary: {partitions_summary}')
|
|
250
|
+
print(f'\t\tSample of interest: {sample_interest}')
|
|
251
|
+
print(f'\t\tCluster composition: {clusterComposition}')
|
|
252
|
+
print(f'\t\tStable regions: {stable_region}')
|
|
253
|
+
|
|
254
|
+
return partitions, partitions_summary, sample_interest, clusterComposition, final_prefix, input_path, stable_region
|
|
255
|
+
|
|
256
|
+
def check_output(output):
|
|
257
|
+
|
|
258
|
+
"""
|
|
259
|
+
Checks if the specified path is a valid directory.
|
|
260
|
+
If the path is not a valid directory, the program will stop with an error message.
|
|
261
|
+
|
|
262
|
+
Parameter
|
|
263
|
+
---------
|
|
264
|
+
output: str
|
|
265
|
+
Relative path to the directory where the results will be saved.
|
|
266
|
+
|
|
267
|
+
Return
|
|
268
|
+
------
|
|
269
|
+
output: str
|
|
270
|
+
Absolute path to the output directory.
|
|
271
|
+
"""
|
|
272
|
+
print(f'\n---------------------------------------------- Function: check_output ----------------------------------------------\n')
|
|
273
|
+
rename = False
|
|
274
|
+
if output == None:
|
|
275
|
+
output='pipeline1_vs_pipeline2'
|
|
276
|
+
rename = True
|
|
277
|
+
os.makedirs(output, exist_ok=True)
|
|
278
|
+
|
|
279
|
+
elif not os.path.isdir(output):
|
|
280
|
+
sys.exit(f'\tError: The specified {output} is not a valid directory.')
|
|
281
|
+
|
|
282
|
+
full_path_output = os.path.abspath(output)
|
|
283
|
+
|
|
284
|
+
return full_path_output, rename
|
|
285
|
+
|
|
286
|
+
def check_threshold(threshold):
|
|
287
|
+
|
|
288
|
+
"""
|
|
289
|
+
Validates the format of the threshold argument for filtering the partition matrix.
|
|
290
|
+
|
|
291
|
+
If the threshold is not "max", it must be in the format "X-Y", where X and Y are positive integers.
|
|
292
|
+
If the format is incorrect, the program will stop with an error message.
|
|
293
|
+
|
|
294
|
+
Parameter
|
|
295
|
+
---------
|
|
296
|
+
threshold: str
|
|
297
|
+
Range of thresholds to apply in the partition matrix file.
|
|
298
|
+
|
|
299
|
+
Return
|
|
300
|
+
------
|
|
301
|
+
threshold: str
|
|
302
|
+
The validated threshold string.
|
|
303
|
+
"""
|
|
304
|
+
#print(f'\n---------------------------------------------- Function: check_threshold ----------------------------------------------\n')
|
|
305
|
+
|
|
306
|
+
if threshold != 'max':
|
|
307
|
+
parts = threshold.split('-')
|
|
308
|
+
|
|
309
|
+
if len(parts) != 2 or not all(part.isdigit() for part in parts):
|
|
310
|
+
sys.exit(f"\tError: The threshold argument (-t) must be in the format 'X-Y', where X and Y are positive integers.")
|
|
311
|
+
|
|
312
|
+
return threshold
|
|
313
|
+
|
|
314
|
+
def check_score(score):
|
|
315
|
+
|
|
316
|
+
"""
|
|
317
|
+
Checks that the score is a float between 0 and 3.
|
|
318
|
+
If the format is incorrect, the program will stop with an error message.
|
|
319
|
+
|
|
320
|
+
Parameter
|
|
321
|
+
---------
|
|
322
|
+
score: str
|
|
323
|
+
Score value as string.
|
|
324
|
+
|
|
325
|
+
Return
|
|
326
|
+
------
|
|
327
|
+
score_value: str
|
|
328
|
+
Validated score.
|
|
329
|
+
"""
|
|
330
|
+
#print(f'\n---------------------------------------------- Function: check_score ----------------------------------------------\n')
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
score_value=float(score)
|
|
334
|
+
if not 0 <= score_value <= 3:
|
|
335
|
+
sys.exit(f"\tError: The score value {float(score)} is out of the allowed range (0 to 3).")
|
|
336
|
+
except ValueError as e:
|
|
337
|
+
sys.exit(f"\tError: The score value {score} is not a float.")
|
|
338
|
+
|
|
339
|
+
return score_value
|
|
340
|
+
|
|
341
|
+
def check_file(file):
|
|
342
|
+
|
|
343
|
+
"""
|
|
344
|
+
Check the structure of the input file (either a sequence-type matrix or a partition matrix).
|
|
345
|
+
|
|
346
|
+
Parameter
|
|
347
|
+
---------
|
|
348
|
+
file: str
|
|
349
|
+
Relative path to the input matrix file.
|
|
350
|
+
|
|
351
|
+
Returns
|
|
352
|
+
-------
|
|
353
|
+
filename : str
|
|
354
|
+
Absolute path to the input file.
|
|
355
|
+
prefix : str
|
|
356
|
+
Prefix of the input file.
|
|
357
|
+
path_directory : str
|
|
358
|
+
Directory name where the file is located.
|
|
359
|
+
file_type : bool
|
|
360
|
+
False for sequence-type matrix; True for partition matrix.
|
|
361
|
+
n_samples : int
|
|
362
|
+
Number of samples present in the file.
|
|
363
|
+
n_groups : int or None
|
|
364
|
+
Number of groups if applicable (only for sequence-type matrix).
|
|
365
|
+
"""
|
|
366
|
+
#print(f'\n---------------------------------------------- Function: check_file ----------------------------------------------\n')
|
|
367
|
+
|
|
368
|
+
df = pd.read_table(file)
|
|
369
|
+
nr_columns = df.shape[1]
|
|
370
|
+
n_samples = df.shape[0]
|
|
371
|
+
|
|
372
|
+
filename = os.path.abspath(file)
|
|
373
|
+
prefix_file = os.path.basename(file)
|
|
374
|
+
path_directory = filename.split('/')[-2]
|
|
375
|
+
|
|
376
|
+
if nr_columns == 2:
|
|
377
|
+
n_groups = len(df.iloc[:,1].unique())
|
|
378
|
+
prefix = prefix_file.split('.tsv')[0]
|
|
379
|
+
file_type = False
|
|
380
|
+
|
|
381
|
+
elif nr_columns > 2:
|
|
382
|
+
|
|
383
|
+
prefix = prefix_file.split('.tsv')[0]
|
|
384
|
+
n_groups = None
|
|
385
|
+
file_type = True
|
|
386
|
+
|
|
387
|
+
return filename, prefix, path_directory, file_type, n_samples, n_groups
|
|
388
|
+
|
|
389
|
+
def check_str_plots_threshold(plots_thresholds):
|
|
390
|
+
|
|
391
|
+
"""
|
|
392
|
+
Check if the threshold has the correct format.
|
|
393
|
+
Otherwise, the program will terminate with an error message indicating that the format is incorrect.
|
|
394
|
+
|
|
395
|
+
Parameter
|
|
396
|
+
---------
|
|
397
|
+
plots_thresholds: str
|
|
398
|
+
One or more thresholds provided by the user, separated by commas and without spaces.
|
|
399
|
+
Return
|
|
400
|
+
------
|
|
401
|
+
plots_thresholds: list
|
|
402
|
+
Valid list of plot thresholds in the format 'METHOD-NxM.M' (e.g., MST-4x1.0)
|
|
403
|
+
"""
|
|
404
|
+
#print(f'\n---------------------------------------------- Function: check_str_plots_thresholds ----------------------------------------------\n')
|
|
405
|
+
|
|
406
|
+
pattern = r'^[A-Za-z]+-\d+x\d+\.\d+$'
|
|
407
|
+
|
|
408
|
+
thresholds = [th for th in plots_thresholds.split(",")]
|
|
409
|
+
|
|
410
|
+
values=[]
|
|
411
|
+
for th in thresholds:
|
|
412
|
+
if not re.match(pattern, th):
|
|
413
|
+
sys.exit(f"\tError: The value '{th}' does not follow the expected format (e.g., METHOD-NxM.M). Multiple plots must be separated by commas and without spaces.")
|
|
414
|
+
values.append(th)
|
|
415
|
+
|
|
416
|
+
return values
|
|
417
|
+
|
|
418
|
+
def check_combinations_arguments(plots_summary_arg, data_folder, data_files):
|
|
419
|
+
|
|
420
|
+
"""
|
|
421
|
+
Check the argument combinations provided by the user.
|
|
422
|
+
If any combination of arguments is invalid, the program will stop.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
plots_summary_arg: str
|
|
427
|
+
Type of file to be used for clustering characterization.
|
|
428
|
+
|
|
429
|
+
data_folder: list
|
|
430
|
+
List containing information about the input files, file prefixes, and directories.
|
|
431
|
+
|
|
432
|
+
data_file: list
|
|
433
|
+
List containing information about the input files, file prefixes, and directories.
|
|
434
|
+
|
|
435
|
+
Return
|
|
436
|
+
------
|
|
437
|
+
go_clustering: boolean
|
|
438
|
+
Indicates whether clustering characterization can be executed.
|
|
439
|
+
go_outbreak: boolean
|
|
440
|
+
Indicates whether outbreak analysis can be executed.
|
|
441
|
+
"""
|
|
442
|
+
#print(f'\n---------------------------------------------- Function: check_combinations_arguments ----------------------------------------------\n')
|
|
443
|
+
|
|
444
|
+
type_file = plots_summary_arg
|
|
445
|
+
|
|
446
|
+
args = sys.argv
|
|
447
|
+
if '-pcn' in args and '-pcp' in args :
|
|
448
|
+
sys.exit(f"\tError: It is not possible to provide the plots_category_number (-pcn) and plots_category_percentage (-pcp) at the same time.")
|
|
449
|
+
|
|
450
|
+
errors = []
|
|
451
|
+
go_clustering = False
|
|
452
|
+
go_outbreaks = False
|
|
453
|
+
|
|
454
|
+
if data_folder:
|
|
455
|
+
for elem in data_folder:
|
|
456
|
+
partitions_summary = elem[3]
|
|
457
|
+
sample_of_interest = elem[4]
|
|
458
|
+
cluster_composition = elem[5]
|
|
459
|
+
input_path = elem[2]
|
|
460
|
+
stable_regions = elem[6]
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
if type_file == 'sample_of_interest':
|
|
464
|
+
if '-cp' not in args:
|
|
465
|
+
errors.append(f'\tError: For clustering analysis you must specify the column plots (-cp) argument with SAMPLE_OF_INTEREST_partitions_summary file.\n')
|
|
466
|
+
|
|
467
|
+
if '-n' in args:
|
|
468
|
+
errors.append(f'\tError: It is impossible to use -n argument with SAMPLE_OF_INTEREST_partitions_summary file.\n')
|
|
469
|
+
|
|
470
|
+
if '-pt' not in args:
|
|
471
|
+
errors.append(f'\tError: For clustering analyis you must specify the plots threshold (-pt) argument with SAMPLE_OF_INTEREST_partitions_summary file.\n')
|
|
472
|
+
|
|
473
|
+
if sample_of_interest is None:
|
|
474
|
+
errors.append(f'\tError: The file SAMPLE_OF_INTEREST_partitions_summary does not exist in {input_path}.\n')
|
|
475
|
+
go_clustering = True
|
|
476
|
+
|
|
477
|
+
if type_file == 'partitions_summary':
|
|
478
|
+
|
|
479
|
+
if '-cp' in args and '-pt' in args and partitions_summary is not None:
|
|
480
|
+
go_clustering = True
|
|
481
|
+
|
|
482
|
+
else:
|
|
483
|
+
if '-cp' in args and not '-pt' in args:
|
|
484
|
+
errors.append(f'\tError: For clustering analyis you must specify the plots threshold (-pt) argument.\n')
|
|
485
|
+
|
|
486
|
+
if '-pt' in args and not '-cp' in args:
|
|
487
|
+
errors.append(f'\tError: For clustering analysis you must specify the column plots (-cp) argument.\n')
|
|
488
|
+
|
|
489
|
+
if partitions_summary is None:
|
|
490
|
+
errors.append(f'\tError: The file partitions_summary does not exist in {input_path}.\n')
|
|
491
|
+
|
|
492
|
+
if stable_regions is None:
|
|
493
|
+
if '-n_stab' in args:
|
|
494
|
+
errors.append(f'\tError: It is impossible to use the -n_stab argument when the file stableRegions does not exist.\n')
|
|
495
|
+
if '-n_thr' in args:
|
|
496
|
+
errors.append(f'\tError: It is impossible to use the -n_thr argument when the file stableRegions does not exist.\n')
|
|
497
|
+
|
|
498
|
+
if '-to' in args:
|
|
499
|
+
if cluster_composition is None:
|
|
500
|
+
errors.append(f'\tError: It is impossible to use the -to argument when the cluster_composition file is not in {input_path}.\n')
|
|
501
|
+
|
|
502
|
+
if '-to' in args and len(data_folder) == 1:
|
|
503
|
+
errors.append(f'\tError: It is impossible to use the -to argument only with one folder.\n')
|
|
504
|
+
|
|
505
|
+
if '-to' in args and len(data_files) == 1:
|
|
506
|
+
errors.append(f'\tError: It is impossible to use the -to argument only with input files.\n')
|
|
507
|
+
|
|
508
|
+
if len(data_folder) == 2:
|
|
509
|
+
if '-to' in args:
|
|
510
|
+
if len(data_folder[0]) == 7 and len(data_folder[1]):
|
|
511
|
+
if data_folder[0][5] is not None and data_folder [1][5] is not None:
|
|
512
|
+
go_outbreaks = True
|
|
513
|
+
|
|
514
|
+
if data_files:
|
|
515
|
+
if len(data_files) == 2:
|
|
516
|
+
cluster_args = ['-cp', '-pt', '-n', '-ps', '-pcn', '-pcp']
|
|
517
|
+
for elem in cluster_args:
|
|
518
|
+
if elem in args:
|
|
519
|
+
errors.append(f'\tError: It is impossible to use the {elem} argument when input file(s) are provided.\n')
|
|
520
|
+
|
|
521
|
+
if errors:
|
|
522
|
+
unique_errors = set(errors)
|
|
523
|
+
sys.exit(f"\nThe following problems were found:\n {' '.join(unique_errors)}")
|
|
524
|
+
|
|
525
|
+
return go_clustering, go_outbreaks
|
|
526
|
+
|
|
527
|
+
def check_data_folders_file(data_folder, data_files):
|
|
528
|
+
|
|
529
|
+
"""
|
|
530
|
+
Validates if the prefixes in data_folder and data_files are different.
|
|
531
|
+
If valid, concatenates the prefixes into one string.
|
|
532
|
+
|
|
533
|
+
Parameters
|
|
534
|
+
----------
|
|
535
|
+
data_folder: list
|
|
536
|
+
List of folder-related elements. Prefix in the second position.
|
|
537
|
+
data_files: list
|
|
538
|
+
List of file-related elements. Prefix in the second position.
|
|
539
|
+
|
|
540
|
+
Returns
|
|
541
|
+
-------
|
|
542
|
+
data_folder : list
|
|
543
|
+
The same input data_folder, unchanged.
|
|
544
|
+
data_files : list
|
|
545
|
+
The same input data_files, unchanged.
|
|
546
|
+
prefix_both: str
|
|
547
|
+
Concatenated prefix inputs (e.g., 'HC_vs_GT'). If only one input is present, returns its prefix.
|
|
548
|
+
"""
|
|
549
|
+
#print(f'\n---------------------------------------------- Function: check_data_folders_file ----------------------------------------------\n')
|
|
550
|
+
|
|
551
|
+
check_prefixes=[]
|
|
552
|
+
|
|
553
|
+
if data_folder:
|
|
554
|
+
for elem in data_folder:
|
|
555
|
+
check_prefixes.append(elem[1])
|
|
556
|
+
|
|
557
|
+
if data_files:
|
|
558
|
+
for elem in data_files:
|
|
559
|
+
check_prefixes.append(elem[1])
|
|
560
|
+
|
|
561
|
+
if len(check_prefixes) == 2 and check_prefixes[0] == check_prefixes[1]:
|
|
562
|
+
sys.exit(f"Error: Impossible to analyse inputs with the same prefix {check_prefixes[0]} and {check_prefixes[1]}.")
|
|
563
|
+
|
|
564
|
+
if len(check_prefixes) == 2:
|
|
565
|
+
prefix_both = check_prefixes[0] + '_vs_' + check_prefixes[1]
|
|
566
|
+
else:
|
|
567
|
+
prefix_both = check_prefixes[0]
|
|
568
|
+
|
|
569
|
+
return data_folder, data_files, prefix_both
|
|
570
|
+
|
|
571
|
+
def check_range_threshold(partition_matrix, threshold, log):
|
|
572
|
+
|
|
573
|
+
"""
|
|
574
|
+
Checks whether a given threshold range is valid within the column range of the partition matrix.
|
|
575
|
+
If valid, decomposes the string threshold into two integers: start and end thresholds.
|
|
576
|
+
|
|
577
|
+
Parameters
|
|
578
|
+
----------
|
|
579
|
+
threshold: str
|
|
580
|
+
Range threshold in the format 'start-end' or 'max'.
|
|
581
|
+
|
|
582
|
+
partition_matrix: str
|
|
583
|
+
Relative path to the partition matrix file.
|
|
584
|
+
|
|
585
|
+
Returns
|
|
586
|
+
-------
|
|
587
|
+
start_threshold : int or None
|
|
588
|
+
The start of the threshold range. None if 'max' is used.
|
|
589
|
+
|
|
590
|
+
end_threshold : int or None
|
|
591
|
+
The end of the threshold range. None if 'max' is used.
|
|
592
|
+
"""
|
|
593
|
+
#print(f'\n---------------------------------------------- Function: check_range_threshold ----------------------------------------------')
|
|
594
|
+
|
|
595
|
+
if threshold != 'max':
|
|
596
|
+
parts = threshold.split('-')
|
|
597
|
+
start_threshold = int(parts[0])
|
|
598
|
+
end_threshold = int(parts[1])
|
|
599
|
+
|
|
600
|
+
df = pd.read_table(partition_matrix)
|
|
601
|
+
columns_df = len(df.columns)
|
|
602
|
+
column_range = (0, columns_df)
|
|
603
|
+
min_column,max_column=column_range
|
|
604
|
+
|
|
605
|
+
if start_threshold > end_threshold:
|
|
606
|
+
sys.exit(f"\tError: Start threshold {start_threshold} is greater than end threshold {end_threshold}.")
|
|
607
|
+
|
|
608
|
+
else:
|
|
609
|
+
if not (min_column <= start_threshold <= max_column):
|
|
610
|
+
sys.exit(f"\tError: Start threshold {start_threshold} is outside the valid column range {column_range}.")
|
|
611
|
+
|
|
612
|
+
if not (min_column <= end_threshold <= max_column):
|
|
613
|
+
print_log(f'\t\tWarning: The final threshold ({end_threshold}) is higher than the available number of columns.',log)
|
|
614
|
+
else:
|
|
615
|
+
start_threshold = None
|
|
616
|
+
end_threshold = None
|
|
617
|
+
|
|
618
|
+
return start_threshold, end_threshold
|
|
619
|
+
|
|
620
|
+
def management_main_scripts(comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, input1, input2, prefix_both, output, score, log):
|
|
621
|
+
|
|
622
|
+
"""
|
|
623
|
+
Executes congruence scripts to evaluate the agreement between two genomic pipelines.
|
|
624
|
+
This function orchestrates the execution of all previously mentioned scripts.
|
|
625
|
+
|
|
626
|
+
Parameters
|
|
627
|
+
----------
|
|
628
|
+
comparing_partitions_script : str
|
|
629
|
+
Absolute path to the comparing_partitions_v2.py.
|
|
630
|
+
get_best_part_correspondence_script : str
|
|
631
|
+
Absolute path to the get_best_part_correspondence.py.
|
|
632
|
+
remove_hifen_script : str
|
|
633
|
+
Absolute path to the remove_hifen.py.
|
|
634
|
+
|
|
635
|
+
input1 : str
|
|
636
|
+
Path to the first input file (e.g., *_partitions.tsv or sequence type matrix).
|
|
637
|
+
input2 : str
|
|
638
|
+
Path to the second input file (e.g, *_partitions.tsv or sequence type matrix).
|
|
639
|
+
prefix_both: str
|
|
640
|
+
Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX).
|
|
641
|
+
output : str
|
|
642
|
+
Full path to the directory where the results will be saved.
|
|
643
|
+
score : str
|
|
644
|
+
Minimum score to consider two partitions as a correspondence.
|
|
645
|
+
|
|
646
|
+
Returns
|
|
647
|
+
-------
|
|
648
|
+
Output files generated by each script include:
|
|
649
|
+
|
|
650
|
+
- comparing_partitions_v2.py:
|
|
651
|
+
*_AdjustedRand.tsv, *_AdjWallace1.tsv, *_AdjWallace2.tsv, *_final_score.tsv,
|
|
652
|
+
*_Simpsons1.tsv, *_Simpsons2.tsv, *_Wallace1.tsv, *_Wallace2.tsv
|
|
653
|
+
|
|
654
|
+
- get_best_part_correspondence.py:
|
|
655
|
+
*_ALL_CORRESPONDENCE.tsv
|
|
656
|
+
|
|
657
|
+
- remove_hifen.py:
|
|
658
|
+
*_All_correspondence.tsv
|
|
659
|
+
"""
|
|
660
|
+
#print_log(f'\n---------------------------------------------- Function: management_all_scripts ----------------------------------------------', log)
|
|
661
|
+
|
|
662
|
+
print_log(f'\tObtaining the cluster congruence score ...', log)
|
|
663
|
+
|
|
664
|
+
#1- Running the first script with the user's inputs
|
|
665
|
+
|
|
666
|
+
print_log(f"\t\tRunning comparing_partitions_v2.py in “between_methods” mode.", log)
|
|
667
|
+
|
|
668
|
+
cmd=[ "python", comparing_partitions_script, "-o1", "0", "-o2", "0", "-a", "between_methods",
|
|
669
|
+
"-log", f"{output}/{prefix_both}_Comparing_partitions.log", "-t", f"{output}/{prefix_both}",
|
|
670
|
+
"-i1", input1, "-i2", input2, "--keep-redundants"]
|
|
671
|
+
|
|
672
|
+
print_log(f'\t\t\t{" ".join(cmd)}', log)
|
|
673
|
+
subprocess.run(cmd)
|
|
674
|
+
|
|
675
|
+
print_log(f"\t\tDone.\n", log)
|
|
676
|
+
|
|
677
|
+
#2- Running the second script with the user's inputs #Input directory with all the *final_score.tsv files
|
|
678
|
+
|
|
679
|
+
print_log(f'\tIdentifying the inter-pipeline “corresponding points”', log)
|
|
680
|
+
print_log(f'\t\tRunning get_best_part_correspondence.py with a score of {score}.', log)
|
|
681
|
+
cmd=["python", get_best_part_correspondence_script, "-i", output, "-s", str(score)]
|
|
682
|
+
print_log(f'\t\t\t{" ".join(cmd)}', log)
|
|
683
|
+
|
|
684
|
+
subprocess.run(cmd)
|
|
685
|
+
print_log(f"\t\tDone.\n", log)
|
|
686
|
+
|
|
687
|
+
# # 3- Execution of the third script - remove hyphens from ALL_CORRESPONDENCE.tsv
|
|
688
|
+
|
|
689
|
+
print_log("\t\tFiltering output file with remove_hifen.py.", log)
|
|
690
|
+
cmd=["python", remove_hifen_script, "-i", f"{output}/ALL_CORRESPONDENCE.tsv", "-o", f"{output}/{prefix_both}_ALL_CORRESPONDENCE.tsv"]
|
|
691
|
+
print_log(f'\t\t\t{" ".join(cmd)}', log)
|
|
692
|
+
subprocess.run(cmd)
|
|
693
|
+
print_log(f"\t\tDone.\n", log)
|
|
694
|
+
|
|
695
|
+
original_file=(f"{output}/{prefix_both}_ALL_CORRESPONDENCE.tsv")
|
|
696
|
+
path_all_correspondence_lower=(f"{output}/{prefix_both}_All_correspondence.tsv")
|
|
697
|
+
os.rename(original_file, path_all_correspondence_lower)
|
|
698
|
+
path_all_correspondence=(f"{output}/ALL_CORRESPONDENCE.tsv")
|
|
699
|
+
os.remove(path_all_correspondence)
|
|
700
|
+
|
|
701
|
+
return path_all_correspondence_lower
|
|
702
|
+
|
|
703
|
+
def tendency_slop(correspondence, pipeline1, pipeline2, output_folder): #correspondence=ALL_CORRESPONDENCE.tsv
|
|
704
|
+
|
|
705
|
+
"""
|
|
706
|
+
This function is part of the script heatmap_final_score.py (Mixão et. al., 2024), and is responsible for generating the *_slope.tsv file.
|
|
707
|
+
This file contains information about the r-value and p-value of the trend line.
|
|
708
|
+
|
|
709
|
+
Parameters
|
|
710
|
+
----------
|
|
711
|
+
correspondence: str
|
|
712
|
+
Absolute path to the All_correspondence.tsv file.
|
|
713
|
+
pipeline1: str
|
|
714
|
+
Prefix of the first pipeline.
|
|
715
|
+
pipeline2: str
|
|
716
|
+
Prefix of the second pipeline.
|
|
717
|
+
output_folder: str
|
|
718
|
+
Full path to the directory where the results will be saved.
|
|
719
|
+
|
|
720
|
+
Return
|
|
721
|
+
------
|
|
722
|
+
comparison: str
|
|
723
|
+
An empty string. The function generates a *_slope.tsv file in the output folder.
|
|
724
|
+
"""
|
|
725
|
+
#print(f'\n---------------------------------------------- Function: tendency_slop ----------------------------------------------')
|
|
726
|
+
|
|
727
|
+
possible_comparison_names = [pipeline1 + "_vs_" + pipeline2, pipeline2 + "_vs_" + pipeline1]
|
|
728
|
+
mx = pd.read_table(correspondence)
|
|
729
|
+
all_comparisons = pd.unique(mx[mx.columns[0]])
|
|
730
|
+
|
|
731
|
+
comparison = ""
|
|
732
|
+
for comp1 in all_comparisons:
|
|
733
|
+
if "_rev" not in comp1:
|
|
734
|
+
if comp1 in possible_comparison_names:
|
|
735
|
+
|
|
736
|
+
extension = output_folder + "/" + comp1 + "_slope.tsv"
|
|
737
|
+
with open(extension, "w+") as out:
|
|
738
|
+
print("#comp1\tcomp2\tslope\tintercept\tr_value\tp_value\tstd_err", file = out)
|
|
739
|
+
for comp2 in all_comparisons:
|
|
740
|
+
if "_rev" in comp2:
|
|
741
|
+
if comp2.split("_rev")[0] == comp1:
|
|
742
|
+
comps = [comp1, comp2]
|
|
743
|
+
flt_mx = mx.loc[mx[mx.columns[0]].isin(comps)]
|
|
744
|
+
|
|
745
|
+
if len(flt_mx[flt_mx.columns[0]].values.tolist()) == 0:
|
|
746
|
+
print("No trend line will be provided as no congruence point was found!!")
|
|
747
|
+
else:
|
|
748
|
+
if len(flt_mx[flt_mx[flt_mx.columns[0]] == comp1]["method1"].values.tolist()) != 0:
|
|
749
|
+
slope, intercept, r_value, p_value, std_err = stats.linregress(flt_mx[flt_mx[flt_mx.columns[0]] == comp1]["method1"],flt_mx[flt_mx[flt_mx.columns[0]] == comp1]["method2"])
|
|
750
|
+
print(comp1 + "\t" + comp2 + "\t" + str(slope) + "\t" + str(intercept) + "\t" + str(r_value) + "\t" + str(p_value) + "\t" + str(std_err), file = out)
|
|
751
|
+
if len(flt_mx[flt_mx[flt_mx.columns[0]] == comp2]["method1"].values.tolist()) != 0:
|
|
752
|
+
slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(flt_mx[flt_mx[flt_mx.columns[0]] == comp2]["method1"],flt_mx[flt_mx[flt_mx.columns[0]] == comp2]["method2"])
|
|
753
|
+
print(comp2 + "\t" + comp1 + "\t" + str(slope2) + "\t" + str(intercept2) + "\t" + str(r_value2) + "\t" + str(p_value2) + "\t" + str(std_err2), file = out)
|
|
754
|
+
|
|
755
|
+
return comparison
|
|
756
|
+
|
|
757
|
+
def filter_partition_matrix(partition_matrix, prefix_single, start_threshold, end_threshold, output, log):
|
|
758
|
+
|
|
759
|
+
"""
|
|
760
|
+
Applies a valid range threshold, when specified, to the partition matrix file.
|
|
761
|
+
A new partition matrix file containing only the selected threshold columns is created.
|
|
762
|
+
|
|
763
|
+
Parameters
|
|
764
|
+
----------
|
|
765
|
+
partition_matrix: str
|
|
766
|
+
Relative path to the partition matrix.
|
|
767
|
+
prefix_single: str
|
|
768
|
+
Prefix to be used in the name of the filtered partition matrix file.
|
|
769
|
+
output: str
|
|
770
|
+
Path to the directory where the filtered matrix file will be saved
|
|
771
|
+
start_threshold: int
|
|
772
|
+
Starting column index for threshold selection in the partition matrix.
|
|
773
|
+
end_threshold: int
|
|
774
|
+
Ending column index for threshold selection in the partition matrix.
|
|
775
|
+
|
|
776
|
+
Return
|
|
777
|
+
------
|
|
778
|
+
input_filtered: str
|
|
779
|
+
Relative path to the new filtered partition matrix (*.tsv) containing only the selected threshold columns.
|
|
780
|
+
"""
|
|
781
|
+
#print_log(f'\n---------------------------------------------- Function: filter_partition_matrix ----------------------------------------------', log)
|
|
782
|
+
|
|
783
|
+
df = pd.read_table(partition_matrix)
|
|
784
|
+
|
|
785
|
+
columns_to_keep = df.columns[start_threshold + 1 : end_threshold + 2]
|
|
786
|
+
columns_to_keep_1 = [df.columns[0]] + list(columns_to_keep)
|
|
787
|
+
df_filtered = df[columns_to_keep_1]
|
|
788
|
+
input_filtered = f"{output}/{prefix_single}_partitions-filtered.tsv"
|
|
789
|
+
df_filtered.to_csv(input_filtered, sep = '\t', index = False)
|
|
790
|
+
|
|
791
|
+
print_log(f'\tFiltering the partitions table for the range {start_threshold}-{end_threshold}...',log)
|
|
792
|
+
|
|
793
|
+
return input_filtered
|
|
794
|
+
|
|
795
|
+
def stability_region(output, partition_matrix, prefix, comparing_partitions_script, n_stability, thr_stability, log):
|
|
796
|
+
|
|
797
|
+
"""
|
|
798
|
+
Executes the stability analysis using comparing_partitions_v2.py when the file *_stableRegions.tsv
|
|
799
|
+
does not already exist in the input directory. If executed, the command generates *_stableRegions.tsv and *_metrics.tsv files.
|
|
800
|
+
|
|
801
|
+
Parameters
|
|
802
|
+
----------
|
|
803
|
+
output: str
|
|
804
|
+
Path to the directory where the results will be saved.
|
|
805
|
+
partition_matrix: str
|
|
806
|
+
Path to the input directory.
|
|
807
|
+
comparing_partitions_script: str
|
|
808
|
+
Path to the comparing_partitions_v2.py script.
|
|
809
|
+
|
|
810
|
+
Return
|
|
811
|
+
------
|
|
812
|
+
file_stability : str
|
|
813
|
+
Path to the *_stableRegions.tsv file created.
|
|
814
|
+
"""
|
|
815
|
+
#print_log(f'\n---------------------------------------------- Function: stability_region ----------------------------------------------', log)
|
|
816
|
+
|
|
817
|
+
cmd = [
|
|
818
|
+
"python", comparing_partitions_script, "-i1", partition_matrix, "-o1", "0", "-a", "stability", "-n", str(n_stability), "-thr", str(thr_stability),
|
|
819
|
+
"-log", f"{output}/{prefix}_Comparing_partitions.log", "-t", f"{output}/{prefix}", "--keep-redundants"]
|
|
820
|
+
|
|
821
|
+
print_log(f'\t\t\t{" ".join(cmd)}', log)
|
|
822
|
+
|
|
823
|
+
subprocess.run(cmd, capture_output = True, text = True)
|
|
824
|
+
|
|
825
|
+
file_stability = f'{output}/{prefix}_stableRegions.tsv'
|
|
826
|
+
|
|
827
|
+
return file_stability
|
|
828
|
+
|
|
829
|
+
def get_heatmap(output, i1_prefix, i2_prefix, threshold, log):
|
|
830
|
+
|
|
831
|
+
"""
|
|
832
|
+
Generates a heatmap figure representing the congruence score between two genomic pipelines,
|
|
833
|
+
based on the *_final_score.tsv file.
|
|
834
|
+
|
|
835
|
+
Parameters
|
|
836
|
+
----------
|
|
837
|
+
output: str
|
|
838
|
+
Path to the directory where the *_final_score.tsv file is located and where the heatmap will be saved.
|
|
839
|
+
i1_prefix : str
|
|
840
|
+
Prefix of the first pipeline (y-axis).
|
|
841
|
+
i2_prefix : str
|
|
842
|
+
Prefix of the second pipeline (x-axis).
|
|
843
|
+
threshold : str
|
|
844
|
+
Threshold value used to filter the partition matrix ('max' if no filtering).
|
|
845
|
+
Return
|
|
846
|
+
------
|
|
847
|
+
fig_heatmap: class plotly.graph_objs._figure.Figure
|
|
848
|
+
Plotly figure object of the heatmap. Also saved as PNG in the output folder.
|
|
849
|
+
"""
|
|
850
|
+
#print_log(f'\n---------------------------------------------- Function: get_heatmap ----------------------------------------------', log)
|
|
851
|
+
|
|
852
|
+
max_ticks = 16
|
|
853
|
+
|
|
854
|
+
final_score = glob.glob(output +'/*_final_score.tsv' )[0]
|
|
855
|
+
df = pd.read_csv(final_score, sep ='\t')
|
|
856
|
+
df_filtered = df.drop(df.columns[0], axis = 1)
|
|
857
|
+
df_filtered.columns = range(len(df_filtered.columns))
|
|
858
|
+
n_lines, n_column = df_filtered.shape
|
|
859
|
+
|
|
860
|
+
#---------------------------------
|
|
861
|
+
fig_heatmap = px.imshow(df_filtered,
|
|
862
|
+
labels = dict(x = f"Threshold <br> -{i2_prefix}-</br>", y = f"Threshold <br> -{i1_prefix}-</br>"))
|
|
863
|
+
|
|
864
|
+
if n_lines > max_ticks:
|
|
865
|
+
step = math.ceil(n_lines/ max_ticks)
|
|
866
|
+
y_list = list(range(0, n_lines, step))
|
|
867
|
+
else:
|
|
868
|
+
y_list=list(range(0,n_lines))
|
|
869
|
+
|
|
870
|
+
if n_column > max_ticks:
|
|
871
|
+
step = math.ceil(n_column / max_ticks)
|
|
872
|
+
x_list = list(range(0, n_column, step))
|
|
873
|
+
else:
|
|
874
|
+
x_list = list(range(0, n_column))
|
|
875
|
+
|
|
876
|
+
fig_heatmap.update_layout(
|
|
877
|
+
height = 500,
|
|
878
|
+
width = 500,
|
|
879
|
+
title_x = 0.5,
|
|
880
|
+
xaxis = dict(scaleanchor = None, constrain='domain', tickvals = x_list),
|
|
881
|
+
yaxis = dict(scaleanchor = None, constrain='domain', tickvals = y_list),
|
|
882
|
+
coloraxis = dict(colorscale = 'Blues', cmin = 0, cmax = 3))
|
|
883
|
+
|
|
884
|
+
#--------------------------------------------------------------------------
|
|
885
|
+
# For both other matrices:
|
|
886
|
+
if n_column == 1:
|
|
887
|
+
fig_heatmap.update_layout(xaxis = dict(tickmode = 'array', tickvals = [0], ticktext = [0]))
|
|
888
|
+
if n_lines == 1:
|
|
889
|
+
fig_heatmap.update_layout(yaxis = dict(tickmode = 'array', tickvals = [0], ticktext = [0]))
|
|
890
|
+
|
|
891
|
+
#--------------------------------------------------------------------------
|
|
892
|
+
#For the partitions filtered matrix:
|
|
893
|
+
|
|
894
|
+
if threshold != 'max':
|
|
895
|
+
|
|
896
|
+
#--------------------------------------------------------------------------
|
|
897
|
+
# Partition matrix and sequence type
|
|
898
|
+
|
|
899
|
+
if n_column == 1 and n_lines != 1:
|
|
900
|
+
columns_y = df.iloc[:, 0]
|
|
901
|
+
string_columns_y = [s.split('-')[1].split('x')[0] for s in columns_y]
|
|
902
|
+
len_y = len(columns_y)
|
|
903
|
+
|
|
904
|
+
if len_y <= max_ticks:
|
|
905
|
+
index_y = list(range(len_y))
|
|
906
|
+
fig_heatmap.update_layout(xaxis = dict(tickmode ='array', tickvals = [0], ticktext = [0]))
|
|
907
|
+
fig_heatmap.update_layout(yaxis = dict(tickmode ='array', tickvals = index_y, ticktext = string_columns_y))
|
|
908
|
+
else:
|
|
909
|
+
step_y = math.ceil(len_y / max_ticks)
|
|
910
|
+
list_index_y = [i * step_y for i in range(max_ticks)]
|
|
911
|
+
list_strings_y = string_columns_y [::step_y]
|
|
912
|
+
|
|
913
|
+
fig_heatmap.update_layout(
|
|
914
|
+
xaxis = dict(tickvals = [0], ticktext = [0]),
|
|
915
|
+
yaxis = dict(tickvals = list_index_y, ticktext = list_strings_y))
|
|
916
|
+
|
|
917
|
+
#--------------------------------------------------------------------------
|
|
918
|
+
# Both Partition matrix
|
|
919
|
+
|
|
920
|
+
if n_column != 1 and n_lines != 1:
|
|
921
|
+
|
|
922
|
+
columns = df.columns.tolist()[1:]
|
|
923
|
+
string_columns = [s.split('-')[1].split('x')[0] for s in columns]
|
|
924
|
+
len_x = len(columns)
|
|
925
|
+
|
|
926
|
+
if len_x > max_ticks:
|
|
927
|
+
|
|
928
|
+
step_x = math.ceil(len_x / max_ticks)
|
|
929
|
+
list_index = [i * step_x for i in range(max_ticks)]
|
|
930
|
+
list_strings = string_columns[::step_x]
|
|
931
|
+
|
|
932
|
+
fig_heatmap.update_layout(
|
|
933
|
+
xaxis = dict(tickvals = list_index, ticktext = list_strings),
|
|
934
|
+
yaxis = dict(tickvals = list_index, ticktext = list_strings))
|
|
935
|
+
else:
|
|
936
|
+
|
|
937
|
+
fig_heatmap.update_layout(
|
|
938
|
+
xaxis = dict(tickvals = list(range(len_x)), ticktext = string_columns),
|
|
939
|
+
yaxis = dict(tickvals = list(range(len_x)), ticktext = string_columns))
|
|
940
|
+
|
|
941
|
+
fig_heatmap.update_layout(margin=dict(l=0, r=0, t=20, b=0))
|
|
942
|
+
fig_heatmap.write_image(f'{output}/{i1_prefix}_vs_{i2_prefix}_heatmap.png', format = "png")
|
|
943
|
+
|
|
944
|
+
return fig_heatmap
|
|
945
|
+
|
|
946
|
+
def get_tendency(output, prefix_both, log):
|
|
947
|
+
|
|
948
|
+
"""
|
|
949
|
+
Creates a scatter plot with trendline from *_All_correspondence.tsv,
|
|
950
|
+
showing the best correspondence points between methods in each pipeline.
|
|
951
|
+
|
|
952
|
+
Parameters
|
|
953
|
+
----------
|
|
954
|
+
output: str
|
|
955
|
+
Path to the directory where the *_All_correspondence.tsv file is located and where the figure will be saved.
|
|
956
|
+
|
|
957
|
+
prefix_both: str
|
|
958
|
+
Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX)
|
|
959
|
+
Return
|
|
960
|
+
------
|
|
961
|
+
fig_tendency: plotly.graph_objs._figure.Figure
|
|
962
|
+
Plotly figure object of the scatter plot. Also saved as PNG in the output folder.
|
|
963
|
+
"""
|
|
964
|
+
#print_log(f'\n---------------------------------------------- Function: get_tendency ----------------------------------------------', log)
|
|
965
|
+
|
|
966
|
+
all_correspondence=glob.glob(output + '/*_All_correspondence.tsv')[0]
|
|
967
|
+
df = pd.read_csv(all_correspondence, sep = "\t")
|
|
968
|
+
|
|
969
|
+
df_1=df.iloc[:,0]
|
|
970
|
+
|
|
971
|
+
values_rev = [string for string in df_1 if '_rev' in string]
|
|
972
|
+
|
|
973
|
+
nr_point=len(df_1)
|
|
974
|
+
nr_point_method_2 = len(values_rev)
|
|
975
|
+
nr_point_method_1 = nr_point - nr_point_method_2
|
|
976
|
+
|
|
977
|
+
for elem in values_rev:
|
|
978
|
+
string_r=elem.split('_')
|
|
979
|
+
reverse_prefix = string_r[-2]+'_vs_'+string_r[0]
|
|
980
|
+
|
|
981
|
+
df_modified=df.replace(to_replace = elem, value = reverse_prefix)
|
|
982
|
+
|
|
983
|
+
x_axes = df_modified.columns[1]
|
|
984
|
+
y_axes = df_modified.columns[2]
|
|
985
|
+
|
|
986
|
+
fig_tendency = px.scatter(df_modified, x = x_axes, y = y_axes, trendline = "ols", color_discrete_sequence = ["orange", "blue"], color = 'comparison')
|
|
987
|
+
fig_tendency.update_layout(title_x = 0.5, legend=dict( orientation="h",yanchor="bottom", y=-0.35, xanchor="center", x=0.5), margin=dict(l=0, r=0, t=20, b=0))
|
|
988
|
+
fig_tendency.write_image(f'{output}/{prefix_both}_tendency.png', format = "png")
|
|
989
|
+
|
|
990
|
+
return fig_tendency, nr_point_method_1, nr_point_method_2
|
|
991
|
+
|
|
992
|
+
def join_inputs_variables(data_folder, data_files):
|
|
993
|
+
|
|
994
|
+
"""
|
|
995
|
+
Join the input variables provided in the command line for the congruence analysis.
|
|
996
|
+
|
|
997
|
+
Parameters
|
|
998
|
+
----------
|
|
999
|
+
data_folder: list
|
|
1000
|
+
List of folder-related elements.
|
|
1001
|
+
|
|
1002
|
+
data_files: list
|
|
1003
|
+
List of folder-related elements.
|
|
1004
|
+
|
|
1005
|
+
Return
|
|
1006
|
+
------
|
|
1007
|
+
inputs_variables : list
|
|
1008
|
+
Combined list of all valid inputs.
|
|
1009
|
+
"""
|
|
1010
|
+
|
|
1011
|
+
#print(f'\n---------------------------------------------- Function: join_inputs_variables ----------------------------------------------')
|
|
1012
|
+
|
|
1013
|
+
inputs_variables = []
|
|
1014
|
+
|
|
1015
|
+
if data_folder:
|
|
1016
|
+
for elem in data_folder:
|
|
1017
|
+
inputs_variables.append(elem)
|
|
1018
|
+
|
|
1019
|
+
if data_files:
|
|
1020
|
+
for elem in data_files:
|
|
1021
|
+
inputs_variables.append(elem)
|
|
1022
|
+
|
|
1023
|
+
if len(inputs_variables) == 1:
|
|
1024
|
+
if inputs_variables[0][0] == None:
|
|
1025
|
+
sys.exit(f"\tError: It is impossible to proceed the analysis without a partition_matrix file.")
|
|
1026
|
+
|
|
1027
|
+
if len(inputs_variables) == 2:
|
|
1028
|
+
i1,i2 = inputs_variables[0][0], inputs_variables[1][0]
|
|
1029
|
+
|
|
1030
|
+
if i1 is None and i2 is None:
|
|
1031
|
+
sys.exit("\tError: It is impossible to proceed the analysis")
|
|
1032
|
+
else:
|
|
1033
|
+
print(f'\nChecking the command line:')
|
|
1034
|
+
print(f'\tThe provided arguments are all compatible. Everything is ready to run EvalTree.py :)\n')
|
|
1035
|
+
|
|
1036
|
+
return inputs_variables
|
|
1037
|
+
|
|
1038
|
+
def load_and_prepare_data(file, log):
|
|
1039
|
+
|
|
1040
|
+
"""
|
|
1041
|
+
Identifies the type of partition summary file and processes it.
|
|
1042
|
+
|
|
1043
|
+
The difference between *_partitions_summary.tsv and *_SAMPLE_OF_INTEREST_partitions_summary.tsv
|
|
1044
|
+
is that the latter contains an extra 'SAMPLE_OF_INTEREST' column, which will be removed from the
|
|
1045
|
+
dataframe if present.
|
|
1046
|
+
|
|
1047
|
+
Parameters
|
|
1048
|
+
----------
|
|
1049
|
+
file: str
|
|
1050
|
+
Path to the file that will be processed.
|
|
1051
|
+
|
|
1052
|
+
Returns
|
|
1053
|
+
-------
|
|
1054
|
+
df_data : pandas DataFrame
|
|
1055
|
+
Returns the dataframe with or without the 'SAMPLE_OF_INTEREST' column, depending on the file type.
|
|
1056
|
+
|
|
1057
|
+
"""
|
|
1058
|
+
#print_log(f'\n---------------------------------------------- Function: load_and_prepare_data ----------------------------------------------\n', log)
|
|
1059
|
+
|
|
1060
|
+
df = pd.read_table(file)
|
|
1061
|
+
first_cell = df.columns[0]
|
|
1062
|
+
|
|
1063
|
+
if first_cell == 'SAMPLE_OF_INTEREST':
|
|
1064
|
+
df_data = df.iloc[:, 1:]
|
|
1065
|
+
else:
|
|
1066
|
+
df_data = df
|
|
1067
|
+
|
|
1068
|
+
return df_data
|
|
1069
|
+
|
|
1070
|
+
def order_cluster_by_size(df_data, log):
|
|
1071
|
+
|
|
1072
|
+
"""
|
|
1073
|
+
Checks if the 'cluster_length' column exists in the dataframe and sorts the dataframe by it in descending order.
|
|
1074
|
+
|
|
1075
|
+
Parameters
|
|
1076
|
+
----------
|
|
1077
|
+
df_data : pandas.DataFrame
|
|
1078
|
+
Dataframe with cluster data to analyze.
|
|
1079
|
+
|
|
1080
|
+
Returns
|
|
1081
|
+
-------
|
|
1082
|
+
df_filtered : pandas.DataFrame or None
|
|
1083
|
+
Sorted dataframe by 'cluster_length', or None if the column doesn't exist.
|
|
1084
|
+
"""
|
|
1085
|
+
#print_log(f'\n---------------------------------------------- Function: order_cluster_by_size ----------------------------------------------\n', log)
|
|
1086
|
+
|
|
1087
|
+
if 'cluster_length' in df_data.columns:
|
|
1088
|
+
df_filtered = df_data.sort_values(by = 'cluster_length', ascending=False)
|
|
1089
|
+
#print_log(f'\t\tOrdering clusters by cluster-length (values in descending order)...', log)
|
|
1090
|
+
else:
|
|
1091
|
+
#print_log(f'\t\tOrdering clusters by cluster-length is not possible because, cluster-length column is not present in the file...', log)
|
|
1092
|
+
df_filtered = None
|
|
1093
|
+
|
|
1094
|
+
return df_filtered
|
|
1095
|
+
|
|
1096
|
+
def check_plot_threshold(plots_thresholds, df_filtered, log):
|
|
1097
|
+
|
|
1098
|
+
"""
|
|
1099
|
+
Checking if the plots_threshold argument contains one or more integer thresholds.
|
|
1100
|
+
Generating the MST structure for each threshold as present in the file (sample_of_interest or partition_summary).
|
|
1101
|
+
|
|
1102
|
+
Parameters
|
|
1103
|
+
----------
|
|
1104
|
+
plots_thresholds: list
|
|
1105
|
+
One or more thresholds provided by the user, separated by commas and without spaces.
|
|
1106
|
+
df_filtered: pandas.DataFrame
|
|
1107
|
+
Sorted dataframe containing the necessary data.
|
|
1108
|
+
Return
|
|
1109
|
+
------
|
|
1110
|
+
method: list
|
|
1111
|
+
A list of thresholds in the format 'MST-{value}x1.0'.
|
|
1112
|
+
"""
|
|
1113
|
+
#print_log(f'\n---------------------------------------------- Function: check_plots_thresholds ----------------------------------------------\n', log)
|
|
1114
|
+
|
|
1115
|
+
name_threshold_in_df = df_filtered.iloc[:,0].unique().tolist()
|
|
1116
|
+
method = []
|
|
1117
|
+
for elem in plots_thresholds:
|
|
1118
|
+
if elem in name_threshold_in_df:
|
|
1119
|
+
method.append(elem)
|
|
1120
|
+
else:
|
|
1121
|
+
print_log(f"\tThe plot threshold {elem} does not exist in the file.", log)
|
|
1122
|
+
|
|
1123
|
+
return method
|
|
1124
|
+
|
|
1125
|
+
def check_threshold_in_file(method, df_filtered, clustering_file, log):
|
|
1126
|
+
|
|
1127
|
+
"""
|
|
1128
|
+
Look for the unique thresholds in the file (*_partitions_summary.tsv or *_SAMPLE_OF_INTEREST_partitions_summary.tsv)
|
|
1129
|
+
Check if the threshold(s) entered exist in the selected file.
|
|
1130
|
+
|
|
1131
|
+
Parameters
|
|
1132
|
+
----------
|
|
1133
|
+
method: list
|
|
1134
|
+
A list of thresholds in the format 'MST-{value}x1.0'.
|
|
1135
|
+
df_filtered: pandas.DataFrame
|
|
1136
|
+
A dataframe containing data from the selected file.
|
|
1137
|
+
clustering_file: str
|
|
1138
|
+
Path to the file (partitions_summary or sample_of_interest)
|
|
1139
|
+
|
|
1140
|
+
Return
|
|
1141
|
+
------
|
|
1142
|
+
filtered_threshold: list
|
|
1143
|
+
List of valid thresholds that will be applied to df_filtered.
|
|
1144
|
+
"""
|
|
1145
|
+
#print_log(f'\n---------------------------------------------- Function: check_threshold_in_file ----------------------------------------------\n', log)
|
|
1146
|
+
|
|
1147
|
+
all_lines_in_one_column = df_filtered.iloc[:,0]
|
|
1148
|
+
unique_threshold = all_lines_in_one_column.unique().tolist()
|
|
1149
|
+
|
|
1150
|
+
filtered_threshold = []
|
|
1151
|
+
|
|
1152
|
+
for elem in method:
|
|
1153
|
+
if elem not in unique_threshold:
|
|
1154
|
+
print(f'\tThe plot threshold entered {elem} does not exist in the {clustering_file}...')
|
|
1155
|
+
else:
|
|
1156
|
+
filtered_threshold.append(elem)
|
|
1157
|
+
|
|
1158
|
+
return filtered_threshold
|
|
1159
|
+
|
|
1160
|
+
def filter_df_by_plot_threshold(filtered_threshold, df_filtered, n_cluster, log):
|
|
1161
|
+
|
|
1162
|
+
"""
|
|
1163
|
+
Check if the number of clusters selected by the n_cluster argument can be applied to the dataframe.
|
|
1164
|
+
Filter the dataframe according to the threshold plots and limit the number of clusters per threshold.
|
|
1165
|
+
|
|
1166
|
+
Parameters
|
|
1167
|
+
----------
|
|
1168
|
+
filtered_threshold: list
|
|
1169
|
+
List of valid thresholds to be applied to df_filtered to generate cluster plots.
|
|
1170
|
+
|
|
1171
|
+
df_filtered: pandas.DataFrame
|
|
1172
|
+
The dataframe is organized with the largest clusters.
|
|
1173
|
+
|
|
1174
|
+
n_cluster: int
|
|
1175
|
+
The number of clusters (pie plot(s)) to be produced.
|
|
1176
|
+
|
|
1177
|
+
Return
|
|
1178
|
+
------
|
|
1179
|
+
result_df: pandas.DataFrame
|
|
1180
|
+
Dataframe with the selected information based on the provided arguments (threshold(s) and n_cluster).
|
|
1181
|
+
"""
|
|
1182
|
+
#print_log(f'\n---------------------------------------------- Function: filter_df_by_plot_threshold ----------------------------------------------\n', log)
|
|
1183
|
+
|
|
1184
|
+
head = df_filtered.columns.tolist()
|
|
1185
|
+
name_partition = head[0]
|
|
1186
|
+
|
|
1187
|
+
results = []
|
|
1188
|
+
|
|
1189
|
+
for threshold in filtered_threshold:
|
|
1190
|
+
threshold_df = df_filtered[df_filtered[name_partition] == threshold]
|
|
1191
|
+
n_lines = len(threshold_df)
|
|
1192
|
+
|
|
1193
|
+
if n_cluster > n_lines:
|
|
1194
|
+
print_log(f"\t\tThe entered n_cluster value ({n_cluster}) is higher than the number of lines in the threshold {threshold} dataframe ({n_lines}).", log)
|
|
1195
|
+
threshold_df_end = threshold_df.head(n_lines)
|
|
1196
|
+
results.append(threshold_df_end)
|
|
1197
|
+
print_log(f"\t\tIt will be produced plots according the number of lines available in the dataset for the {threshold}.", log)
|
|
1198
|
+
else:
|
|
1199
|
+
threshold_df_end = threshold_df.head(n_cluster)
|
|
1200
|
+
results.append(threshold_df_end)
|
|
1201
|
+
|
|
1202
|
+
if not results:
|
|
1203
|
+
return None
|
|
1204
|
+
|
|
1205
|
+
result_df = pd.concat(results)
|
|
1206
|
+
|
|
1207
|
+
return result_df
|
|
1208
|
+
|
|
1209
|
+
def filtering_df_threshold(filtered_threshold, df_filtered, log):
|
|
1210
|
+
|
|
1211
|
+
"""
|
|
1212
|
+
Filter the dataframe according to the provided threshold values.
|
|
1213
|
+
|
|
1214
|
+
Parameters
|
|
1215
|
+
----------
|
|
1216
|
+
filtered_threshold : list
|
|
1217
|
+
List of valid thresholds to be applied to df_filtered.
|
|
1218
|
+
|
|
1219
|
+
df_filtered: pandas.DataFrame
|
|
1220
|
+
DataFrame containing the clustering information, where the first column represents thresholds.
|
|
1221
|
+
|
|
1222
|
+
Returns
|
|
1223
|
+
-------
|
|
1224
|
+
df_filtered_threshold: pandas.DataFrame
|
|
1225
|
+
DataFrame filtered by the thresholds in filtered_threshold.
|
|
1226
|
+
"""
|
|
1227
|
+
#print_log(f'\n---------------------------------------------- Function: filtering_df_threshold ----------------------------------------------\n', log)
|
|
1228
|
+
|
|
1229
|
+
head=df_filtered.columns.tolist()
|
|
1230
|
+
name_partition = head[0]
|
|
1231
|
+
|
|
1232
|
+
results = []
|
|
1233
|
+
|
|
1234
|
+
for threshold in filtered_threshold:
|
|
1235
|
+
if threshold in df_filtered.iloc[:,0].values:
|
|
1236
|
+
threshold_df = df_filtered[df_filtered[name_partition] == threshold]
|
|
1237
|
+
results.append(threshold_df)
|
|
1238
|
+
|
|
1239
|
+
df_filtered_threshold = pd.concat(results)
|
|
1240
|
+
|
|
1241
|
+
return df_filtered_threshold
|
|
1242
|
+
|
|
1243
|
+
def check_column_plots(user_columns_plots, result_df, log):
|
|
1244
|
+
|
|
1245
|
+
"""
|
|
1246
|
+
Validate whether the column name(s) selected for plotting exist in the dataframe.
|
|
1247
|
+
|
|
1248
|
+
Parameters
|
|
1249
|
+
----------
|
|
1250
|
+
user_columns_plots: str
|
|
1251
|
+
Column(s) selected by the user for plotting.
|
|
1252
|
+
result_df: pandas.DataFrame
|
|
1253
|
+
Dataframe containing the data to be plotted.
|
|
1254
|
+
|
|
1255
|
+
Return
|
|
1256
|
+
------
|
|
1257
|
+
check_columns: list
|
|
1258
|
+
A list of valid column names found in the dataframe.
|
|
1259
|
+
"""
|
|
1260
|
+
#print_log(f'\n---------------------------------------------- Function: check_column_plots ----------------------------------------------\n', log)
|
|
1261
|
+
|
|
1262
|
+
column_in_df = result_df.columns.tolist()
|
|
1263
|
+
check_columns = []
|
|
1264
|
+
|
|
1265
|
+
value_columns_plots = [col.strip() for col in user_columns_plots.split(',')]
|
|
1266
|
+
for col in value_columns_plots:
|
|
1267
|
+
if col in column_in_df:
|
|
1268
|
+
check_columns.append(col)
|
|
1269
|
+
else:
|
|
1270
|
+
print_log(f'\t\tInvalid column name for plot: {col}. It does not exist.', log)
|
|
1271
|
+
|
|
1272
|
+
|
|
1273
|
+
return check_columns
|
|
1274
|
+
|
|
1275
|
+
def generate_pastel_color():
|
|
1276
|
+
|
|
1277
|
+
"""Generation of the random color pallete to the cluster plots"""
|
|
1278
|
+
r = random.randint(100, 200)
|
|
1279
|
+
g = random.randint(100, 200)
|
|
1280
|
+
b = random.randint(100, 200)
|
|
1281
|
+
|
|
1282
|
+
return f'#{r:02X}{g:02X}{b:02X}'
|
|
1283
|
+
|
|
1284
|
+
def check_structure_lines_column_plots(check_columns, result_df, plots_category_percentage, plots_category_number,output, prefix, plots_summary, category_colors, log):
|
|
1285
|
+
|
|
1286
|
+
"""
|
|
1287
|
+
Check if the rows of the valid column plots have the correct structure to perform the cluster characterization.
|
|
1288
|
+
Generate cluster characterization plots based on validated columns.
|
|
1289
|
+
|
|
1290
|
+
Parameters
|
|
1291
|
+
----------
|
|
1292
|
+
check_columns: list
|
|
1293
|
+
List of column names to be validated and plotted.
|
|
1294
|
+
result_df: pandas.DataFrame
|
|
1295
|
+
Dataframe containing the selected information to be processed.
|
|
1296
|
+
plots_category_percentage: float
|
|
1297
|
+
Percentage threshold for aggregating smaller categories.
|
|
1298
|
+
plots_category_number: int
|
|
1299
|
+
Maximum number of categories to show.
|
|
1300
|
+
output: str
|
|
1301
|
+
Path to the output directory where the images will be saved.
|
|
1302
|
+
prefix: str
|
|
1303
|
+
Prefix to be added to the output file names.
|
|
1304
|
+
plots_summary: str
|
|
1305
|
+
Type of file selected by user.
|
|
1306
|
+
category_colors: dict
|
|
1307
|
+
Dictionary to store and reuse colors for each category.
|
|
1308
|
+
|
|
1309
|
+
Return
|
|
1310
|
+
------
|
|
1311
|
+
results_list: list
|
|
1312
|
+
List of dictionaries containing:
|
|
1313
|
+
- A: the threshold
|
|
1314
|
+
- B: the column
|
|
1315
|
+
- C: the plotly figure object
|
|
1316
|
+
|
|
1317
|
+
"""
|
|
1318
|
+
#print_log(f'\n---------------------------------------------- Function: check_struture_lines_column_plots ----------------------------------------------\n', log)
|
|
1319
|
+
|
|
1320
|
+
pattern_line_column_plot = r'^(.+ \(\d+(\.\d+)?%\))(, .+ \(\d+(\.\d+)?%\))*( \(n = \d+\))$'
|
|
1321
|
+
results_list = []
|
|
1322
|
+
flag = False
|
|
1323
|
+
strings = []
|
|
1324
|
+
for _,row in result_df.iterrows():
|
|
1325
|
+
|
|
1326
|
+
for col in check_columns:
|
|
1327
|
+
|
|
1328
|
+
#---------------------
|
|
1329
|
+
mst = row.iloc[0]
|
|
1330
|
+
cluster = row.iloc[1]
|
|
1331
|
+
cluster_rename = cluster[0].upper() + cluster[1:]
|
|
1332
|
+
n_cluster_length = row['cluster_length']
|
|
1333
|
+
|
|
1334
|
+
#-------------------------------
|
|
1335
|
+
#Check if the line is valid
|
|
1336
|
+
|
|
1337
|
+
if re.match(pattern_line_column_plot, str(row[col])):
|
|
1338
|
+
|
|
1339
|
+
#---------------------------------------------
|
|
1340
|
+
#Processing plots
|
|
1341
|
+
|
|
1342
|
+
if plots_summary == 'sample_of_interest':
|
|
1343
|
+
sample_increase = row['samples_increase']
|
|
1344
|
+
else:
|
|
1345
|
+
sample_increase = ''
|
|
1346
|
+
|
|
1347
|
+
#---------------------------------
|
|
1348
|
+
# Split informations
|
|
1349
|
+
|
|
1350
|
+
components_row = row[col].split(" (n =")[0].split(", ")
|
|
1351
|
+
category = []
|
|
1352
|
+
values = []
|
|
1353
|
+
|
|
1354
|
+
for elem in components_row:
|
|
1355
|
+
label, percentage = elem.split(" (")
|
|
1356
|
+
percentage_value = percentage [:-2]
|
|
1357
|
+
category.append(label)
|
|
1358
|
+
values.append(float(percentage_value))
|
|
1359
|
+
|
|
1360
|
+
#--------------------------------------------------------
|
|
1361
|
+
#Processing information by plots_category_number argument
|
|
1362
|
+
|
|
1363
|
+
if plots_category_percentage is not None:
|
|
1364
|
+
plots_category_number = None
|
|
1365
|
+
|
|
1366
|
+
if plots_category_number is not None:
|
|
1367
|
+
if not flag:
|
|
1368
|
+
flag = True
|
|
1369
|
+
|
|
1370
|
+
list_category = category[0:plots_category_number]
|
|
1371
|
+
list_values = values[0:plots_category_number]
|
|
1372
|
+
percentage = sum(list_values)
|
|
1373
|
+
remaining_percentage = 100 - percentage
|
|
1374
|
+
if remaining_percentage != 0:
|
|
1375
|
+
list_category.append('Others')
|
|
1376
|
+
list_values.append(remaining_percentage)
|
|
1377
|
+
|
|
1378
|
+
#--------------------------------------------------------
|
|
1379
|
+
#Processing information by plots_category_percentage argument
|
|
1380
|
+
|
|
1381
|
+
if plots_category_percentage is not None:
|
|
1382
|
+
if not flag:
|
|
1383
|
+
flag = True
|
|
1384
|
+
|
|
1385
|
+
other_values = []
|
|
1386
|
+
|
|
1387
|
+
for num in values:
|
|
1388
|
+
if num <= plots_category_percentage:
|
|
1389
|
+
|
|
1390
|
+
other_values.append(num)
|
|
1391
|
+
|
|
1392
|
+
percentage = sum(other_values)
|
|
1393
|
+
|
|
1394
|
+
if percentage != 0:
|
|
1395
|
+
size = len(other_values)
|
|
1396
|
+
list_category = category[:-size]
|
|
1397
|
+
list_values = values[:-size]
|
|
1398
|
+
list_category.append('Others')
|
|
1399
|
+
list_values.append(percentage)
|
|
1400
|
+
else:
|
|
1401
|
+
list_category = category
|
|
1402
|
+
list_values = values
|
|
1403
|
+
|
|
1404
|
+
#------------------------------------------
|
|
1405
|
+
# Definition of colors for each category
|
|
1406
|
+
colors = []
|
|
1407
|
+
|
|
1408
|
+
for cat in list_category:
|
|
1409
|
+
if cat not in category_colors:
|
|
1410
|
+
category_colors[cat] = generate_pastel_color()
|
|
1411
|
+
colors.append(category_colors[cat])
|
|
1412
|
+
|
|
1413
|
+
#-----------------------------------------
|
|
1414
|
+
#Production of image
|
|
1415
|
+
|
|
1416
|
+
df = pd.DataFrame({'Category': list_category, 'Percentage': list_values})
|
|
1417
|
+
fig = px.pie(df, values = 'Percentage', names = 'Category', title = f'{cluster_rename}')
|
|
1418
|
+
fig.update_traces(marker = dict(colors = colors))
|
|
1419
|
+
|
|
1420
|
+
fig.update_layout(title_x = 0.5, annotations = [dict(
|
|
1421
|
+
x = 0.5,
|
|
1422
|
+
y = -0.2,
|
|
1423
|
+
text = f'Number of samples: {n_cluster_length}<br>{sample_increase}',showarrow=False)])
|
|
1424
|
+
|
|
1425
|
+
fig.write_image(f'{output}/{prefix}_{mst}_{col}_{cluster_rename}.png', format="png")
|
|
1426
|
+
result_dict = {'A': mst, 'B': col, 'C': fig}
|
|
1427
|
+
results_list.append(result_dict)
|
|
1428
|
+
strings.append(f"\t\tAnalyzing threshold {mst}, column {col}.")
|
|
1429
|
+
|
|
1430
|
+
else:
|
|
1431
|
+
print_log(f'\tError: INVALID values present in the line with the {col} column at the {mst}: {row[col]}.\n', log)
|
|
1432
|
+
results_list = None
|
|
1433
|
+
|
|
1434
|
+
unique_strings = set(strings)
|
|
1435
|
+
for elem in unique_strings:
|
|
1436
|
+
print_log(elem, log)
|
|
1437
|
+
|
|
1438
|
+
print_log(f'\tSaving the cluster characterization plots.', log)
|
|
1439
|
+
return results_list
|
|
1440
|
+
|
|
1441
|
+
|
|
1442
|
+
def select_nomenclature_change(df_filtered_threshold, log):
|
|
1443
|
+
|
|
1444
|
+
"""
|
|
1445
|
+
Select clusters with increase behavior in the 'nomenclature_change' column.
|
|
1446
|
+
If the 'nomenclature_change' column exists, this function filters the DataFrame to retain
|
|
1447
|
+
only the clusters with specific increase or new related tags.
|
|
1448
|
+
|
|
1449
|
+
Parameters
|
|
1450
|
+
----------
|
|
1451
|
+
df_filtered: pd.DataFrame
|
|
1452
|
+
DataFrame containing filtered cluster data.
|
|
1453
|
+
Return
|
|
1454
|
+
------
|
|
1455
|
+
result_df: pd.DataFrame or None
|
|
1456
|
+
A new DataFrame containing only the rows where nomenclature_change indicates cluster increase,
|
|
1457
|
+
or None if the column is missing or no valid categories are found.
|
|
1458
|
+
"""
|
|
1459
|
+
#print_log(f'\n---------------------------------------------- Function: select_nomenclature_change ----------------------------------------------\n', log)
|
|
1460
|
+
results = []
|
|
1461
|
+
|
|
1462
|
+
possibilities = ['kept (increase)','new','new (increase)', 'new (merge_increase)', 'new (split_increase)', 'new (split_merge_increase)']
|
|
1463
|
+
|
|
1464
|
+
if 'nomenclature_change' in df_filtered_threshold.columns:
|
|
1465
|
+
data = df_filtered_threshold['nomenclature_change'].values.tolist()
|
|
1466
|
+
unique_list = set(data)
|
|
1467
|
+
|
|
1468
|
+
for elem in unique_list:
|
|
1469
|
+
if elem in possibilities:
|
|
1470
|
+
filtered_df = df_filtered_threshold[df_filtered_threshold['nomenclature_change'] == elem]
|
|
1471
|
+
results.append(filtered_df)
|
|
1472
|
+
|
|
1473
|
+
if results != []:
|
|
1474
|
+
result_df = pd.concat(results)
|
|
1475
|
+
else:
|
|
1476
|
+
result_df = None
|
|
1477
|
+
print_log(f'\tNo information about the behavior of the “Cluster Nomenclature System” in some of the most common situations in a routine surveillance scenario.', log)
|
|
1478
|
+
else:
|
|
1479
|
+
result_df = None
|
|
1480
|
+
print_log(f'\tColumn nomenclature change not found in the selected file.', log)
|
|
1481
|
+
|
|
1482
|
+
return result_df
|
|
1483
|
+
|
|
1484
|
+
def get_nr_lines_threshold(partition_matrix, log):
|
|
1485
|
+
|
|
1486
|
+
"""
|
|
1487
|
+
Retrieve the number of samples (rows) and thresholds (columns) from a *_partitions.tsv file.
|
|
1488
|
+
|
|
1489
|
+
Parameters
|
|
1490
|
+
----------
|
|
1491
|
+
partition_matrix: str
|
|
1492
|
+
Path to the *_partitions.tsv file.
|
|
1493
|
+
|
|
1494
|
+
Returns
|
|
1495
|
+
-------
|
|
1496
|
+
nr_columns_df: int
|
|
1497
|
+
Number of thresholds (columns/partitions) presents in the file.
|
|
1498
|
+
nr_lines_df: int
|
|
1499
|
+
Number of samples (rows) present in the file.
|
|
1500
|
+
"""
|
|
1501
|
+
#print_log(f'\n---------------------------------------------- Function: get_nr_lines_threshold ----------------------------------------------\n', log)
|
|
1502
|
+
|
|
1503
|
+
df = pd.read_table(partition_matrix)
|
|
1504
|
+
nr_columns_df = (len(df.columns)-1)
|
|
1505
|
+
nr_lines_df = len(df)
|
|
1506
|
+
|
|
1507
|
+
return nr_lines_df, nr_columns_df
|
|
1508
|
+
|
|
1509
|
+
def get_file_partition_by_threshold (partition_matrix, prefix, output, log):
|
|
1510
|
+
|
|
1511
|
+
"""
|
|
1512
|
+
Generate a *_cluters_partitions.tsv or *_cluters_partitions-filered.tsv file
|
|
1513
|
+
with the number of partitions per threshold for each partition matrix (normal or filtered).
|
|
1514
|
+
|
|
1515
|
+
Parameters
|
|
1516
|
+
---------
|
|
1517
|
+
partition_matrix: str
|
|
1518
|
+
Path to the *_partitions.tsv file.
|
|
1519
|
+
prefix: str
|
|
1520
|
+
Prefix to include in the output filename.
|
|
1521
|
+
output: str
|
|
1522
|
+
Path to the directory where the results will be saved.
|
|
1523
|
+
Return
|
|
1524
|
+
------
|
|
1525
|
+
file_partition_by_threshold: str
|
|
1526
|
+
Path to the newly generated file.
|
|
1527
|
+
"""
|
|
1528
|
+
#print_log(f'\n---------------------------------------------- Function: get_file_partition_by_threshold ----------------------------------------------\n', log)
|
|
1529
|
+
|
|
1530
|
+
order_col = ["pipeline", "threshold", "partitions"]
|
|
1531
|
+
info_partitions = {"pipeline": [], "threshold": [], "partitions": []}
|
|
1532
|
+
|
|
1533
|
+
partitions = pd.read_table(partition_matrix)
|
|
1534
|
+
|
|
1535
|
+
for i in range(1,len(partitions.columns)):
|
|
1536
|
+
clusters = pd.unique(partitions[partitions.columns[i]])
|
|
1537
|
+
info_partitions["pipeline"].append(prefix)
|
|
1538
|
+
info_partitions["threshold"].append(i)
|
|
1539
|
+
info_partitions["partitions"].append(len(clusters))
|
|
1540
|
+
|
|
1541
|
+
cluster_partition_matrix = pd.DataFrame(data = info_partitions, columns = order_col)
|
|
1542
|
+
|
|
1543
|
+
if '-' in partition_matrix:
|
|
1544
|
+
file_partition_by_threshold = (f'{output}/{prefix}_clusters_partitions-filtered.tsv')
|
|
1545
|
+
else:
|
|
1546
|
+
file_partition_by_threshold = (f'{output}/{prefix}_clusters_partitions.tsv')
|
|
1547
|
+
|
|
1548
|
+
cluster_partition_matrix.to_csv(file_partition_by_threshold, index = False, header = True, sep = "\t")
|
|
1549
|
+
|
|
1550
|
+
return file_partition_by_threshold
|
|
1551
|
+
|
|
1552
|
+
def get_graph_partition_by_threshold(file_partition_by_threshold, prefix, prefix_both, yes_prefix_both, output, log):
|
|
1553
|
+
|
|
1554
|
+
"""
|
|
1555
|
+
Generate a graphic showing the number of partitions vs. thresholds
|
|
1556
|
+
for one pipeline using the *_clusters_partitions.tsv or *_clusters_partitions-filtered.tsv file.
|
|
1557
|
+
|
|
1558
|
+
Parameters
|
|
1559
|
+
----------
|
|
1560
|
+
file_partition_by_threshold: str
|
|
1561
|
+
Relative path to the *_cluster_partitions file.
|
|
1562
|
+
prefix: str
|
|
1563
|
+
Prefix to name the output.
|
|
1564
|
+
output: str
|
|
1565
|
+
Full path to the directory where the results will be saved.
|
|
1566
|
+
|
|
1567
|
+
Returns
|
|
1568
|
+
-------
|
|
1569
|
+
fig_partition_vs_threshols: plotly.graph_objs._figure.Figure
|
|
1570
|
+
Plotly figure object showing the number of partitions by threshold.
|
|
1571
|
+
|
|
1572
|
+
"""
|
|
1573
|
+
#print_log(f'\n---------------------------------------------- Function: get_graph_partition_by_threshold ----------------------------------------------\n', log)
|
|
1574
|
+
|
|
1575
|
+
df = pd.read_csv(file_partition_by_threshold, sep = '\t')
|
|
1576
|
+
|
|
1577
|
+
fig_partition_vs_threshols=px.line(df, x = "threshold", y = "partitions", color = "pipeline",
|
|
1578
|
+
labels = {'partitions': 'Partitions', 'threshold': 'Threshold'})
|
|
1579
|
+
fig_partition_vs_threshols.update_layout(legend=dict (orientation="h",yanchor="bottom",y=-0.35,xanchor="center", x=0.5), margin=dict(l=0, r=0, t=20, b=0))
|
|
1580
|
+
|
|
1581
|
+
if yes_prefix_both == False:
|
|
1582
|
+
fig_partition_vs_threshols.write_image(f'{output}/{prefix}_lineplot.png', format = "png")
|
|
1583
|
+
|
|
1584
|
+
if yes_prefix_both == True:
|
|
1585
|
+
fig_partition_vs_threshols.write_image(f'{output}/{prefix_both}_lineplot.png', format = "png")
|
|
1586
|
+
|
|
1587
|
+
return fig_partition_vs_threshols
|
|
1588
|
+
|
|
1589
|
+
def concatenation_files(file1, file2, output, prefix_both):
|
|
1590
|
+
|
|
1591
|
+
"""
|
|
1592
|
+
Concatenate two TSV files containing cluster partition data (one of each pipeline) and save the result.
|
|
1593
|
+
|
|
1594
|
+
Parameters
|
|
1595
|
+
----------
|
|
1596
|
+
file1 : str
|
|
1597
|
+
Path to the first TSV file.
|
|
1598
|
+
file2 : str
|
|
1599
|
+
Path to the second TSV file.
|
|
1600
|
+
output: str
|
|
1601
|
+
The directory where the combined file will be saved.
|
|
1602
|
+
prefix_both : str
|
|
1603
|
+
Prefix for naming the output file.
|
|
1604
|
+
|
|
1605
|
+
Returns
|
|
1606
|
+
-------
|
|
1607
|
+
path: str
|
|
1608
|
+
Full path to the saved concatenated file.
|
|
1609
|
+
"""
|
|
1610
|
+
#print(f'\n---------------------------------------------- Function: concatenation_files----------------------------------------------\n')
|
|
1611
|
+
|
|
1612
|
+
df_cp1 = pd.read_csv(file1, sep = '\t')
|
|
1613
|
+
df_cp2 = pd.read_csv(file2, sep = '\t')
|
|
1614
|
+
df1 = pd.DataFrame(df_cp1)
|
|
1615
|
+
df2 = pd.DataFrame(df_cp2)
|
|
1616
|
+
|
|
1617
|
+
df_combined = pd.concat([df1, df2])
|
|
1618
|
+
|
|
1619
|
+
path = f'{output}/{prefix_both}_cluster_partitions.tsv'
|
|
1620
|
+
df_combined.to_csv(path, index = False, header = True, sep = "\t")
|
|
1621
|
+
|
|
1622
|
+
return path
|
|
1623
|
+
|
|
1624
|
+
def organize_clusters(results_list):
|
|
1625
|
+
|
|
1626
|
+
"""
|
|
1627
|
+
Organizes plotly figure objects by threshold (e.g., MST) and by category
|
|
1628
|
+
(e.g., source, country).
|
|
1629
|
+
|
|
1630
|
+
Parameters
|
|
1631
|
+
----------
|
|
1632
|
+
results_list : list
|
|
1633
|
+
A list of dictionaries, each containing:
|
|
1634
|
+
- "A": threshold/method string (e.g., 'MST-4x1.0')
|
|
1635
|
+
- "B": category string (e.g., 'country' or 'source')
|
|
1636
|
+
- "C": a Plotly figure object (plotly.graph_objs._figure.Figure)
|
|
1637
|
+
|
|
1638
|
+
Returns
|
|
1639
|
+
-------
|
|
1640
|
+
mst_groups : dict
|
|
1641
|
+
The dictionary is organized by threshold and then by category.
|
|
1642
|
+
"""
|
|
1643
|
+
#print(f'\n---------------------------------------------- Function: organize_clusters----------------------------------------------\n')
|
|
1644
|
+
|
|
1645
|
+
method_groups = {}
|
|
1646
|
+
|
|
1647
|
+
for item in results_list:
|
|
1648
|
+
method = item["A"]
|
|
1649
|
+
category = item["B"]
|
|
1650
|
+
image = item["C"]
|
|
1651
|
+
|
|
1652
|
+
if method not in method_groups:
|
|
1653
|
+
method_groups[method] = {}
|
|
1654
|
+
|
|
1655
|
+
if category not in method_groups[method]:
|
|
1656
|
+
method_groups[method][category] = []
|
|
1657
|
+
|
|
1658
|
+
method_groups[method][category].append(image)
|
|
1659
|
+
|
|
1660
|
+
return method_groups
|
|
1661
|
+
|
|
1662
|
+
def processing_block_names(file_stability, prefix, log):
|
|
1663
|
+
|
|
1664
|
+
"""
|
|
1665
|
+
Identifies the names of stability blocks in a *_stableRegions.tsv file and adds a prefix to each.
|
|
1666
|
+
|
|
1667
|
+
Parameters
|
|
1668
|
+
-----------
|
|
1669
|
+
file_stability: str
|
|
1670
|
+
Path to the *_stableRegions.tsv file.
|
|
1671
|
+
|
|
1672
|
+
prefix: str
|
|
1673
|
+
Prefix to prepend to each block name.
|
|
1674
|
+
|
|
1675
|
+
Returns
|
|
1676
|
+
-------
|
|
1677
|
+
name_block : list or None
|
|
1678
|
+
List of block names with the given prefix, or None if the file is empty.
|
|
1679
|
+
"""
|
|
1680
|
+
#print_log(f'\n---------------------------------------------- Function: processing_file_sta_reg----------------------------------------------\n', log)
|
|
1681
|
+
|
|
1682
|
+
df = pd.read_csv(file_stability, sep = '\t', comment = "#", header = None)
|
|
1683
|
+
|
|
1684
|
+
if df.empty:
|
|
1685
|
+
name_block = None
|
|
1686
|
+
else:
|
|
1687
|
+
name_block = []
|
|
1688
|
+
for elem in df[0]:
|
|
1689
|
+
string = f'{prefix}_' + elem
|
|
1690
|
+
name_block.append(string)
|
|
1691
|
+
|
|
1692
|
+
return name_block
|
|
1693
|
+
|
|
1694
|
+
def processing_data(file, log):
|
|
1695
|
+
|
|
1696
|
+
"""
|
|
1697
|
+
Extracts the start and end positions of stability blocks from a *_stableRegions.tsv file.
|
|
1698
|
+
|
|
1699
|
+
Paramenter
|
|
1700
|
+
----------
|
|
1701
|
+
file: str
|
|
1702
|
+
Path to the *_stableRegions.tsv file.
|
|
1703
|
+
|
|
1704
|
+
Returns
|
|
1705
|
+
-------
|
|
1706
|
+
first_data: list of int
|
|
1707
|
+
List of integers identifying the start of each stability block.
|
|
1708
|
+
|
|
1709
|
+
final_data: list of int
|
|
1710
|
+
List of integers identifying the end of each stability block.
|
|
1711
|
+
|
|
1712
|
+
values_block: list of int
|
|
1713
|
+
Combined and sorted list of all start and end points.
|
|
1714
|
+
"""
|
|
1715
|
+
#print_log(f'\n---------------------------------------------- Function: processing_data----------------------------------------------\n', log)
|
|
1716
|
+
|
|
1717
|
+
df = pd.read_csv(file, sep="\t", comment='#', header=None)
|
|
1718
|
+
|
|
1719
|
+
first_data = []
|
|
1720
|
+
final_data = []
|
|
1721
|
+
|
|
1722
|
+
for elem in df[1]:
|
|
1723
|
+
line = elem.split('-')
|
|
1724
|
+
value = line[3]
|
|
1725
|
+
first_partition = value.split('x')[0]
|
|
1726
|
+
first_data.append(int(first_partition))
|
|
1727
|
+
|
|
1728
|
+
for elem in df[2]:
|
|
1729
|
+
line = elem.split('-')
|
|
1730
|
+
value = line[1]
|
|
1731
|
+
last_partition = value.split('x')[0]
|
|
1732
|
+
final_data.append(int(last_partition))
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
|
|
1736
|
+
return first_data, final_data
|
|
1737
|
+
|
|
1738
|
+
def change_processing_data(final_df, i1_prefix, i2_prefix, output, log):
|
|
1739
|
+
|
|
1740
|
+
"""
|
|
1741
|
+
Conversion of dataframe values in logarithms to create the graph.
|
|
1742
|
+
|
|
1743
|
+
Parameters
|
|
1744
|
+
----------
|
|
1745
|
+
final_df: pd.DataFrame
|
|
1746
|
+
Dataframe with the start and end of each block per pipeline
|
|
1747
|
+
|
|
1748
|
+
i1_prefix: str
|
|
1749
|
+
Prefix added to the result in pipeline i1
|
|
1750
|
+
|
|
1751
|
+
i2_prefix: str
|
|
1752
|
+
Prefix added to the result in pipeline i2
|
|
1753
|
+
"""
|
|
1754
|
+
|
|
1755
|
+
#print_log(f'\n---------------------------------------------- Function: change_processing_data----------------------------------------------\n', log)
|
|
1756
|
+
|
|
1757
|
+
df_final = final_df.rename(columns={"Finish": "temp"})
|
|
1758
|
+
|
|
1759
|
+
df_final['temp']=np.log2(df_final['temp'])
|
|
1760
|
+
df_final['Start'] = np.log2(df_final['Start'])
|
|
1761
|
+
df_final['Finish'] = df_final['temp'] - df_final['Start']
|
|
1762
|
+
df1_inverted = df_final.iloc[::-1]
|
|
1763
|
+
|
|
1764
|
+
max_val = df1_inverted["temp"].max()
|
|
1765
|
+
max_val_1= int(round(2 ** max_val,0))
|
|
1766
|
+
list_tickvals = list(range(1, max_val_1 + 1))
|
|
1767
|
+
list_ticktext= [str (2 ** x) for x in range(1, max_val_1 + 1)]
|
|
1768
|
+
|
|
1769
|
+
|
|
1770
|
+
fig_st = px.bar(df1_inverted,
|
|
1771
|
+
x="Finish",
|
|
1772
|
+
y="Block_id",
|
|
1773
|
+
base='Start',
|
|
1774
|
+
color="Pipeline",
|
|
1775
|
+
orientation="h")
|
|
1776
|
+
|
|
1777
|
+
fig_st.update_layout(
|
|
1778
|
+
xaxis_title="Threshold",
|
|
1779
|
+
yaxis_title='',
|
|
1780
|
+
xaxis=dict(
|
|
1781
|
+
tickvals=list_tickvals,
|
|
1782
|
+
ticktext=list_ticktext),
|
|
1783
|
+
yaxis=dict(showticklabels=False), legend=dict( orientation="h",yanchor="bottom",y=-0.35,xanchor="center", x=0.5), margin=dict(l=0, r=0, t=20, b=0))
|
|
1784
|
+
|
|
1785
|
+
if i2_prefix is None:
|
|
1786
|
+
prefix=f'{i1_prefix}'
|
|
1787
|
+
else:
|
|
1788
|
+
prefix=f'{i1_prefix}_vs_{i2_prefix}'
|
|
1789
|
+
|
|
1790
|
+
fig_st.write_image(f'{output}/{prefix}_StableRegions.png', format='png')
|
|
1791
|
+
|
|
1792
|
+
|
|
1793
|
+
return fig_st
|
|
1794
|
+
|
|
1795
|
+
#################################################################### OUTBREAKS ###############################################################
|
|
1796
|
+
|
|
1797
|
+
def validate_combinations_outbreak(threshold_outbreak):
|
|
1798
|
+
|
|
1799
|
+
"""
|
|
1800
|
+
Validates the structure of threshold_outbreaks combinations, including their components, and
|
|
1801
|
+
identifies the comparison type and thresholds for outbreak analysis.
|
|
1802
|
+
|
|
1803
|
+
The comparison type supported:
|
|
1804
|
+
- "equal" (defined by ',')
|
|
1805
|
+
- "lower_equal" (defined by '<=')
|
|
1806
|
+
|
|
1807
|
+
Multiple combinations must be separated by semicolons (';'), without spaces.
|
|
1808
|
+
|
|
1809
|
+
This function processes one or more threshold pairs, extracting threshold_1, threshold_2, and the comparison type.
|
|
1810
|
+
|
|
1811
|
+
It also validates that each threshold follows the expected pattern: string-integerxfloat (e.g., 'MST-7x1.0').
|
|
1812
|
+
|
|
1813
|
+
Parameters
|
|
1814
|
+
----------
|
|
1815
|
+
threshold_outbreak: str
|
|
1816
|
+
One or more outbreak threshold combinations provided by the user.
|
|
1817
|
+
|
|
1818
|
+
Returns
|
|
1819
|
+
-------
|
|
1820
|
+
valid_combinations: list
|
|
1821
|
+
A list of sublists ([[threshold_1, threshold_2, comparison_type]]) containing valid combination structures
|
|
1822
|
+
for downstream outbreak analysis.
|
|
1823
|
+
"""
|
|
1824
|
+
#print(f'\n---------------------------------------------- Function: validate_combinations_outbreak----------------------------------------------\n')
|
|
1825
|
+
|
|
1826
|
+
regex = r'^[A-Za-z]+-\d+x\d+\.\d+$'
|
|
1827
|
+
valid_combinations = []
|
|
1828
|
+
combos = threshold_outbreak.split(';')
|
|
1829
|
+
|
|
1830
|
+
for combo in combos:
|
|
1831
|
+
|
|
1832
|
+
parts = combo.split(',')
|
|
1833
|
+
|
|
1834
|
+
if len(parts) != 2:
|
|
1835
|
+
sys.exit(f"The combination '{combo}' must have 2 elements separated by a comma (e.g., 'MST-7x1.0,MST-7x1.0'). Multiple combinations must be separated by ; . Please, do not use spaces.")
|
|
1836
|
+
|
|
1837
|
+
pattern1 = parts[0]
|
|
1838
|
+
|
|
1839
|
+
if parts[1].startswith('<='):
|
|
1840
|
+
pattern2 = parts[1][2:]
|
|
1841
|
+
else:
|
|
1842
|
+
pattern2 = parts[1]
|
|
1843
|
+
|
|
1844
|
+
if not re.match(regex, pattern1):
|
|
1845
|
+
sys.exit(f"Error: Pattern '{pattern1}' (part 1) is not in the correct format (e.g., 'MST-7x1.0'). Please, do not use spaces.")
|
|
1846
|
+
|
|
1847
|
+
if not re.match(regex, pattern2):
|
|
1848
|
+
sys.exit(f"Error: Pattern '{parts[1]}' (part 2) is not in the correct format (e.g., 'MST-7x1.0' or '<=MST-10x1.0'). Please, do not use spaces.")
|
|
1849
|
+
|
|
1850
|
+
if parts[1].startswith('<='):
|
|
1851
|
+
valid_combinations.append([parts[0], parts[1], 'lower_equal'])
|
|
1852
|
+
else:
|
|
1853
|
+
valid_combinations.append([parts[0], parts[1], 'equal'])
|
|
1854
|
+
|
|
1855
|
+
return valid_combinations
|
|
1856
|
+
|
|
1857
|
+
def extract_integer_part(valid_combinations, log):
|
|
1858
|
+
|
|
1859
|
+
"""
|
|
1860
|
+
Extract the integer thresholds from string-formatted threshold.
|
|
1861
|
+
Paramenter
|
|
1862
|
+
---------
|
|
1863
|
+
valid_combinations: list
|
|
1864
|
+
List of sublist ([[threshold_1, threshold_2, type_comparison]]),
|
|
1865
|
+
containing the valid combinations structure to use in the downstream outbreak analysis.
|
|
1866
|
+
|
|
1867
|
+
Return
|
|
1868
|
+
------
|
|
1869
|
+
extracted: list
|
|
1870
|
+
List of tuples [(integer, integer, type_comparison)] with the extracted integer values and the comparison type,
|
|
1871
|
+
to be used in the command-line call of the script `stats_outbreak_script.py`
|
|
1872
|
+
"""
|
|
1873
|
+
#print_log(f'\n---------------------------------------------- Function: extract_integer_part----------------------------------------------\n', log)
|
|
1874
|
+
|
|
1875
|
+
values_outbreak = []
|
|
1876
|
+
|
|
1877
|
+
for p1, p2, comp in valid_combinations:
|
|
1878
|
+
n1 = int(p1.split('-')[1].split('x')[0])
|
|
1879
|
+
n2 = int(p2.split('-')[1].split('x')[0])
|
|
1880
|
+
values_outbreak.append((n1, n2, comp))
|
|
1881
|
+
|
|
1882
|
+
return values_outbreak
|
|
1883
|
+
|
|
1884
|
+
def creation_tsv_stats_outbreak(clusterComposition_1, clusterComposition_2, output, prefix_both, log):
|
|
1885
|
+
|
|
1886
|
+
"""
|
|
1887
|
+
Create a new file (*_path_stats_outbreak.tsv) containing the path to each *clusterComposition.tsv file
|
|
1888
|
+
(these files can be obtained with ReporTree), which will be used as an input argument to stats_oubtreak analysis.py script.
|
|
1889
|
+
|
|
1890
|
+
Parameters
|
|
1891
|
+
----------
|
|
1892
|
+
clusterComposition_1: str
|
|
1893
|
+
Path to the *cluster_composition file of pipeline 1.
|
|
1894
|
+
and clusterComposition_2: str
|
|
1895
|
+
Path to the *cluster_composition file of pipeline 1.
|
|
1896
|
+
output: str
|
|
1897
|
+
Path to the directory where the results will be saved.
|
|
1898
|
+
prefix_both: str
|
|
1899
|
+
Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX).
|
|
1900
|
+
|
|
1901
|
+
Return
|
|
1902
|
+
------
|
|
1903
|
+
df: pandas.DataFrame
|
|
1904
|
+
DataFrame containing the paths to the clusterComposition.tsv files.
|
|
1905
|
+
path_comparison_outbreak: str
|
|
1906
|
+
Path to the *_path_stats_outbreak.tsv file.
|
|
1907
|
+
"""
|
|
1908
|
+
#print_log(f'\n---------------------------------------------- Function: creation_tsv_stats_outbreak ----------------------------------------------\n', log)
|
|
1909
|
+
|
|
1910
|
+
data=[[clusterComposition_1], [clusterComposition_2]]
|
|
1911
|
+
df=pd.DataFrame(data)
|
|
1912
|
+
path_stats_outbreak=f'{output}/{prefix_both}_path_stats_outbreak.tsv'
|
|
1913
|
+
df.to_csv(path_stats_outbreak, sep='\t', index=False, header=None)
|
|
1914
|
+
|
|
1915
|
+
return df, path_stats_outbreak
|
|
1916
|
+
|
|
1917
|
+
|
|
1918
|
+
def calling_script_outbreak(stats_outbreak_script, path_stats_outbreak, output, prefix_both, values_outbreak, log):
|
|
1919
|
+
|
|
1920
|
+
"""
|
|
1921
|
+
Calls the outbreak script.
|
|
1922
|
+
|
|
1923
|
+
Parameters:
|
|
1924
|
+
-----------
|
|
1925
|
+
stats_outbreak_script: str
|
|
1926
|
+
Path to the stats_outbreak_analysis.py script.
|
|
1927
|
+
path_stats_outbreak: str
|
|
1928
|
+
Path to the *_path_stats_outbreak.tsv file.
|
|
1929
|
+
output: str
|
|
1930
|
+
Path to the directory where the results will be saved.
|
|
1931
|
+
prefix_both: str
|
|
1932
|
+
Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX)
|
|
1933
|
+
values_outbreak: list
|
|
1934
|
+
The struture of tuple list is i.e, [(threshold1, threshold2, type_comparison)]
|
|
1935
|
+
|
|
1936
|
+
Return:
|
|
1937
|
+
------
|
|
1938
|
+
Files produced by script, i.e,:
|
|
1939
|
+
-XX_vs_XX_stats_outbreak_missing_clusters_INTEGER_equal_INTEGER.tsv
|
|
1940
|
+
-XX_vs_XX_stats_outbreak_pairwise_comparison_INTEGER_equal_INTEGER.tsv
|
|
1941
|
+
-XX_vs_XX_stats_outbreak_pairwise_comparison_INTEGER_equal_INTEGER_pct.tsv
|
|
1942
|
+
-XX_vs_XX_stats_outbreak_summary_INTEGER_equal_INTEGER.tsv
|
|
1943
|
+
"""
|
|
1944
|
+
#print_log(f'\n---------------------------------------------- Function: calling_script_outbreak ----------------------------------------------\n', log)
|
|
1945
|
+
|
|
1946
|
+
if values_outbreak !=[]:
|
|
1947
|
+
for th1,th2,type_comparison in values_outbreak:
|
|
1948
|
+
|
|
1949
|
+
cmd= ["python", stats_outbreak_script, "-i", path_stats_outbreak, "-t1", str(th1), "-t2", str(th2),
|
|
1950
|
+
"-o", f"{output}/{prefix_both}_stats_outbreak", "-c", type_comparison]
|
|
1951
|
+
|
|
1952
|
+
subprocess.run(cmd,capture_output=True, text=True)
|
|
1953
|
+
|
|
1954
|
+
print_log(f"\tRunning stats_outbreak_analysis.py for {th1} {type_comparison} {th2}", log)
|
|
1955
|
+
print_log(f'\t\t{" ".join(cmd)}', log)
|
|
1956
|
+
|
|
1957
|
+
print_log(f"\tDone!", log)
|
|
1958
|
+
else:
|
|
1959
|
+
print_log(f'\tImpossible to call the stats_outbreak_analysis.py.', log)
|
|
1960
|
+
|
|
1961
|
+
def read_files_outbreak(output):
|
|
1962
|
+
|
|
1963
|
+
"""
|
|
1964
|
+
Identifying of the file *stats_outbreak_pairwise_comparison_*_pct.tsv, which contains percentage values of clusters detected by one pipeline that are also detected,
|
|
1965
|
+
with the exact same composition, by another pipeline.
|
|
1966
|
+
|
|
1967
|
+
Parameters:
|
|
1968
|
+
----------
|
|
1969
|
+
output: str
|
|
1970
|
+
Path to the directory where the results will be saved.
|
|
1971
|
+
|
|
1972
|
+
Return:
|
|
1973
|
+
------
|
|
1974
|
+
process_files: list
|
|
1975
|
+
List with the full path of *stats_outbreak_pairwise_comparison_*_pct.tsv.
|
|
1976
|
+
"""
|
|
1977
|
+
#print(f'\n---------------------------------------------- Function: read_files_outbreak ----------------------------------------------\n')
|
|
1978
|
+
|
|
1979
|
+
files_outbreak_pct=os.listdir(output)
|
|
1980
|
+
process_files=[]
|
|
1981
|
+
for file in files_outbreak_pct:
|
|
1982
|
+
if file.endswith('_pct.tsv'):
|
|
1983
|
+
path_file=f'{output}/{file}'
|
|
1984
|
+
abs_path_file=os.path.realpath(path_file)
|
|
1985
|
+
process_files.append(abs_path_file)
|
|
1986
|
+
|
|
1987
|
+
return process_files
|
|
1988
|
+
|
|
1989
|
+
def creation_overlap_clusters(process_files, output, values_oubreak):
|
|
1990
|
+
|
|
1991
|
+
"""
|
|
1992
|
+
Production of the graphics with the overlap genetic clusters according the threshold outbreak.
|
|
1993
|
+
|
|
1994
|
+
Parameters:
|
|
1995
|
+
-----------
|
|
1996
|
+
process_files: list
|
|
1997
|
+
List with the full path of *stats_outbreak_pairwise_comparison_*_pct.tsv.
|
|
1998
|
+
|
|
1999
|
+
output: str
|
|
2000
|
+
Path to the directory where the results will be saved.
|
|
2001
|
+
|
|
2002
|
+
values_outbreak: list
|
|
2003
|
+
List of tuples [(integer, integer, type_comparison)]
|
|
2004
|
+
|
|
2005
|
+
Return:
|
|
2006
|
+
------
|
|
2007
|
+
fig_result: list
|
|
2008
|
+
List of images (plotly.graph_objs._figure.Figure).
|
|
2009
|
+
"""
|
|
2010
|
+
#print(f'\n---------------------------------------------- Function: creation_overlap_clusters ----------------------------------------------\n')
|
|
2011
|
+
|
|
2012
|
+
result=[]
|
|
2013
|
+
for th1, thr2, type_compo in values_oubreak:
|
|
2014
|
+
for path in process_files:
|
|
2015
|
+
if f'_{th1}_{type_compo}_{thr2}_' in path:
|
|
2016
|
+
result.append([th1,thr2,type_compo,path])
|
|
2017
|
+
|
|
2018
|
+
fig_result=[]
|
|
2019
|
+
thresholds=[]
|
|
2020
|
+
for i in result:
|
|
2021
|
+
file=i[3]
|
|
2022
|
+
thr1=i[0]
|
|
2023
|
+
thr2=i[1]
|
|
2024
|
+
type_com=i[2]
|
|
2025
|
+
|
|
2026
|
+
df=pd.read_table(file)
|
|
2027
|
+
df_filtered = df.drop(df.columns[0], axis=1)
|
|
2028
|
+
|
|
2029
|
+
if df_filtered.shape[1] == 2:
|
|
2030
|
+
values_col1 = df_filtered.columns[0]
|
|
2031
|
+
values_col2 = df_filtered.columns[1]
|
|
2032
|
+
|
|
2033
|
+
if df_filtered.shape[0] <= 2:
|
|
2034
|
+
df_filtered[''] = ''
|
|
2035
|
+
values_col1 = df_filtered.columns[0]
|
|
2036
|
+
values_col2 = df_filtered.columns[1]
|
|
2037
|
+
|
|
2038
|
+
df_percentage= df_filtered*100
|
|
2039
|
+
name_file=os.path.basename(file)
|
|
2040
|
+
base, ext=os.path.splitext(name_file)
|
|
2041
|
+
|
|
2042
|
+
if type_com=='equal':
|
|
2043
|
+
string1=f'at {thr1} threshold'
|
|
2044
|
+
string2=f'at {thr2} threshold'
|
|
2045
|
+
else:
|
|
2046
|
+
string1=f'at {thr1} threshold'
|
|
2047
|
+
string2=f'at up {thr2} threshold'
|
|
2048
|
+
thresholds.append((thr1,thr2,type_com))
|
|
2049
|
+
|
|
2050
|
+
colors = [[0, 'white'], [0.5, 'white'], [0.5, '#FDFD96'], [1, '#89B6E3']]
|
|
2051
|
+
|
|
2052
|
+
fig = go.Figure(data=go.Heatmap(
|
|
2053
|
+
z=df_percentage.values,
|
|
2054
|
+
x =[f'{values_col1}', f'{values_col2}'],
|
|
2055
|
+
y =[f'{values_col1}', f'{values_col2}'],
|
|
2056
|
+
text=df_percentage.values,
|
|
2057
|
+
texttemplate="%{text:.2f}%",
|
|
2058
|
+
textfont=dict(size=11, color="black"),
|
|
2059
|
+
colorscale=colors,
|
|
2060
|
+
colorbar=dict(title="Overlap"),
|
|
2061
|
+
zmin=0, zmax=100
|
|
2062
|
+
))
|
|
2063
|
+
|
|
2064
|
+
fig.update_layout(
|
|
2065
|
+
xaxis_title=f"Cluster detected {string2}",
|
|
2066
|
+
yaxis_title=f"Cluster detected {string1}",
|
|
2067
|
+
plot_bgcolor='white',
|
|
2068
|
+
paper_bgcolor='white', margin=dict(l=0, r=0, t=20, b=0))
|
|
2069
|
+
|
|
2070
|
+
fig.write_image(f'{output}/{base}.png', format="png")
|
|
2071
|
+
fig_result.append(fig)
|
|
2072
|
+
|
|
2073
|
+
return fig_result, thresholds
|
|
2074
|
+
|
|
2075
|
+
def get_plot_columns(file):
|
|
2076
|
+
|
|
2077
|
+
"""
|
|
2078
|
+
Get the names of the available columns for cluster plots in a given summary file (*_partition_summary or *_sample_of_interest), to perform the cluster characterization.
|
|
2079
|
+
|
|
2080
|
+
Parameters
|
|
2081
|
+
----------
|
|
2082
|
+
file: str
|
|
2083
|
+
Path to the *_partitions_summary.tsv or *_SAMPLE_OF_INTEREST_partitions_summary.tsv file.
|
|
2084
|
+
|
|
2085
|
+
Return
|
|
2086
|
+
------
|
|
2087
|
+
List of columns present in the file that are not part of the default memory_columns.
|
|
2088
|
+
If none are found, the program exits with an error message.
|
|
2089
|
+
"""
|
|
2090
|
+
#print(f'\n---------------------------------------------- Function: get_plot_columns_list ----------------------------------------------\n')
|
|
2091
|
+
|
|
2092
|
+
memory_columns = ['partition', 'cluster', 'nomenclature_change', 'n_increase', 'cluster_length', 'samples', 'samples_increase','SAMPLE_OF_INTEREST']
|
|
2093
|
+
|
|
2094
|
+
df = pd.read_csv(file, sep = "\t")
|
|
2095
|
+
name_folder = file.split('/')[0]
|
|
2096
|
+
print(f"\nAvailable columns for {name_folder}:")
|
|
2097
|
+
|
|
2098
|
+
columns_df=df.columns.tolist()
|
|
2099
|
+
|
|
2100
|
+
attachement_list=[]
|
|
2101
|
+
for elem in columns_df:
|
|
2102
|
+
if elem not in memory_columns:
|
|
2103
|
+
attachement_list.append(elem)
|
|
2104
|
+
|
|
2105
|
+
if attachement_list != []:
|
|
2106
|
+
for elem in attachement_list:
|
|
2107
|
+
print(f'\t- {elem}')
|
|
2108
|
+
else:
|
|
2109
|
+
sys.exit(f'Error: No additional columns found in {file}.')
|
|
2110
|
+
|
|
2111
|
+
def find_html_outbreak(output,prefix_both,log):
|
|
2112
|
+
|
|
2113
|
+
"""
|
|
2114
|
+
Check if there is an initial HTML report and if the second HTML report, created by the reanalysis of the threshold outbreak (-rto argument), exists.
|
|
2115
|
+
|
|
2116
|
+
Parameters
|
|
2117
|
+
---------
|
|
2118
|
+
output: str
|
|
2119
|
+
Path to the directory where the results will be saved.
|
|
2120
|
+
prefix_both: str
|
|
2121
|
+
Prefix added to the result files generated from both pipelines (e.g., XXX_vs_XXX)
|
|
2122
|
+
|
|
2123
|
+
Return
|
|
2124
|
+
------
|
|
2125
|
+
final_files: <class 'list'>
|
|
2126
|
+
If the files exist, a list with their relative paths is created.
|
|
2127
|
+
"""
|
|
2128
|
+
#print_log(f'\n---------------------------------------------- Function: find_html_outbreak----------------------------------------------\n', log)
|
|
2129
|
+
|
|
2130
|
+
all_files = os.listdir(output)
|
|
2131
|
+
|
|
2132
|
+
expected_new_report = f"{prefix_both}_2ºRUN_report.html"
|
|
2133
|
+
expected_report = f"{prefix_both}_report.html"
|
|
2134
|
+
|
|
2135
|
+
if expected_new_report not in all_files:
|
|
2136
|
+
sys.exit(f"Error: {expected_new_report} not found!")
|
|
2137
|
+
|
|
2138
|
+
if expected_report not in all_files:
|
|
2139
|
+
sys.exit(f"Error: {expected_report} not found!")
|
|
2140
|
+
|
|
2141
|
+
final_files = [expected_new_report, expected_report]
|
|
2142
|
+
|
|
2143
|
+
return final_files
|
|
2144
|
+
|
|
2145
|
+
def extration_section_original_file(output, final_files, log):
|
|
2146
|
+
|
|
2147
|
+
"""
|
|
2148
|
+
Extracts specific sections (e.g., clustering and congruence) from an original HTML file,
|
|
2149
|
+
and saves them in a temporary text file for later use (e.g., to merge with another report).
|
|
2150
|
+
|
|
2151
|
+
Parameters:
|
|
2152
|
+
----------
|
|
2153
|
+
output: str
|
|
2154
|
+
Path to the directory where the results will be saved.
|
|
2155
|
+
final_files: list
|
|
2156
|
+
List with relative paths to the HTML reports.
|
|
2157
|
+
|
|
2158
|
+
Return:
|
|
2159
|
+
------
|
|
2160
|
+
path_temp: str
|
|
2161
|
+
Path to the *.txt file with clustering and congruence information.
|
|
2162
|
+
"""
|
|
2163
|
+
#print_log(f'\n---------------------------------------------- Function: extration_section_original_file----------------------------------------------\n', log)
|
|
2164
|
+
|
|
2165
|
+
original = final_files[1]
|
|
2166
|
+
original_file = f'{output}/{original}'
|
|
2167
|
+
|
|
2168
|
+
line = 73
|
|
2169
|
+
|
|
2170
|
+
with open(original_file, "r") as f:
|
|
2171
|
+
lines = f.readlines()
|
|
2172
|
+
|
|
2173
|
+
end = '<button class="accordion">Outbreak</button>'
|
|
2174
|
+
path_temp = f'{output}/exit.txt'
|
|
2175
|
+
|
|
2176
|
+
with open(path_temp, "w") as f:
|
|
2177
|
+
for current_line in lines[line-1:]:
|
|
2178
|
+
if end in current_line:
|
|
2179
|
+
break
|
|
2180
|
+
f.write(current_line)
|
|
2181
|
+
|
|
2182
|
+
print_log(f"Content saved in: {path_temp}", log)
|
|
2183
|
+
return path_temp
|
|
2184
|
+
|
|
2185
|
+
def transfer_info_to_html_content(path_temp, html_content, log):
|
|
2186
|
+
|
|
2187
|
+
"""
|
|
2188
|
+
Reads clustering and congruence information from a temporary .txt file,
|
|
2189
|
+
appends it to the existing HTML header content, and deletes the .txt file.
|
|
2190
|
+
|
|
2191
|
+
Parameters
|
|
2192
|
+
----------
|
|
2193
|
+
path_temp: str
|
|
2194
|
+
Path to the *.txt file containing clustering and congruence information.
|
|
2195
|
+
|
|
2196
|
+
html_content: str
|
|
2197
|
+
Initial content of the new HTML report (e.g., header section).
|
|
2198
|
+
|
|
2199
|
+
Return:
|
|
2200
|
+
------
|
|
2201
|
+
Merges the header of the new report with the clustering and congruence analysis from the initial report.
|
|
2202
|
+
"""
|
|
2203
|
+
|
|
2204
|
+
#print_log(f'\n---------------------------------------------- Function: transfer_info_to_html_content----------------------------------------------\n', log)
|
|
2205
|
+
|
|
2206
|
+
if os.path.exists(path_temp):
|
|
2207
|
+
#print_log(path_temp, log)
|
|
2208
|
+
with open(path_temp, 'r') as input_file:
|
|
2209
|
+
content = input_file.read()
|
|
2210
|
+
|
|
2211
|
+
else:
|
|
2212
|
+
print_log('There was a problem with the creation of the file containing information about clustering and congruence for the second HTML report.', log)
|
|
2213
|
+
html_content = content
|
|
2214
|
+
#os.remove(path_temp)
|
|
2215
|
+
return html_content
|
|
2216
|
+
|
|
2217
|
+
################################################################ MODULE 1 ################################################################################
|
|
2218
|
+
|
|
2219
|
+
def reading_sequence_type(sequence_type_file, output, prefix_st, log):
|
|
2220
|
+
|
|
2221
|
+
"""
|
|
2222
|
+
Reading the sequence type matrix.
|
|
2223
|
+
|
|
2224
|
+
Parameters
|
|
2225
|
+
----------
|
|
2226
|
+
sequence_type: str
|
|
2227
|
+
Full path to the sequence type matrix.
|
|
2228
|
+
|
|
2229
|
+
output: str
|
|
2230
|
+
Path to the directory where the results will be saved.
|
|
2231
|
+
|
|
2232
|
+
prefix_st: str
|
|
2233
|
+
The prefix that will be added to the file.
|
|
2234
|
+
|
|
2235
|
+
Returns
|
|
2236
|
+
-------
|
|
2237
|
+
fig: plotly.graph_objs._figure.Figure
|
|
2238
|
+
Code to produce figure
|
|
2239
|
+
"""
|
|
2240
|
+
|
|
2241
|
+
#print_log(f'\n---------------------------------------------- Function: reading_sequence_type----------------------------------------------\n', log)
|
|
2242
|
+
|
|
2243
|
+
df=pd.read_table(sequence_type_file)
|
|
2244
|
+
column=df.columns[1]
|
|
2245
|
+
|
|
2246
|
+
name_cluster=[]
|
|
2247
|
+
nr_cluster=[]
|
|
2248
|
+
|
|
2249
|
+
for elem in df[column]:
|
|
2250
|
+
if elem not in name_cluster:
|
|
2251
|
+
name_cluster.append(elem)
|
|
2252
|
+
number = df[column].tolist().count(elem)
|
|
2253
|
+
nr_cluster.append(number)
|
|
2254
|
+
|
|
2255
|
+
new_df = pd.DataFrame({"Cluster": name_cluster, "Count": nr_cluster})
|
|
2256
|
+
fig = px.bar(new_df, x="Cluster", y="Count", title=f"Most represented STs in the {prefix_st} pipeline", labels={"Cluster": "Cluster name", "Count": "Number of samples"})
|
|
2257
|
+
fig.update_layout(title_x=0.5)
|
|
2258
|
+
fig.write_image(f'{output}/{prefix_st}_pipeline_clusters.png', format='png')
|
|
2259
|
+
|
|
2260
|
+
return fig
|
|
2261
|
+
|
|
2262
|
+
|
|
2263
|
+
###########################################################################################################################################################
|
|
2264
|
+
########################################################################## HTML ###########################################################################
|
|
2265
|
+
###########################################################################################################################################################
|
|
2266
|
+
|
|
2267
|
+
def create_html(log, file_path_report):
|
|
2268
|
+
|
|
2269
|
+
"""
|
|
2270
|
+
Opening HTML file to save the dynamic graphs of the various analyses.
|
|
2271
|
+
|
|
2272
|
+
Parameters
|
|
2273
|
+
----------
|
|
2274
|
+
log: <class '_io.TextIOWrapper'>
|
|
2275
|
+
|
|
2276
|
+
file_path_report: str
|
|
2277
|
+
Full path to the HTML file with all analysis results.
|
|
2278
|
+
|
|
2279
|
+
Return
|
|
2280
|
+
------
|
|
2281
|
+
html_content: str
|
|
2282
|
+
It contains the body of an HTML document.
|
|
2283
|
+
|
|
2284
|
+
"""
|
|
2285
|
+
#print_log(f'\n---------------------------------------------- Function: create_html----------------------------------------------\n',log)
|
|
2286
|
+
|
|
2287
|
+
name_file=os.path.basename(file_path_report)
|
|
2288
|
+
title="Report EvalTree"
|
|
2289
|
+
|
|
2290
|
+
html_content= f"""<!DOCTYPE html>
|
|
2291
|
+
<html>
|
|
2292
|
+
<head>
|
|
2293
|
+
<meta charset="utf-8">
|
|
2294
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2295
|
+
<title>{title}</title>
|
|
2296
|
+
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
|
2297
|
+
<style>
|
|
2298
|
+
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
|
2299
|
+
section {{ margin: 20px 0; }}
|
|
2300
|
+
|
|
2301
|
+
|
|
2302
|
+
/* ------------------------- CLUSTERING images------------------------------------------- */
|
|
2303
|
+
.image-row {{
|
|
2304
|
+
display: flex;
|
|
2305
|
+
flex-wrap: wrap;
|
|
2306
|
+
justify-content: space-around;
|
|
2307
|
+
margin-top: 20px;
|
|
2308
|
+
}}
|
|
2309
|
+
|
|
2310
|
+
.image-item {{
|
|
2311
|
+
flex: 1 1 calc(25% - 20px);
|
|
2312
|
+
box-sizing: border-box;
|
|
2313
|
+
margin: 10px;
|
|
2314
|
+
max-width: calc(25% - 20px);
|
|
2315
|
+
}}
|
|
2316
|
+
|
|
2317
|
+
/* --------------------------- START Accordion ------------------------------------------- */
|
|
2318
|
+
.accordion {{
|
|
2319
|
+
background-color: #eee;
|
|
2320
|
+
color: #444;
|
|
2321
|
+
cursor: pointer;
|
|
2322
|
+
padding: 18px;
|
|
2323
|
+
width: 100%;
|
|
2324
|
+
border: none;
|
|
2325
|
+
text-align: left;
|
|
2326
|
+
outline: none;
|
|
2327
|
+
font-size: 20px;
|
|
2328
|
+
transition: 0.4s;
|
|
2329
|
+
font-weight: bold;
|
|
2330
|
+
}}
|
|
2331
|
+
.active, .accordion:hover {{background-color: #ccc;}}
|
|
2332
|
+
|
|
2333
|
+
.panel {{ padding: 0 10px;
|
|
2334
|
+
display: none;
|
|
2335
|
+
background-color: white;
|
|
2336
|
+
overflow: hidden;}}
|
|
2337
|
+
|
|
2338
|
+
/* --------------------------- END Accordion ------------------------------------------- */
|
|
2339
|
+
|
|
2340
|
+
.image-heatmap {{
|
|
2341
|
+
display: flex;
|
|
2342
|
+
justify-content: space-around;
|
|
2343
|
+
margin-top: 20px;
|
|
2344
|
+
flex-wrap: wrap;
|
|
2345
|
+
}}
|
|
2346
|
+
|
|
2347
|
+
.compact {{margin: 2px 0; line-height: 1.2;}}
|
|
2348
|
+
</style>
|
|
2349
|
+
</head>
|
|
2350
|
+
<body>
|
|
2351
|
+
<header>
|
|
2352
|
+
<h1>EvalTree Report</h1>
|
|
2353
|
+
<p> Toolbox for comparative clustering evaluation of whole genome sequencing pipelines for bacteria routine surveillance</p>
|
|
2354
|
+
</header>"""
|
|
2355
|
+
|
|
2356
|
+
return html_content
|
|
2357
|
+
|
|
2358
|
+
def body_html(start, command_line,version):
|
|
2359
|
+
|
|
2360
|
+
html_content=f"""<button class="accordion">Overview</button>
|
|
2361
|
+
<div class="panel">
|
|
2362
|
+
<p>Report generated on: {start}</p>
|
|
2363
|
+
<p>Entered command line: {command_line}</p>
|
|
2364
|
+
<p>Version: {version}</p>
|
|
2365
|
+
</div>
|
|
2366
|
+
"""
|
|
2367
|
+
return html_content
|
|
2368
|
+
|
|
2369
|
+
def get_sequence_type(prefix_st,samples_st,groups_st,sequence_type_file):
|
|
2370
|
+
|
|
2371
|
+
html_content=f"""
|
|
2372
|
+
<button class="accordion">Pipeline characterization: {prefix_st}</button>
|
|
2373
|
+
<div class="panel">
|
|
2374
|
+
<p>Number of samples: {samples_st} </p>
|
|
2375
|
+
<p>Number of groups:{groups_st} </p>
|
|
2376
|
+
<p> Name of file: {sequence_type_file}</p>
|
|
2377
|
+
"""
|
|
2378
|
+
return html_content
|
|
2379
|
+
|
|
2380
|
+
def sequence_type_image(fig_html):
|
|
2381
|
+
|
|
2382
|
+
html_content=f"""
|
|
2383
|
+
<div>{fig_html}</div></div>
|
|
2384
|
+
"""
|
|
2385
|
+
return html_content
|
|
2386
|
+
|
|
2387
|
+
def get_partitions_threshold(prefix_single, nr_lines_df, nr_columns_df, fig_partition_vs_threshols):
|
|
2388
|
+
|
|
2389
|
+
fig_partition_vs_threshols.update_layout(margin=dict(l=0, r=0, t=20, b=0))
|
|
2390
|
+
fig_pt=pio.to_html(fig_partition_vs_threshols, include_plotlyjs='cdn', full_html=False)
|
|
2391
|
+
|
|
2392
|
+
html_content= f"""<button class="accordion">Pipeline characterization: {prefix_single}</button>
|
|
2393
|
+
<div class="panel">
|
|
2394
|
+
<h3> Summary: {prefix_single} </h3>
|
|
2395
|
+
<p> Number of samples: {nr_lines_df} </p>
|
|
2396
|
+
<p> Number of thresholds: {nr_columns_df} </p>
|
|
2397
|
+
<h3> Number of partitions per threshold </h3>
|
|
2398
|
+
<div> {fig_pt} </div>
|
|
2399
|
+
<p> This line plot shows the number of partitions (groups) at each threshold. </p>
|
|
2400
|
+
"""
|
|
2401
|
+
return html_content
|
|
2402
|
+
|
|
2403
|
+
def get_clusters(mst_groups, prefix):
|
|
2404
|
+
|
|
2405
|
+
"""
|
|
2406
|
+
Display pie charts by threshold and category, generating CLUSTER HTML content for each pipeline.
|
|
2407
|
+
|
|
2408
|
+
Parameters
|
|
2409
|
+
----------
|
|
2410
|
+
|
|
2411
|
+
mst_groups: dict
|
|
2412
|
+
Categories and images.
|
|
2413
|
+
prefix: str
|
|
2414
|
+
Prefix the name of the file that is being processed.
|
|
2415
|
+
|
|
2416
|
+
Return
|
|
2417
|
+
-----
|
|
2418
|
+
HTML file with data chosen by the user."
|
|
2419
|
+
"""
|
|
2420
|
+
|
|
2421
|
+
html_content=''
|
|
2422
|
+
html_content +=f'<button class="accordion"> ReporTree clustering visualization: pipeline {prefix} </button>\n'
|
|
2423
|
+
html_content +=f'<div class="panel">\n'
|
|
2424
|
+
|
|
2425
|
+
for mst, categories in mst_groups.items():
|
|
2426
|
+
|
|
2427
|
+
html_content += f'<button class="accordion">Threshold: {mst}</button>\n'
|
|
2428
|
+
html_content += f'<div class="panel">\n'
|
|
2429
|
+
|
|
2430
|
+
for category, images in categories.items():
|
|
2431
|
+
html_content += f"<h4>Category: {category}</h4>\n"
|
|
2432
|
+
html_content+=f'<div class="image-row">\n'
|
|
2433
|
+
|
|
2434
|
+
for image in images:
|
|
2435
|
+
width_percent = 25
|
|
2436
|
+
fig_html=pio.to_html(image, include_plotlyjs = 'cdn', full_html = False)
|
|
2437
|
+
html_content += f'<div class="image-item">{fig_html}</div>\n'
|
|
2438
|
+
#html_content += f'<div class="image-item" style="flex: 0 0 {width_percent}%; max-width: {width_percent}%;">{fig_html}</div>\n'
|
|
2439
|
+
|
|
2440
|
+
html_content += f"</div>\n"
|
|
2441
|
+
html_content += f'</div>\n'
|
|
2442
|
+
html_content += f'</div>\n'
|
|
2443
|
+
html_content += f'</div>\n'
|
|
2444
|
+
|
|
2445
|
+
return html_content
|
|
2446
|
+
|
|
2447
|
+
def summary_congruence():
|
|
2448
|
+
|
|
2449
|
+
html_content=f"""
|
|
2450
|
+
<button class="accordion">Inter-pipeline cluster congruence</button>
|
|
2451
|
+
<div class="panel" >
|
|
2452
|
+
<p > This section evaluates the clustering congruence between two WGS-based pipelines by comparing their cluster compositon at all possible threshold levels.
|
|
2453
|
+
The goal is to assess how similarly the pipelines group the isolates, by measuring the consistency of cluster assignments at each threshold.
|
|
2454
|
+
This helps determine the level of agreement between the pipelines and identify the most comparable thresholds.
|
|
2455
|
+
More detailed information is available on the
|
|
2456
|
+
<a href="https://github.com/insapathogenomics/CENTAUR/tree/main/EvalTree" target="_blank" rel="noopener noreferrer">
|
|
2457
|
+
EvalTree GitHub
|
|
2458
|
+
</a>.
|
|
2459
|
+
</p>
|
|
2460
|
+
|
|
2461
|
+
"""
|
|
2462
|
+
return html_content
|
|
2463
|
+
|
|
2464
|
+
def summary_partition_threshold(fig_html_partition_threshold, prefix_both):
|
|
2465
|
+
|
|
2466
|
+
html_content=f"""
|
|
2467
|
+
<h3> Number of partitions per threshold </h3>
|
|
2468
|
+
<div> {fig_html_partition_threshold} </div>
|
|
2469
|
+
<p class="compact"> The line plot shows the number of partitions at each threshold.</p>
|
|
2470
|
+
<p class="compact"> Detailed information is available in the <code> {prefix_both}_cluster_partitions.tsv</code> file.</p>
|
|
2471
|
+
"""
|
|
2472
|
+
return html_content
|
|
2473
|
+
|
|
2474
|
+
def congruence_stability(fig_html_st, prefix, prefix_2, n_stability, thr_stability):
|
|
2475
|
+
|
|
2476
|
+
html_content=f"""
|
|
2477
|
+
<h3> Blocks of stability regions </h3>
|
|
2478
|
+
<div>{fig_html_st}</div>
|
|
2479
|
+
<p class="compact"> For each pipeline, clustering stability regions are defined as a range of thresholds e.g., {n_stability} with a nAWC of e.g., {thr_stability} which cluster composition remains stable/consistent. </p>
|
|
2480
|
+
<p class="compact"> To better distinguish each region (represented by separated rectangle blocks), the blocks are vertically offset, starting on a different line. </p>
|
|
2481
|
+
<p class="compact"> Distance thresholds (x axis) are presented in log2 scale. </p>
|
|
2482
|
+
<p class="compact"> Detailed information is available in the following files: </p>
|
|
2483
|
+
"""
|
|
2484
|
+
|
|
2485
|
+
html_content += f"- <code>{prefix}_metrics.tsv"
|
|
2486
|
+
if prefix_2 is not None:
|
|
2487
|
+
html_content += f" and {prefix_2}_metrics.tsv"
|
|
2488
|
+
html_content += f": summarizes all comparisons between consecutive pairs of thresholds (“n + 1” → “n”). A region is stable when at least five consecutive pairs of thresholds yield an nAWC greater than 0.99.</code>"
|
|
2489
|
+
|
|
2490
|
+
html_content += f"<br>- <code>{prefix}_StableRegions.tsv"
|
|
2491
|
+
if prefix_2 is not None:
|
|
2492
|
+
html_content += f" and {prefix_2}_StableRegions.tsv"
|
|
2493
|
+
html_content += f": lists the block names, their respective threshold range, and the length of each block.</code>"
|
|
2494
|
+
|
|
2495
|
+
|
|
2496
|
+
return html_content
|
|
2497
|
+
|
|
2498
|
+
def congruence_heatmap(fig_html_heatmap, prefix_both):
|
|
2499
|
+
|
|
2500
|
+
split_prefix=prefix_both.split('_')
|
|
2501
|
+
first=split_prefix[0]
|
|
2502
|
+
second=split_prefix[-1]
|
|
2503
|
+
|
|
2504
|
+
html_content=f"""
|
|
2505
|
+
<h3> Congruence score </h3>
|
|
2506
|
+
<div class='image-heatmap'>{fig_html_heatmap} </div>
|
|
2507
|
+
<p class="compact"> The heatmap shows a pairwise comparison of clustering results from two pipelines, {first} and {second}, at all possible distance thresholds. </p>
|
|
2508
|
+
<p class="compact"> The congruence score (CS) is a metric ranging from 0 (no congruence between methods) to 3 (absolute congruence).</p>
|
|
2509
|
+
<p class="compact"> Detailed information is available in the <code> {prefix_both}_final_score.tsv </code> file.</p>
|
|
2510
|
+
"""
|
|
2511
|
+
return html_content
|
|
2512
|
+
|
|
2513
|
+
def congruence_tendency(fig_tendency_html, score_value, prefix_both, nr_point_method_1, nr_point_method_2):
|
|
2514
|
+
|
|
2515
|
+
pipeline1=prefix_both.split('_vs_')[0]
|
|
2516
|
+
pipeline2=prefix_both.split('_vs_')[-1]
|
|
2517
|
+
|
|
2518
|
+
html_content= f"""
|
|
2519
|
+
<h3> Corresponding points </h3>
|
|
2520
|
+
<div> {fig_tendency_html} </div>
|
|
2521
|
+
<p class="compact"> This graph shows the corresponding points between the two pipelines in both directions above (CS >= {score_value}). </p>
|
|
2522
|
+
<p class="compact"> When comparing a set of samples between two pipelines, the probability of two sample clustering together in one method/pipeline in a given threshold
|
|
2523
|
+
may not to be the same in the other method/pipeline. Therefore:</p>
|
|
2524
|
+
<p class="compact"> - First, the threshold in the {pipeline1} pipeline (method 1) that produces clustering results most similar to those in the {pipeline2} pipeline (method 2) is identified. </p>
|
|
2525
|
+
<p class="compact"> - Then, the threshold in the {pipeline2} pipeline (method 1) that produces clustering results most similar to those in the {pipeline1} pipeline (method 2) is identified.</p>
|
|
2526
|
+
<p class="compact"> Both methods produce similar clustering results when the tendency line has a slope near 1. </p>
|
|
2527
|
+
<p class="compact">A linear tendency line supported by {nr_point_method_1} (blue) and {nr_point_method_2} (orange) points is presented. </p>
|
|
2528
|
+
<p class="compact"> Detailed information is available in the <code> {prefix_both}_All_correspondence.tsv </code> file. </p>
|
|
2529
|
+
<p style="margin-bottom: 8px;"></p>
|
|
2530
|
+
</div>
|
|
2531
|
+
"""
|
|
2532
|
+
return html_content
|
|
2533
|
+
|
|
2534
|
+
def congruence_st(fig_html_heatmap, prefix_both):
|
|
2535
|
+
|
|
2536
|
+
html_content= f"""
|
|
2537
|
+
<button class="accordion">Congruence</button>
|
|
2538
|
+
<div class="panel">
|
|
2539
|
+
<p> This section makes it possible to evaluate the congruence of the two genomic pipelines. </p>
|
|
2540
|
+
<h3> Congruence score </h3>
|
|
2541
|
+
<div class="image">{fig_html_heatmap} </div>
|
|
2542
|
+
<p> Sequence type {prefix_both} pipelines</p>
|
|
2543
|
+
</div>
|
|
2544
|
+
"""
|
|
2545
|
+
return html_content
|
|
2546
|
+
|
|
2547
|
+
def html_tradicional_typing(n_samples,n_groups, prefix):
|
|
2548
|
+
|
|
2549
|
+
html_content= f"""
|
|
2550
|
+
<button class="accordion">Sequence type {prefix}</button>
|
|
2551
|
+
<div class="panel">
|
|
2552
|
+
<p>Number of samples: {n_samples}</p>
|
|
2553
|
+
<p>Number of groups: {n_groups}</p>
|
|
2554
|
+
</div>
|
|
2555
|
+
"""
|
|
2556
|
+
return html_content
|
|
2557
|
+
|
|
2558
|
+
def image_outbreak(fig_result):
|
|
2559
|
+
|
|
2560
|
+
|
|
2561
|
+
html_content=''
|
|
2562
|
+
html_content +=f'<button class="accordion">Outbreak</button>'
|
|
2563
|
+
html_content +=f'<div class="panel">'
|
|
2564
|
+
html_content+=f'<div class="image-row">\n'
|
|
2565
|
+
|
|
2566
|
+
for fig in fig_result:
|
|
2567
|
+
fig_html=pio.to_html(fig, full_html=False, include_plotlyjs='cdn')
|
|
2568
|
+
html_content += f'<div class="image-item">{fig_html}</div>'
|
|
2569
|
+
#html_content+='</div>'
|
|
2570
|
+
html_content+='</div>'
|
|
2571
|
+
|
|
2572
|
+
return html_content
|
|
2573
|
+
|
|
2574
|
+
def summary_outbreak(prefix_both, thresholds):
|
|
2575
|
+
|
|
2576
|
+
html_content=f"""
|
|
2577
|
+
<p class="compact">Determines the percentage of clusters identified in a pipeline at a given threshold that could be detected with the same composition by another pipeline at a similar or even higher threshold.</p>
|
|
2578
|
+
"""
|
|
2579
|
+
for elem in thresholds:
|
|
2580
|
+
string1, string2, type_com = elem
|
|
2581
|
+
html_content += f"""<p class="compact"> Detailed information is available in the <code> {prefix_both}_stats_outbreak_summary_{string1}_{type_com}_{string2} file.</code></p>"""
|
|
2582
|
+
html_content += f"""<p class="compact"> Detailed information is available in the <code> {prefix_both}_stats_outbreak_pairwise_comparison_{string1}_{type_com}_{string2} file.</code></p>"""
|
|
2583
|
+
html_content += f"""</div> """
|
|
2584
|
+
|
|
2585
|
+
return html_content
|
|
2586
|
+
|
|
2587
|
+
def references():
|
|
2588
|
+
|
|
2589
|
+
html_content = f"""
|
|
2590
|
+
<p style="font-size: 10pt;"> <strong> References:</strong> </p>
|
|
2591
|
+
<p style="font-size: 8pt;"><a href="https://doi.org/10.1038/s41467-025-59246-8" target="_blank">Mixão V et al. (2025). Multi-country and intersectoral assessment of cluster congruence between pipelines for genomics surveillance of foodborne pathogens. <em>Nature Communications</em>, 16, Article 3961.</a></p>
|
|
2592
|
+
<p style="font-size: 8pt;"> EvalTree relies on the work of other developers. So you must also cite: </p>
|
|
2593
|
+
<p style="font-size: 8pt;"> -<a href="https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-023-01196-1"> Mixão V et al. (2023). ReporTree: a surveillance-oriented tool to strengthen the linkage between pathogen genetic clusters and epidemiological data.</a></p>
|
|
2594
|
+
<p style="font-size: 8pt;"> -<a href="https://journals.asm.org/doi/10.1128/jcm.02536-05?permanently=true"> Carriço J et al. (2006). Illustration of a Common Framework for Relating Multiple Typing Methods by Application to Macrolide-Resistant Streptococcus pyogenes.</a></p>
|
|
2595
|
+
<br></br>
|
|
2596
|
+
<p style="text-align: center; max-width: 1000px; margin: 0 auto;">
|
|
2597
|
+
<em>EvalTree.py</em> is a tool developed in the frame of the <strong>CENTAUR project</strong> (supported by the European ISIDORe initiative) at the
|
|
2598
|
+
Genomics and Bioinformatics Unit of the Department of Infectious Diseases in the National Institute of Health Dr. Ricardo Jorge (INSA, Portugal).
|
|
2599
|
+
</p>
|
|
2600
|
+
|
|
2601
|
+
"""
|
|
2602
|
+
|
|
2603
|
+
return html_content
|
|
2604
|
+
def javascript_function():
|
|
2605
|
+
|
|
2606
|
+
html_content=f"""
|
|
2607
|
+
<script>
|
|
2608
|
+
var acc = document.getElementsByClassName("accordion");
|
|
2609
|
+
var i;
|
|
2610
|
+
|
|
2611
|
+
for (i = 0; i < acc.length; i++) {{
|
|
2612
|
+
acc[i].addEventListener("click", function() {{
|
|
2613
|
+
var panel = this.nextElementSibling;
|
|
2614
|
+
if (panel.style.display === "block") {{
|
|
2615
|
+
panel.style.display = "none";
|
|
2616
|
+
}} else {{
|
|
2617
|
+
panel.style.display = "block";
|
|
2618
|
+
}}
|
|
2619
|
+
}});
|
|
2620
|
+
}}
|
|
2621
|
+
</script>
|
|
2622
|
+
"""
|
|
2623
|
+
|
|
2624
|
+
return html_content
|
|
2625
|
+
|
|
2626
|
+
def write_html(html_content, file_path_report, log):
|
|
2627
|
+
|
|
2628
|
+
"""
|
|
2629
|
+
Writing HTML
|
|
2630
|
+
|
|
2631
|
+
Parameters
|
|
2632
|
+
----------
|
|
2633
|
+
html_content: str
|
|
2634
|
+
Code of HTML report.
|
|
2635
|
+
|
|
2636
|
+
file_path_report: str
|
|
2637
|
+
Path of the report HTML file.
|
|
2638
|
+
Return
|
|
2639
|
+
-----
|
|
2640
|
+
file_path_report: str
|
|
2641
|
+
HTML file with results.
|
|
2642
|
+
"""
|
|
2643
|
+
#print(f'\n---------------------------------------------- Function: write_html----------------------------------------------\n')
|
|
2644
|
+
|
|
2645
|
+
with open(file_path_report, "w") as file:
|
|
2646
|
+
file.write(html_content)
|
|
2647
|
+
print_log(f"\nReport successfully generated in:\n {file_path_report}.\n", log)
|
|
2648
|
+
|
|
2649
|
+
def create_html_footer():
|
|
2650
|
+
return """
|
|
2651
|
+
</body>
|
|
2652
|
+
</html>
|
|
2653
|
+
"""
|
|
2654
|
+
|
|
2655
|
+
def close_painel(prefix, message=None):
|
|
2656
|
+
html_content = f"""
|
|
2657
|
+
<button class="accordion">Clusters {prefix}</button>
|
|
2658
|
+
<div class="panel">
|
|
2659
|
+
"""
|
|
2660
|
+
|
|
2661
|
+
if message:
|
|
2662
|
+
html_content += f'<p>Error: {message}</p>\n'
|
|
2663
|
+
|
|
2664
|
+
html_content += "</div>\n"
|
|
2665
|
+
html_content += "</div>\n"
|
|
2666
|
+
|
|
2667
|
+
return html_content
|
|
2668
|
+
|
|
2669
|
+
|
|
2670
|
+
def print_log(message, log):
|
|
2671
|
+
""" print messages in the terminal and in the log file """
|
|
2672
|
+
|
|
2673
|
+
print(message)
|
|
2674
|
+
print(message, file = log)
|
|
2675
|
+
|
|
2676
|
+
#####################################################################################################################################
|
|
2677
|
+
###################################################################***###############################################################
|
|
2678
|
+
###############################################################***EvalTree***##########################################################
|
|
2679
|
+
###################################################################***###############################################################
|
|
2680
|
+
#####################################################################################################################################
|
|
2681
|
+
|
|
2682
|
+
def main():
|
|
2683
|
+
"""
|
|
2684
|
+
This function is instrumental in the tool.
|
|
2685
|
+
It manages the flow of the program, determining which functions to call and in what order.
|
|
2686
|
+
|
|
2687
|
+
Parameters
|
|
2688
|
+
---------
|
|
2689
|
+
Without parameters, it will pass all arguments entered by the user.
|
|
2690
|
+
|
|
2691
|
+
Returns
|
|
2692
|
+
---------
|
|
2693
|
+
None
|
|
2694
|
+
|
|
2695
|
+
"""
|
|
2696
|
+
#-------------------------------------------------------------------------------------------------------------------------------------
|
|
2697
|
+
# Configures the parser for command line arguments
|
|
2698
|
+
|
|
2699
|
+
parser = argparse.ArgumentParser(description="Running EvalTree")
|
|
2700
|
+
parser = argparse.ArgumentParser(prog="EvalTree.py",
|
|
2701
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
2702
|
+
description=textwrap.dedent("""
|
|
2703
|
+
EvalTree.py
|
|
2704
|
+
|
|
2705
|
+
EvalTree was designed for comparing two genomic pipeline inputs (e.g., cg/wgMLST, traditional sequence-type matrix), with three main functionalities:
|
|
2706
|
+
|
|
2707
|
+
- Evaluates the congruence between the pipelines.
|
|
2708
|
+
- Characterizes genetic clusters.
|
|
2709
|
+
- Detects closely related outbreak clusters at given thresholds.
|
|
2710
|
+
|
|
2711
|
+
The EvalTree toolbox accepts two types of inputs: folders and files.
|
|
2712
|
+
- Folders must be derived from ReporTree outputs (highly recommended). Pipelines of cg/wgMLST should contain clustering data (clusters and/or singletons) for
|
|
2713
|
+
all possible thresholds in a partition.tsv file.
|
|
2714
|
+
- Files can be partition files or other types of files with classifications (e.g., sequence-type, serotypes).
|
|
2715
|
+
It will return an interactive HTML report based on the selected arguments.
|
|
2716
|
+
|
|
2717
|
+
The following arguments are used for specific analyses:
|
|
2718
|
+
|
|
2719
|
+
- plot_summary, plots_threshold, column_plot, n_cluster, plots_category_number, plots_category_percentage: These are used exclusively to characterize genetic clusters from
|
|
2720
|
+
ReporTree output files (e.g., *_partitions_summary.tsv).
|
|
2721
|
+
- score and threshold: These are used in the cg/wgMLST pipeline congruence analysis.
|
|
2722
|
+
- threshold_outbreak, repeat_threshold_outbreak: These are used in the outbreak analysis, utilizing the cluster_composition.tsv file produced by ReporTree."""))
|
|
2723
|
+
|
|
2724
|
+
# Mandatory arguments
|
|
2725
|
+
parser.add_argument("-i1", "--input1",
|
|
2726
|
+
action = "store",
|
|
2727
|
+
required = True,
|
|
2728
|
+
help = '[MANDATORY] Specifies the first input type (folder or file), requiring the full path. \
|
|
2729
|
+
The folder must contain the partition matrix file with clustering data, and is highly recommended to be a Reportree output folder.\
|
|
2730
|
+
Alternatively, the file can be a traditional sequence-type matrix or a partition matrix.\
|
|
2731
|
+
Using either of these input types enables the analysis.')
|
|
2732
|
+
|
|
2733
|
+
parser.add_argument("-i2", "--input2",
|
|
2734
|
+
action = "store",
|
|
2735
|
+
required = False,
|
|
2736
|
+
help = '[OPTIONAL] Specifies the second input type (folder or file), requiring the full path. \
|
|
2737
|
+
The folder must contain the partition matrix file with clustering data, and is highly recommended to be a Reportree output folder. \
|
|
2738
|
+
Alternatively, the file can be a traditional sequence-type matrix or a partition matrix. \
|
|
2739
|
+
Using either of these input types enables the analysis.')
|
|
2740
|
+
|
|
2741
|
+
parser.add_argument("-o", "--output",
|
|
2742
|
+
action = "store",
|
|
2743
|
+
help = '[MANDATORY] Specifies the output directory for storing all analysis results. \
|
|
2744
|
+
If no folder is provided, the program will automatically create one based on the prefix of the files.')
|
|
2745
|
+
|
|
2746
|
+
# Optional arguments
|
|
2747
|
+
parser.add_argument('-s', '--score',
|
|
2748
|
+
dest = 'score',
|
|
2749
|
+
default = '2.85',
|
|
2750
|
+
help = '[OPTIONAL] Define a minimum score to consider two partitions (one from each pipeline) as corresponding. The score accepts values between 0 and 3.\
|
|
2751
|
+
Partition - It refer to the number of identical clusters that exist at the same threshold.')
|
|
2752
|
+
|
|
2753
|
+
parser.add_argument('-t', '--threshold',
|
|
2754
|
+
dest = 'threshold',
|
|
2755
|
+
default = 'max',
|
|
2756
|
+
help = '[OPTIONAL] Defines an integer range to select or filter threshold columns from the partition matrix file. \
|
|
2757
|
+
A filtered partition matrix, containing only the selected columns, will be created and used for subsequent analysis. \
|
|
2758
|
+
Ranges are specified using a hyphen to separate the minimum and maximum values (e.g., 10-20). \
|
|
2759
|
+
If this option is not set, the script will perform clustering for all possible thresholds in the range 0 to the maximum threshold.')
|
|
2760
|
+
|
|
2761
|
+
parser.add_argument('-ps', '--plots_summary',
|
|
2762
|
+
dest = 'plots_summary',
|
|
2763
|
+
choices = ['partitions_summary','sample_of_interest'],
|
|
2764
|
+
default = 'partitions_summary',
|
|
2765
|
+
help = '[OPTIONAL] Specify the type of cluster characterization file (partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv), both of which are expected to be located within a Reportree results folder. \
|
|
2766
|
+
Using the partition_summary option, the largest clusters present in the file will be characterized. \
|
|
2767
|
+
Alternatively, the samples_of_interest option will characterize all clusters, including those resulting from the addition of new samples (kept increase, new, new (increase), new (merge_increase), new (split_increase), new (split_merge_increase)).')
|
|
2768
|
+
|
|
2769
|
+
parser.add_argument('-n', '--n_cluster',
|
|
2770
|
+
dest = 'n_cluster',
|
|
2771
|
+
type = int,
|
|
2772
|
+
default = 3,
|
|
2773
|
+
help = '[OPTIONAL] Specify the number of top clusters to be displayed from the partitions_summary.tsv file, which must be located within a Reportree results folder. \
|
|
2774
|
+
This argument is not applicable when using the samples_of_interest option.')
|
|
2775
|
+
|
|
2776
|
+
parser.add_argument('-cp', '--columns_plots',
|
|
2777
|
+
dest = 'columns_plots',
|
|
2778
|
+
help = '[OPTIONAL] Name(s) of the column(s) to process the characterization of the clustering data in the selected file (specified by the plots_summary argument). \
|
|
2779
|
+
For multiple column names, indicate them separated by commas without spaces (e.g., column1,column2).')
|
|
2780
|
+
|
|
2781
|
+
parser.add_argument('-pt','--plots_threshold',
|
|
2782
|
+
dest='plots_threshold',
|
|
2783
|
+
help='[OPTIONAL] Identify the integer threshold(s) to be applied to the file specified by the plots_summary argument. \
|
|
2784
|
+
For multiple thresholds, indicate them separated by commas without spaces (e.g., X,Y,Z). \
|
|
2785
|
+
This generates a pie chart showing the clustering data for the specified threshold(s), according to the columns_plot argument.')
|
|
2786
|
+
|
|
2787
|
+
parser.add_argument('-pcn','--plots_category_number',
|
|
2788
|
+
dest='plots_category_number',
|
|
2789
|
+
default=5,
|
|
2790
|
+
type=int,
|
|
2791
|
+
help='[OPTIONAL] Determines the number of plot categories in the partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv file\
|
|
2792
|
+
that are intended to be collapsed into the '"Other"' category for visualization in the cluster plots.\
|
|
2793
|
+
When there are more than 5 slices (default), they will be combined into one category named Other')
|
|
2794
|
+
|
|
2795
|
+
parser.add_argument('-pcp','--plots_category_percentage',
|
|
2796
|
+
dest='plots_category_percentage',
|
|
2797
|
+
type=float,
|
|
2798
|
+
help='[OPTIONAL] Determines the percentage of plot categories in the partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv file\
|
|
2799
|
+
that are intended to be collapse into the '"Other"'category for visualization in the cluster plots.\
|
|
2800
|
+
Slices plots with a lower percentage than the entered plots_category_percentage will be combined into one category named Others')
|
|
2801
|
+
|
|
2802
|
+
parser.add_argument('-to', '--threshold_outbreak',
|
|
2803
|
+
dest='threshold_outbreak',
|
|
2804
|
+
type=str,
|
|
2805
|
+
help='[OPTIONAL] Determine the number of clusters identified in one pipeline at a given threshold \
|
|
2806
|
+
that will exist with the same composition in another pipeline at the same or a higher threshold.\
|
|
2807
|
+
Full attention, this argument has its own structure: two threshold (strings-methods) and the type of comparison is \
|
|
2808
|
+
either equal (defined by , ) or lower_equal (defined by <= ) \
|
|
2809
|
+
Threshold1: Threshold at which the genetic clusters must be identified for the pipeline of interest.\
|
|
2810
|
+
Threshold2: Threshold at which the genetic clusters must be searched in the other pipelines.\
|
|
2811
|
+
Comparison (equal or lower equal): \
|
|
2812
|
+
- ''equal'': Used to assess whether a cluster is detected at a given threshold by another pipeline. \
|
|
2813
|
+
Use a comma '','' to separate threshold1,threshold2. Example of expression: MST-7x1.0,MST-7x1.0.\
|
|
2814
|
+
- ''lower_equal'': Used to assess whether a cluster is detected up to a given threshold in another pipeline. \
|
|
2815
|
+
Use <= between threshold1<=threshold2. Example of expression: MST-7x1.0,<=MST-9x1.0.\
|
|
2816
|
+
\
|
|
2817
|
+
For multiple pair of threshold values, use '';'' as a separator. Example of expression: "MST-7x1.0,MST-7x1.0;<=MST-7x1.0,MST-10x1.0" represents two pair of threshold values.')
|
|
2818
|
+
|
|
2819
|
+
parser.add_argument('-list', '--list',
|
|
2820
|
+
dest='list',
|
|
2821
|
+
choices=['partitions_summary','sample_of_interest'],
|
|
2822
|
+
help='[OPTIONAL] Specify the names of the columns present in the partitions_summary.tsv or SAMPLES_OF_INTEREST_partitions_summary.tsv file.')
|
|
2823
|
+
|
|
2824
|
+
parser.add_argument('-rto','--repeat_threshold_outbreak',
|
|
2825
|
+
dest='repeat_threshold_outbreak',
|
|
2826
|
+
action="store_true",
|
|
2827
|
+
help='[OPTIONAL] This argument can only be used after of a previous analysis of threshold_outbreak.')
|
|
2828
|
+
|
|
2829
|
+
parser.add_argument('-v', '--version',
|
|
2830
|
+
action='version',
|
|
2831
|
+
version='EvalTree 1.0.0, last update 2025-05-20',
|
|
2832
|
+
help='[OPTIONAL] Specify the version number of EvalTree.')
|
|
2833
|
+
|
|
2834
|
+
parser.add_argument('-n_stab', '--n_stability',
|
|
2835
|
+
dest = 'n_stability',
|
|
2836
|
+
default = 5,
|
|
2837
|
+
type = int,
|
|
2838
|
+
help = '[OPTIONAL] Range of threshold which the cluster composition can be conistent/stable.')
|
|
2839
|
+
|
|
2840
|
+
parser.add_argument('-thr_stab', '--thr_stability',
|
|
2841
|
+
dest = 'thr_stability',
|
|
2842
|
+
default = 0.99,
|
|
2843
|
+
type = float,
|
|
2844
|
+
help = '[OPTIONAL] The neighborhood Adjusted Wallace Coefficient (nAWC) threshold used to determine if a clustering threshold is considered consistent or stable.')
|
|
2845
|
+
|
|
2846
|
+
#------------------------------------------------------------------
|
|
2847
|
+
# INITIAL INFORMATIONS
|
|
2848
|
+
# Read the command line arguments and retrieve paths
|
|
2849
|
+
|
|
2850
|
+
args = parser.parse_args()
|
|
2851
|
+
path_toolbox_script, directory_toolbox = get_path_toolbox()
|
|
2852
|
+
comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, stats_outbreak_script = get_path_other_scripts(directory_toolbox)
|
|
2853
|
+
|
|
2854
|
+
#------------------------------------------------------------------
|
|
2855
|
+
# I- Structural validation of the arguments
|
|
2856
|
+
# I1- Check the input argument(s) (-i1 and/or -i2)
|
|
2857
|
+
|
|
2858
|
+
input1 = None
|
|
2859
|
+
input2 = None
|
|
2860
|
+
|
|
2861
|
+
if args.input1:
|
|
2862
|
+
input1 = args.input1
|
|
2863
|
+
|
|
2864
|
+
if args.input2:
|
|
2865
|
+
input2=args.input2
|
|
2866
|
+
|
|
2867
|
+
folders, files = check_input_argument(input1, input2)
|
|
2868
|
+
print(f'Checking inputs:')
|
|
2869
|
+
|
|
2870
|
+
data_folder = []
|
|
2871
|
+
if folders !=[]:
|
|
2872
|
+
|
|
2873
|
+
for folder in folders:
|
|
2874
|
+
print(f"\tFolder: {folder}")
|
|
2875
|
+
partitions, partitions_summary, sample_interest, clusterComposition, prefix, input_path, stable_region = check_folder(folder)
|
|
2876
|
+
data_folder += [[partitions, prefix, input_path, partitions_summary, sample_interest, clusterComposition, stable_region]] #7
|
|
2877
|
+
|
|
2878
|
+
data_files = []
|
|
2879
|
+
for file in files:
|
|
2880
|
+
print(f"\n\tFile: {file}")
|
|
2881
|
+
file, prefix_file, path_directory, file_type, n_samples, n_groups = check_file(file)
|
|
2882
|
+
print(f"\t\tPrefix: {prefix_file}")
|
|
2883
|
+
print(f'\t\tDirectory: {path_directory}')
|
|
2884
|
+
data_files += [[file, prefix_file, path_directory, file_type, n_samples, n_groups]] #6
|
|
2885
|
+
|
|
2886
|
+
#----------------------------------------------------------------
|
|
2887
|
+
# I2- Check the output argument (-o)
|
|
2888
|
+
output, rename = check_output(args.output)
|
|
2889
|
+
|
|
2890
|
+
#----------------------------------------------------------------
|
|
2891
|
+
# I3- Check the list argument (-list)
|
|
2892
|
+
|
|
2893
|
+
list_column_plot = args.list
|
|
2894
|
+
|
|
2895
|
+
if list_column_plot is not None:
|
|
2896
|
+
if data_folder != []:
|
|
2897
|
+
for sub in data_folder:
|
|
2898
|
+
|
|
2899
|
+
if list_column_plot == 'partitions_summary':
|
|
2900
|
+
file_summary = sub[3]
|
|
2901
|
+
if file_summary is not None:
|
|
2902
|
+
get_plot_columns(file_summary)
|
|
2903
|
+
else:
|
|
2904
|
+
sys.exit(f"There is no partitions_summary file in the {sub[2]}. ")
|
|
2905
|
+
else:
|
|
2906
|
+
file_s_interest=sub[4]
|
|
2907
|
+
if file_s_interest is not None:
|
|
2908
|
+
get_plot_columns(file_s_interest)
|
|
2909
|
+
else:
|
|
2910
|
+
sys.exit(f"\nThere is no the sample_of_interest file in the {sub[2]}.")
|
|
2911
|
+
sys.exit()
|
|
2912
|
+
else:
|
|
2913
|
+
sys.exit('It is impossible to use the list argument (-list) when the input(s) (-i1, -i2) argument(s) have provided file(s).')
|
|
2914
|
+
|
|
2915
|
+
#----------------------------------------------------------------
|
|
2916
|
+
# I4- Check the column plot(s) (-cp) and plots thresholds (-pt) arguments
|
|
2917
|
+
|
|
2918
|
+
columns_plots = args.columns_plots
|
|
2919
|
+
plots_thresholds = args.plots_threshold
|
|
2920
|
+
|
|
2921
|
+
if plots_thresholds is not None:
|
|
2922
|
+
plots_thresholds = check_str_plots_threshold(plots_thresholds)
|
|
2923
|
+
|
|
2924
|
+
#----------------------------------------------------------------
|
|
2925
|
+
# I5- Check the threshold (-t) and score(-s) arguments
|
|
2926
|
+
|
|
2927
|
+
score_value = check_score(args.score)
|
|
2928
|
+
threshold = check_threshold(args.threshold)
|
|
2929
|
+
|
|
2930
|
+
if threshold != 'max':
|
|
2931
|
+
|
|
2932
|
+
for sub in data_files:
|
|
2933
|
+
file_matrix = sub[0]
|
|
2934
|
+
identify_matrix = sub[3]
|
|
2935
|
+
if identify_matrix == False:
|
|
2936
|
+
if len(data_files) == 1:
|
|
2937
|
+
print(f'\t\tWarning: The threshold argument (-t) is only applied to a partition matrix, so it is not applicable to the {file_matrix}.')
|
|
2938
|
+
|
|
2939
|
+
#----------------------------------------------------------------
|
|
2940
|
+
# I6- Arguments that do not require structural validation
|
|
2941
|
+
|
|
2942
|
+
n_cluster = args.n_cluster
|
|
2943
|
+
plots_summary_arg = args.plots_summary
|
|
2944
|
+
plots_category_percentage = args.plots_category_percentage
|
|
2945
|
+
plots_category_number = args.plots_category_number
|
|
2946
|
+
n_stability = args.n_stability
|
|
2947
|
+
thr_stability = args.thr_stability
|
|
2948
|
+
|
|
2949
|
+
#----------------------------------------------------------------
|
|
2950
|
+
# I7- Check the threshold outbreak (-to) and repeat_threshold_outbreak (-rto) arguments
|
|
2951
|
+
|
|
2952
|
+
threshold_outbreak = args.threshold_outbreak
|
|
2953
|
+
repeat_threshold_outbreak = args.repeat_threshold_outbreak
|
|
2954
|
+
|
|
2955
|
+
if threshold_outbreak is not None:
|
|
2956
|
+
valid_combinations = validate_combinations_outbreak(threshold_outbreak)
|
|
2957
|
+
|
|
2958
|
+
#----------------------------------------------------------------
|
|
2959
|
+
# II- Validation of the argument combination (clustering, outbreaks)
|
|
2960
|
+
|
|
2961
|
+
go_clustering, go_outbreaks = check_combinations_arguments(plots_summary_arg, data_folder, data_files)
|
|
2962
|
+
|
|
2963
|
+
#----------------------------------------------------------------
|
|
2964
|
+
# III- Validation of file prefixes provided in different inputs
|
|
2965
|
+
|
|
2966
|
+
data_folder, data_files, prefix_both = check_data_folders_file(data_folder, data_files)
|
|
2967
|
+
|
|
2968
|
+
#---------------------------------------------------------
|
|
2969
|
+
# IV- Validation of partition matrix FUNDAMENTAL
|
|
2970
|
+
|
|
2971
|
+
inputs_variables = join_inputs_variables(data_folder,data_files)
|
|
2972
|
+
|
|
2973
|
+
#---------------------------------------------------------
|
|
2974
|
+
# V- Validation of congruence
|
|
2975
|
+
go_congruence = False
|
|
2976
|
+
|
|
2977
|
+
if len(inputs_variables) == 2:
|
|
2978
|
+
|
|
2979
|
+
i1,i2 = inputs_variables[0][0], inputs_variables[1][0]
|
|
2980
|
+
|
|
2981
|
+
if i1 is not None and i2 is not None:
|
|
2982
|
+
go_congruence = True
|
|
2983
|
+
|
|
2984
|
+
else:
|
|
2985
|
+
print("Congruence analysis is not possible. It is necessary two *_partitions.tsv files.\n")
|
|
2986
|
+
|
|
2987
|
+
|
|
2988
|
+
#---------------------------------------------------------------------------------------------------
|
|
2989
|
+
# VI- Outbreaks (-rto)
|
|
2990
|
+
|
|
2991
|
+
if repeat_threshold_outbreak is not False:
|
|
2992
|
+
|
|
2993
|
+
if args.output is None:
|
|
2994
|
+
sys.exit('Error: Please specify the output folder with the -o argument. It should contain the previous results.')
|
|
2995
|
+
|
|
2996
|
+
file=glob.glob(os.path.join(output,'*_report.html'))
|
|
2997
|
+
if not file:
|
|
2998
|
+
sys.exit("Error: The expected *_report.html file was not found. Please run the program first with the -to argument, and then with the -rto argument.")
|
|
2999
|
+
|
|
3000
|
+
if not threshold_outbreak:
|
|
3001
|
+
print('\tDo not forget the double quotation marks!')
|
|
3002
|
+
sys.exit("Error: You must specify a new argument for the threshold_outbreak (-to).")
|
|
3003
|
+
|
|
3004
|
+
#---------------------------------------------------------------------------------------------------
|
|
3005
|
+
# VII - Stable Regions (-thr_stab)
|
|
3006
|
+
|
|
3007
|
+
if thr_stability != 0.99:
|
|
3008
|
+
if not (0 <= thr_stability <= 1):
|
|
3009
|
+
sys.exit("Error: thr_stability must be between 0 and 1.")
|
|
3010
|
+
|
|
3011
|
+
#--------------------------------------------------------------------------------------------------
|
|
3012
|
+
# Starting logs
|
|
3013
|
+
|
|
3014
|
+
if not repeat_threshold_outbreak:
|
|
3015
|
+
log_name = (f'{output}/{prefix_both}.log')
|
|
3016
|
+
log = open(log_name, "w+")
|
|
3017
|
+
|
|
3018
|
+
else:
|
|
3019
|
+
log_name = (f'{output}/{prefix_both}_reanalyse.log')
|
|
3020
|
+
log = open(log_name, "w+")
|
|
3021
|
+
|
|
3022
|
+
# -------------------------------------------------------------------------------------------------------------------------
|
|
3023
|
+
# INITIAL INFORMATIONS
|
|
3024
|
+
|
|
3025
|
+
print("---------------------------------------------- Running EvalTree.py ----------------------------------------------\n")
|
|
3026
|
+
print_log(f"Version " + str(version) + " last updated on " + str(last_updated)+"\n", log)
|
|
3027
|
+
command_line = " ".join(sys.argv)
|
|
3028
|
+
print_log(f"Running EvalTree with the following command: {command_line}\n", log)
|
|
3029
|
+
print_log(f'Log file name: {log_name}\n', log)
|
|
3030
|
+
start = datetime.datetime.now()
|
|
3031
|
+
print_log("Start: " + str(start)+"\n", log)
|
|
3032
|
+
print_log(f'Output directory: {output}\n', log)
|
|
3033
|
+
|
|
3034
|
+
#-----------------------------------------------
|
|
3035
|
+
# STAR HTML
|
|
3036
|
+
|
|
3037
|
+
if not repeat_threshold_outbreak:
|
|
3038
|
+
file_path_report = os.path.join(output, f'{prefix_both}_report.html')
|
|
3039
|
+
html_content = create_html(log, file_path_report)
|
|
3040
|
+
html_content += body_html(start, command_line,version)
|
|
3041
|
+
else:
|
|
3042
|
+
file_path_report = os.path.join(output, f'{prefix_both}_2ºRUN_report.html')
|
|
3043
|
+
html_content = create_html(log, file_path_report)
|
|
3044
|
+
html_content += body_html(start, command_line,version)
|
|
3045
|
+
html_report = write_html(html_content,file_path_report, log)
|
|
3046
|
+
|
|
3047
|
+
#-------------------------------------------------------------------------------------------------------------------------------
|
|
3048
|
+
|
|
3049
|
+
if not repeat_threshold_outbreak:
|
|
3050
|
+
|
|
3051
|
+
#-------------------------------------------------------------------------------------------------------------------------------
|
|
3052
|
+
|
|
3053
|
+
if inputs_variables:
|
|
3054
|
+
for sub in inputs_variables:
|
|
3055
|
+
|
|
3056
|
+
#-------------------------------------------------------------------------------------------------------------------------------
|
|
3057
|
+
# MODULE 1 - SEQUENCE TYPE
|
|
3058
|
+
if len(sub) == 6:
|
|
3059
|
+
|
|
3060
|
+
if sub[3] == False:
|
|
3061
|
+
samples_st = sub[4]
|
|
3062
|
+
groups_st = sub[5]
|
|
3063
|
+
sequence_type_file = sub[0]
|
|
3064
|
+
prefix_st = sub[1]
|
|
3065
|
+
html_content += get_sequence_type(prefix_st,samples_st,groups_st,sequence_type_file)
|
|
3066
|
+
fig_clusters = reading_sequence_type(sequence_type_file, output, prefix_st, log)
|
|
3067
|
+
fig_html = pio.to_html(fig_clusters, include_plotlyjs='cdn', full_html=False)
|
|
3068
|
+
html_content += sequence_type_image(fig_html)
|
|
3069
|
+
|
|
3070
|
+
#-------------------------------------------------------------------------------------------------------------------------------
|
|
3071
|
+
list_partition_by_threshold=[]
|
|
3072
|
+
category_colors = {'Others':'#000000'}
|
|
3073
|
+
|
|
3074
|
+
if inputs_variables:
|
|
3075
|
+
for sub in inputs_variables:
|
|
3076
|
+
partition_matrix = sub[0]
|
|
3077
|
+
prefix = sub[1]
|
|
3078
|
+
directory = sub[2]
|
|
3079
|
+
|
|
3080
|
+
#-------------------------------------------------------------------------------------------------------------------------------
|
|
3081
|
+
# MODULE 2 - Characterization of the ONE pipeline (Nr_partitions vs Nr_thresholds)
|
|
3082
|
+
if len(sub) == 7 or (len(sub) == 6 and sub[3] == True):
|
|
3083
|
+
|
|
3084
|
+
print_log(f'\nPipeline characterization: {directory}', log)
|
|
3085
|
+
print_log(f'\tPipeline name: {prefix}', log)
|
|
3086
|
+
|
|
3087
|
+
if partition_matrix is not None:
|
|
3088
|
+
|
|
3089
|
+
if threshold !='max':
|
|
3090
|
+
start_threshold, end_threshold = check_range_threshold(partition_matrix,threshold,log)
|
|
3091
|
+
input_filtered = filter_partition_matrix(partition_matrix, prefix, start_threshold, end_threshold, output, log)
|
|
3092
|
+
sub[0] = input_filtered
|
|
3093
|
+
partition_matrix = input_filtered
|
|
3094
|
+
|
|
3095
|
+
nr_lines_df, nr_columns_df = get_nr_lines_threshold(partition_matrix, log)
|
|
3096
|
+
file_partition_by_threshold = get_file_partition_by_threshold (partition_matrix, prefix, output, log)
|
|
3097
|
+
list_partition_by_threshold.append(file_partition_by_threshold)
|
|
3098
|
+
print_log(f'\tObtaining the number of partitions per threshold.', log)
|
|
3099
|
+
yes_prefix_both=False
|
|
3100
|
+
fig_partition_vs_threshols = get_graph_partition_by_threshold(file_partition_by_threshold, prefix, prefix_both, yes_prefix_both, output, log)
|
|
3101
|
+
html_content += get_partitions_threshold(prefix, nr_lines_df, nr_columns_df, fig_partition_vs_threshols)
|
|
3102
|
+
|
|
3103
|
+
|
|
3104
|
+
if go_clustering == False:
|
|
3105
|
+
html_content += f'</div>\n'
|
|
3106
|
+
|
|
3107
|
+
#-----------------------------------------------------------------------
|
|
3108
|
+
# MODULE 3 - REPORTREE
|
|
3109
|
+
if len(sub) == 7: #folder
|
|
3110
|
+
partitions_summary = sub[3]
|
|
3111
|
+
sample_interest = sub[4]
|
|
3112
|
+
|
|
3113
|
+
if go_clustering == True:
|
|
3114
|
+
plots_file = None
|
|
3115
|
+
|
|
3116
|
+
if plots_summary_arg == 'partitions_summary':
|
|
3117
|
+
if partitions_summary is not None:
|
|
3118
|
+
plots_file = partitions_summary
|
|
3119
|
+
else:
|
|
3120
|
+
if sample_interest is not None:
|
|
3121
|
+
plots_file = sample_interest
|
|
3122
|
+
|
|
3123
|
+
print_log(f'\tPlotting cluster characterization ...', log)
|
|
3124
|
+
|
|
3125
|
+
#-----------------------------------------------------------------------
|
|
3126
|
+
# Starting clustering
|
|
3127
|
+
df_data=load_and_prepare_data(plots_file, log)
|
|
3128
|
+
df_filtered=order_cluster_by_size(df_data, log)
|
|
3129
|
+
|
|
3130
|
+
#-----------------------------------------------------------------------
|
|
3131
|
+
if df_filtered is not None:
|
|
3132
|
+
method = check_plot_threshold(plots_thresholds, df_filtered, log)
|
|
3133
|
+
|
|
3134
|
+
if method != []:
|
|
3135
|
+
filtered_threshold = check_threshold_in_file(method, df_filtered, plots_file, log)
|
|
3136
|
+
|
|
3137
|
+
if filtered_threshold != []:
|
|
3138
|
+
|
|
3139
|
+
if plots_summary_arg == 'partitions_summary':
|
|
3140
|
+
result_df = filter_df_by_plot_threshold(filtered_threshold, df_filtered, n_cluster, log)
|
|
3141
|
+
|
|
3142
|
+
if plots_summary_arg == 'sample_of_interest':
|
|
3143
|
+
df_filtered_threshold = filtering_df_threshold(filtered_threshold, df_filtered, log)
|
|
3144
|
+
result_df = select_nomenclature_change(df_filtered_threshold, log)
|
|
3145
|
+
|
|
3146
|
+
if result_df is not None:
|
|
3147
|
+
check_columns = check_column_plots(columns_plots, result_df, log)
|
|
3148
|
+
|
|
3149
|
+
#-----------------------------------------------------------------------
|
|
3150
|
+
# PLots
|
|
3151
|
+
if check_columns != []:
|
|
3152
|
+
|
|
3153
|
+
results_list = check_structure_lines_column_plots(check_columns, result_df, plots_category_percentage, plots_category_number, output, prefix, plots_summary_arg, category_colors, log)
|
|
3154
|
+
|
|
3155
|
+
if results_list is not None:
|
|
3156
|
+
mst_groups = organize_clusters(results_list)
|
|
3157
|
+
html_content += get_clusters(mst_groups, prefix)
|
|
3158
|
+
else:
|
|
3159
|
+
html_content += close_painel(prefix,"Error: Impossible to produce cluster plots.")
|
|
3160
|
+
print_log(f'\tError: Impossible to produce cluster plots.', log)
|
|
3161
|
+
else:
|
|
3162
|
+
html_content += close_painel(prefix,"Error: Invalid column plots, without clustering analysis.")
|
|
3163
|
+
print_log(f'\tError: Invalid column plots, without clustering analysis ...', log)
|
|
3164
|
+
#-------------------------------------------------------------------------
|
|
3165
|
+
|
|
3166
|
+
else:
|
|
3167
|
+
html_content += close_painel(prefix,"Error: No data for processing, without clustering analysis.")
|
|
3168
|
+
print_log(f'\tError: No data for processing, without clustering analysis.', log)
|
|
3169
|
+
else:
|
|
3170
|
+
html_content += close_painel(prefix,"Error: The plot_threshold argument is invalid, without clustering analysis.")
|
|
3171
|
+
print_log(f'\tError: The plot_threshold argument is invalid, without clustering analysis. ', log)
|
|
3172
|
+
else:
|
|
3173
|
+
html_content += close_painel(prefix,"Error: No analysis method provided, without clustering analysis.")
|
|
3174
|
+
print_log(f"\tError: No analysis method provided, without clustering analysis.",log)
|
|
3175
|
+
else:
|
|
3176
|
+
html_content += close_painel(prefix,"Error: Impossible to order the Dataframe by cluster length, without clustering analysis.")
|
|
3177
|
+
print_log(f'\tError: Impossible to order the Dataframe by cluster length, without clustering analysis.')
|
|
3178
|
+
else:
|
|
3179
|
+
html_content += f'</div>\n'
|
|
3180
|
+
|
|
3181
|
+
#-------------------------------------------------------------------------------------------------------------------------------
|
|
3182
|
+
print_log(f"\nInter-pipeline cluster congruence analysis:\n", log)
|
|
3183
|
+
html_content += summary_congruence()
|
|
3184
|
+
|
|
3185
|
+
#-----------------------------------------------------------------------
|
|
3186
|
+
# MODULE 4.1 - Characterization of the BOTH pipelines (Nr_partitions vs Nr_thresholds)
|
|
3187
|
+
if list_partition_by_threshold:
|
|
3188
|
+
if len(list_partition_by_threshold) == 2:
|
|
3189
|
+
file1, file2 = list_partition_by_threshold
|
|
3190
|
+
path = concatenation_files(file1, file2, output, prefix_both)
|
|
3191
|
+
yes_prefix_both=True
|
|
3192
|
+
fig = get_graph_partition_by_threshold(path, prefix, prefix_both, yes_prefix_both, output, log)
|
|
3193
|
+
print_log(f"\tPlotting the number of partitions per threshold for the two pipelines ...", log)
|
|
3194
|
+
fig_html = pio.to_html(fig, include_plotlyjs='cdn', full_html=False)
|
|
3195
|
+
html_content += summary_partition_threshold(fig_html, prefix_both)
|
|
3196
|
+
|
|
3197
|
+
#-----------------------------------------------------------------------
|
|
3198
|
+
# MODULE 4.2 - Stability regions #or (len(sub)==6 and sub[3]==True):
|
|
3199
|
+
print_log(f"\tIdentifying cluster stability regions for each pipeline ...", log)
|
|
3200
|
+
print_log(f"\t\tRunning comparing_partitions_v2.py in “stability” mode.", log)
|
|
3201
|
+
|
|
3202
|
+
files_to_stability = []
|
|
3203
|
+
|
|
3204
|
+
#-----------------------------------------------------------------------
|
|
3205
|
+
for sub in inputs_variables:
|
|
3206
|
+
partition_matrix = sub[0]
|
|
3207
|
+
prefix = sub[1]
|
|
3208
|
+
directory = sub[2]
|
|
3209
|
+
|
|
3210
|
+
if len(sub) == 7: #folders
|
|
3211
|
+
stable_region = sub[6]
|
|
3212
|
+
if stable_region is None or threshold !='max':
|
|
3213
|
+
file_stability = stability_region(output, partition_matrix, prefix, comparing_partitions_script, n_stability, thr_stability, log)
|
|
3214
|
+
files_to_stability.append([file_stability,prefix])
|
|
3215
|
+
else:
|
|
3216
|
+
files_to_stability.append([stable_region,prefix])
|
|
3217
|
+
|
|
3218
|
+
if len(sub) == 6: #files
|
|
3219
|
+
type_file = sub[3]
|
|
3220
|
+
if type_file == True: #if true it is partition matrix
|
|
3221
|
+
file_stability = stability_region(output, partition_matrix, prefix, comparing_partitions_script, n_stability, thr_stability, log)
|
|
3222
|
+
files_to_stability.append([file_stability, prefix])
|
|
3223
|
+
else:
|
|
3224
|
+
go_stability = False
|
|
3225
|
+
|
|
3226
|
+
#-----------------------------------------------------------------------
|
|
3227
|
+
all_dfs = []
|
|
3228
|
+
prefix_df = []
|
|
3229
|
+
#list_values_block = []
|
|
3230
|
+
|
|
3231
|
+
if files_to_stability:
|
|
3232
|
+
|
|
3233
|
+
for file, prefix in files_to_stability:
|
|
3234
|
+
|
|
3235
|
+
try:
|
|
3236
|
+
name_block = processing_block_names(file, prefix, log)
|
|
3237
|
+
first_data, final_data = processing_data(file, log)
|
|
3238
|
+
#list_values_block.append(values_block)
|
|
3239
|
+
|
|
3240
|
+
df = pd.DataFrame({'Block_id': name_block, 'Start': first_data, 'Finish': final_data, 'Pipeline': prefix})
|
|
3241
|
+
all_dfs.append(df)
|
|
3242
|
+
prefix_df.append(prefix)
|
|
3243
|
+
go_stability = True
|
|
3244
|
+
|
|
3245
|
+
except Exception as e:
|
|
3246
|
+
print(f'\t\tWarning: without stability reagions in the file {file}.')
|
|
3247
|
+
go_stability = False
|
|
3248
|
+
|
|
3249
|
+
if all_dfs:
|
|
3250
|
+
df = pd.concat(all_dfs, ignore_index = True)
|
|
3251
|
+
if len(prefix_df) == 2:
|
|
3252
|
+
prefix=prefix_df[0]
|
|
3253
|
+
prefix_2=prefix_df[1]
|
|
3254
|
+
|
|
3255
|
+
else:
|
|
3256
|
+
prefix_2 = None
|
|
3257
|
+
prefix = prefix_df[0]
|
|
3258
|
+
|
|
3259
|
+
|
|
3260
|
+
if go_stability == True:
|
|
3261
|
+
fig_st = change_processing_data(df, prefix, prefix_2, output, log)
|
|
3262
|
+
print_log(f"\t\tDone.\n", log)
|
|
3263
|
+
fig_html_st = pio.to_html(fig_st, include_plotlyjs='cdn', full_html=False)
|
|
3264
|
+
html_content += congruence_stability(fig_html_st, prefix, prefix_2, n_stability, thr_stability)
|
|
3265
|
+
|
|
3266
|
+
#-----------------------------------------------------------------------
|
|
3267
|
+
# MODULE 4.3 - Congruence between pipelines
|
|
3268
|
+
if go_congruence == True:
|
|
3269
|
+
|
|
3270
|
+
i1_matrix=inputs_variables[0][0]
|
|
3271
|
+
i2_matrix=inputs_variables[1][0]
|
|
3272
|
+
i1_prefix=inputs_variables[0][1]
|
|
3273
|
+
i2_prefix=inputs_variables[1][1]
|
|
3274
|
+
|
|
3275
|
+
path_all_correspondence_lower = management_main_scripts(comparing_partitions_script, get_best_part_correspondence_script, remove_hifen_script, i1_matrix, i2_matrix, prefix_both, output, score_value, log)
|
|
3276
|
+
|
|
3277
|
+
#Final score
|
|
3278
|
+
fig_heatmap = get_heatmap(output, i1_prefix, i2_prefix, threshold, log)
|
|
3279
|
+
fig_html_heatmap = pio.to_html(fig_heatmap, include_plotlyjs='cdn',full_html=False)
|
|
3280
|
+
html_content += congruence_heatmap(fig_html_heatmap, prefix_both)
|
|
3281
|
+
|
|
3282
|
+
#-----------------------------------------------------------------------
|
|
3283
|
+
# Get best correspondence
|
|
3284
|
+
|
|
3285
|
+
if not any(len(elem) == 6 and elem[3] is False for elem in inputs_variables):
|
|
3286
|
+
fig_tendency, nr_point_method_1, nr_point_method_2 = get_tendency(output, prefix_both, log)
|
|
3287
|
+
fig_tendency_html = pio.to_html(fig_tendency,include_plotlyjs='cdn', full_html=False)
|
|
3288
|
+
html_content += congruence_tendency(fig_tendency_html, score_value, prefix_both, nr_point_method_1, nr_point_method_2)
|
|
3289
|
+
comparison = tendency_slop(path_all_correspondence_lower, i1_prefix, i2_prefix, output)
|
|
3290
|
+
#-------------------------------------------------------------------------------------------------------------------------------
|
|
3291
|
+
# MODULE 5 - OUTBREAK
|
|
3292
|
+
|
|
3293
|
+
if go_outbreaks == True:
|
|
3294
|
+
|
|
3295
|
+
#-----------------------------------------------------------------------
|
|
3296
|
+
# Variables
|
|
3297
|
+
clusterComposition_1 = inputs_variables[0][5]
|
|
3298
|
+
clusterComposition_2 = inputs_variables[1][5]
|
|
3299
|
+
|
|
3300
|
+
if valid_combinations != []:
|
|
3301
|
+
|
|
3302
|
+
print_log(f"\tThreshold outbreaks was validated successfully.", log)
|
|
3303
|
+
values_outbreak = extract_integer_part(valid_combinations, log)
|
|
3304
|
+
print_log(f"\tAssessing the overlap of cluster composition.\n", log)
|
|
3305
|
+
df_stats_outbreak, path_stats_outbreak = creation_tsv_stats_outbreak(clusterComposition_1, clusterComposition_2, output, prefix_both, log)
|
|
3306
|
+
|
|
3307
|
+
#-----------------------------------------------------------------------
|
|
3308
|
+
if values_outbreak:
|
|
3309
|
+
calling_script_outbreak(stats_outbreak_script, path_stats_outbreak, output, prefix_both, values_outbreak, log)
|
|
3310
|
+
process_files = read_files_outbreak(output)
|
|
3311
|
+
fig_result, thresholds = creation_overlap_clusters(process_files, output, values_outbreak)
|
|
3312
|
+
print_log(f"\tPlotting the matrices with the cluster overlap for each comparison", log)
|
|
3313
|
+
|
|
3314
|
+
if not repeat_threshold_outbreak:
|
|
3315
|
+
html_content += image_outbreak(fig_result)
|
|
3316
|
+
html_content += summary_outbreak(prefix_both, thresholds)
|
|
3317
|
+
else:
|
|
3318
|
+
final_files = find_html_outbreak(output, prefix_both, log)
|
|
3319
|
+
path_temp = extration_section_original_file(output, final_files, log)
|
|
3320
|
+
html_content += transfer_info_to_html_content(path_temp, html_content, log)
|
|
3321
|
+
html_content += image_outbreak(fig_result)
|
|
3322
|
+
html_content += summary_outbreak(prefix_both, thresholds)
|
|
3323
|
+
else:
|
|
3324
|
+
print_log(f'\tImpossible outbreaks analysis.', log)
|
|
3325
|
+
|
|
3326
|
+
#--------------------------------------------------------------------------------------------------------------------------
|
|
3327
|
+
#4 - END HTML report
|
|
3328
|
+
html_content += references()
|
|
3329
|
+
html_content += javascript_function()
|
|
3330
|
+
html_content += create_html_footer()
|
|
3331
|
+
html_report = write_html(html_content, file_path_report, log)
|
|
3332
|
+
|
|
3333
|
+
# path=f'{output}/html_all_modules.txt'
|
|
3334
|
+
# with open(path, 'w') as f:
|
|
3335
|
+
# f.write(html_content)
|
|
3336
|
+
|
|
3337
|
+
#----------------------------------------------------------------------------------------------------------------------------
|
|
3338
|
+
#END INFORMATIONS
|
|
3339
|
+
|
|
3340
|
+
#print_log("\nEND Running EvalTree.py ...\n", log)
|
|
3341
|
+
print_log('Evaltree is done! If you found any issue please contact us.\n', log)
|
|
3342
|
+
end = datetime.datetime.now()
|
|
3343
|
+
elapsed = end - start
|
|
3344
|
+
print_log("\nEnd: " + str(end), log)
|
|
3345
|
+
print_log("Time elapsed: " + str(elapsed), log)
|
|
3346
|
+
log.close()
|
|
3347
|
+
|
|
3348
|
+
#--------------------------------------------------------------------------------------------------
|
|
3349
|
+
# Rename ouput folder if it was automatically created
|
|
3350
|
+
if rename == True:
|
|
3351
|
+
|
|
3352
|
+
rename_folder = os.path.join(os.path.dirname(output), prefix_both)
|
|
3353
|
+
os.rename(output, rename_folder)
|
|
3354
|
+
|
|
3355
|
+
|
|
3356
|
+
if __name__ == "__main__":
|
|
3357
|
+
main()
|
|
3358
|
+
|
|
3359
|
+
|