genelastic 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/cli_start_api.py +18 -0
- genelastic/api/extends/example.py +2 -3
- genelastic/api/extends/example.yml +20 -0
- genelastic/api/routes.py +160 -23
- genelastic/api/server.py +42 -31
- genelastic/api/settings.py +5 -8
- genelastic/api/specification.yml +350 -0
- genelastic/common/__init__.py +41 -9
- genelastic/common/cli.py +103 -23
- genelastic/common/elastic.py +80 -49
- genelastic/common/exceptions.py +0 -2
- genelastic/common/server.py +51 -0
- genelastic/common/types.py +20 -15
- genelastic/import_data/__init__.py +23 -5
- genelastic/import_data/analyses.py +17 -20
- genelastic/import_data/analysis.py +69 -65
- genelastic/import_data/bi_process.py +7 -5
- genelastic/import_data/bi_processes.py +8 -8
- genelastic/import_data/cli_gen_data.py +143 -0
- genelastic/import_data/cli_import.py +379 -0
- genelastic/import_data/{info.py → cli_info.py} +104 -75
- genelastic/import_data/cli_integrity.py +384 -0
- genelastic/import_data/cli_validate.py +54 -0
- genelastic/import_data/constants.py +11 -32
- genelastic/import_data/data_file.py +23 -20
- genelastic/import_data/filename_pattern.py +26 -32
- genelastic/import_data/import_bundle.py +56 -47
- genelastic/import_data/import_bundle_factory.py +166 -158
- genelastic/import_data/logger.py +22 -18
- genelastic/import_data/random_bundle.py +425 -0
- genelastic/import_data/tags.py +46 -26
- genelastic/import_data/wet_process.py +8 -4
- genelastic/import_data/wet_processes.py +13 -8
- genelastic/ui/__init__.py +0 -0
- genelastic/ui/cli_start_ui.py +18 -0
- genelastic/ui/routes.py +86 -0
- genelastic/ui/server.py +14 -0
- genelastic/ui/settings.py +7 -0
- genelastic/ui/templates/analyses.html +11 -0
- genelastic/ui/templates/bi_processes.html +11 -0
- genelastic/ui/templates/home.html +4 -0
- genelastic/ui/templates/layout.html +34 -0
- genelastic/ui/templates/version.html +9 -0
- genelastic/ui/templates/wet_processes.html +11 -0
- genelastic-0.8.0.dist-info/METADATA +109 -0
- genelastic-0.8.0.dist-info/RECORD +52 -0
- {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/WHEEL +1 -1
- genelastic-0.8.0.dist-info/entry_points.txt +8 -0
- genelastic/import_data/gen_data.py +0 -194
- genelastic/import_data/import_data.py +0 -292
- genelastic/import_data/integrity.py +0 -290
- genelastic/import_data/validate_data.py +0 -43
- genelastic-0.6.1.dist-info/METADATA +0 -41
- genelastic-0.6.1.dist-info/RECORD +0 -36
- genelastic-0.6.1.dist-info/entry_points.txt +0 -6
- {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/top_level.txt +0 -0
genelastic/import_data/logger.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import logging
|
|
3
2
|
import typing
|
|
4
3
|
|
|
@@ -8,32 +7,37 @@ import colorlog
|
|
|
8
7
|
def configure_logging(verbose: int, log_file: str | None = None) -> None:
|
|
9
8
|
"""Configure logging for both import and gen-data scripts."""
|
|
10
9
|
# Define TRACE level
|
|
11
|
-
logging.TRACE = 5 # type: ignore
|
|
12
|
-
logging.addLevelName(logging.TRACE, "TRACE") # type: ignore
|
|
10
|
+
logging.TRACE = 5 # type: ignore[attr-defined]
|
|
11
|
+
logging.addLevelName(logging.TRACE, "TRACE") # type: ignore[attr-defined]
|
|
13
12
|
|
|
14
|
-
def trace(
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def trace(
|
|
14
|
+
self: logging.Logger,
|
|
15
|
+
message: object,
|
|
16
|
+
*args: typing.Any, # noqa: ANN401
|
|
17
|
+
**kws: typing.Any, # noqa: ANN401
|
|
18
|
+
) -> None:
|
|
19
|
+
if self.isEnabledFor(logging.TRACE): # type: ignore[attr-defined]
|
|
20
|
+
self._log(logging.TRACE, message, args, **kws) # type: ignore[attr-defined]
|
|
17
21
|
|
|
18
|
-
logging.Logger.trace = trace # type: ignore
|
|
22
|
+
logging.Logger.trace = trace # type: ignore[attr-defined]
|
|
19
23
|
|
|
20
24
|
# Get root logger
|
|
21
25
|
root = logging.getLogger()
|
|
22
26
|
|
|
23
27
|
# Define formatter for file logging.
|
|
24
|
-
fmt = logging.Formatter(
|
|
28
|
+
fmt = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
|
25
29
|
|
|
26
30
|
# Define formatter for colored console logging.
|
|
27
31
|
color_fmt = colorlog.ColoredFormatter(
|
|
28
|
-
|
|
32
|
+
"%(log_color)s%(asctime)s %(levelname)-8s %(message)s",
|
|
29
33
|
log_colors={
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
}
|
|
34
|
+
"TRACE": "light_cyan",
|
|
35
|
+
"DEBUG": "light_yellow",
|
|
36
|
+
"INFO": "light_green",
|
|
37
|
+
"WARNING": "light_purple",
|
|
38
|
+
"ERROR": "light_red",
|
|
39
|
+
"CRITICAL": "light_red",
|
|
40
|
+
},
|
|
37
41
|
)
|
|
38
42
|
|
|
39
43
|
# Define console handler
|
|
@@ -50,7 +54,7 @@ def configure_logging(verbose: int, log_file: str | None = None) -> None:
|
|
|
50
54
|
level_map = {
|
|
51
55
|
0: logging.WARNING, # quiet mode
|
|
52
56
|
1: logging.INFO, # default
|
|
53
|
-
2: logging.DEBUG # verbose mode
|
|
57
|
+
2: logging.DEBUG, # verbose mode
|
|
54
58
|
}
|
|
55
59
|
# If verbose is greater than 2, set level to TRACE.
|
|
56
|
-
root.setLevel(level_map.get(verbose, logging.TRACE)) # type: ignore
|
|
60
|
+
root.setLevel(level_map.get(verbose, logging.TRACE)) # type: ignore[attr-defined]
|
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import random
|
|
3
|
+
import sys
|
|
4
|
+
import tempfile
|
|
5
|
+
import typing
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
from biophony import (
|
|
11
|
+
BioSeqGen,
|
|
12
|
+
CovGen,
|
|
13
|
+
Elements,
|
|
14
|
+
FastaWriter,
|
|
15
|
+
MutSim,
|
|
16
|
+
MutSimParams,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from genelastic.common import (
|
|
20
|
+
RandomAnalysisData,
|
|
21
|
+
RandomBiProcessData,
|
|
22
|
+
RandomWetProcessData,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RandomBundleItem(ABC):
|
|
27
|
+
"""Abstract class representing a randomly generated bundle item."""
|
|
28
|
+
|
|
29
|
+
def _random_alphanum_str(
|
|
30
|
+
self, chars: str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", n: int = 4
|
|
31
|
+
) -> str:
|
|
32
|
+
"""Generate a random alphanumerical string."""
|
|
33
|
+
return "".join(random.sample(list(chars), n))
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def to_dict(self) -> typing.Any: # noqa: ANN401
|
|
37
|
+
"""Return the randomly generated item data as a dict."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class RandomWetProcess(RandomBundleItem):
|
|
41
|
+
"""Generate a random wet lab process.
|
|
42
|
+
|
|
43
|
+
:param seed: Set a seed for data reproducibility.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
KITS: typing.ClassVar = [
|
|
47
|
+
{
|
|
48
|
+
"generic_kit": "truseq-illumina",
|
|
49
|
+
"library_kit": "truseq-illumina",
|
|
50
|
+
"sequencing_kit": "truseq-illumina",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"generic_kit": "smrtbellprepkit3.0",
|
|
54
|
+
"library_kit": "smrtbellprepkit3.0",
|
|
55
|
+
"sequencing_kit": "revio_polymerase_sequencing",
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"generic_kit": "sqk-lsk114",
|
|
59
|
+
"library_kit": "sqk-lsk114",
|
|
60
|
+
"sequencing_kit": "sqk-lsk114",
|
|
61
|
+
},
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
def __init__(self, seed: int | None = None) -> None:
|
|
65
|
+
random.seed(seed)
|
|
66
|
+
|
|
67
|
+
self._proc_id = self._random_alphanum_str(n=8)
|
|
68
|
+
self._manufacturer = random.choice(["illumina", "ont", "pacbio"])
|
|
69
|
+
self._sequencer = random.choice(
|
|
70
|
+
["novaseqxplus", "promethion", "novaseq6000", "revio"]
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
kit: dict[str, str] = random.choice(self.KITS)
|
|
74
|
+
self._generic_kit = kit["generic_kit"]
|
|
75
|
+
self._library_kit = kit["library_kit"]
|
|
76
|
+
self._sequencing_kit = kit["sequencing_kit"]
|
|
77
|
+
|
|
78
|
+
self._fragmentation = random.choice(range(100, 401, 50))
|
|
79
|
+
self._reads_size = random.choice(range(100, 401, 50))
|
|
80
|
+
self._flowcell_type = f"{random.choice(range(10, 101, 10))}b"
|
|
81
|
+
self._sequencing_type = "wgs" + random.choice(["", "-iclr", "-lowpass"])
|
|
82
|
+
self._error_rate_expected = round(random.uniform(0.01, 0.1), 2)
|
|
83
|
+
|
|
84
|
+
def to_dict(self) -> RandomWetProcessData:
|
|
85
|
+
"""Return the generated wet lab process as a dictionary."""
|
|
86
|
+
return {
|
|
87
|
+
"proc_id": self._proc_id,
|
|
88
|
+
"manufacturer": self._manufacturer,
|
|
89
|
+
"sequencer": self._sequencer,
|
|
90
|
+
"generic_kit": self._generic_kit,
|
|
91
|
+
"library_kit": self._library_kit,
|
|
92
|
+
"sequencing_kit": self._sequencing_kit,
|
|
93
|
+
"fragmentation": self._fragmentation,
|
|
94
|
+
"reads_size": self._reads_size,
|
|
95
|
+
"input_type": "gdna",
|
|
96
|
+
"amplification": "pcr-free",
|
|
97
|
+
"flowcell_type": self._flowcell_type,
|
|
98
|
+
"sequencing_type": self._sequencing_type,
|
|
99
|
+
"error_rate_expected": self._error_rate_expected,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class RandomBiProcess(RandomBundleItem):
|
|
104
|
+
"""Generate a random bioinformatics process.
|
|
105
|
+
|
|
106
|
+
:param seed: Set a seed for data reproducibility.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
STEPS: typing.ClassVar = [
|
|
110
|
+
{"name": "basecalling", "cmd": ["bclconvert", "dorado", "smrtlink"]},
|
|
111
|
+
{"name": "mapping", "cmd": ["bwa", "dragmap", "minimap", "pbmm"]},
|
|
112
|
+
{"name": "postmapping", "cmd": ["bqsr", "dragen"]},
|
|
113
|
+
{
|
|
114
|
+
"name": "smallvarcalling",
|
|
115
|
+
"cmd": [
|
|
116
|
+
"gatk_haplotypecaller",
|
|
117
|
+
"octopus",
|
|
118
|
+
"glimpse",
|
|
119
|
+
"dragen",
|
|
120
|
+
"deepvariant",
|
|
121
|
+
"clair",
|
|
122
|
+
],
|
|
123
|
+
"output": "smallvar",
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"name": "svcalling",
|
|
127
|
+
"cmd": ["manta", "dragen", "sniffles", "cutesv", "pbsv"],
|
|
128
|
+
"output": "sv",
|
|
129
|
+
},
|
|
130
|
+
{"name": "secondary_qc", "cmd": ["genomx", "dragen", "lrqc"]},
|
|
131
|
+
{"name": "trimming", "cmd": ["dragen", "seqfiltering"]},
|
|
132
|
+
{"name": "phasing", "cmd": ["whatshap"]},
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
def __init__(self, seed: int | None = None) -> None:
|
|
136
|
+
random.seed(seed)
|
|
137
|
+
|
|
138
|
+
self._proc_id = self._random_alphanum_str(n=8)
|
|
139
|
+
|
|
140
|
+
version_str_len = random.choice(range(1, 5))
|
|
141
|
+
self._pipeline_version = self._generate_version(version_str_len)
|
|
142
|
+
self._name = random.choice(
|
|
143
|
+
["varscope", "glimpse", "dragen", "vacana", "pbvaria"]
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
self._steps: list[dict[str, str]] = []
|
|
147
|
+
self._generate_steps()
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
def _generate_version(count: int) -> str:
|
|
151
|
+
"""Generate a random version string.
|
|
152
|
+
|
|
153
|
+
:param count: Count of numbers present in the number string.
|
|
154
|
+
:raises ValueError: If count is less than 1.
|
|
155
|
+
:return: A random version string with the specified count of numbers.
|
|
156
|
+
"""
|
|
157
|
+
if count < 1:
|
|
158
|
+
msg = "Count of numbers present in the version string must be > 0."
|
|
159
|
+
raise ValueError(msg)
|
|
160
|
+
|
|
161
|
+
lower_bound = 0
|
|
162
|
+
# Do not use 0 for versions string with only one number.
|
|
163
|
+
if count == 1:
|
|
164
|
+
lower_bound = 1
|
|
165
|
+
|
|
166
|
+
version_parts = [
|
|
167
|
+
str(random.randint(lower_bound, 9)) for _ in range(count)
|
|
168
|
+
]
|
|
169
|
+
return ".".join(version_parts)
|
|
170
|
+
|
|
171
|
+
def _generate_steps(self) -> None:
|
|
172
|
+
steps_count = random.randint(1, 5)
|
|
173
|
+
random_steps = copy.deepcopy(random.sample(self.STEPS, steps_count))
|
|
174
|
+
for random_step in random_steps:
|
|
175
|
+
random_step["version"] = self._generate_version(
|
|
176
|
+
random.choice(range(1, 5))
|
|
177
|
+
)
|
|
178
|
+
random_step["cmd"] = random.choice(random_step["cmd"])
|
|
179
|
+
self._steps.append(random_step)
|
|
180
|
+
|
|
181
|
+
def to_dict(self) -> RandomBiProcessData:
|
|
182
|
+
"""Return the generated bi informatics process as a dictionary."""
|
|
183
|
+
return {
|
|
184
|
+
"proc_id": self._proc_id,
|
|
185
|
+
"name": self._name,
|
|
186
|
+
"pipeline_version": self._pipeline_version,
|
|
187
|
+
"steps": self._steps,
|
|
188
|
+
"sequencing_type": "wgs",
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class RandomAnalysis(RandomBundleItem):
|
|
193
|
+
"""Generate a random analysis.
|
|
194
|
+
|
|
195
|
+
:param fasta_dir: Directory where to create the FASTA file used as a basis to generate the analysis VCF file.
|
|
196
|
+
:param output_dir: Directory where the analysis VCF file
|
|
197
|
+
(and coverage file if `do_gen_coverage` is set to True) is generated.
|
|
198
|
+
|
|
199
|
+
:raises RuntimeError: Could not generate a VCF file with the given simulation parameters.
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
def __init__( # noqa: PLR0913
|
|
203
|
+
self,
|
|
204
|
+
fasta_dir: Path,
|
|
205
|
+
output_dir: Path,
|
|
206
|
+
seq_len: int,
|
|
207
|
+
nb_chrom: int,
|
|
208
|
+
wet_proc_id: str,
|
|
209
|
+
bi_proc_id: str,
|
|
210
|
+
sim_params: MutSimParams,
|
|
211
|
+
*,
|
|
212
|
+
do_gen_coverage: bool,
|
|
213
|
+
) -> None:
|
|
214
|
+
self._fasta_dir = fasta_dir
|
|
215
|
+
self._output_dir = output_dir
|
|
216
|
+
self._seq_len = seq_len
|
|
217
|
+
self._nb_chrom = nb_chrom
|
|
218
|
+
self._wet_process_id = wet_proc_id
|
|
219
|
+
self._bi_process_id = bi_proc_id
|
|
220
|
+
|
|
221
|
+
self._sample_name = "HG000" + str(random.randint(1, 9))
|
|
222
|
+
sim_params.sample_name = self._sample_name
|
|
223
|
+
self._sim_params = sim_params
|
|
224
|
+
|
|
225
|
+
self._source = "CNRGH"
|
|
226
|
+
self._barcode = self._random_alphanum_str(n=6)
|
|
227
|
+
self._reference_genome = "hg38"
|
|
228
|
+
self._prefix = (
|
|
229
|
+
f"{self._sample_name}_{self._source}_{self._wet_process_id}_{self._bi_process_id}_"
|
|
230
|
+
f"{self._barcode}_{self._reference_genome}"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
self._gen_vcf_file()
|
|
234
|
+
if do_gen_coverage:
|
|
235
|
+
self.gen_cov_file()
|
|
236
|
+
|
|
237
|
+
def _gen_vcf_file(self) -> None:
|
|
238
|
+
"""Generate a dummy VCF file.
|
|
239
|
+
|
|
240
|
+
:raises RuntimeError: The call to `mutation-simulator` returned a non-zero exit status.
|
|
241
|
+
"""
|
|
242
|
+
fasta_out_file = self._fasta_dir / "seq.fasta"
|
|
243
|
+
vcf_out_file = self._output_dir / f"{self._prefix}.vcf"
|
|
244
|
+
|
|
245
|
+
# 1 - Generate a FASTA file and save it to a temporary directory.
|
|
246
|
+
gen = BioSeqGen(
|
|
247
|
+
elements=Elements(), seqlen=self._seq_len, count=self._nb_chrom
|
|
248
|
+
)
|
|
249
|
+
with fasta_out_file.open("w", encoding="utf-8") as f:
|
|
250
|
+
FastaWriter(f, header=False).write_seqs(gen)
|
|
251
|
+
|
|
252
|
+
# 2 - Generate a VCF from the previously created FASTA file.
|
|
253
|
+
MutSim(
|
|
254
|
+
fasta_file=str(fasta_out_file),
|
|
255
|
+
vcf_file=str(vcf_out_file),
|
|
256
|
+
sim_params=self._sim_params,
|
|
257
|
+
).run()
|
|
258
|
+
|
|
259
|
+
def gen_cov_file(self) -> None:
|
|
260
|
+
"""Generate a dummy coverage file."""
|
|
261
|
+
chrom_end = self._seq_len - 1
|
|
262
|
+
|
|
263
|
+
output_path = self._output_dir / f"{self._prefix}.cov.tsv"
|
|
264
|
+
with output_path.open("w", encoding="utf-8") as f:
|
|
265
|
+
for chrom in range(1, self._nb_chrom + 1):
|
|
266
|
+
coverage = CovGen(
|
|
267
|
+
chrom=str(chrom),
|
|
268
|
+
min_pos=0,
|
|
269
|
+
max_pos=chrom_end,
|
|
270
|
+
min_depth=5,
|
|
271
|
+
max_depth=15,
|
|
272
|
+
depth_offset=0,
|
|
273
|
+
depth_change_rate=0.1,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
for item in coverage:
|
|
277
|
+
f.write(item.to_bed_line() + "\n")
|
|
278
|
+
|
|
279
|
+
def to_dict(self) -> RandomAnalysisData:
|
|
280
|
+
"""Return the generated analysis as a dictionary."""
|
|
281
|
+
return {
|
|
282
|
+
"file_prefix": "%S_%F_%W_%B_%A_%R",
|
|
283
|
+
"sample_name": self._sample_name,
|
|
284
|
+
"source": self._source,
|
|
285
|
+
"barcode": self._barcode,
|
|
286
|
+
"wet_process": self._wet_process_id,
|
|
287
|
+
"bi_process": self._bi_process_id,
|
|
288
|
+
"reference_genome": self._reference_genome,
|
|
289
|
+
"flowcell": self._random_alphanum_str(n=8),
|
|
290
|
+
"lanes": [random.randint(1, 10)],
|
|
291
|
+
"seq_indices": [
|
|
292
|
+
"DUAL219",
|
|
293
|
+
"DUAL222",
|
|
294
|
+
"DUAL225",
|
|
295
|
+
"DUAL228",
|
|
296
|
+
"DUAL289",
|
|
297
|
+
],
|
|
298
|
+
"data_path": str(self._output_dir),
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class RandomBundle(RandomBundleItem):
|
|
303
|
+
"""Generate a random analyses bundle."""
|
|
304
|
+
|
|
305
|
+
def __init__( # noqa: PLR0913
|
|
306
|
+
self,
|
|
307
|
+
output_dir: Path,
|
|
308
|
+
analyses_count: int,
|
|
309
|
+
processes_count: int,
|
|
310
|
+
nb_chrom: int,
|
|
311
|
+
seq_len: int,
|
|
312
|
+
sim_params: MutSimParams,
|
|
313
|
+
*,
|
|
314
|
+
do_gen_coverage: bool,
|
|
315
|
+
) -> None:
|
|
316
|
+
self._output_dir = output_dir
|
|
317
|
+
self._analyses_count = analyses_count
|
|
318
|
+
self._processes_count = processes_count
|
|
319
|
+
self._nb_chrom = nb_chrom
|
|
320
|
+
self._seq_len = seq_len
|
|
321
|
+
self._do_gen_coverage = do_gen_coverage
|
|
322
|
+
self._analyses: list[RandomAnalysisData] = []
|
|
323
|
+
|
|
324
|
+
self._wet_processes = [
|
|
325
|
+
RandomWetProcess().to_dict() for _ in range(self._processes_count)
|
|
326
|
+
]
|
|
327
|
+
self._assigned_wet_processes = self._assign_processes(
|
|
328
|
+
self._wet_processes, self._analyses_count
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
self._bi_processes = [
|
|
332
|
+
RandomBiProcess().to_dict() for _ in range(self._processes_count)
|
|
333
|
+
]
|
|
334
|
+
self._assigned_bi_processes = self._assign_processes(
|
|
335
|
+
self._bi_processes, self._analyses_count
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
with tempfile.TemporaryDirectory() as fasta_dir:
|
|
339
|
+
try:
|
|
340
|
+
self._analyses.extend(
|
|
341
|
+
[
|
|
342
|
+
RandomAnalysis(
|
|
343
|
+
Path(fasta_dir),
|
|
344
|
+
self._output_dir,
|
|
345
|
+
self._seq_len,
|
|
346
|
+
self._nb_chrom,
|
|
347
|
+
str(self._assigned_wet_processes[i]["proc_id"]),
|
|
348
|
+
str(self._assigned_bi_processes[i]["proc_id"]),
|
|
349
|
+
sim_params,
|
|
350
|
+
do_gen_coverage=self._do_gen_coverage,
|
|
351
|
+
).to_dict()
|
|
352
|
+
for i in range(self._analyses_count)
|
|
353
|
+
]
|
|
354
|
+
)
|
|
355
|
+
except RuntimeError as e:
|
|
356
|
+
msg = f"VCF file generation for one analysis failed. {e}"
|
|
357
|
+
raise SystemExit(msg) from None
|
|
358
|
+
|
|
359
|
+
@staticmethod
|
|
360
|
+
def _assign_processes(
|
|
361
|
+
random_processes: list[dict[str, typing.Any]],
|
|
362
|
+
analyses_count: int,
|
|
363
|
+
) -> list[dict[str, typing.Any]]:
|
|
364
|
+
"""Assigns a specified number of processes to analyses.
|
|
365
|
+
|
|
366
|
+
This function ensures that the returned list contains exactly `analyses_count` processes:
|
|
367
|
+
- If there are more processes than required, it selects a random subset.
|
|
368
|
+
- If there are fewer processes than required, it extends the list by randomly selecting additional
|
|
369
|
+
processes until the desired size is reached.
|
|
370
|
+
- If the number of processes matches the number of analyses, the same list is returned.
|
|
371
|
+
|
|
372
|
+
:param random_processes: A list of available processes.
|
|
373
|
+
:param analyses_count: The number of processes required.
|
|
374
|
+
:raises ValueError: If the input list `random_processes` is empty.
|
|
375
|
+
:returns: A list of processes with a length of `analyses_count`.
|
|
376
|
+
"""
|
|
377
|
+
if not random_processes:
|
|
378
|
+
msg = "Random processes list is empty."
|
|
379
|
+
raise ValueError(msg)
|
|
380
|
+
|
|
381
|
+
if len(random_processes) > analyses_count:
|
|
382
|
+
# Case 1: More processes than analyses.
|
|
383
|
+
# Select a random subset of processes with the required size.
|
|
384
|
+
return random.sample(random_processes, analyses_count)
|
|
385
|
+
|
|
386
|
+
# Case 2: Equal or fewer processes than analyses.
|
|
387
|
+
# If the number of processes equals the number of analyses, return the same list.
|
|
388
|
+
# Otherwise, extend the list by randomly selecting additional processes until the desired size is reached.
|
|
389
|
+
random_process_copy = random_processes.copy()
|
|
390
|
+
|
|
391
|
+
while len(random_process_copy) < analyses_count:
|
|
392
|
+
random_process_copy.append(random.choice(random_processes))
|
|
393
|
+
|
|
394
|
+
return random_process_copy
|
|
395
|
+
|
|
396
|
+
# Write import bundle YAML
|
|
397
|
+
def to_yaml(self, output_file: Path | None) -> None:
|
|
398
|
+
"""Export the generated bundle in YAML format to a file or stdout."""
|
|
399
|
+
# Standard output
|
|
400
|
+
if not output_file:
|
|
401
|
+
sys.stdout.write("---\n")
|
|
402
|
+
yaml.dump(self.to_dict(), sys.stdout)
|
|
403
|
+
|
|
404
|
+
# File
|
|
405
|
+
else:
|
|
406
|
+
with output_file.open("w", encoding="utf-8") as f:
|
|
407
|
+
f.write("---\n")
|
|
408
|
+
yaml.dump(self.to_dict(), f)
|
|
409
|
+
|
|
410
|
+
def to_dict(
|
|
411
|
+
self,
|
|
412
|
+
) -> dict[
|
|
413
|
+
str,
|
|
414
|
+
int
|
|
415
|
+
| list[RandomAnalysisData]
|
|
416
|
+
| list[RandomBiProcessData]
|
|
417
|
+
| list[RandomWetProcessData],
|
|
418
|
+
]:
|
|
419
|
+
"""Return the generated bundle as a dictionary."""
|
|
420
|
+
return {
|
|
421
|
+
"version": 3,
|
|
422
|
+
"analyses": self._analyses,
|
|
423
|
+
"bi_processes": self._bi_processes,
|
|
424
|
+
"wet_processes": self._wet_processes,
|
|
425
|
+
}
|
genelastic/import_data/tags.py
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import logging
|
|
3
2
|
import re
|
|
4
3
|
import typing
|
|
5
4
|
|
|
6
5
|
from genelastic.common import BundleDict
|
|
7
6
|
|
|
8
|
-
from .constants import DEFAULT_TAG2FIELD,
|
|
7
|
+
from .constants import DEFAULT_TAG2FIELD, DEFAULT_TAG_PREFIX, DEFAULT_TAG_SUFFIX
|
|
9
8
|
|
|
10
|
-
logger = logging.getLogger(
|
|
9
|
+
logger = logging.getLogger("genelastic")
|
|
11
10
|
|
|
12
|
-
TagsDefinition: typing.TypeAlias =
|
|
11
|
+
TagsDefinition: typing.TypeAlias = dict[str, dict[str, str | dict[str, str]]]
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
class Tags:
|
|
16
|
-
"""
|
|
17
|
-
This class handles the definition of default and custom tags.
|
|
15
|
+
"""This class handles the definition of default and custom tags.
|
|
18
16
|
Tags are used to extract custom metadata from files belonging to an analysis.
|
|
19
17
|
"""
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
|
|
19
|
+
def __init__(self, documents: typing.Sequence[BundleDict] | None) -> None:
|
|
20
|
+
"""Create a Tag instance."""
|
|
21
|
+
self._tags: dict[str, dict[str, str]] = DEFAULT_TAG2FIELD
|
|
22
22
|
self._tag_prefix: str = DEFAULT_TAG_PREFIX
|
|
23
23
|
self._tag_suffix: str = DEFAULT_TAG_SUFFIX
|
|
24
24
|
|
|
@@ -30,11 +30,15 @@ class Tags:
|
|
|
30
30
|
|
|
31
31
|
if redefined_tags:
|
|
32
32
|
self._build_tags(redefined_tags)
|
|
33
|
-
logger.info(
|
|
34
|
-
|
|
33
|
+
logger.info(
|
|
34
|
+
"The following tags will be used to extract metadata from filenames : %s",
|
|
35
|
+
self._tags,
|
|
36
|
+
)
|
|
35
37
|
else:
|
|
36
|
-
logger.info(
|
|
37
|
-
|
|
38
|
+
logger.info(
|
|
39
|
+
"Using the default tags to extract metadata from filenames : %s",
|
|
40
|
+
self._tags,
|
|
41
|
+
)
|
|
38
42
|
|
|
39
43
|
def _build_tags(self, redefined_tags: TagsDefinition) -> None:
|
|
40
44
|
# Erase the tags defined by defaults.
|
|
@@ -53,22 +57,31 @@ class Tags:
|
|
|
53
57
|
|
|
54
58
|
for tag_name, tag_attrs in redefined_tags["match"].items():
|
|
55
59
|
if isinstance(tag_attrs, dict): # extra type check for mypy
|
|
56
|
-
self._tags[
|
|
60
|
+
self._tags[
|
|
61
|
+
f"{self._tag_prefix}{tag_name}{self._tag_suffix}"
|
|
62
|
+
] = tag_attrs
|
|
57
63
|
|
|
58
64
|
@staticmethod
|
|
59
|
-
def _search_redefined_tags(
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
65
|
+
def _search_redefined_tags(
|
|
66
|
+
documents: typing.Sequence[BundleDict],
|
|
67
|
+
) -> TagsDefinition | None:
|
|
68
|
+
documents_with_redefined_tags: list[BundleDict] = [
|
|
69
|
+
d for d in documents if "tags" in d
|
|
70
|
+
]
|
|
71
|
+
bundle_paths = [d["bundle_file"] for d in documents_with_redefined_tags]
|
|
64
72
|
|
|
65
73
|
# If there are more than one 'tags' redefinition across the documents, raise an error.
|
|
66
74
|
if len(documents_with_redefined_tags) > 1:
|
|
67
|
-
|
|
68
|
-
|
|
75
|
+
msg = (
|
|
76
|
+
f"Only one 'tags' key should be defined across all documents, "
|
|
77
|
+
f"but multiple were found : {', '.join(bundle_paths)}"
|
|
78
|
+
)
|
|
79
|
+
raise RuntimeError(msg)
|
|
69
80
|
|
|
70
81
|
if len(documents_with_redefined_tags) == 1:
|
|
71
|
-
redefined_tags: TagsDefinition = documents_with_redefined_tags[0][
|
|
82
|
+
redefined_tags: TagsDefinition = documents_with_redefined_tags[0][
|
|
83
|
+
"tags"
|
|
84
|
+
]
|
|
72
85
|
return redefined_tags
|
|
73
86
|
|
|
74
87
|
return None
|
|
@@ -84,9 +97,8 @@ class Tags:
|
|
|
84
97
|
return self._tag_suffix
|
|
85
98
|
|
|
86
99
|
@property
|
|
87
|
-
def items(self) -> typing.ItemsView[str,
|
|
88
|
-
"""
|
|
89
|
-
Returns the tag items : the key is the tag name,
|
|
100
|
+
def items(self) -> typing.ItemsView[str, dict[str, str]]:
|
|
101
|
+
"""Returns the tag items : the key is the tag name,
|
|
90
102
|
and the value is the tag attributes (a dict containing the 'field' and 'regex' keys).
|
|
91
103
|
"""
|
|
92
104
|
return self._tags.items()
|
|
@@ -94,10 +106,18 @@ class Tags:
|
|
|
94
106
|
@property
|
|
95
107
|
def search_regex(self) -> str:
|
|
96
108
|
"""Returns a regex to search for a tag inside a string."""
|
|
97
|
-
return
|
|
109
|
+
return (
|
|
110
|
+
r"("
|
|
111
|
+
+ re.escape(self._tag_prefix)
|
|
112
|
+
+ r"\w+"
|
|
113
|
+
+ re.escape(self._tag_suffix)
|
|
114
|
+
+ r")"
|
|
115
|
+
)
|
|
98
116
|
|
|
99
117
|
def __len__(self) -> int:
|
|
118
|
+
"""Return the number of registered tags."""
|
|
100
119
|
return len(self._tags)
|
|
101
120
|
|
|
102
|
-
def __getitem__(self, key: str) ->
|
|
121
|
+
def __getitem__(self, key: str) -> dict[str, str]:
|
|
122
|
+
"""Return a tag by its key."""
|
|
103
123
|
return self._tags[key]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import copy
|
|
3
2
|
|
|
4
3
|
from genelastic.common import WetProcessesData
|
|
@@ -6,9 +5,14 @@ from genelastic.common import WetProcessesData
|
|
|
6
5
|
|
|
7
6
|
class WetProcess:
|
|
8
7
|
"""Class WetProcess that represents a wet process."""
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
proc_id: str,
|
|
12
|
+
bundle_file: str | None = None,
|
|
13
|
+
**data: str | float,
|
|
14
|
+
) -> None:
|
|
15
|
+
"""Create a WetProcess instance."""
|
|
12
16
|
self._proc_id = proc_id
|
|
13
17
|
self._bundle_file = bundle_file
|
|
14
18
|
self._data: WetProcessesData = data
|