XspecT 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

xspect/classify.py CHANGED
@@ -1,11 +1,13 @@
1
+ """Classification module"""
2
+
1
3
  from pathlib import Path
2
- from xspect.mlst_feature.mlst_helper import pick_scheme_from_models_dir
4
+ from importlib import import_module
3
5
  import xspect.model_management as mm
4
- from xspect.models.probabilistic_filter_mlst_model import (
5
- ProbabilisticFilterMlstSchemeModel,
6
- )
7
6
  from xspect.file_io import prepare_input_output_paths
8
7
 
8
+ # inline imports lead to "invalid name" issues
9
+ # pylint: disable=invalid-name
10
+
9
11
 
10
12
  def classify_genus(
11
13
  model_genus: str, input_path: Path, output_path: Path, step: int = 1
@@ -22,7 +24,12 @@ def classify_genus(
22
24
  output_path (Path): The path to the output file where results will be saved.
23
25
  step (int): The amount of kmers to be skipped.
24
26
  """
25
- model = mm.get_genus_model(model_genus)
27
+ ProbabilisticSingleFilterModel = import_module(
28
+ "xspect.models.probabilistic_single_filter_model"
29
+ ).ProbabilisticSingleFilterModel
30
+
31
+ model_path = mm.get_genus_model_path(model_genus)
32
+ model = ProbabilisticSingleFilterModel.load(model_path)
26
33
  input_paths, get_output_path = prepare_input_output_paths(input_path)
27
34
 
28
35
  for idx, current_path in enumerate(input_paths):
@@ -34,7 +41,12 @@ def classify_genus(
34
41
 
35
42
 
36
43
  def classify_species(
37
- model_genus: str, input_path: Path, output_path: Path, step: int = 1
44
+ model_genus: str,
45
+ input_path: Path,
46
+ output_path: Path,
47
+ step: int = 1,
48
+ display_name: bool = False,
49
+ validation: bool = False,
38
50
  ):
39
51
  """
40
52
  Classify the species of sequences.
@@ -47,12 +59,24 @@ def classify_species(
47
59
  input_path (Path): The path to the input file/directory containing sequences.
48
60
  output_path (Path): The path to the output file where results will be saved.
49
61
  step (int): The amount of kmers to be skipped.
62
+ display_name (bool): Includes a display name for each tax_ID.
63
+ validation (bool): Sorts out misclassified reads.
50
64
  """
51
- model = mm.get_species_model(model_genus)
65
+ ProbabilisticFilterSVMModel = import_module(
66
+ "xspect.models.probabilistic_filter_svm_model"
67
+ ).ProbabilisticFilterSVMModel
68
+
69
+ model_path = mm.get_species_model_path(model_genus)
70
+ model = ProbabilisticFilterSVMModel.load(model_path)
52
71
  input_paths, get_output_path = prepare_input_output_paths(input_path)
53
72
 
54
73
  for idx, current_path in enumerate(input_paths):
55
- result = model.predict(current_path, step=step)
74
+ result = model.predict(
75
+ current_path,
76
+ step=step,
77
+ display_name=display_name,
78
+ validation=validation,
79
+ )
56
80
  result.input_source = current_path.name
57
81
  cls_path = get_output_path(idx, output_path)
58
82
  result.save(cls_path)
@@ -68,6 +92,12 @@ def classify_mlst(input_path: Path, output_path: Path, limit: bool):
68
92
  output_path (Path): The path to the output file where results will be saved.
69
93
  limit (bool): A limit for the highest allele_id results that are shown.
70
94
  """
95
+ pick_scheme_from_models_dir = import_module(
96
+ "xspect.mlst_feature.mlst_helper"
97
+ ).pick_scheme_from_models_dir
98
+ ProbabilisticFilterMlstSchemeModel = import_module(
99
+ "xspect.models.probabilistic_filter_mlst_model"
100
+ ).ProbabilisticFilterMlstSchemeModel
71
101
 
72
102
  scheme_path = pick_scheme_from_models_dir()
73
103
  model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
xspect/definitions.py CHANGED
@@ -11,8 +11,9 @@ def get_xspect_root_path() -> Path:
11
11
  """
12
12
  Return the root path for XspecT data.
13
13
 
14
- Returns the path to the XspecT data directory, which can be located either in the user's home directory or in the current working directory.
15
- If neither exists, it creates the directory in the user's home directory.
14
+ Returns the path to the XspecT data directory, which can be located either in the user's home
15
+ directory or in the current working directory. If neither exists, it creates the directory in
16
+ the user's home directory.
16
17
 
17
18
  Returns:
18
19
  Path: The path to the XspecT data directory.
@@ -34,8 +35,8 @@ def get_xspect_model_path() -> Path:
34
35
  """
35
36
  Return the path to the XspecT models.
36
37
 
37
- Returns the path to the XspecT models directory, which is located within the XspecT data directory.
38
- If the directory does not exist, it creates the directory.
38
+ Returns the path to the XspecT models directory, which is located within the XspecT data
39
+ directory. If the directory does not exist, it creates the directory.
39
40
 
40
41
  Returns:
41
42
  Path: The path to the XspecT models directory.
@@ -49,8 +50,8 @@ def get_xspect_upload_path() -> Path:
49
50
  """
50
51
  Return the path to the XspecT upload directory.
51
52
 
52
- Returns the path to the XspecT uploads directory, which is located within the XspecT data directory.
53
- If the directory does not exist, it creates the directory.
53
+ Returns the path to the XspecT uploads directory, which is located within the XspecT data
54
+ directory. If the directory does not exist, it creates the directory.
54
55
 
55
56
  Returns:
56
57
  Path: The path to the XspecT uploads directory.
@@ -64,8 +65,8 @@ def get_xspect_runs_path() -> Path:
64
65
  """
65
66
  Return the path to the XspecT runs directory.
66
67
 
67
- Returns the path to the XspecT runs directory, which is located within the XspecT data directory.
68
- If the directory does not exist, it creates the directory.
68
+ Returns the path to the XspecT runs directory, which is located within the XspecT data
69
+ directory. If the directory does not exist, it creates the directory.
69
70
 
70
71
  Returns:
71
72
  Path: The path to the XspecT runs directory.
@@ -79,8 +80,8 @@ def get_xspect_mlst_path() -> Path:
79
80
  """
80
81
  Return the path to the XspecT MLST directory.
81
82
 
82
- Returns the path to the XspecT MLST directory, which is located within the XspecT data directory.
83
- If the directory does not exist, it creates the directory.
83
+ Returns the path to the XspecT MLST directory, which is located within the XspecT data
84
+ directory. If the directory does not exist, it creates the directory.
84
85
 
85
86
  Returns:
86
87
  Path: The path to the XspecT MLST directory.
@@ -88,3 +89,22 @@ def get_xspect_mlst_path() -> Path:
88
89
  mlst_path = get_xspect_root_path() / "mlst"
89
90
  mlst_path.mkdir(exist_ok=True, parents=True)
90
91
  return mlst_path
92
+
93
+
94
+ def get_xspect_misclassification_path() -> Path:
95
+ """
96
+ Notes:
97
+ Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
98
+ (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
99
+
100
+ Return the path to the XspecT Misclassification directory.
101
+
102
+ Returns the path to the XspecT Misclassification directory, which is located within the XspecT data
103
+ directory. If the directory does not exist, it creates the directory.
104
+
105
+ Returns:
106
+ Path: The path to the XspecT Misclassification directory.
107
+ """
108
+ misclassification_path = get_xspect_root_path() / "misclassification"
109
+ misclassification_path.mkdir(exist_ok=True, parents=True)
110
+ return misclassification_path
xspect/file_io.py CHANGED
@@ -113,7 +113,8 @@ def concatenate_metagenome(fasta_dir: Path, meta_path: Path) -> None:
113
113
  Concatenate all fasta files in a directory into one file.
114
114
 
115
115
  This function searches for all fasta files in the specified directory and writes their contents
116
- into a single output file. The output file will contain the concatenated sequences from all fasta files.
116
+ into a single output file. The output file will contain the concatenated sequences from all
117
+ fasta files.
117
118
 
118
119
  Args:
119
120
  fasta_dir (Path): Path to the directory with the fasta files.
@@ -1,7 +1,13 @@
1
+ """Sequence filtering module"""
2
+
1
3
  from pathlib import Path
2
- from xspect.model_management import get_genus_model, get_species_model
4
+ from importlib import import_module
5
+ from xspect.model_management import get_genus_model_path, get_species_model_path
3
6
  from xspect.file_io import filter_sequences, prepare_input_output_paths
4
7
 
8
+ # inline imports lead to "invalid name" issues
9
+ # pylint: disable=invalid-name
10
+
5
11
 
6
12
  def filter_species(
7
13
  model_genus: str,
@@ -31,7 +37,12 @@ def filter_species(
31
37
  available species scores.
32
38
  sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
33
39
  """
34
- species_model = get_species_model(model_genus)
40
+ ProbabilisticFilterSVMModel = import_module(
41
+ "xspect.models.probabilistic_filter_svm_model"
42
+ ).ProbabilisticFilterSVMModel
43
+
44
+ species_model_path = get_species_model_path(model_genus)
45
+ species_model = ProbabilisticFilterSVMModel.load(species_model_path)
35
46
  input_paths, get_output_path = prepare_input_output_paths(input_path)
36
47
 
37
48
  for idx, current_path in enumerate(input_paths):
@@ -82,11 +93,16 @@ def filter_genus(
82
93
  sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
83
94
 
84
95
  """
85
- model = get_genus_model(model_genus)
96
+ ProbabilisticSingleFilterModel = import_module(
97
+ "xspect.models.probabilistic_single_filter_model"
98
+ ).ProbabilisticSingleFilterModel
99
+
100
+ genus_model_path = get_genus_model_path(model_genus)
101
+ genus_model = ProbabilisticSingleFilterModel.load(genus_model_path)
86
102
  input_paths, get_output_path = prepare_input_output_paths(input_path)
87
103
 
88
104
  for idx, current_path in enumerate(input_paths):
89
- result = model.predict(current_path, step=sparse_sampling_step)
105
+ result = genus_model.predict(current_path, step=sparse_sampling_step)
90
106
  result.input_source = current_path.name
91
107
 
92
108
  if classification_output_path:
xspect/main.py CHANGED
@@ -2,25 +2,12 @@
2
2
 
3
3
  from pathlib import Path
4
4
  from uuid import uuid4
5
+ from importlib import import_module
5
6
  import click
6
- import uvicorn
7
- from xspect import classify
8
- from xspect.web import app
9
- from xspect.download_models import download_test_models
10
- from xspect import filter_sequences
11
- from xspect.train import train_from_directory, train_from_ncbi
12
- from xspect.definitions import (
13
- get_xspect_model_path,
14
- )
15
- from xspect.mlst_feature.mlst_helper import pick_scheme
16
- from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
17
- from xspect.models.probabilistic_filter_mlst_model import (
18
- ProbabilisticFilterMlstSchemeModel,
19
- )
20
- from xspect.model_management import (
21
- get_model_metadata,
22
- get_models,
23
- )
7
+ from xspect.model_management import get_models
8
+
9
+ # inline imports lead to "invalid name" issues
10
+ # pylint: disable=invalid-name
24
11
 
25
12
 
26
13
  @click.group()
@@ -32,7 +19,10 @@ def cli():
32
19
  @cli.command()
33
20
  def web():
34
21
  """Open the XspecT web application."""
35
- uvicorn.run(app, host="0.0.0.0", port=8000)
22
+ app = import_module("xspect.web").app
23
+ run = import_module("uvicorn").run
24
+
25
+ run(app, host="0.0.0.0", port=8000)
36
26
 
37
27
 
38
28
  # # # # # # # # # # # # # # #
@@ -49,6 +39,8 @@ def models():
49
39
  def download():
50
40
  """Download models."""
51
41
  click.echo("Downloading models, this may take a while...")
42
+ download_test_models = import_module("xspect.download_models").download_test_models
43
+
52
44
  download_test_models(
53
45
  "https://assets.adrianromberg.com/science/xspect-models-07-08-2025.zip"
54
46
  )
@@ -64,7 +56,6 @@ def list_models():
64
56
  if not available_models:
65
57
  click.echo("No models found.")
66
58
  return
67
- # todo: make this machine readable
68
59
  click.echo("Models found:")
69
60
  click.echo("--------------")
70
61
  for model_type, names in available_models.items():
@@ -96,11 +87,62 @@ def train():
96
87
  help="Email of the author.",
97
88
  default=None,
98
89
  )
99
- def train_ncbi(model_genus, svm_steps, author, author_email):
90
+ @click.option(
91
+ "--min-n50",
92
+ type=int,
93
+ help="Minimum contig N50 to filter the accessions (default: 10000).",
94
+ default=10000,
95
+ )
96
+ @click.option(
97
+ "--include-atypical/--exclude-atypical",
98
+ help="Include or exclude atypical accessions (default: exclude).",
99
+ default=False,
100
+ )
101
+ @click.option(
102
+ "--allow-inconclusive",
103
+ is_flag=True,
104
+ help="Allow the use of accessions with inconclusive taxonomy check status for training.",
105
+ default=False,
106
+ )
107
+ @click.option(
108
+ "--allow-candidatus",
109
+ is_flag=True,
110
+ help="Allow the use of Candidatus species for training.",
111
+ default=False,
112
+ )
113
+ @click.option(
114
+ "--allow-sp",
115
+ is_flag=True,
116
+ help="Allow the use of species with 'sp.' in their names for training.",
117
+ default=False,
118
+ )
119
+ def train_ncbi(
120
+ model_genus,
121
+ svm_steps,
122
+ author,
123
+ author_email,
124
+ min_n50,
125
+ include_atypical,
126
+ allow_inconclusive,
127
+ allow_candidatus,
128
+ allow_sp,
129
+ ):
100
130
  """Train a species and a genus model based on NCBI data."""
101
131
  click.echo(f"Training {model_genus} species and genus metagenome model.")
102
132
  try:
103
- train_from_ncbi(model_genus, svm_steps, author, author_email)
133
+ train_from_ncbi = import_module("xspect.train").train_from_ncbi
134
+
135
+ train_from_ncbi(
136
+ model_genus,
137
+ svm_steps,
138
+ author,
139
+ author_email,
140
+ min_n50=min_n50,
141
+ exclude_atypical=not include_atypical,
142
+ allow_inconclusive=allow_inconclusive,
143
+ allow_candidatus=allow_candidatus,
144
+ allow_sp=allow_sp,
145
+ )
104
146
  except ValueError as e:
105
147
  click.echo(f"Error: {e}")
106
148
  return
@@ -143,6 +185,8 @@ def train_ncbi(model_genus, svm_steps, author, author_email):
143
185
  def train_directory(model_genus, input_path, svm_steps, meta, author, author_email):
144
186
  """Train a model based on data from a directory for a given genus."""
145
187
  click.echo(f"Training {model_genus} model with {svm_steps} SVM steps.")
188
+ train_from_directory = import_module("xspect.train").train_from_directory
189
+
146
190
  train_from_directory(
147
191
  model_genus,
148
192
  Path(input_path),
@@ -167,12 +211,28 @@ def train_directory(model_genus, input_path, svm_steps, meta, author, author_ema
167
211
  def train_mlst(choose_schemes):
168
212
  """Download alleles and train bloom filters."""
169
213
  click.echo("Updating alleles")
214
+ mlst_helper = import_module("xspect.mlst_feature.mlst_helper")
215
+ pick_scheme = mlst_helper.pick_scheme
216
+
217
+ pub_mlst_handler = import_module("xspect.mlst_feature.pub_mlst_handler")
218
+ PubMLSTHandler = pub_mlst_handler.PubMLSTHandler
219
+
220
+ probabilistic_filter_mlst_model = import_module(
221
+ "xspect.models.probabilistic_filter_mlst_model"
222
+ )
223
+ ProbabilisticFilterMlstSchemeModel = (
224
+ probabilistic_filter_mlst_model.ProbabilisticFilterMlstSchemeModel
225
+ )
226
+
227
+ definitions = import_module("xspect.definitions")
228
+ get_xspect_model_path = definitions.get_xspect_model_path
229
+
170
230
  handler = PubMLSTHandler()
171
231
  handler.download_alleles(choose_schemes)
172
232
  click.echo("Download finished")
173
233
  scheme_path = pick_scheme(handler.get_scheme_paths())
174
234
  species_name = str(scheme_path).split("/")[-2]
175
- scheme_name = str(scheme_path).split("/")[-1]
235
+ scheme_name = str(scheme_path).rsplit("/", maxsplit=1)[-1]
176
236
  scheme_url = handler.scheme_mapping[str(scheme_path)]
177
237
  model = ProbabilisticFilterMlstSchemeModel(
178
238
  31, f"{species_name}:{scheme_name}", get_xspect_model_path(), scheme_url
@@ -230,6 +290,8 @@ def classify_seqs():
230
290
  def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
231
291
  """Classify samples using a genus model."""
232
292
  click.echo("Classifying...")
293
+ classify = import_module("xspect.classify")
294
+
233
295
  classify.classify_genus(
234
296
  model_genus, Path(input_path), Path(output_path), sparse_sampling_step
235
297
  )
@@ -268,11 +330,37 @@ def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
268
330
  help="Sparse sampling step (e. g. only every 500th kmer for '--sparse-sampling-step 500').",
269
331
  default=1,
270
332
  )
271
- def classify_species(model_genus, input_path, output_path, sparse_sampling_step):
333
+ @click.option(
334
+ "-n",
335
+ "--display-names",
336
+ help="Includes the display names next to taxonomy-IDs.",
337
+ is_flag=True,
338
+ )
339
+ @click.option(
340
+ "-v",
341
+ "--validation",
342
+ help="Detects misclassification for small reads or contigs.",
343
+ is_flag=True,
344
+ )
345
+ def classify_species(
346
+ model_genus,
347
+ input_path,
348
+ output_path,
349
+ sparse_sampling_step,
350
+ display_names,
351
+ validation,
352
+ ):
272
353
  """Classify samples using a species model."""
273
354
  click.echo("Classifying...")
355
+ classify = import_module("xspect.classify")
356
+
274
357
  classify.classify_species(
275
- model_genus, Path(input_path), Path(output_path), sparse_sampling_step
358
+ model_genus,
359
+ Path(input_path),
360
+ Path(output_path),
361
+ sparse_sampling_step,
362
+ display_names,
363
+ validation,
276
364
  )
277
365
 
278
366
 
@@ -301,6 +389,8 @@ def classify_species(model_genus, input_path, output_path, sparse_sampling_step)
301
389
  def classify_mlst(input_path, output_path, limit):
302
390
  """MLST classify a sample."""
303
391
  click.echo("Classifying...")
392
+ classify = import_module("xspect.classify")
393
+
304
394
  classify.classify_mlst(Path(input_path), Path(output_path), limit)
305
395
 
306
396
 
@@ -372,6 +462,7 @@ def filter_genus(
372
462
  ):
373
463
  """Filter samples using a genus model."""
374
464
  click.echo("Filtering...")
465
+ filter_sequences = import_module("xspect.filter_sequences")
375
466
 
376
467
  filter_sequences.filter_genus(
377
468
  model_genus,
@@ -426,14 +517,16 @@ def filter_genus(
426
517
  "-t",
427
518
  "--threshold",
428
519
  type=float,
429
- help="Threshold for filtering (default: 0.7). Use -1 to filter for the highest scoring species.",
520
+ help="Threshold for filtering (default: 0.7). Use -1 to filter for the highest scoring "
521
+ "species.",
430
522
  default=0.7,
431
523
  prompt=True,
432
524
  )
433
525
  @click.option(
434
526
  "--sparse-sampling-step",
435
527
  type=int,
436
- help="Sparse sampling step (e. g. only every 500th kmer for '--sparse-sampling-step 500').",
528
+ help="Sparse sampling step (e. g. only every 500th kmer for "
529
+ "'--sparse-sampling-step 500').",
437
530
  default=1,
438
531
  )
439
532
  def filter_species(
@@ -449,9 +542,12 @@ def filter_species(
449
542
 
450
543
  if threshold != -1 and (threshold < 0 or threshold > 1):
451
544
  raise click.BadParameter(
452
- "Threshold must be between 0 and 1, or -1 for filtering by the highest scoring species."
545
+ "Threshold must be between 0 and 1, or -1 for filtering by the highest "
546
+ "scoring species."
453
547
  )
454
548
 
549
+ get_model_metadata = import_module("xspect.model_management").get_model_metadata
550
+
455
551
  available_species = get_model_metadata(f"{model_genus}-species")["display_names"]
456
552
  available_species = {
457
553
  id: name.replace(f"{model_genus} ", "")
@@ -476,6 +572,8 @@ def filter_species(
476
572
  ][0]
477
573
 
478
574
  click.echo("Filtering...")
575
+ filter_sequences = import_module("xspect.filter_sequences")
576
+
479
577
  filter_sequences.filter_species(
480
578
  model_genus,
481
579
  model_species,
File without changes
@@ -0,0 +1,168 @@
1
+ """
2
+ Mapping handler for the alignment-based misclassification detection.
3
+
4
+ Notes:
5
+ Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
6
+ (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
7
+ """
8
+
9
+ import mappy, pysam, os, csv
10
+ from Bio import SeqIO
11
+ from xspect.definitions import fasta_endings
12
+
13
+ __author__ = "Cetin, Oemer"
14
+
15
+
16
+ class MappingHandler:
17
+ """Handler class for all mapping related procedures."""
18
+
19
+ def __init__(self, ref_genome_path: str, reads_path: str) -> None:
20
+ """
21
+ Initialise the mapping handler.
22
+
23
+ This method sets up the paths to the reference genome and query sequences.
24
+ Additionally, the paths to the output formats (SAM, BAM and TSV) are generated.
25
+
26
+ Args:
27
+ ref_genome_path (str): The path to the reference genome.
28
+ reads_path (str): The path to the query sequences.
29
+ """
30
+ if not os.path.isfile(ref_genome_path):
31
+ raise ValueError("The path to the reference genome does not exist.")
32
+
33
+ if not os.path.isfile(reads_path):
34
+ raise ValueError("The path to the reads does not exist.")
35
+
36
+ if not ref_genome_path.endswith(tuple(fasta_endings)) and reads_path.endswith(
37
+ tuple(fasta_endings)
38
+ ):
39
+ raise ValueError("The files must be FASTA-files!")
40
+
41
+ stem = reads_path.rsplit(".", 1)[0] + "_mapped"
42
+ self.ref_genome_path = ref_genome_path
43
+ self.reads_path = reads_path
44
+ self.sam = stem + ".sam"
45
+ self.bam = stem + ".sorted.bam"
46
+ self.tsv = stem + ".start_coordinates.tsv"
47
+
48
+ def map_reads_onto_reference(self) -> None:
49
+ """
50
+ A Method that maps reads against the respective reference genome.
51
+
52
+ This function creates a SAM file via Mappy and converts it into a BAM file.
53
+ """
54
+ # create header (entry = sequences of the reference genome)
55
+ ref_seq = [
56
+ {"SN": rec.id, "LN": len(rec.seq)}
57
+ for rec in SeqIO.parse(self.ref_genome_path, "fasta")
58
+ ]
59
+ header = {"HD": {"VN": "1.0"}, "SQ": ref_seq}
60
+ target_id = {sequence["SN"]: number for number, sequence in enumerate(ref_seq)}
61
+
62
+ reads = list(SeqIO.parse(self.reads_path, "fasta"))
63
+ if not reads:
64
+ raise ValueError("Reads file is empty.")
65
+
66
+ read_length = len(reads[0].seq)
67
+ preset = "map-ont" if read_length > 150 else "sr"
68
+ # create SAM-file
69
+ aln = mappy.Aligner(self.ref_genome_path, preset=preset)
70
+ with pysam.AlignmentFile(self.sam, "w", header=header) as out:
71
+ for read in reads:
72
+ read_seq = str(read.seq)
73
+ for hit in aln.map(read_seq):
74
+ if hit.cigar_str is None:
75
+ continue
76
+ # add soft-clips so CIGAR length == len(read_seq) IMPORTANT!!
77
+ leftS = hit.q_st
78
+ rightS = len(read_seq) - hit.q_en
79
+ cigar = (
80
+ (f"{leftS}S" if leftS > 0 else "")
81
+ + hit.cigar_str
82
+ + (f"{rightS}S" if rightS > 0 else "")
83
+ )
84
+
85
+ mapped_region = pysam.AlignedSegment()
86
+ mapped_region.query_name = read.id
87
+ mapped_region.query_sequence = read_seq
88
+ mapped_region.flag = 16 if hit.strand == -1 else 0
89
+ mapped_region.reference_id = target_id[hit.ctg]
90
+ mapped_region.reference_start = hit.r_st
91
+ mapped_region.mapping_quality = (
92
+ hit.mapq or 255
93
+ ) # 0-60 (255 means unavailable)
94
+ mapped_region.cigarstring = cigar
95
+ out.write(mapped_region)
96
+ break # keep only primary
97
+
98
+ # create BAM-file
99
+ pysam.sort("-o", self.bam, self.sam)
100
+ pysam.index(self.bam)
101
+
102
+ def get_total_genome_length(self) -> int:
103
+ """
104
+ Get the genome length from a BAM-file.
105
+
106
+ This function opens a BAM-file and extracts the genome length information.
107
+
108
+ Returns:
109
+ int: The genome length.
110
+ """
111
+ with pysam.AlignmentFile(self.bam, "rb") as bam:
112
+ return sum(bam.lengths)
113
+
114
+ def extract_starting_coordinates(self) -> None:
115
+ """
116
+ Extract starting coordinates of mapped regions from a BAM-file.
117
+
118
+ This function scans through a BAM-file and creates a TSV-file.
119
+ The information that is extracted is the starting coordinate for each mapped read.
120
+ """
121
+ # create tsv-file with all start positions
122
+ with open(self.tsv, "w") as tsv:
123
+ tsv.write("reference_genome\tread\tmapped_starting_coordinate\n")
124
+ try:
125
+ with pysam.AlignmentFile(self.bam, "rb") as bam:
126
+ entry = {
127
+ i: seq["SN"] for i, seq in enumerate(bam.header.to_dict()["SQ"])
128
+ }
129
+ seen = set()
130
+ for ref_seq in bam.references:
131
+ for hit in bam.fetch(ref_seq):
132
+ if (
133
+ hit.is_unmapped
134
+ or hit.is_secondary
135
+ or hit.is_supplementary
136
+ ):
137
+ continue
138
+ key = (hit.reference_id, hit.reference_start)
139
+ if key in seen:
140
+ continue
141
+ seen.add(key)
142
+ tsv.write(
143
+ f"{entry[hit.reference_id]}\t{hit.query_name}\t{hit.reference_start}\n"
144
+ )
145
+ except ValueError:
146
+ tsv.write("dummy_reference\tdummy_read\t1000\n")
147
+
148
+ def get_start_coordinates(self) -> list[int]:
149
+ """
150
+ Get the coordinates of a TSV-file.
151
+
152
+ This function opens a TSV-file and saves all starting coordinates in a list.
153
+
154
+ Returns:
155
+ list[int]: The list containing all starting coordinates.
156
+
157
+ Raises:
158
+ ValueError: If no column with starting coordinates is found.
159
+ """
160
+ coordinates = []
161
+ with open(self.tsv, "r", newline="") as f:
162
+ reader = csv.DictReader(f, delimiter="\t")
163
+ for row in reader:
164
+ val = row.get("mapped_starting_coordinate")
165
+ if val is None:
166
+ raise ValueError("Column with starting coordinates not found.")
167
+ coordinates.append(int(val))
168
+ return coordinates