XspecT 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

@@ -0,0 +1,102 @@
1
+ """
2
+ Point pattern density analysis tool for the alignment-based misclassification detection.
3
+
4
+ Notes:
5
+ Developed by Oemer Cetin as part of Bsc thesis (2025), Goethe University Frankfurt am Main.
6
+ (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
7
+ """
8
+
9
+ import numpy
10
+
11
+ __author__ = "Cetin, Oemer"
12
+
13
+
14
+ class PointPatternAnalysis:
15
+ """Class for all point pattern density analysis procedures."""
16
+
17
+ def __init__(self, points: list[int], length: int):
18
+ """
19
+ Initialise the class for point pattern analysis.
20
+
21
+ This method sets up the required list with data points (sorted) and the length of the reference genome.
22
+ All required intensity for the statistics is also calculated.
23
+
24
+ Args:
25
+ points (list): The start coordinates of mapped regions on the genome.
26
+ length (int): The length of the reference genome.
27
+ """
28
+ if len(points) < 2:
29
+ raise ValueError("Need at least 2 points.")
30
+ self.sorted_points = numpy.sort(numpy.asarray(points, dtype=float))
31
+ self.n = len(points)
32
+ self.length = float(length)
33
+
34
+ def ripleys_k(self) -> tuple[bool, float, float]:
35
+ """
36
+ Calculates the K-function for the given point distribution.
37
+
38
+ This method calculates the K-function to describe the point distribution.
39
+ The result is than compared with what would be expected under a completely random distribution.
40
+ (Under complete randomness the K-function result is 2*r)
41
+
42
+ Returns:
43
+ tuple: A tuple containing the information whether points are clustered or not.
44
+ """
45
+ r = 0.01 * self.length
46
+ left = 0
47
+ right = 0
48
+ total_neighbors = 0
49
+
50
+ for i in range(self.n):
51
+ while self.sorted_points[i] - self.sorted_points[left] > r:
52
+ left += 1
53
+ if right < i:
54
+ right = i
55
+ while (
56
+ right + 1 < self.n
57
+ and self.sorted_points[right + 1] - self.sorted_points[i] <= r
58
+ ):
59
+ right += 1
60
+ total_neighbors += right - left
61
+ k = (self.length / (self.n * (self.n - 1))) * total_neighbors
62
+ return (k > 2 * r), k, 2 * r
63
+
64
+ def ripleys_k_edge_corrected(self) -> tuple[bool, float, float]:
65
+ """
66
+ Calculates the K-function for the given point distribution with an edge correction factor.
67
+
68
+ This method calculates the K-function to describe the point distribution.
69
+ This time an additional factor is multiplied for each data point to account for edge effects.
70
+ The result is than compared with what would be expected under a completely random distribution.
71
+ (Under complete randomness the K-function result is 2*r)
72
+
73
+ Returns:
74
+ tuple: A tuple containing the information whether the points are clustered or not.
75
+ """
76
+ r = 0.01 * self.length
77
+ left = 0
78
+ right = 0
79
+ total_weighted = 0
80
+
81
+ for i in range(self.n):
82
+ while self.sorted_points[i] - self.sorted_points[left] > r:
83
+ left += 1
84
+ if right < i:
85
+ right = i
86
+ while (
87
+ right + 1 < self.n
88
+ and self.sorted_points[right + 1] - self.sorted_points[i] <= r
89
+ ):
90
+ right += 1
91
+
92
+ neighbors = right - left
93
+ if neighbors > 0:
94
+ a = max(0, self.sorted_points[i] - r)
95
+ b = min(self.length, self.sorted_points[i] + r)
96
+ overlap = b - a
97
+ weight = (2 * r) / overlap if overlap > 0 else 0
98
+
99
+ total_weighted += weight * neighbors
100
+
101
+ k = (self.length / (self.n * (self.n - 1))) * total_weighted
102
+ return (bool(k > 2 * r)), float(k), 2 * r
@@ -0,0 +1,55 @@
1
+ """
2
+ Read simulation for the alignment-based misclassification detection (Used for testing purposes).
3
+
4
+ Notes:
5
+ Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
6
+ (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
7
+ """
8
+
9
+ import random
10
+ from Bio import SeqIO
11
+
12
+ __author__ = "Cetin, Oemer"
13
+
14
+
15
+ def extract_random_reads(
16
+ fasta_file, output_fasta, read_length=150, num_reads=1000, seed=42
17
+ ) -> None:
18
+ """
19
+ Uniformly extracts reads from a genome and writes them to a FASTA-file.
20
+
21
+ Args:
22
+ fasta_file (str): Path to input FASTA file.
23
+ output_fasta (str): Output FASTA file to write simulated reads.
24
+ read_length (int): Length of each read to extract.
25
+ num_reads (int): Total number of reads to extract.
26
+ seed (int): A seed for reproducibility.
27
+
28
+ Raises:
29
+ ValueError: If the sequences are shorter than the chosen read length.
30
+ """
31
+ random.seed(seed)
32
+ sequences = [
33
+ record
34
+ for record in SeqIO.parse(fasta_file, "fasta")
35
+ if len(record.seq) >= read_length
36
+ ]
37
+ if not sequences:
38
+ raise ValueError("No sequences long enough for the desired read length.")
39
+
40
+ # Probability to extract reads from large contigs is higher
41
+ seq_lengths = [len(rec.seq) for rec in sequences]
42
+ total_length = sum(seq_lengths)
43
+ weights = [single_length / total_length for single_length in seq_lengths]
44
+
45
+ with open(output_fasta, "w") as o:
46
+ for i in range(num_reads):
47
+ # random.choices() provides a list!
48
+ selected = random.choices(sequences, weights=weights, k=1)[0]
49
+ seq_length = len(selected.seq)
50
+ start = random.randint(0, seq_length - read_length)
51
+ read_seq = selected.seq[start : start + read_length]
52
+ o.write(
53
+ f">read_{i}_{selected.id}_{start}-{start + read_length}\n{read_seq}\n"
54
+ )
55
+ print("The reads have been simulated successfully.")
@@ -2,10 +2,10 @@
2
2
 
3
3
  __author__ = "Cetin, Oemer"
4
4
 
5
- import requests
6
5
  import json
7
- from io import StringIO
8
6
  from pathlib import Path
7
+ from io import StringIO
8
+ import requests
9
9
  from Bio import SeqIO
10
10
  from xspect.definitions import get_xspect_model_path
11
11
 
@@ -29,7 +29,7 @@ def create_fasta_files(locus_path: Path, fasta_batch: str) -> None:
29
29
  output_fasta_file = locus_path / f"Allele_ID_{number}.fasta"
30
30
  if output_fasta_file.exists():
31
31
  continue # Ignore existing ones
32
- with open(output_fasta_file, "w") as allele:
32
+ with open(output_fasta_file, "w", encoding="utf-8") as allele:
33
33
  SeqIO.write(record, allele, "fasta")
34
34
 
35
35
 
@@ -59,10 +59,9 @@ def pick_species_number_from_db(available_species: dict) -> str:
59
59
  if int(choice) in available_species.keys():
60
60
  chosen_species = available_species.get(int(choice))
61
61
  return chosen_species
62
- else:
63
- print(
64
- "Wrong input! Try again with a number that is available in the list above."
65
- )
62
+ print(
63
+ "Wrong input! Try again with a number that is available in the list above."
64
+ )
66
65
  except ValueError:
67
66
  print(
68
67
  "Wrong input! Try again with a number that is available in the list above."
@@ -95,10 +94,9 @@ def pick_scheme_number_from_db(available_schemes: dict) -> str:
95
94
  if int(choice) in available_schemes.keys():
96
95
  chosen_scheme = available_schemes.get(int(choice))[1]
97
96
  return chosen_scheme
98
- else:
99
- print(
100
- "Wrong input! Try again with a number that is available in the above list."
101
- )
97
+ print(
98
+ "Wrong input! Try again with a number that is available in the above list."
99
+ )
102
100
  except ValueError:
103
101
  print(
104
102
  "Wrong input! Try again with a number that is available in the above list."
@@ -162,12 +160,12 @@ def pick_scheme(available_schemes: dict) -> Path:
162
160
  for counter, scheme in available_schemes.items():
163
161
  # For Strain Typing with an API-POST Request to the db
164
162
  if str(scheme).startswith("http"):
165
- scheme_json = requests.get(scheme).json()
163
+ scheme_json = requests.get(scheme, timeout=10).json()
166
164
  print(str(counter) + ":" + scheme_json["description"])
167
165
 
168
166
  # To pick a scheme after download for fitting
169
167
  else:
170
- print(str(counter) + ":" + str(scheme).split("/")[-1])
168
+ print(str(counter) + ":" + str(scheme).rsplit("/", maxsplit=1)[-1])
171
169
 
172
170
  print("\nPick a scheme for strain type prediction")
173
171
  while True:
@@ -176,10 +174,9 @@ def pick_scheme(available_schemes: dict) -> Path:
176
174
  if int(choice) in available_schemes.keys():
177
175
  chosen_scheme = available_schemes.get(int(choice))
178
176
  return chosen_scheme
179
- else:
180
- print(
181
- "Wrong input! Try again with a number that is available in the above list."
182
- )
177
+ print(
178
+ "Wrong input! Try again with a number that is available in the above list."
179
+ )
183
180
  except ValueError:
184
181
  print(
185
182
  "Wrong input! Try again with a number that is available in the above list."
@@ -209,8 +206,7 @@ class MlstResult:
209
206
  Returns:
210
207
  dict: The result dictionary with s sequence ID as key and the Strain type as value.
211
208
  """
212
- results = {seq_id: result for seq_id, result in self.hits.items()}
213
- return results
209
+ return dict(self.hits.items())
214
210
 
215
211
  def to_dict(self) -> dict:
216
212
  """
@@ -2,8 +2,8 @@
2
2
 
3
3
  __author__ = "Cetin, Oemer"
4
4
 
5
- import requests
6
5
  import json
6
+ import requests
7
7
  from xspect.mlst_feature.mlst_helper import (
8
8
  create_fasta_files,
9
9
  pick_species_number_from_db,
@@ -51,7 +51,7 @@ class PubMLSTHandler:
51
51
  counter = 1
52
52
  # retrieve all available species
53
53
  species_url = PubMLSTHandler.base_url
54
- for species_databases in requests.get(species_url).json():
54
+ for species_databases in requests.get(species_url, timeout=10).json():
55
55
  for database in species_databases["databases"]:
56
56
  if database["name"].endswith("seqdef"):
57
57
  available_species[counter] = database["name"]
@@ -61,7 +61,7 @@ class PubMLSTHandler:
61
61
 
62
62
  counter = 1
63
63
  scheme_url = f"{species_url}/{chosen_species}/schemes"
64
- for scheme in requests.get(scheme_url).json()["schemes"]:
64
+ for scheme in requests.get(scheme_url, timeout=10).json()["schemes"]:
65
65
  # scheme["description"] stores the name of a scheme.
66
66
  # scheme["scheme"] stores the URL that is needed for downloading all loci.
67
67
  available_schemes[counter] = [scheme["description"], scheme["scheme"]]
@@ -70,11 +70,8 @@ class PubMLSTHandler:
70
70
  # Selection process of available scheme from a species for download (doubles are caught!)
71
71
  while True:
72
72
  chosen_scheme = pick_scheme_number_from_db(available_schemes)
73
- (
73
+ if chosen_scheme not in chosen_schemes:
74
74
  chosen_schemes.append(chosen_scheme)
75
- if chosen_scheme not in chosen_schemes
76
- else None
77
- )
78
75
  choice = input(
79
76
  "Do you want to pick another scheme to download? (y/n):"
80
77
  ).lower()
@@ -97,7 +94,7 @@ class PubMLSTHandler:
97
94
  self.choose_schemes() # changes the scheme_list attribute
98
95
 
99
96
  for scheme in self.scheme_list:
100
- scheme_json = requests.get(scheme).json()
97
+ scheme_json = requests.get(scheme, timeout=10).json()
101
98
  # We only want the name and the respective featured loci of a scheme
102
99
  scheme_name = scheme_json["description"]
103
100
  locus_list = scheme_json["loci"]
@@ -117,7 +114,7 @@ class PubMLSTHandler:
117
114
  if not locus_path.exists():
118
115
  locus_path.mkdir(exist_ok=True, parents=True)
119
116
 
120
- alleles = requests.get(f"{locus_url}/alleles_fasta").text
117
+ alleles = requests.get(f"{locus_url}/alleles_fasta", timeout=10).text
121
118
  create_fasta_files(locus_path, alleles)
122
119
 
123
120
  def assign_strain_type_by_db(self) -> None:
@@ -132,13 +129,15 @@ class PubMLSTHandler:
132
129
  str(pick_scheme(scheme_list_to_dict(self.scheme_list))) + "/sequence"
133
130
  )
134
131
  fasta_file = get_xspect_upload_path() / "Test.fna"
135
- with open(fasta_file, "r") as file:
132
+ with open(fasta_file, "r", encoding="utf-8") as file:
136
133
  data = file.read()
137
134
  payload = { # Essential API-POST-Body
138
135
  "sequence": data,
139
136
  "filetype": "fasta",
140
137
  }
141
- response = requests.post(scheme_url, data=json.dumps(payload)).json()
138
+ response = requests.post(
139
+ scheme_url, data=json.dumps(payload), timeout=10
140
+ ).json()
142
141
 
143
142
  for locus, meta_data in response["exact_matches"].items():
144
143
  # meta_data is a list containing a dictionary, therefore [0] and then key value.
@@ -170,18 +169,16 @@ class PubMLSTHandler:
170
169
  }
171
170
  }
172
171
 
173
- response = requests.post(post_url + "/designations", json=payload)
172
+ response = requests.post(post_url + "/designations", json=payload, timeout=10)
174
173
 
175
174
  if response.status_code == 200:
176
175
  data = response.json()
177
176
  if "fields" in data:
178
177
  post_response = data["fields"]
179
178
  return post_response
180
- else:
181
- post_response = "No matching Strain Type found in the database. "
182
- post_response += "Possibly a novel Strain Type."
183
- return post_response
184
- else:
185
- post_response = "Error:" + str(response.status_code)
186
- post_response += response.text
179
+ post_response = "No matching Strain Type found in the database. "
180
+ post_response += "Possibly a novel Strain Type."
187
181
  return post_response
182
+ post_response = "Error:" + str(response.status_code)
183
+ post_response += response.text
184
+ return post_response
@@ -2,45 +2,41 @@
2
2
 
3
3
  from json import loads, dumps
4
4
  from pathlib import Path
5
- from xspect.models.probabilistic_single_filter_model import (
6
- ProbabilisticSingleFilterModel,
7
- )
8
- from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
9
5
  from xspect.definitions import get_xspect_model_path
10
6
 
11
7
 
12
- def get_genus_model(genus) -> ProbabilisticSingleFilterModel:
8
+ def get_genus_model_path(genus) -> Path:
13
9
  """
14
- Get a genus model for the specified genus.
10
+ Get a genus model path for the specified genus.
15
11
 
16
- This function retrieves a pre-trained genus classification model based on the provided genus name.
12
+ This function retrieves the path of a pre-trained genus classification model based on the
13
+ provided genus name.
17
14
 
18
15
  Args:
19
16
  genus (str): The genus name for which the model is to be retrieved.
20
17
 
21
18
  Returns:
22
- ProbabilisticSingleFilterModel: An instance of the genus classification model.
19
+ Path: The file path of the genus classification model.
23
20
  """
24
21
  genus_model_path = get_xspect_model_path() / (genus.lower() + "-genus.json")
25
- genus_filter_model = ProbabilisticSingleFilterModel.load(genus_model_path)
26
- return genus_filter_model
22
+ return genus_model_path
27
23
 
28
24
 
29
- def get_species_model(genus) -> ProbabilisticFilterSVMModel:
25
+ def get_species_model_path(genus) -> Path:
30
26
  """
31
- Get a species classification model for the specified genus.
27
+ Get a species model path for the specified genus.
32
28
 
33
- This function retrieves a pre-trained species classification model based on the provided genus name.
29
+ This function retrieves the path of a pre-trained species classification model based on the
30
+ provided genus name.
34
31
 
35
32
  Args:
36
33
  genus (str): The genus name for which the species model is to be retrieved.
37
34
 
38
35
  Returns:
39
- ProbabilisticFilterSVMModel: An instance of the species classification model.
36
+ Path: The file path of the species classification model.
40
37
  """
41
38
  species_model_path = get_xspect_model_path() / (genus.lower() + "-species.json")
42
- species_filter_model = ProbabilisticFilterSVMModel.load(species_model_path)
43
- return species_filter_model
39
+ return species_model_path
44
40
 
45
41
 
46
42
  def get_model_metadata(model: str | Path) -> dict:
@@ -121,7 +117,8 @@ def get_models() -> dict[str, list[dict]]:
121
117
  This function scans the model directory for JSON files and organizes them by their model type.
122
118
 
123
119
  Returns:
124
- dict[str, list[dict]]: A dictionary where keys are model types and values are lists of model display names.
120
+ dict[str, list[dict]]: A dictionary where keys are model types and values are lists of
121
+ model display names.
125
122
  """
126
123
  model_dict = {}
127
124
  for model_file in get_xspect_model_path().glob("*.json"):
@@ -2,14 +2,14 @@
2
2
 
3
3
  __author__ = "Cetin, Oemer"
4
4
 
5
- import cobs_index
6
5
  import json
7
6
  from pathlib import Path
7
+ from collections import defaultdict
8
+ import cobs_index
9
+ from cobs_index import DocumentList
8
10
  from Bio import SeqIO
9
11
  from Bio.Seq import Seq
10
12
  from Bio.SeqRecord import SeqRecord
11
- from cobs_index import DocumentList
12
- from collections import defaultdict
13
13
  from xspect.file_io import get_record_iterator
14
14
  from xspect.mlst_feature.mlst_helper import MlstResult
15
15
  from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
@@ -100,11 +100,11 @@ class ProbabilisticFilterMlstSchemeModel:
100
100
  "Scheme not found. Please make sure to download the schemes prior!"
101
101
  )
102
102
 
103
- scheme = str(scheme_path).split("/")[-1]
103
+ scheme = str(scheme_path).rsplit("/", maxsplit=1)[-1]
104
104
  cobs_path = ""
105
105
  # COBS structure for every locus (default = 7 for Oxford or Pasteur scheme)
106
106
  for locus_path in sorted(scheme_path.iterdir()):
107
- locus = str(locus_path).split("/")[-1]
107
+ locus = str(locus_path).rsplit("/", maxsplit=1)[-1]
108
108
  # counts all fasta files that belong to a locus
109
109
  self.loci[locus] = sum(
110
110
  (1 for _ in locus_path.iterdir() if not str(_).endswith("cache"))
@@ -112,7 +112,7 @@ class ProbabilisticFilterMlstSchemeModel:
112
112
 
113
113
  # determine the avg base pair size of alleles
114
114
  fasta_file = next(locus_path.glob("*.fasta"), None)
115
- with open(fasta_file, "r") as handle:
115
+ with open(fasta_file, "r", encoding="utf-8") as handle:
116
116
  record = next(SeqIO.parse(handle, "fasta"))
117
117
  self.avg_locus_bp_size.append(len(record.seq))
118
118
 
@@ -134,7 +134,8 @@ class ProbabilisticFilterMlstSchemeModel:
134
134
 
135
135
  def save(self) -> None:
136
136
  """Saves the model to disk"""
137
- scheme = str(self.scheme_path).split("/")[-1] # [-1] contains the scheme name
137
+ # [-1] contains the scheme name
138
+ scheme = str(self.scheme_path).rsplit("/", maxsplit=1)[-1]
138
139
  json_path = self.base_path / scheme / f"{scheme}.json"
139
140
  json_object = json.dumps(self.to_dict(), indent=4)
140
141
 
@@ -152,7 +153,7 @@ class ProbabilisticFilterMlstSchemeModel:
152
153
  Returns:
153
154
  ProbabilisticFilterMlstSchemeModel: A trained model from the disk in JSON format.
154
155
  """
155
- scheme_name = str(scheme_path).split("/")[-1]
156
+ scheme_name = str(scheme_path).rsplit("/", maxsplit=1)[-1]
156
157
  json_path = scheme_path / f"{scheme_name}.json"
157
158
  with open(json_path, "r", encoding="utf-8") as file:
158
159
  json_object = file.read()
@@ -221,7 +222,7 @@ class ProbabilisticFilterMlstSchemeModel:
221
222
  for entry in sorted(cobs_path.iterdir()):
222
223
  if str(entry).endswith(".json"):
223
224
  continue
224
- file_name = str(entry).split("/")[-1] # file_name = locus
225
+ file_name = str(entry).rsplit("/", maxsplit=1)[-1] # file_name = locus
225
226
  scheme_path_list.append(file_name.split(".")[0]) # without the file ending
226
227
 
227
228
  result_dict = {}
@@ -442,7 +443,7 @@ class ProbabilisticFilterMlstSchemeModel:
442
443
  Returns:
443
444
  bool: True if any locus score >= 0.5 * its avg base pair size, False otherwise.
444
445
  """
445
- for i, (locus, allele_score_dict) in enumerate(highest_results.items()):
446
+ for i, (_, allele_score_dict) in enumerate(highest_results.items()):
446
447
  if not allele_score_dict:
447
448
  continue # skip empty values
448
449