XspecT 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

@@ -0,0 +1,287 @@
1
+ """Probabilistic filter MLST model for sequence data"""
2
+
3
+ __author__ = "Cetin, Oemer"
4
+
5
+ import cobs_index
6
+ import json
7
+ from pathlib import Path
8
+ from Bio import SeqIO
9
+ from Bio.Seq import Seq
10
+ from Bio.SeqRecord import SeqRecord
11
+ from cobs_index import DocumentList
12
+ from collections import defaultdict
13
+ from xspect.file_io import get_record_iterator
14
+ from xspect.mlst_feature.mlst_helper import MlstResult
15
+
16
+
17
+ class ProbabilisticFilterMlstSchemeModel:
18
+ """Probabilistic filter MLST scheme model for sequence data"""
19
+
20
+ def __init__(
21
+ self,
22
+ k: int,
23
+ model_display_name: str,
24
+ base_path: Path,
25
+ fpr: float = 0.001,
26
+ ) -> None:
27
+ if k < 1:
28
+ raise ValueError("Invalid k value, must be greater than 0")
29
+ if not isinstance(base_path, Path):
30
+ raise ValueError("Invalid base path, must be a pathlib.Path object")
31
+
32
+ self.k = k
33
+ self.model_display_name = model_display_name
34
+ self.base_path = base_path / "MLST"
35
+ self.fpr = fpr
36
+ self.model_type = "Strain"
37
+ self.loci = {}
38
+ self.scheme_path = ""
39
+ self.cobs_path = ""
40
+ self.avg_locus_bp_size = []
41
+ self.indices = []
42
+
43
+ def to_dict(self) -> dict:
44
+ """Returns a dictionary representation of the model"""
45
+ return {
46
+ "k": self.k,
47
+ "model_display_name": self.model_display_name,
48
+ "model_type": self.model_type,
49
+ "fpr": self.fpr,
50
+ "scheme_path": str(self.scheme_path),
51
+ "cobs_path": str(self.cobs_path),
52
+ "average_locus_base_pair_size": self.avg_locus_bp_size,
53
+ "loci": self.loci,
54
+ }
55
+
56
+ def get_cobs_index_path(self, scheme: str, locus: str) -> Path:
57
+ """Returns the path to the cobs index"""
58
+ # To differentiate from genus and species models
59
+ cobs_path = self.base_path / f"{scheme}"
60
+ cobs_path.mkdir(exist_ok=True, parents=True)
61
+ return cobs_path / f"{locus}.cobs_compact"
62
+
63
+ def fit(self, scheme_path: Path) -> None:
64
+ """Trains a COBS structure for every locus with all its alleles"""
65
+ if not scheme_path.exists():
66
+ raise ValueError(
67
+ "Scheme not found. Please make sure to download the schemes prior!"
68
+ )
69
+
70
+ scheme = str(scheme_path).split("/")[-1]
71
+ cobs_path = ""
72
+ # COBS structure for every locus (default = 7 for Oxford or Pasteur scheme)
73
+ for locus_path in sorted(scheme_path.iterdir()):
74
+ locus = str(locus_path).split("/")[-1]
75
+ # counts all fasta files that belong to a locus
76
+ self.loci[locus] = sum(
77
+ (1 for _ in locus_path.iterdir() if not str(_).endswith("cache"))
78
+ )
79
+
80
+ # determine the avg base pair size of alleles
81
+ fasta_file = next(locus_path.glob("*.fasta"), None)
82
+ with open(fasta_file, "r") as handle:
83
+ record = next(SeqIO.parse(handle, "fasta"))
84
+ self.avg_locus_bp_size.append(len(record.seq))
85
+
86
+ # COBS only accepts strings as paths
87
+ doclist = DocumentList(str(locus_path))
88
+ index_params = cobs_index.CompactIndexParameters()
89
+ index_params.term_size = self.k # k-mer size
90
+ index_params.clobber = True # overwrite output and temporary files
91
+ index_params.false_positive_rate = self.fpr
92
+
93
+ # Creates COBS data structure for each locus
94
+ cobs_path = self.get_cobs_index_path(scheme, locus)
95
+ cobs_index.compact_construct_list(doclist, str(cobs_path), index_params)
96
+ # Saves COBS-file inside the "indices" attribute
97
+ self.indices.append(cobs_index.Search(str(cobs_path)))
98
+
99
+ self.scheme_path = scheme_path
100
+ self.cobs_path = cobs_path.parent
101
+
102
+ def save(self) -> None:
103
+ """Saves the model to disk"""
104
+ scheme = str(self.scheme_path).split("/")[
105
+ -1
106
+ ] # [-1] -> contains the scheme name
107
+ json_path = self.base_path / scheme / f"{scheme}.json"
108
+ json_object = json.dumps(self.to_dict(), indent=4)
109
+
110
+ with open(json_path, "w", encoding="utf-8") as file:
111
+ file.write(json_object)
112
+
113
+ @staticmethod
114
+ def load(scheme_path: Path) -> "ProbabilisticFilterMlstSchemeModel":
115
+ """Loads the model from a JSON-file"""
116
+ scheme_name = str(scheme_path).split("/")[-1]
117
+ json_path = scheme_path / f"{scheme_name}.json"
118
+ with open(json_path, "r", encoding="utf-8") as file:
119
+ json_object = file.read()
120
+ model_json = json.loads(json_object)
121
+ model = ProbabilisticFilterMlstSchemeModel(
122
+ model_json["k"],
123
+ model_json["model_display_name"],
124
+ json_path.parent,
125
+ model_json["fpr"],
126
+ )
127
+ model.scheme_path = model_json["scheme_path"]
128
+ model.cobs_path = model_json["cobs_path"]
129
+ model.avg_locus_bp_size = model_json["average_locus_base_pair_size"]
130
+ model.loci = model_json["loci"]
131
+
132
+ for entry in sorted(json_path.parent.iterdir()):
133
+ if not entry.exists():
134
+ raise FileNotFoundError(f"Index file not found at {entry}")
135
+ if str(entry).endswith(".json"): # only COBS-files
136
+ continue
137
+ model.indices.append(cobs_index.Search(str(entry), False))
138
+ return model
139
+
140
+ def calculate_hits(self, path: Path, sequence: Seq, step: int = 1) -> list[dict]:
141
+ """Calculates the hits for a sequence"""
142
+ if not isinstance(sequence, Seq):
143
+ raise ValueError("Invalid sequence, must be a Bio.Seq object")
144
+
145
+ if not len(sequence) > self.k:
146
+ raise ValueError("Invalid sequence, must be longer than k")
147
+
148
+ if not self.indices:
149
+ raise ValueError("The Model has not been trained yet")
150
+
151
+ scheme_path_list = []
152
+ for entry in sorted(path.iterdir()):
153
+ if str(entry).endswith(".json"):
154
+ continue
155
+ file_name = str(entry).split("/")[-1] # file_name = locus
156
+ scheme_path_list.append(file_name.split(".")[0]) # without the file ending
157
+
158
+ result_dict = {}
159
+ highest_results = {}
160
+ counter = 0
161
+ # split the sequence in parts based on sequence length
162
+ if len(sequence) >= 10000:
163
+ for index in self.indices:
164
+ cobs_results = []
165
+ allele_len = self.avg_locus_bp_size[counter]
166
+ split_sequence = self.sequence_splitter(str(sequence), allele_len)
167
+ for split in split_sequence:
168
+ res = index.search(split, step=step)
169
+ split_result = self.get_cobs_result(res)
170
+ if not split_result:
171
+ continue
172
+ cobs_results.append(split_result)
173
+
174
+ all_counts = defaultdict(int)
175
+ for result in cobs_results:
176
+ for name, value in result.items():
177
+ all_counts[name] += value
178
+
179
+ sorted_counts = dict(
180
+ sorted(all_counts.items(), key=lambda item: -item[1])
181
+ )
182
+ first_key = next(iter(sorted_counts))
183
+ highest_result = sorted_counts[first_key]
184
+ result_dict[scheme_path_list[counter]] = sorted_counts
185
+ highest_results[scheme_path_list[counter]] = {first_key: highest_result}
186
+ counter += 1
187
+ else:
188
+ for index in self.indices:
189
+ res = index.search(
190
+ str(sequence), step=step
191
+ ) # COBS can't handle Seq-Objects
192
+ result_dict[scheme_path_list[counter]] = self.get_cobs_result(res)
193
+ highest_results[scheme_path_list[counter]] = (
194
+ self.get_highest_cobs_result(res)
195
+ )
196
+ counter += 1
197
+ return [{"Strain type": highest_results}, {"All results": result_dict}]
198
+
199
+ def predict(
200
+ self,
201
+ cobs_path: Path,
202
+ sequence_input: (
203
+ SeqRecord
204
+ | list[SeqRecord]
205
+ | SeqIO.FastaIO.FastaIterator
206
+ | SeqIO.QualityIO.FastqPhredIterator
207
+ | Path
208
+ ),
209
+ step: int = 1,
210
+ ) -> MlstResult:
211
+ """Returns scores for the sequence(s) based on the filters in the model"""
212
+ if isinstance(sequence_input, SeqRecord):
213
+ if sequence_input.id == "<unknown id>":
214
+ sequence_input.id = "test"
215
+ hits = {
216
+ sequence_input.id: self.calculate_hits(cobs_path, sequence_input.seq)
217
+ }
218
+ return MlstResult(self.model_display_name, step, hits)
219
+
220
+ if isinstance(sequence_input, Path):
221
+ return ProbabilisticFilterMlstSchemeModel.predict(
222
+ self, cobs_path, get_record_iterator(sequence_input), step=step
223
+ )
224
+
225
+ if isinstance(
226
+ sequence_input,
227
+ (SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),
228
+ ):
229
+ hits = {}
230
+ # individual_seq is a SeqRecord-Object
231
+ for individual_seq in sequence_input:
232
+ individual_hits = self.calculate_hits(cobs_path, individual_seq.seq)
233
+ hits[individual_seq.id] = individual_hits
234
+ return MlstResult(self.model_display_name, step, hits)
235
+
236
+ raise ValueError(
237
+ "Invalid sequence input, must be a Seq object, a list of Seq objects, a"
238
+ " SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
239
+ )
240
+
241
+ def get_highest_cobs_result(self, cobs_result: cobs_index.SearchResult) -> dict:
242
+ """Returns the first entry in a COBS search result."""
243
+ # counter = 1
244
+ # dictio = {}
245
+ for individual_result in cobs_result:
246
+ # COBS already sorts the result in descending order
247
+ # The first doc_name has the highest result which is needed to determine the allele
248
+ return {individual_result.doc_name: individual_result.score}
249
+
250
+ def get_cobs_result(self, cobs_result: cobs_index.SearchResult) -> dict:
251
+ """Returns all entries in a COBS search result."""
252
+ return {
253
+ individual_result.doc_name: individual_result.score
254
+ for individual_result in cobs_result
255
+ if individual_result.score > 50
256
+ }
257
+
258
+ def sequence_splitter(self, input_sequence: str, allele_len: int) -> list[str]:
259
+ """Returns an equally divided sequence in form of a list."""
260
+ # An input sequence will have 10000 or more base pairs.
261
+ sequence_len = len(input_sequence)
262
+
263
+ if sequence_len < 100000:
264
+ substring_length = allele_len // 10
265
+ elif 100000 <= sequence_len < 1000000:
266
+ substring_length = allele_len
267
+ elif 1000000 <= sequence_len < 10000000:
268
+ substring_length = allele_len * 10
269
+ else:
270
+ substring_length = allele_len * 100
271
+
272
+ substring_list = []
273
+ start = 0
274
+
275
+ while start + substring_length <= sequence_len:
276
+ substring_list.append(input_sequence[start : start + substring_length])
277
+ start += substring_length - self.k + 1 # To not lose kmers when dividing
278
+
279
+ # The remaining string is either appended to the list or added to the last entry.
280
+ if start < len(input_sequence):
281
+ remaining_substring = input_sequence[start:]
282
+ # A substring needs to be at least of size k for COBS.
283
+ if len(remaining_substring) < self.k:
284
+ substring_list[-1] += remaining_substring
285
+ else:
286
+ substring_list.append(remaining_substring)
287
+ return substring_list
@@ -8,6 +8,7 @@ from Bio.SeqRecord import SeqRecord
8
8
  from Bio import SeqIO
9
9
  from slugify import slugify
10
10
  import cobs_index as cobs
11
+ from xspect.definitions import fasta_endings, fastq_endings
11
12
  from xspect.file_io import get_record_iterator
12
13
  from xspect.models.result import ModelResult
13
14
 
@@ -64,10 +65,6 @@ class ProbabilisticFilterModel:
64
65
  "num_hashes": self.num_hashes,
65
66
  }
66
67
 
67
- def __dict__(self) -> dict:
68
- """Returns a dictionary representation of the model"""
69
- return self.to_dict()
70
-
71
68
  def slug(self) -> str:
72
69
  """Returns a slug representation of the model"""
73
70
  return slugify(self.model_display_name + "-" + str(self.model_type))
@@ -89,13 +86,7 @@ class ProbabilisticFilterModel:
89
86
 
90
87
  doclist = cobs.DocumentList()
91
88
  for file in dir_path.iterdir():
92
- if file.is_file() and file.suffix in [
93
- ".fasta",
94
- ".fna",
95
- ".fa",
96
- ".fastq",
97
- ".fq",
98
- ]:
89
+ if file.is_file() and file.suffix[1:] in fasta_endings + fastq_endings:
99
90
  # cobs only uses the file name to the first "." as the document name
100
91
  if file.name in display_names:
101
92
  self.display_names[file.name.split(".")[0]] = display_names[
@@ -65,8 +65,11 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
65
65
  ) -> None:
66
66
  """Fit the SVM to the sequences and labels"""
67
67
 
68
+ # Since the SVM works with score data, we need to train
69
+ # the underlying data structure for score generation first
68
70
  super().fit(dir_path, display_names=display_names)
69
71
 
72
+ # calculate scores for SVM training
70
73
  score_list = []
71
74
  for file in svm_path.iterdir():
72
75
  if not file.is_file():
@@ -1,4 +1,4 @@
1
- """Probabilistic filter SVM model for sequence data"""
1
+ """Base probabilistic filter model for sequence data"""
2
2
 
3
3
  # pylint: disable=no-name-in-module, too-many-instance-attributes
4
4
 
@@ -14,7 +14,7 @@ from xspect.file_io import get_record_iterator
14
14
 
15
15
 
16
16
  class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
17
- """Probabilistic filter SVM model for sequence data"""
17
+ """Base probabilistic filter model for sequence data"""
18
18
 
19
19
  def __init__(
20
20
  self,
@@ -25,7 +25,6 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
25
25
  model_type: str,
26
26
  base_path: Path,
27
27
  fpr: float = 0.01,
28
- num_hashes: int = 7,
29
28
  ) -> None:
30
29
  super().__init__(
31
30
  k=k,
@@ -35,12 +34,12 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
35
34
  model_type=model_type,
36
35
  base_path=base_path,
37
36
  fpr=fpr,
38
- num_hashes=num_hashes,
37
+ num_hashes=1,
39
38
  )
40
39
  self.bf = None
41
40
 
42
41
  def fit(self, file_path: Path, display_name: str) -> None:
43
- """Fit the SVM to the sequences and labels"""
42
+ """Fit the cobs classic index to the sequences and labels"""
44
43
  # estimate number of kmers
45
44
  total_length = 0
46
45
  for record in get_record_iterator(file_path):
@@ -89,7 +88,6 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
89
88
  model_json["model_type"],
90
89
  path.parent,
91
90
  fpr=model_json["fpr"],
92
- num_hashes=model_json["num_hashes"],
93
91
  )
94
92
  model.display_names = model_json["display_names"]
95
93
  bloom_path = model.base_path / model.slug() / "filter.bloom"
xspect/models/result.py CHANGED
@@ -1,14 +1,15 @@
1
- """ Module for storing the results of XspecT models. """
1
+ """Module for storing the results of XspecT models."""
2
2
 
3
3
  from enum import Enum
4
4
 
5
5
 
6
6
  def get_last_processing_step(result: "ModelResult") -> "ModelResult":
7
7
  """Get the last subprocessing step of the result. First path only."""
8
- last_step = result
9
- while last_step.subprocessing_steps:
10
- last_step = last_step.subprocessing_steps[-1].result
11
- return last_step
8
+
9
+ # traverse result tree to get last step
10
+ while result.subprocessing_steps:
11
+ result = result.subprocessing_steps[-1].result
12
+ return result
12
13
 
13
14
 
14
15
  class StepType(Enum):
@@ -82,9 +83,9 @@ class ModelResult:
82
83
  scores = {
83
84
  subsequence: {
84
85
  label: round(hits / self.num_kmers[subsequence], 2)
85
- for label, hits in subseuqence_hits.items()
86
+ for label, hits in subsequence_hits.items()
86
87
  }
87
- for subsequence, subseuqence_hits in self.hits.items()
88
+ for subsequence, subsequence_hits in self.hits.items()
88
89
  }
89
90
 
90
91
  # calculate total scores
xspect/pipeline.py CHANGED
@@ -1,4 +1,4 @@
1
- """ Module for defining the Pipeline class. """
1
+ """Module for defining the Pipeline class."""
2
2
 
3
3
  import json
4
4
  from pathlib import Path
xspect/run.py CHANGED
@@ -1,4 +1,4 @@
1
- """ Module with XspecT global run class, which summarizes individual model results. """
1
+ """Module with XspecT global run class, which summarizes individual model results."""
2
2
 
3
3
  import json
4
4
  from pathlib import Path
xspect/train.py CHANGED
@@ -22,7 +22,6 @@ from xspect.train_filter.ncbi_api import (
22
22
  )
23
23
  from xspect.train_filter import (
24
24
  create_svm,
25
- html_scrap,
26
25
  extract_and_concatenate,
27
26
  )
28
27
 
@@ -40,7 +39,7 @@ def check_user_input(user_input: str):
40
39
  rank = metadata["rank"]
41
40
  lineage = metadata["lineage"]
42
41
  bacteria_id = 2
43
- if not sci_name == user_input and not tax_id == user_input:
42
+ if user_input not in (sci_name, tax_id):
44
43
  print(
45
44
  f"{get_current_time()}| The given genus: {user_input} was found as"
46
45
  f" genus: {sci_name} ID: {tax_id}"
@@ -60,38 +59,6 @@ def check_user_input(user_input: str):
60
59
  sys.exit()
61
60
 
62
61
 
63
- def copy_custom_data(bf_path: str, svm_path: str, dir_name: str):
64
- """
65
-
66
- :param bf_path:
67
- :param svm_path:
68
- :param dir_name:
69
- :return:
70
- """
71
- path = Path(os.getcwd()) / "genus_metadata" / dir_name
72
- new_bf_path = path / "concatenate"
73
- new_svm_path = path / "training_data"
74
-
75
- # Make the new directories.
76
- path.mkdir(exist_ok=True)
77
- new_bf_path.mkdir(exist_ok=True)
78
- new_svm_path.mkdir(exist_ok=True)
79
-
80
- # Move bloomfilter files.
81
- bf_files = os.listdir(bf_path)
82
- for file in bf_files:
83
- file_path = Path(bf_path) / file
84
- new_file_path = new_bf_path / file
85
- shutil.copy2(file_path, new_file_path)
86
-
87
- # Move svm files.
88
- svm_files = os.listdir(svm_path)
89
- for file in svm_files:
90
- file_path = Path(svm_path) / file
91
- new_file_path = new_svm_path / file
92
- shutil.copy2(file_path, new_file_path)
93
-
94
-
95
62
  def set_logger(dir_name: str):
96
63
  """Sets the logger parameters.
97
64
 
@@ -168,14 +135,10 @@ def train_ncbi(genus: str, svm_step: int = 1):
168
135
  children_ids = ncbi_children_tree.NCBIChildrenTree(genus).children_ids()
169
136
  species_dict = ncbi_taxon_metadata.NCBITaxonMetadata(children_ids).get_metadata()
170
137
 
171
- # Get all gcf accessions that have Taxonomy check result OK.
172
- logger.info("Checking ANI data for updates")
173
- ani_gcf = html_scrap.TaxonomyCheck().ani_gcf()
174
-
175
138
  # Look for up to 8 assembly accessions per species.
176
139
  logger.info("Getting assembly metadata")
177
140
  all_metadata = ncbi_assembly_metadata.NCBIAssemblyMetadata(
178
- all_metadata=species_dict, ani_gcf=ani_gcf, count=8, contig_n50=10000
141
+ all_metadata=species_dict, count=8, contig_n50=10000
179
142
  )
180
143
  all_metadata = all_metadata.get_all_metadata()
181
144
 
@@ -1,4 +1,4 @@
1
- """ Module for extracting and concatenating assemblies. """
1
+ """Module for extracting and concatenating assemblies."""
2
2
 
3
3
  __author__ = "Berger, Phillip"
4
4
 
@@ -23,9 +23,9 @@ def download_assemblies(accessions, dir_name, target_folder, zip_file_name):
23
23
  """
24
24
 
25
25
  path = get_xspect_tmp_path() / dir_name / target_folder / zip_file_name
26
- api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/{','.join(accessions)}/download"
26
+ api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{','.join(accessions)}/download"
27
27
  parameters = {"include_annotation_type": "GENOME_FASTA", "filename": zip_file_name}
28
28
  os.makedirs(os.path.dirname(path), exist_ok=True)
29
- genome_download = requests.get(api_url, params=parameters, timeout=20)
29
+ genome_download = requests.get(api_url, params=parameters, timeout=30)
30
30
  with open(path, "wb") as f:
31
31
  f.write(genome_download.content)
@@ -1,4 +1,4 @@
1
- """ Collects metadata of assemblies from NCBI API """
1
+ """Collects metadata of assemblies from NCBI API"""
2
2
 
3
3
  __author__ = "Berger, Phillip"
4
4
 
@@ -14,16 +14,14 @@ class NCBIAssemblyMetadata:
14
14
 
15
15
  _all_metadata: dict
16
16
  _count: int
17
- _ani_gcf: list
18
17
  _parameters: dict
19
18
  _accessions: list[str]
20
19
  _contig_n50: int
21
20
  _all_metadata_complete: dict
22
21
 
23
- def __init__(self, all_metadata: dict, ani_gcf: list, count=8, contig_n50=10000):
22
+ def __init__(self, all_metadata: dict, count=8, contig_n50=10000):
24
23
  self._all_metadata = all_metadata
25
24
  self._count = count
26
- self._ani_gcf = ani_gcf
27
25
  self._contig_n50 = contig_n50
28
26
 
29
27
  self._set_parameters()
@@ -72,7 +70,7 @@ class NCBIAssemblyMetadata:
72
70
  }
73
71
 
74
72
  def _make_request(self, taxon: str):
75
- api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/taxon/{taxon}"
73
+ api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{taxon}/dataset_report"
76
74
  accessions = []
77
75
  count = 0
78
76
  for request_type, parameters in self._parameters.items():
@@ -80,17 +78,19 @@ class NCBIAssemblyMetadata:
80
78
  response = raw_response.json()
81
79
  if response:
82
80
  try:
83
- assemblies = response["assemblies"]
84
- for assembly in assemblies:
85
- curr_assembly = assembly["assembly"]
86
- curr_accession = curr_assembly["assembly_accession"]
87
- curr_contig_n50 = curr_assembly["contig_n50"]
81
+ reports = response["reports"]
82
+ for report in reports:
83
+ accession = report["accession"]
84
+ contig_n50 = report["assembly_stats"]["contig_n50"]
85
+ taxonomy_check_status = report["average_nucleotide_identity"][
86
+ "taxonomy_check_status"
87
+ ]
88
88
  if count < self._count:
89
89
  if (
90
- curr_accession in self._ani_gcf
91
- and curr_contig_n50 > self._contig_n50
90
+ taxonomy_check_status == "OK"
91
+ and contig_n50 > self._contig_n50
92
92
  ):
93
- accessions.append(curr_accession)
93
+ accessions.append(accession)
94
94
  count += 1
95
95
  else:
96
96
  break
@@ -24,7 +24,7 @@ class NCBIChildrenTree:
24
24
 
25
25
  def _request_tree(self):
26
26
  """Make the request for the children tree at the NCBI Datasets API."""
27
- api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{self._taxon}/filtered_subtree"
27
+ api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{self._taxon}/filtered_subtree"
28
28
  raw_response = requests.get(api_url, timeout=5)
29
29
  self._response = raw_response.json()["edges"]
30
30
  self._parent_taxon_id = str(self._response["1"]["visible_children"][0])
@@ -1,4 +1,4 @@
1
- """ This module is used to retrieve metadata from the NCBI taxonomy database. """
1
+ """This module is used to retrieve metadata from the NCBI taxonomy database."""
2
2
 
3
3
  __author__ = "Berger, Phillip"
4
4
 
@@ -21,7 +21,7 @@ class NCBITaxonMetadata:
21
21
  self._collect_all_metadata()
22
22
 
23
23
  def _request_metadata(self):
24
- api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{str(self._taxon)}"
24
+ api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{str(self._taxon)}"
25
25
  raw_response = requests.get(api_url, timeout=5)
26
26
  self._response = raw_response.json()["taxonomy_nodes"]
27
27
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: XspecT
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -46,14 +46,14 @@ Requires-Dist: fastapi
46
46
  Requires-Dist: uvicorn
47
47
  Requires-Dist: python-multipart
48
48
  Provides-Extra: docs
49
- Requires-Dist: sphinx ; extra == 'docs'
50
- Requires-Dist: furo ; extra == 'docs'
51
- Requires-Dist: myst-parser ; extra == 'docs'
52
- Requires-Dist: sphinx-copybutton ; extra == 'docs'
53
- Requires-Dist: sphinx-autobuild ; extra == 'docs'
49
+ Requires-Dist: sphinx; extra == "docs"
50
+ Requires-Dist: furo; extra == "docs"
51
+ Requires-Dist: myst-parser; extra == "docs"
52
+ Requires-Dist: sphinx-copybutton; extra == "docs"
53
+ Requires-Dist: sphinx-autobuild; extra == "docs"
54
54
  Provides-Extra: test
55
- Requires-Dist: pytest ; extra == 'test'
56
- Requires-Dist: pytest-cov ; extra == 'test'
55
+ Requires-Dist: pytest; extra == "test"
56
+ Requires-Dist: pytest-cov; extra == "test"
57
57
 
58
58
  # XspecT - Acinetobacter Species Assignment Tool
59
59
  ![Test](https://github.com/bionf/xspect2/actions/workflows/test.yml/badge.svg)
@@ -63,7 +63,7 @@ Requires-Dist: pytest-cov ; extra == 'test'
63
63
  <img src="/docs/img/logo.png" height="50%" width="50%">
64
64
 
65
65
  <!-- start intro -->
66
- XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or sub-type level using [Bloom Filters] and a [Support Vector Machine]. It also identifies existing [blaOxa-genes] and provides a list of relevant research papers for further information.
66
+ XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [Bloom Filters] and a [Support Vector Machine].
67
67
  <br/><br/>
68
68
 
69
69
  XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a reference database. Bloom Filter ensure a fast lookup in this process. For a final prediction the results are classified using a Support Vector Machine.
@@ -88,14 +88,14 @@ pip install xspect
88
88
  Please note that Windows and Alpine Linux is currently not supported.
89
89
 
90
90
  ## Usage
91
- ### Get the Bloomfilters
92
- To download basic pre-trained filters, you can use the built-in command:
91
+ ### Get the models
92
+ To download basic pre-trained models, you can use the built-in command:
93
93
  ```
94
- xspect download-filters
94
+ xspect download-models
95
95
  ```
96
- Additional species filters can be trained using:
96
+ Additional species models can be trained using:
97
97
  ```
98
- xspect train you-ncbi-genus-name
98
+ xspect train-species you-ncbi-genus-name
99
99
  ```
100
100
 
101
101
  ### How to run the web app
@@ -107,7 +107,7 @@ xspect api
107
107
  ### How to use the XspecT command line interface
108
108
  Run xspect with the configuration you want to run it with as arguments.
109
109
  ```
110
- xspect classify your-genus path/to/your/input-set
110
+ xspect classify-species your-genus path/to/your/input-set
111
111
  ```
112
112
  For further instructions on how to use the command line interface, please refer to the [documentation] or execute:
113
113
  ```