XspecT 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/classify.py +32 -0
- xspect/file_io.py +3 -9
- xspect/filter_sequences.py +56 -0
- xspect/main.py +52 -30
- xspect/mlst_feature/mlst_helper.py +102 -13
- xspect/mlst_feature/pub_mlst_handler.py +32 -6
- xspect/model_management.py +1 -15
- xspect/models/probabilistic_filter_mlst_model.py +160 -32
- xspect/models/probabilistic_filter_model.py +1 -0
- xspect/models/result.py +18 -6
- xspect/ncbi.py +8 -6
- xspect/train.py +13 -5
- xspect/web.py +173 -0
- xspect/xspect-web/.gitignore +24 -0
- xspect/xspect-web/README.md +54 -0
- xspect/xspect-web/components.json +21 -0
- xspect/xspect-web/dist/assets/index-CMG4V7fZ.js +290 -0
- xspect/xspect-web/dist/assets/index-jIKg1HIy.css +1 -0
- xspect/xspect-web/dist/index.html +14 -0
- xspect/xspect-web/dist/vite.svg +1 -0
- xspect/xspect-web/eslint.config.js +28 -0
- xspect/xspect-web/index.html +13 -0
- xspect/xspect-web/package-lock.json +6865 -0
- xspect/xspect-web/package.json +58 -0
- xspect/xspect-web/pnpm-lock.yaml +4317 -0
- xspect/xspect-web/public/vite.svg +1 -0
- xspect/xspect-web/src/App.tsx +29 -0
- xspect/xspect-web/src/api.tsx +62 -0
- xspect/xspect-web/src/assets/react.svg +1 -0
- xspect/xspect-web/src/components/classification-form.tsx +284 -0
- xspect/xspect-web/src/components/classify.tsx +18 -0
- xspect/xspect-web/src/components/data-table.tsx +78 -0
- xspect/xspect-web/src/components/dropdown-checkboxes.tsx +63 -0
- xspect/xspect-web/src/components/dropdown-slider.tsx +42 -0
- xspect/xspect-web/src/components/filter-form.tsx +423 -0
- xspect/xspect-web/src/components/filter.tsx +15 -0
- xspect/xspect-web/src/components/header.tsx +46 -0
- xspect/xspect-web/src/components/landing.tsx +7 -0
- xspect/xspect-web/src/components/models-details.tsx +138 -0
- xspect/xspect-web/src/components/models.tsx +53 -0
- xspect/xspect-web/src/components/result-chart.tsx +44 -0
- xspect/xspect-web/src/components/result.tsx +155 -0
- xspect/xspect-web/src/components/spinner.tsx +30 -0
- xspect/xspect-web/src/components/ui/accordion.tsx +64 -0
- xspect/xspect-web/src/components/ui/button.tsx +59 -0
- xspect/xspect-web/src/components/ui/card.tsx +92 -0
- xspect/xspect-web/src/components/ui/chart.tsx +351 -0
- xspect/xspect-web/src/components/ui/command.tsx +175 -0
- xspect/xspect-web/src/components/ui/dialog.tsx +135 -0
- xspect/xspect-web/src/components/ui/dropdown-menu.tsx +255 -0
- xspect/xspect-web/src/components/ui/file-upload.tsx +1459 -0
- xspect/xspect-web/src/components/ui/form.tsx +165 -0
- xspect/xspect-web/src/components/ui/input.tsx +21 -0
- xspect/xspect-web/src/components/ui/label.tsx +24 -0
- xspect/xspect-web/src/components/ui/navigation-menu.tsx +168 -0
- xspect/xspect-web/src/components/ui/popover.tsx +46 -0
- xspect/xspect-web/src/components/ui/select.tsx +183 -0
- xspect/xspect-web/src/components/ui/separator.tsx +26 -0
- xspect/xspect-web/src/components/ui/slider.tsx +61 -0
- xspect/xspect-web/src/components/ui/switch.tsx +29 -0
- xspect/xspect-web/src/components/ui/table.tsx +113 -0
- xspect/xspect-web/src/components/ui/tabs.tsx +64 -0
- xspect/xspect-web/src/index.css +120 -0
- xspect/xspect-web/src/lib/utils.ts +6 -0
- xspect/xspect-web/src/main.tsx +10 -0
- xspect/xspect-web/src/types.tsx +34 -0
- xspect/xspect-web/src/utils.tsx +6 -0
- xspect/xspect-web/src/vite-env.d.ts +1 -0
- xspect/xspect-web/tsconfig.app.json +32 -0
- xspect/xspect-web/tsconfig.json +13 -0
- xspect/xspect-web/tsconfig.node.json +24 -0
- xspect/xspect-web/vite.config.ts +24 -0
- {xspect-0.4.0.dist-info → xspect-0.5.0.dist-info}/METADATA +7 -8
- xspect-0.5.0.dist-info/RECORD +85 -0
- {xspect-0.4.0.dist-info → xspect-0.5.0.dist-info}/WHEEL +1 -1
- xspect/fastapi.py +0 -102
- xspect-0.4.0.dist-info/RECORD +0 -24
- {xspect-0.4.0.dist-info → xspect-0.5.0.dist-info}/entry_points.txt +0 -0
- {xspect-0.4.0.dist-info → xspect-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {xspect-0.4.0.dist-info → xspect-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -24,6 +24,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
24
24
|
base_path: Path,
|
|
25
25
|
fpr: float = 0.001,
|
|
26
26
|
) -> None:
|
|
27
|
+
"""Initialise a ProbabilisticFilterMlstSchemeModel object."""
|
|
27
28
|
if k < 1:
|
|
28
29
|
raise ValueError("Invalid k value, must be greater than 0")
|
|
29
30
|
if not isinstance(base_path, Path):
|
|
@@ -41,7 +42,12 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
41
42
|
self.indices = []
|
|
42
43
|
|
|
43
44
|
def to_dict(self) -> dict:
|
|
44
|
-
"""
|
|
45
|
+
"""
|
|
46
|
+
Returns a dictionary representation of the model.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
dict: The dictionary containing all metadata of an object.
|
|
50
|
+
"""
|
|
45
51
|
return {
|
|
46
52
|
"k": self.k,
|
|
47
53
|
"model_display_name": self.model_display_name,
|
|
@@ -54,14 +60,37 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
54
60
|
}
|
|
55
61
|
|
|
56
62
|
def get_cobs_index_path(self, scheme: str, locus: str) -> Path:
|
|
57
|
-
"""
|
|
63
|
+
"""
|
|
64
|
+
Get the path to the cobs indices.
|
|
65
|
+
|
|
66
|
+
This function creates a directory based on the scheme name, if it does not exist.
|
|
67
|
+
A COBS-Index file is created for every locus in a scheme.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
scheme (str): The name of the scheme.
|
|
71
|
+
locus (str): The name of the locus.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Path: The path to the COBS indices.
|
|
75
|
+
"""
|
|
58
76
|
# To differentiate from genus and species models
|
|
59
77
|
cobs_path = self.base_path / f"{scheme}"
|
|
60
78
|
cobs_path.mkdir(exist_ok=True, parents=True)
|
|
61
79
|
return cobs_path / f"{locus}.cobs_compact"
|
|
62
80
|
|
|
63
81
|
def fit(self, scheme_path: Path) -> None:
|
|
64
|
-
"""
|
|
82
|
+
"""
|
|
83
|
+
Trains a COBS structure for every locus with all its alleles.
|
|
84
|
+
|
|
85
|
+
This function creates COBS-indices.
|
|
86
|
+
Many attributes of an object are set in this function.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
scheme_path (Path): The path to the scheme directory with all loci.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If the scheme alleles have not been downloaded prior.
|
|
93
|
+
"""
|
|
65
94
|
if not scheme_path.exists():
|
|
66
95
|
raise ValueError(
|
|
67
96
|
"Scheme not found. Please make sure to download the schemes prior!"
|
|
@@ -112,7 +141,15 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
112
141
|
|
|
113
142
|
@staticmethod
|
|
114
143
|
def load(scheme_path: Path) -> "ProbabilisticFilterMlstSchemeModel":
|
|
115
|
-
"""
|
|
144
|
+
"""
|
|
145
|
+
Loads the model from a JSON-file.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
scheme_path (Path): The path of the scheme model.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
ProbabilisticFilterMlstSchemeModel: A trained model from the disk in JSON format.
|
|
152
|
+
"""
|
|
116
153
|
scheme_name = str(scheme_path).split("/")[-1]
|
|
117
154
|
json_path = scheme_path / f"{scheme_name}.json"
|
|
118
155
|
with open(json_path, "r", encoding="utf-8") as file:
|
|
@@ -137,8 +174,30 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
137
174
|
model.indices.append(cobs_index.Search(str(entry), False))
|
|
138
175
|
return model
|
|
139
176
|
|
|
140
|
-
def calculate_hits(
|
|
141
|
-
|
|
177
|
+
def calculate_hits(
|
|
178
|
+
self, cobs_path: Path, sequence: Seq, step: int = 1
|
|
179
|
+
) -> list[dict]:
|
|
180
|
+
"""
|
|
181
|
+
Calculates the hits for a sequence.
|
|
182
|
+
|
|
183
|
+
This function has two ways of identifying strain types.
|
|
184
|
+
Sequences with a length of up to 10000 base pairs are handled without preprocessing.
|
|
185
|
+
Sequences with a length >= 10000 base pairs are divided into substrings.
|
|
186
|
+
The results of each substring are added up to find the strain type.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
cobs_path (Path): The path of the COBS-structure directory.
|
|
190
|
+
sequence (Seq): The input sequence for classification.
|
|
191
|
+
step (int, optional): The amount of kmers that are passed; defaults to one.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
list[dict]: The results of the prediction.
|
|
195
|
+
|
|
196
|
+
Raises:
|
|
197
|
+
ValueError: If the model has not been trained.
|
|
198
|
+
ValueError: If the sequence is shorter than k.
|
|
199
|
+
ValueError: If the sequence is not a Seq-object.
|
|
200
|
+
"""
|
|
142
201
|
if not isinstance(sequence, Seq):
|
|
143
202
|
raise ValueError("Invalid sequence, must be a Bio.Seq object")
|
|
144
203
|
|
|
@@ -149,7 +208,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
149
208
|
raise ValueError("The Model has not been trained yet")
|
|
150
209
|
|
|
151
210
|
scheme_path_list = []
|
|
152
|
-
for entry in sorted(
|
|
211
|
+
for entry in sorted(cobs_path.iterdir()):
|
|
153
212
|
if str(entry).endswith(".json"):
|
|
154
213
|
continue
|
|
155
214
|
file_name = str(entry).split("/")[-1] # file_name = locus
|
|
@@ -166,11 +225,12 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
166
225
|
split_sequence = self.sequence_splitter(str(sequence), allele_len)
|
|
167
226
|
for split in split_sequence:
|
|
168
227
|
res = index.search(split, step=step)
|
|
169
|
-
split_result = self.get_cobs_result(res)
|
|
228
|
+
split_result = self.get_cobs_result(res, True)
|
|
170
229
|
if not split_result:
|
|
171
230
|
continue
|
|
172
231
|
cobs_results.append(split_result)
|
|
173
232
|
|
|
233
|
+
# add all split results of an Allele id into one
|
|
174
234
|
all_counts = defaultdict(int)
|
|
175
235
|
for result in cobs_results:
|
|
176
236
|
for name, value in result.items():
|
|
@@ -179,21 +239,36 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
179
239
|
sorted_counts = dict(
|
|
180
240
|
sorted(all_counts.items(), key=lambda item: -item[1])
|
|
181
241
|
)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
242
|
+
if not sorted_counts:
|
|
243
|
+
result_dict = "A Strain type could not be detected because of no kmer matches!"
|
|
244
|
+
highest_results[scheme_path_list[counter]] = {"N/A": 0}
|
|
245
|
+
else:
|
|
246
|
+
first_key = next(iter(sorted_counts))
|
|
247
|
+
highest_result = sorted_counts[first_key]
|
|
248
|
+
result_dict[scheme_path_list[counter]] = sorted_counts
|
|
249
|
+
highest_results[scheme_path_list[counter]] = {
|
|
250
|
+
first_key: highest_result
|
|
251
|
+
}
|
|
186
252
|
counter += 1
|
|
187
253
|
else:
|
|
188
254
|
for index in self.indices:
|
|
189
255
|
res = index.search(
|
|
190
256
|
str(sequence), step=step
|
|
191
257
|
) # COBS can't handle Seq-Objects
|
|
192
|
-
result_dict[scheme_path_list[counter]] = self.get_cobs_result(
|
|
193
|
-
|
|
194
|
-
|
|
258
|
+
result_dict[scheme_path_list[counter]] = self.get_cobs_result(
|
|
259
|
+
res, False
|
|
260
|
+
)
|
|
261
|
+
first_key, highest_result = next(
|
|
262
|
+
iter(result_dict[scheme_path_list[counter]].items())
|
|
195
263
|
)
|
|
264
|
+
highest_results[scheme_path_list[counter]] = {first_key: highest_result}
|
|
196
265
|
counter += 1
|
|
266
|
+
# check if the strain type has sufficient amount of kmer hits
|
|
267
|
+
is_valid = self.has_sufficient_score(highest_results, self.avg_locus_bp_size)
|
|
268
|
+
if not is_valid:
|
|
269
|
+
highest_results["Attention:"] = (
|
|
270
|
+
"This strain type is not reliable due to low kmer hit rates!"
|
|
271
|
+
)
|
|
197
272
|
return [{"Strain type": highest_results}, {"All results": result_dict}]
|
|
198
273
|
|
|
199
274
|
def predict(
|
|
@@ -208,7 +283,20 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
208
283
|
),
|
|
209
284
|
step: int = 1,
|
|
210
285
|
) -> MlstResult:
|
|
211
|
-
"""
|
|
286
|
+
"""
|
|
287
|
+
Get scores for the sequence(s) based on the filters in the model.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
cobs_path (Path): The path of the COBS-structure directory.
|
|
291
|
+
sequence_input (Seq): The input sequence for classification
|
|
292
|
+
step (int, optional): The amount of kmers that are passed; defaults to one
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
MlstResult: The results of the prediction.
|
|
296
|
+
|
|
297
|
+
Raises:
|
|
298
|
+
ValueError: If the sequence input is invalid.
|
|
299
|
+
"""
|
|
212
300
|
if isinstance(sequence_input, SeqRecord):
|
|
213
301
|
if sequence_input.id == "<unknown id>":
|
|
214
302
|
sequence_input.id = "test"
|
|
@@ -238,31 +326,48 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
238
326
|
" SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
|
|
239
327
|
)
|
|
240
328
|
|
|
241
|
-
def
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
329
|
+
def get_cobs_result(
|
|
330
|
+
self, cobs_result: cobs_index.SearchResult, kmer_threshold: bool
|
|
331
|
+
) -> dict:
|
|
332
|
+
"""
|
|
333
|
+
Get every entry in a COBS search result.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
cobs_result (SearchResult): The result of the prediction.
|
|
337
|
+
kmer_threshold (bool): Applying a kmer threshold to mitigate false positives
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
dict: A dictionary storing the allele id of locus as key and the score as value.
|
|
341
|
+
"""
|
|
252
342
|
return {
|
|
253
343
|
individual_result.doc_name: individual_result.score
|
|
254
344
|
for individual_result in cobs_result
|
|
255
|
-
if individual_result.score > 50
|
|
345
|
+
if not kmer_threshold or individual_result.score > 50
|
|
256
346
|
}
|
|
257
347
|
|
|
258
348
|
def sequence_splitter(self, input_sequence: str, allele_len: int) -> list[str]:
|
|
259
|
-
"""
|
|
349
|
+
"""
|
|
350
|
+
Get an equally divided sequence in form of a list.
|
|
351
|
+
|
|
352
|
+
This function is splitting very long sequences into substrings.
|
|
353
|
+
The split is based on sequence and allele length.
|
|
354
|
+
Measures have been taken to not lose kmers while splitting.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
input_sequence (str): The sequence of interest.
|
|
358
|
+
allele_len (int): The average length of an allele.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
list[str]: A list containing all substrings of a sequence greater than 10000 bp.
|
|
362
|
+
|
|
363
|
+
Raises:
|
|
364
|
+
ValueError: If the sequence input is invalid.
|
|
365
|
+
"""
|
|
366
|
+
|
|
260
367
|
# An input sequence will have 10000 or more base pairs.
|
|
261
368
|
sequence_len = len(input_sequence)
|
|
262
369
|
|
|
263
|
-
if sequence_len <
|
|
264
|
-
substring_length = allele_len // 10
|
|
265
|
-
elif 100000 <= sequence_len < 1000000:
|
|
370
|
+
if sequence_len < 1000000:
|
|
266
371
|
substring_length = allele_len
|
|
267
372
|
elif 1000000 <= sequence_len < 10000000:
|
|
268
373
|
substring_length = allele_len * 10
|
|
@@ -285,3 +390,26 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
285
390
|
else:
|
|
286
391
|
substring_list.append(remaining_substring)
|
|
287
392
|
return substring_list
|
|
393
|
+
|
|
394
|
+
def has_sufficient_score(
|
|
395
|
+
self, highest_results: dict, locus_size: list[int]
|
|
396
|
+
) -> bool:
|
|
397
|
+
"""
|
|
398
|
+
Checks if at least one locus in highest_results has a score >= 0.5 * avg base pair size.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
highest_results (dict): Dict where each key is a locus and each value is the kmer score.
|
|
402
|
+
locus_size (list[int]): List of average base pair sizes per locus (in directory order).
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
bool: True if any locus score >= 0.5 * its avg base pair size, False otherwise.
|
|
406
|
+
"""
|
|
407
|
+
for i, (locus, allele_score_dict) in enumerate(highest_results.items()):
|
|
408
|
+
if not allele_score_dict:
|
|
409
|
+
continue # skip empty values
|
|
410
|
+
|
|
411
|
+
# Take the score (the only value) from the nested dict
|
|
412
|
+
score = next(iter(allele_score_dict.values()))
|
|
413
|
+
if score >= 0.5 * locus_size[i]:
|
|
414
|
+
return True
|
|
415
|
+
return False
|
xspect/models/result.py
CHANGED
|
@@ -58,16 +58,28 @@ class ModelResult:
|
|
|
58
58
|
return total_hits
|
|
59
59
|
|
|
60
60
|
def get_filter_mask(self, label: str, filter_threshold: float) -> dict[str, bool]:
|
|
61
|
-
"""Return a mask for filtered subsequences.
|
|
62
|
-
|
|
61
|
+
"""Return a mask for filtered subsequences.
|
|
62
|
+
|
|
63
|
+
The mask is a dictionary with subsequence names as keys and boolean values
|
|
64
|
+
indicating whether the subsequence is above the filter threshold for the given label.
|
|
65
|
+
A value of -1 for filter_threshold indicates that the subsequence with the maximum score
|
|
66
|
+
for the given label should be returned.
|
|
67
|
+
"""
|
|
68
|
+
if filter_threshold < 0 and not filter_threshold == -1 or filter_threshold > 1:
|
|
63
69
|
raise ValueError("The filter threshold must be between 0 and 1.")
|
|
64
70
|
|
|
65
71
|
scores = self.get_scores()
|
|
66
72
|
scores.pop("total")
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
73
|
+
if not filter_threshold == -1:
|
|
74
|
+
return {
|
|
75
|
+
subsequence: score[label] >= filter_threshold
|
|
76
|
+
for subsequence, score in scores.items()
|
|
77
|
+
}
|
|
78
|
+
else:
|
|
79
|
+
return {
|
|
80
|
+
subsequence: score[label] == max(score.values())
|
|
81
|
+
for subsequence, score in scores.items()
|
|
82
|
+
}
|
|
71
83
|
|
|
72
84
|
def get_filtered_subsequence_labels(
|
|
73
85
|
self, label: str, filter_threshold: float = 0.7
|
xspect/ncbi.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
import requests
|
|
6
5
|
import time
|
|
6
|
+
import requests
|
|
7
7
|
|
|
8
8
|
# pylint: disable=line-too-long
|
|
9
9
|
|
|
@@ -55,14 +55,14 @@ class NCBIHandler:
|
|
|
55
55
|
elapsed_time = now - self.last_request_time
|
|
56
56
|
if elapsed_time < self.min_interval:
|
|
57
57
|
time.sleep(self.min_interval - elapsed_time)
|
|
58
|
-
self.last_request_time = now
|
|
58
|
+
self.last_request_time = now
|
|
59
59
|
|
|
60
|
-
def _make_request(self, endpoint: str, timeout: int =
|
|
60
|
+
def _make_request(self, endpoint: str, timeout: int = 15) -> dict:
|
|
61
61
|
"""Make a request to the NCBI Datasets API.
|
|
62
62
|
|
|
63
63
|
Args:
|
|
64
64
|
endpoint (str): The endpoint to make the request to.
|
|
65
|
-
timeout (int, optional): The timeout for the request in seconds. Defaults to
|
|
65
|
+
timeout (int, optional): The timeout for the request in seconds. Defaults to 10.
|
|
66
66
|
|
|
67
67
|
Returns:
|
|
68
68
|
dict: The response from the API.
|
|
@@ -229,7 +229,9 @@ class NCBIHandler:
|
|
|
229
229
|
== "OK"
|
|
230
230
|
]
|
|
231
231
|
except (IndexError, KeyError, TypeError):
|
|
232
|
-
print(
|
|
232
|
+
print(
|
|
233
|
+
f"Could not get {assembly_level.value} accessions for taxon with ID: {taxon_id}. Skipping."
|
|
234
|
+
)
|
|
233
235
|
return []
|
|
234
236
|
return accessions[:count] # Limit to count
|
|
235
237
|
|
|
@@ -255,7 +257,7 @@ class NCBIHandler:
|
|
|
255
257
|
|
|
256
258
|
self._enforce_rate_limit()
|
|
257
259
|
|
|
258
|
-
response = requests.get(self.base_url + endpoint, stream=True, timeout=
|
|
260
|
+
response = requests.get(self.base_url + endpoint, stream=True, timeout=15)
|
|
259
261
|
if response.status_code != 200:
|
|
260
262
|
response.raise_for_status()
|
|
261
263
|
|
xspect/train.py
CHANGED
|
@@ -243,11 +243,19 @@ def train_from_ncbi(
|
|
|
243
243
|
cobs_dir.mkdir(parents=True, exist_ok=True)
|
|
244
244
|
svm_dir.mkdir(parents=True, exist_ok=True)
|
|
245
245
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
246
|
+
# download assemblies
|
|
247
|
+
all_accessions = sum(accessions.values(), [])
|
|
248
|
+
batch_size = 100
|
|
249
|
+
accession_paths = {}
|
|
250
|
+
for i in range(0, len(all_accessions), batch_size):
|
|
251
|
+
batch = all_accessions[i : i + batch_size]
|
|
252
|
+
ncbi_handler.download_assemblies(accessions=batch, output_dir=tmp_dir)
|
|
253
|
+
extract_zip(
|
|
254
|
+
tmp_dir / "ncbi_dataset.zip", tmp_dir / f"batch-{i}-{i+batch_size}"
|
|
255
|
+
)
|
|
256
|
+
accession_paths.update(
|
|
257
|
+
get_ncbi_dataset_accession_paths(tmp_dir / f"batch-{i}-{i+batch_size}")
|
|
258
|
+
)
|
|
251
259
|
|
|
252
260
|
# select accessions
|
|
253
261
|
cobs_accessions = {}
|
xspect/web.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""FastAPI-based web application for XspecT."""
|
|
2
|
+
|
|
3
|
+
from uuid import uuid4
|
|
4
|
+
import json
|
|
5
|
+
from shutil import copyfileobj
|
|
6
|
+
import importlib.resources as pkg_resources
|
|
7
|
+
from fastapi import APIRouter, FastAPI, HTTPException, UploadFile, BackgroundTasks
|
|
8
|
+
from fastapi.responses import RedirectResponse
|
|
9
|
+
from xspect.definitions import get_xspect_runs_path, get_xspect_upload_path
|
|
10
|
+
from xspect.download_models import download_test_models
|
|
11
|
+
import xspect.model_management as mm
|
|
12
|
+
from xspect.train import train_from_ncbi
|
|
13
|
+
from xspect import classify, filter_sequences
|
|
14
|
+
from fastapi.staticfiles import StaticFiles
|
|
15
|
+
|
|
16
|
+
app = FastAPI()
|
|
17
|
+
app.mount(
|
|
18
|
+
"/xspect-web",
|
|
19
|
+
StaticFiles(directory=str(pkg_resources.files("xspect") / "xspect-web" / "dist")),
|
|
20
|
+
name="static",
|
|
21
|
+
)
|
|
22
|
+
router = APIRouter()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.get("/")
|
|
26
|
+
def root():
|
|
27
|
+
"""Root endpoint, forwards to /xspect-web/index.html."""
|
|
28
|
+
return RedirectResponse(url="/xspect-web/index.html")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@router.get("/download-filters")
|
|
32
|
+
def download_filters():
|
|
33
|
+
"""Download filters."""
|
|
34
|
+
download_test_models("http://assets.adrianromberg.com/xspect-models.zip")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@router.get("/classification-result")
|
|
38
|
+
def get_classification_result(uuid: str):
|
|
39
|
+
"""Get classification result."""
|
|
40
|
+
result_path = get_xspect_runs_path() / f"result_{uuid}.json"
|
|
41
|
+
if not result_path.exists():
|
|
42
|
+
raise HTTPException(
|
|
43
|
+
status_code=404, detail="No result found for the specified uuid."
|
|
44
|
+
)
|
|
45
|
+
return json.loads(result_path.read_text())
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@router.post("/classify")
|
|
49
|
+
def classify_post(
|
|
50
|
+
classification_type: str,
|
|
51
|
+
model: str,
|
|
52
|
+
file: str,
|
|
53
|
+
background_tasks: BackgroundTasks,
|
|
54
|
+
step: int = 1,
|
|
55
|
+
):
|
|
56
|
+
"""Classify uploaded sample."""
|
|
57
|
+
input_path = get_xspect_upload_path() / file
|
|
58
|
+
if not input_path.exists():
|
|
59
|
+
raise FileNotFoundError(f"File {input_path} does not exist.")
|
|
60
|
+
|
|
61
|
+
uuid = str(uuid4())
|
|
62
|
+
|
|
63
|
+
if classification_type == "Genus":
|
|
64
|
+
background_tasks.add_task(
|
|
65
|
+
classify.classify_genus,
|
|
66
|
+
model,
|
|
67
|
+
input_path,
|
|
68
|
+
get_xspect_runs_path() / f"result_{uuid}.json",
|
|
69
|
+
step=step,
|
|
70
|
+
)
|
|
71
|
+
return {"message": "Classification started.", "uuid": uuid}
|
|
72
|
+
|
|
73
|
+
elif classification_type == "Species":
|
|
74
|
+
background_tasks.add_task(
|
|
75
|
+
classify.classify_species,
|
|
76
|
+
model,
|
|
77
|
+
input_path,
|
|
78
|
+
get_xspect_runs_path() / f"result_{uuid}.json",
|
|
79
|
+
step=step,
|
|
80
|
+
)
|
|
81
|
+
return {"message": "Classification started.", "uuid": uuid}
|
|
82
|
+
|
|
83
|
+
raise NotImplementedError(
|
|
84
|
+
f"Classification type {classification_type} is not implemented."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
router.post("/filter")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def filter_post(
|
|
92
|
+
filter_type: str,
|
|
93
|
+
model: str,
|
|
94
|
+
input_file: str,
|
|
95
|
+
threshold: float,
|
|
96
|
+
filter_species: str = None,
|
|
97
|
+
):
|
|
98
|
+
"""Filter sequences."""
|
|
99
|
+
input_path = get_xspect_upload_path() / input_file
|
|
100
|
+
output_path = get_xspect_upload_path() / f"filtered_{input_file}"
|
|
101
|
+
|
|
102
|
+
if not input_path.exists():
|
|
103
|
+
raise FileNotFoundError(f"File {input_path} does not exist.")
|
|
104
|
+
|
|
105
|
+
if filter_type == "Genus":
|
|
106
|
+
filter_sequences.filter_genus(model, input_path, output_path, threshold)
|
|
107
|
+
return {"message": "Genus Filtering started."}
|
|
108
|
+
|
|
109
|
+
elif filter_type == "Species":
|
|
110
|
+
filter_sequences.filter_species(
|
|
111
|
+
model, filter_species, input_path, output_path, threshold
|
|
112
|
+
)
|
|
113
|
+
return {"message": "Species Filtering started."}
|
|
114
|
+
|
|
115
|
+
raise NotImplementedError(f"Filter type {filter_type} is not implemented.")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@router.post("/train")
|
|
119
|
+
def train(genus: str, background_tasks: BackgroundTasks, svm_steps: int = 1):
|
|
120
|
+
"""Train NCBI model."""
|
|
121
|
+
background_tasks.add_task(train_from_ncbi, genus, svm_steps)
|
|
122
|
+
|
|
123
|
+
return {"message": "Training started."}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@router.get("/list-models")
|
|
127
|
+
def list_models():
|
|
128
|
+
"""List available models."""
|
|
129
|
+
return mm.get_models()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@router.get("/model-metadata")
|
|
133
|
+
def get_model_metadata(model_slug: str):
|
|
134
|
+
"""Get metadata of a model."""
|
|
135
|
+
return mm.get_model_metadata(model_slug)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@router.post("/model-metadata")
|
|
139
|
+
def post_model_metadata(model_slug: str, author: str, author_email: str):
|
|
140
|
+
"""Update metadata of a model."""
|
|
141
|
+
try:
|
|
142
|
+
mm.update_model_metadata(model_slug, author, author_email)
|
|
143
|
+
except ValueError as e:
|
|
144
|
+
return {"error": str(e)}
|
|
145
|
+
return {"message": "Metadata updated."}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@router.post("/model-display-name")
|
|
149
|
+
def post_model_display_name(model_slug: str, filter_id: str, display_name: str):
|
|
150
|
+
"""Update display name of a filter in a model."""
|
|
151
|
+
try:
|
|
152
|
+
mm.update_model_display_name(model_slug, filter_id, display_name)
|
|
153
|
+
except ValueError as e:
|
|
154
|
+
return {"error": str(e)}
|
|
155
|
+
return {"message": "Display name updated."}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@router.post("/upload-file")
|
|
159
|
+
def upload_file(file: UploadFile):
|
|
160
|
+
"""Upload file to the server."""
|
|
161
|
+
upload_path = get_xspect_upload_path() / file.filename
|
|
162
|
+
|
|
163
|
+
if not upload_path.exists():
|
|
164
|
+
try:
|
|
165
|
+
with upload_path.open("wb") as buffer:
|
|
166
|
+
copyfileobj(file.file, buffer)
|
|
167
|
+
finally:
|
|
168
|
+
file.file.close()
|
|
169
|
+
|
|
170
|
+
return {"filename": file.filename}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
app.include_router(router, prefix="/api", tags=["api"])
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Logs
|
|
2
|
+
logs
|
|
3
|
+
*.log
|
|
4
|
+
npm-debug.log*
|
|
5
|
+
yarn-debug.log*
|
|
6
|
+
yarn-error.log*
|
|
7
|
+
pnpm-debug.log*
|
|
8
|
+
lerna-debug.log*
|
|
9
|
+
|
|
10
|
+
node_modules
|
|
11
|
+
dist
|
|
12
|
+
dist-ssr
|
|
13
|
+
*.local
|
|
14
|
+
|
|
15
|
+
# Editor directories and files
|
|
16
|
+
.vscode/*
|
|
17
|
+
!.vscode/extensions.json
|
|
18
|
+
.idea
|
|
19
|
+
.DS_Store
|
|
20
|
+
*.suo
|
|
21
|
+
*.ntvs*
|
|
22
|
+
*.njsproj
|
|
23
|
+
*.sln
|
|
24
|
+
*.sw?
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# React + TypeScript + Vite
|
|
2
|
+
|
|
3
|
+
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
|
|
4
|
+
|
|
5
|
+
Currently, two official plugins are available:
|
|
6
|
+
|
|
7
|
+
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) for Fast Refresh
|
|
8
|
+
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
|
|
9
|
+
|
|
10
|
+
## Expanding the ESLint configuration
|
|
11
|
+
|
|
12
|
+
If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:
|
|
13
|
+
|
|
14
|
+
```js
|
|
15
|
+
export default tseslint.config({
|
|
16
|
+
extends: [
|
|
17
|
+
// Remove ...tseslint.configs.recommended and replace with this
|
|
18
|
+
...tseslint.configs.recommendedTypeChecked,
|
|
19
|
+
// Alternatively, use this for stricter rules
|
|
20
|
+
...tseslint.configs.strictTypeChecked,
|
|
21
|
+
// Optionally, add this for stylistic rules
|
|
22
|
+
...tseslint.configs.stylisticTypeChecked,
|
|
23
|
+
],
|
|
24
|
+
languageOptions: {
|
|
25
|
+
// other options...
|
|
26
|
+
parserOptions: {
|
|
27
|
+
project: ['./tsconfig.node.json', './tsconfig.app.json'],
|
|
28
|
+
tsconfigRootDir: import.meta.dirname,
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
})
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:
|
|
35
|
+
|
|
36
|
+
```js
|
|
37
|
+
// eslint.config.js
|
|
38
|
+
import reactX from 'eslint-plugin-react-x'
|
|
39
|
+
import reactDom from 'eslint-plugin-react-dom'
|
|
40
|
+
|
|
41
|
+
export default tseslint.config({
|
|
42
|
+
plugins: {
|
|
43
|
+
// Add the react-x and react-dom plugins
|
|
44
|
+
'react-x': reactX,
|
|
45
|
+
'react-dom': reactDom,
|
|
46
|
+
},
|
|
47
|
+
rules: {
|
|
48
|
+
// other rules...
|
|
49
|
+
// Enable its recommended typescript rules
|
|
50
|
+
...reactX.configs['recommended-typescript'].rules,
|
|
51
|
+
...reactDom.configs.recommended.rules,
|
|
52
|
+
},
|
|
53
|
+
})
|
|
54
|
+
```
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://ui.shadcn.com/schema.json",
|
|
3
|
+
"style": "new-york",
|
|
4
|
+
"rsc": false,
|
|
5
|
+
"tsx": true,
|
|
6
|
+
"tailwind": {
|
|
7
|
+
"config": "",
|
|
8
|
+
"css": "src/index.css",
|
|
9
|
+
"baseColor": "neutral",
|
|
10
|
+
"cssVariables": true,
|
|
11
|
+
"prefix": ""
|
|
12
|
+
},
|
|
13
|
+
"aliases": {
|
|
14
|
+
"components": "@/components",
|
|
15
|
+
"utils": "@/lib/utils",
|
|
16
|
+
"ui": "@/components/ui",
|
|
17
|
+
"lib": "@/lib",
|
|
18
|
+
"hooks": "@/hooks"
|
|
19
|
+
},
|
|
20
|
+
"iconLibrary": "lucide"
|
|
21
|
+
}
|