XspecT 0.5.4__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xspect-0.5.4 → xspect-0.6.0}/PKG-INFO +4 -1
- {xspect-0.5.4 → xspect-0.6.0}/docs/benchmark.md +5 -5
- {xspect-0.5.4 → xspect-0.6.0}/docs/cli.md +2 -0
- {xspect-0.5.4 → xspect-0.6.0}/pyproject.toml +5 -2
- {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/main.nf +128 -53
- {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/PKG-INFO +4 -1
- {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/SOURCES.txt +5 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/requires.txt +3 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/classify.py +8 -1
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/definitions.py +19 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/main.py +64 -3
- xspect-0.6.0/src/xspect/misclassification_detection/mapping.py +168 -0
- xspect-0.6.0/src/xspect/misclassification_detection/point_pattern_analysis.py +102 -0
- xspect-0.6.0/src/xspect/misclassification_detection/simulate_reads.py +55 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_filter_model.py +122 -4
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_filter_svm_model.py +7 -7
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/result.py +2 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/ncbi.py +82 -7
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/train.py +21 -4
- xspect-0.6.0/tests/__init__.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_cli.py +3 -1
- xspect-0.6.0/tests/test_misclassification_detection.py +92 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_ncbi.py +53 -3
- {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/black.yml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/docs.yml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/pylint.yml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/pypi.yml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/test.yml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/.gitignore +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/LICENSE +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/README.md +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/docs/contributing.md +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/docs/index.md +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/docs/quickstart.md +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/docs/understanding.md +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/docs/web.md +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/mkdocs.yml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/classify/main.nf +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/environment.yml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/nextflow.config +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark-data/download_data.slurm +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/setup.cfg +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/dependency_links.txt +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/entry_points.txt +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/top_level.txt +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/__init__.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/download_models.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/file_io.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/filter_sequences.py +0 -0
- {xspect-0.5.4/src/xspect/mlst_feature → xspect-0.6.0/src/xspect/misclassification_detection}/__init__.py +0 -0
- {xspect-0.5.4/src/xspect/models → xspect-0.6.0/src/xspect/mlst_feature}/__init__.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/mlst_feature/mlst_helper.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/mlst_feature/pub_mlst_handler.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/model_management.py +0 -0
- {xspect-0.5.4/tests → xspect-0.6.0/src/xspect/models}/__init__.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_filter_mlst_model.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_single_filter_model.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/web.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/.gitignore +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/README.md +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/components.json +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/assets/index-Ceo58xui.css +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/assets/index-Dt_UlbgE.js +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/index.html +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/vite.svg +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/eslint.config.js +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/index.html +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/package-lock.json +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/package.json +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/pnpm-lock.yaml +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/public/vite.svg +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/App.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/api.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/assets/react.svg +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/classification-form.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/classify.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/data-table.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/dropdown-checkboxes.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/dropdown-slider.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/filter-form.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/filter.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/filtering-result.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/header.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/landing.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/models-details.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/models.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/result-chart.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/result.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/spinner.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/accordion.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/button.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/card.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/chart.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/command.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/dialog.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/dropdown-menu.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/file-upload.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/form.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/input.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/label.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/navigation-menu.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/popover.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/select.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/separator.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/slider.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/switch.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/table.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/tabs.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/index.css +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/lib/utils.ts +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/main.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/types.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/utils.tsx +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/vite-env.d.ts +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/tsconfig.app.json +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/tsconfig.json +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/tsconfig.node.json +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/vite.config.ts +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/conftest.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_file_io.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_model_management.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_model_result.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilisitc_filter_mlst_model.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilistic_filter_model.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilistic_filter_svm_model.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilistic_single_filter_model.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_pub_mlst_handler.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_train.py +0 -0
- {xspect-0.5.4 → xspect-0.6.0}/tests/test_web.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: XspecT
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Tool to monitor and characterize pathogens using Bloom filters.
|
|
5
5
|
License: MIT License
|
|
6
6
|
|
|
@@ -45,6 +45,9 @@ Requires-Dist: xxhash
|
|
|
45
45
|
Requires-Dist: fastapi
|
|
46
46
|
Requires-Dist: uvicorn
|
|
47
47
|
Requires-Dist: python-multipart
|
|
48
|
+
Requires-Dist: mappy
|
|
49
|
+
Requires-Dist: pysam
|
|
50
|
+
Requires-Dist: numpy
|
|
48
51
|
Provides-Extra: docs
|
|
49
52
|
Requires-Dist: mkdocs-material; extra == "docs"
|
|
50
53
|
Requires-Dist: mkdocs-include-markdown-plugin; extra == "docs"
|
|
@@ -6,12 +6,12 @@ The benchmark was performed by first download all available Acinetobacter genome
|
|
|
6
6
|
|
|
7
7
|
## Benchmark Results
|
|
8
8
|
|
|
9
|
-
The benchmark results show that XspecT achieves high classification accuracy, with an overall accuracy of
|
|
9
|
+
The benchmark results show that XspecT achieves high classification accuracy, with an overall accuracy of nearly 100% for whole genomes and 82% for simulated reads. However, the low macro-average F1 score (0.41) for the read dataset highlights a substantial class imbalance.
|
|
10
10
|
|
|
11
|
-
|
|
|
12
|
-
|
|
13
|
-
|
|
|
14
|
-
|
|
|
11
|
+
| Dataset | Total Samples | Matches | Mismatches | Match Rate | Mismatch Rate | Accuracy | Macro Avg F1 | Weighted Avg F1 |
|
|
12
|
+
|-----------|--------------:|----------:|-----------:|-----------:|--------------:|---------:|-------------:|----------------:|
|
|
13
|
+
| Assembly | 44,905 | 44,879 | 26 | 99.94% | 0.06% | ≈1.00 | 0.95 | 1.00 |
|
|
14
|
+
| Reads | 9,200,000 | 7,526,902 | 1,673,098 | 81.81% | 18.19% | 0.82 | 0.41 | 0.87 |
|
|
15
15
|
|
|
16
16
|
## Running the benchmark yourself
|
|
17
17
|
|
|
@@ -43,6 +43,8 @@ To train a model with NCBI data, run the following command:
|
|
|
43
43
|
xspect models train ncbi
|
|
44
44
|
```
|
|
45
45
|
|
|
46
|
+
By default, XspecT filters out NCBI accessions that do not meed minimum N50 thresholds, have an inconclusive taxonomy check status, or are deemed atypical by NCBI. Furthermore, species with "Candidatus" and "sp." in their species names are filtered out. To disable filtering behavior, use the respective flag (see `xspect models train ncbi --help`).
|
|
47
|
+
|
|
46
48
|
If you would like to train models with manually curated data from a directory, you can use:
|
|
47
49
|
|
|
48
50
|
```bash
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "XspecT"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.6.0"
|
|
4
4
|
description = "Tool to monitor and characterize pathogens using Bloom filters."
|
|
5
5
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
6
6
|
license = {file = "LICENSE"}
|
|
@@ -18,7 +18,10 @@ dependencies = [
|
|
|
18
18
|
"xxhash",
|
|
19
19
|
"fastapi",
|
|
20
20
|
"uvicorn",
|
|
21
|
-
"python-multipart"
|
|
21
|
+
"python-multipart",
|
|
22
|
+
"mappy",
|
|
23
|
+
"pysam",
|
|
24
|
+
"numpy"
|
|
22
25
|
]
|
|
23
26
|
classifiers = [
|
|
24
27
|
"Intended Audience :: Developers",
|
|
@@ -127,8 +127,8 @@ process createAssemblyTable {
|
|
|
127
127
|
}
|
|
128
128
|
|
|
129
129
|
process summarizeClassifications {
|
|
130
|
-
conda "
|
|
131
|
-
cpus
|
|
130
|
+
conda "conda-forge::pandas"
|
|
131
|
+
cpus 4
|
|
132
132
|
memory '16 GB'
|
|
133
133
|
publishDir "results"
|
|
134
134
|
|
|
@@ -141,24 +141,38 @@ process summarizeClassifications {
|
|
|
141
141
|
|
|
142
142
|
script:
|
|
143
143
|
"""
|
|
144
|
-
|
|
144
|
+
#!/usr/bin/env python
|
|
145
|
+
import pandas as pd
|
|
146
|
+
import json
|
|
147
|
+
import os
|
|
148
|
+
|
|
149
|
+
df = pd.read_csv('${assemblies}', sep='\\t')
|
|
150
|
+
df['Prediction'] = 'unknown'
|
|
151
|
+
|
|
152
|
+
classifications = '${classifications}'.split()
|
|
145
153
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
154
|
+
with open(classifications[0]) as f:
|
|
155
|
+
data = json.load(f)
|
|
156
|
+
keys = data["scores"]["total"]
|
|
157
|
+
for key in keys:
|
|
158
|
+
df[str(key)] = pd.NA
|
|
150
159
|
|
|
151
|
-
for json_file in
|
|
152
|
-
basename
|
|
153
|
-
accession
|
|
154
|
-
prediction=\$(jq '.["prediction"]' \$json_file | tr -d '"')
|
|
160
|
+
for json_file in classifications:
|
|
161
|
+
basename = os.path.basename(json_file).replace('.json', '')
|
|
162
|
+
accession = '_'.join(basename.split('_')[:2])
|
|
155
163
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
164
|
+
with open(json_file, 'r') as f:
|
|
165
|
+
data = json.load(f)
|
|
166
|
+
prediction = data.get('prediction', 'unknown')
|
|
167
|
+
|
|
168
|
+
mask = df['Assembly Accession'].str.contains(accession, na=False)
|
|
169
|
+
df.loc[mask, 'Prediction'] = prediction
|
|
170
|
+
|
|
171
|
+
scores = data.get('scores', {}).get('total', {})
|
|
172
|
+
for species_id, score in scores.items():
|
|
173
|
+
df.loc[mask, str(species_id)] = score
|
|
174
|
+
|
|
175
|
+
df.to_csv('classifications.tsv', sep='\\t', index=False)
|
|
162
176
|
"""
|
|
163
177
|
}
|
|
164
178
|
|
|
@@ -188,7 +202,10 @@ process selectForReadGen {
|
|
|
188
202
|
for id, accession in species_model["training_accessions"].items():
|
|
189
203
|
training_accessions.extend(accession)
|
|
190
204
|
|
|
191
|
-
assemblies = assemblies[
|
|
205
|
+
assemblies = assemblies[
|
|
206
|
+
(assemblies['Assembly Level'] == 'Complete Genome') |
|
|
207
|
+
(assemblies['Assembly Level'] == 'Chromosome')
|
|
208
|
+
]
|
|
192
209
|
assemblies = assemblies[~assemblies['Assembly Accession'].isin(training_accessions)]
|
|
193
210
|
|
|
194
211
|
# use up to three assemblies for each species
|
|
@@ -238,8 +255,8 @@ process generateReads {
|
|
|
238
255
|
}
|
|
239
256
|
|
|
240
257
|
process summarizeReadClassifications {
|
|
241
|
-
conda "conda-forge::
|
|
242
|
-
cpus
|
|
258
|
+
conda "conda-forge::pandas"
|
|
259
|
+
cpus 4
|
|
243
260
|
memory '16 GB'
|
|
244
261
|
publishDir "results"
|
|
245
262
|
|
|
@@ -252,29 +269,55 @@ process summarizeReadClassifications {
|
|
|
252
269
|
|
|
253
270
|
script:
|
|
254
271
|
"""
|
|
255
|
-
|
|
272
|
+
#!/usr/bin/env python
|
|
273
|
+
import pandas as pd
|
|
274
|
+
import json
|
|
275
|
+
import os
|
|
256
276
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
277
|
+
df_assemblies = pd.read_csv('${read_assemblies}', sep='\\t')
|
|
278
|
+
|
|
279
|
+
# Create a mapping of accession to species ID
|
|
280
|
+
accession_to_species = dict(zip(df_assemblies['Assembly Accession'], df_assemblies['Species ID']))
|
|
281
|
+
|
|
282
|
+
results = []
|
|
283
|
+
|
|
284
|
+
classifications = '${read_classifications}'.split()
|
|
285
|
+
for json_file in classifications:
|
|
286
|
+
basename = os.path.basename(json_file).replace('.json', '')
|
|
287
|
+
accession = '_'.join(basename.split('_')[:2])
|
|
260
288
|
|
|
261
|
-
|
|
262
|
-
species_id=\$(awk -F'\t' -v acc="\$accession" '\$1 == acc {print \$6}' ${read_assemblies})
|
|
289
|
+
species_id = accession_to_species.get(accession, 'unknown')
|
|
263
290
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
.scores
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
291
|
+
with open(json_file, 'r') as f:
|
|
292
|
+
data = json.load(f)
|
|
293
|
+
scores = data.get('scores', {})
|
|
294
|
+
|
|
295
|
+
for read_name, read_scores in scores.items():
|
|
296
|
+
if read_name != 'total':
|
|
297
|
+
if read_scores:
|
|
298
|
+
max_score = max(read_scores.values())
|
|
299
|
+
max_species = [species for species, score in read_scores.items() if score == max_score]
|
|
300
|
+
prediction = max_species[0] if len(max_species) == 1 else "ambiguous"
|
|
301
|
+
|
|
302
|
+
result = {
|
|
303
|
+
'Assembly Accession': accession,
|
|
304
|
+
'Read': read_name,
|
|
305
|
+
'Prediction': prediction,
|
|
306
|
+
'Species ID': species_id
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
for species, score in read_scores.items():
|
|
310
|
+
result[species] = score
|
|
311
|
+
|
|
312
|
+
results.append(result)
|
|
313
|
+
|
|
314
|
+
df_results = pd.DataFrame(results)
|
|
315
|
+
df_results.to_csv('read_classifications.tsv', sep='\\t', index=False)
|
|
273
316
|
"""
|
|
274
317
|
}
|
|
275
318
|
|
|
276
319
|
process calculateStats {
|
|
277
|
-
conda "conda-forge::pandas"
|
|
320
|
+
conda "conda-forge::pandas conda-forge::scikit-learn"
|
|
278
321
|
cpus 2
|
|
279
322
|
memory '16 GB'
|
|
280
323
|
publishDir "results"
|
|
@@ -290,33 +333,65 @@ process calculateStats {
|
|
|
290
333
|
"""
|
|
291
334
|
#!/usr/bin/env python
|
|
292
335
|
import pandas as pd
|
|
336
|
+
from sklearn.metrics import classification_report
|
|
293
337
|
|
|
338
|
+
# --- Assembly ---
|
|
294
339
|
df_assembly = pd.read_csv('${assembly_classifications}', sep='\\t')
|
|
295
340
|
df_assembly['Species ID'] = df_assembly['Species ID'].astype(str)
|
|
296
341
|
df_assembly['Prediction'] = df_assembly['Prediction'].astype(str)
|
|
297
|
-
assembly_matches = df_assembly.loc[df_assembly['Species ID'] == df_assembly['Prediction']]
|
|
298
|
-
assembly_mismatches = df_assembly.loc[df_assembly['Species ID'] != df_assembly['Prediction']]
|
|
299
342
|
|
|
343
|
+
y_true_asm = df_assembly['Species ID']
|
|
344
|
+
y_pred_asm = df_assembly['Prediction']
|
|
345
|
+
|
|
346
|
+
asm_matches = (y_true_asm == y_pred_asm).sum()
|
|
347
|
+
asm_total = len(df_assembly)
|
|
348
|
+
|
|
349
|
+
asm_labels = sorted(set(y_true_asm.unique()).union(set(y_pred_asm.unique())))
|
|
350
|
+
asm_report = classification_report(
|
|
351
|
+
y_true_asm,
|
|
352
|
+
y_pred_asm,
|
|
353
|
+
labels=asm_labels,
|
|
354
|
+
zero_division=0
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# --- Reads ---
|
|
300
358
|
df_read = pd.read_csv('${read_classifications}', sep='\\t')
|
|
301
359
|
df_read['Species ID'] = df_read['Species ID'].astype(str)
|
|
302
360
|
df_read['Prediction'] = df_read['Prediction'].astype(str)
|
|
303
|
-
read_matches = df_read.loc[df_read['Species ID'] == df_read['Prediction']]
|
|
304
|
-
read_mismatches = df_read.loc[df_read['Species ID'] != df_read['Prediction']]
|
|
305
361
|
|
|
362
|
+
y_true_read = df_read['Species ID']
|
|
363
|
+
y_pred_read = df_read['Prediction']
|
|
364
|
+
|
|
365
|
+
read_matches = (y_true_read == y_pred_read).sum()
|
|
366
|
+
read_total = len(df_read)
|
|
367
|
+
|
|
368
|
+
read_labels = sorted(set(y_true_read.unique()).union(set(y_pred_read.unique())))
|
|
369
|
+
read_report = classification_report(
|
|
370
|
+
y_true_read,
|
|
371
|
+
y_pred_read,
|
|
372
|
+
labels=read_labels,
|
|
373
|
+
zero_division=0
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# --- Output ---
|
|
306
377
|
with open('stats.txt', 'w') as f:
|
|
307
|
-
f.write(
|
|
308
|
-
f.write(f"
|
|
309
|
-
f.write(f"
|
|
310
|
-
f.write(f"
|
|
311
|
-
f.write(f"
|
|
312
|
-
|
|
313
|
-
f.write("
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
f.write(
|
|
317
|
-
f.write(f"
|
|
318
|
-
f.write(f"
|
|
319
|
-
f.write(f"
|
|
378
|
+
f.write("=== Assembly ===\\n")
|
|
379
|
+
f.write(f"Total: {asm_total}\\n")
|
|
380
|
+
f.write(f"Matches: {asm_matches}\\n")
|
|
381
|
+
f.write(f"Mismatches: {asm_total - asm_matches}\\n")
|
|
382
|
+
f.write(f"Match Rate: {asm_matches / asm_total * 100:.2f}%\\n")
|
|
383
|
+
f.write(f"Mismatch Rate: {(asm_total - asm_matches) / asm_total * 100:.2f}%\\n\\n")
|
|
384
|
+
f.write("Classification report (per class):\\n")
|
|
385
|
+
f.write(asm_report + "\\n")
|
|
386
|
+
|
|
387
|
+
f.write("=== Reads ===\\n")
|
|
388
|
+
f.write(f"Total: {read_total}\\n")
|
|
389
|
+
f.write(f"Matches: {read_matches}\\n")
|
|
390
|
+
f.write(f"Mismatches: {read_total - read_matches}\\n")
|
|
391
|
+
f.write(f"Match Rate: {read_matches / read_total * 100:.2f}%\\n")
|
|
392
|
+
f.write(f"Mismatch Rate: {(read_total - read_matches) / read_total * 100:.2f}%\\n\\n")
|
|
393
|
+
f.write("Classification report (per class):\\n")
|
|
394
|
+
f.write(read_report + "\\n")
|
|
320
395
|
"""
|
|
321
396
|
}
|
|
322
397
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: XspecT
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Tool to monitor and characterize pathogens using Bloom filters.
|
|
5
5
|
License: MIT License
|
|
6
6
|
|
|
@@ -45,6 +45,9 @@ Requires-Dist: xxhash
|
|
|
45
45
|
Requires-Dist: fastapi
|
|
46
46
|
Requires-Dist: uvicorn
|
|
47
47
|
Requires-Dist: python-multipart
|
|
48
|
+
Requires-Dist: mappy
|
|
49
|
+
Requires-Dist: pysam
|
|
50
|
+
Requires-Dist: numpy
|
|
48
51
|
Provides-Extra: docs
|
|
49
52
|
Requires-Dist: mkdocs-material; extra == "docs"
|
|
50
53
|
Requires-Dist: mkdocs-include-markdown-plugin; extra == "docs"
|
|
@@ -37,6 +37,10 @@ src/xspect/model_management.py
|
|
|
37
37
|
src/xspect/ncbi.py
|
|
38
38
|
src/xspect/train.py
|
|
39
39
|
src/xspect/web.py
|
|
40
|
+
src/xspect/misclassification_detection/__init__.py
|
|
41
|
+
src/xspect/misclassification_detection/mapping.py
|
|
42
|
+
src/xspect/misclassification_detection/point_pattern_analysis.py
|
|
43
|
+
src/xspect/misclassification_detection/simulate_reads.py
|
|
40
44
|
src/xspect/mlst_feature/__init__.py
|
|
41
45
|
src/xspect/mlst_feature/mlst_helper.py
|
|
42
46
|
src/xspect/mlst_feature/pub_mlst_handler.py
|
|
@@ -110,6 +114,7 @@ tests/__init__.py
|
|
|
110
114
|
tests/conftest.py
|
|
111
115
|
tests/test_cli.py
|
|
112
116
|
tests/test_file_io.py
|
|
117
|
+
tests/test_misclassification_detection.py
|
|
113
118
|
tests/test_model_management.py
|
|
114
119
|
tests/test_model_result.py
|
|
115
120
|
tests/test_ncbi.py
|
|
@@ -46,6 +46,7 @@ def classify_species(
|
|
|
46
46
|
output_path: Path,
|
|
47
47
|
step: int = 1,
|
|
48
48
|
display_name: bool = False,
|
|
49
|
+
validation: bool = False,
|
|
49
50
|
):
|
|
50
51
|
"""
|
|
51
52
|
Classify the species of sequences.
|
|
@@ -59,6 +60,7 @@ def classify_species(
|
|
|
59
60
|
output_path (Path): The path to the output file where results will be saved.
|
|
60
61
|
step (int): The amount of kmers to be skipped.
|
|
61
62
|
display_name (bool): Includes a display name for each tax_ID.
|
|
63
|
+
validation (bool): Sorts out misclassified reads.
|
|
62
64
|
"""
|
|
63
65
|
ProbabilisticFilterSVMModel = import_module(
|
|
64
66
|
"xspect.models.probabilistic_filter_svm_model"
|
|
@@ -69,7 +71,12 @@ def classify_species(
|
|
|
69
71
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
70
72
|
|
|
71
73
|
for idx, current_path in enumerate(input_paths):
|
|
72
|
-
result = model.predict(
|
|
74
|
+
result = model.predict(
|
|
75
|
+
current_path,
|
|
76
|
+
step=step,
|
|
77
|
+
display_name=display_name,
|
|
78
|
+
validation=validation,
|
|
79
|
+
)
|
|
73
80
|
result.input_source = current_path.name
|
|
74
81
|
cls_path = get_output_path(idx, output_path)
|
|
75
82
|
result.save(cls_path)
|
|
@@ -89,3 +89,22 @@ def get_xspect_mlst_path() -> Path:
|
|
|
89
89
|
mlst_path = get_xspect_root_path() / "mlst"
|
|
90
90
|
mlst_path.mkdir(exist_ok=True, parents=True)
|
|
91
91
|
return mlst_path
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_xspect_misclassification_path() -> Path:
|
|
95
|
+
"""
|
|
96
|
+
Notes:
|
|
97
|
+
Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
|
|
98
|
+
(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
|
|
99
|
+
|
|
100
|
+
Return the path to the XspecT Misclassification directory.
|
|
101
|
+
|
|
102
|
+
Returns the path to the XspecT Misclassification directory, which is located within the XspecT data
|
|
103
|
+
directory. If the directory does not exist, it creates the directory.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Path: The path to the XspecT Misclassification directory.
|
|
107
|
+
"""
|
|
108
|
+
misclassification_path = get_xspect_root_path() / "misclassification"
|
|
109
|
+
misclassification_path.mkdir(exist_ok=True, parents=True)
|
|
110
|
+
return misclassification_path
|
|
@@ -87,13 +87,62 @@ def train():
|
|
|
87
87
|
help="Email of the author.",
|
|
88
88
|
default=None,
|
|
89
89
|
)
|
|
90
|
-
|
|
90
|
+
@click.option(
|
|
91
|
+
"--min-n50",
|
|
92
|
+
type=int,
|
|
93
|
+
help="Minimum contig N50 to filter the accessions (default: 10000).",
|
|
94
|
+
default=10000,
|
|
95
|
+
)
|
|
96
|
+
@click.option(
|
|
97
|
+
"--include-atypical/--exclude-atypical",
|
|
98
|
+
help="Include or exclude atypical accessions (default: exclude).",
|
|
99
|
+
default=False,
|
|
100
|
+
)
|
|
101
|
+
@click.option(
|
|
102
|
+
"--allow-inconclusive",
|
|
103
|
+
is_flag=True,
|
|
104
|
+
help="Allow the use of accessions with inconclusive taxonomy check status for training.",
|
|
105
|
+
default=False,
|
|
106
|
+
)
|
|
107
|
+
@click.option(
|
|
108
|
+
"--allow-candidatus",
|
|
109
|
+
is_flag=True,
|
|
110
|
+
help="Allow the use of Candidatus species for training.",
|
|
111
|
+
default=False,
|
|
112
|
+
)
|
|
113
|
+
@click.option(
|
|
114
|
+
"--allow-sp",
|
|
115
|
+
is_flag=True,
|
|
116
|
+
help="Allow the use of species with 'sp.' in their names for training.",
|
|
117
|
+
default=False,
|
|
118
|
+
)
|
|
119
|
+
def train_ncbi(
|
|
120
|
+
model_genus,
|
|
121
|
+
svm_steps,
|
|
122
|
+
author,
|
|
123
|
+
author_email,
|
|
124
|
+
min_n50,
|
|
125
|
+
include_atypical,
|
|
126
|
+
allow_inconclusive,
|
|
127
|
+
allow_candidatus,
|
|
128
|
+
allow_sp,
|
|
129
|
+
):
|
|
91
130
|
"""Train a species and a genus model based on NCBI data."""
|
|
92
131
|
click.echo(f"Training {model_genus} species and genus metagenome model.")
|
|
93
132
|
try:
|
|
94
133
|
train_from_ncbi = import_module("xspect.train").train_from_ncbi
|
|
95
134
|
|
|
96
|
-
train_from_ncbi(
|
|
135
|
+
train_from_ncbi(
|
|
136
|
+
model_genus,
|
|
137
|
+
svm_steps,
|
|
138
|
+
author,
|
|
139
|
+
author_email,
|
|
140
|
+
min_n50=min_n50,
|
|
141
|
+
exclude_atypical=not include_atypical,
|
|
142
|
+
allow_inconclusive=allow_inconclusive,
|
|
143
|
+
allow_candidatus=allow_candidatus,
|
|
144
|
+
allow_sp=allow_sp,
|
|
145
|
+
)
|
|
97
146
|
except ValueError as e:
|
|
98
147
|
click.echo(f"Error: {e}")
|
|
99
148
|
return
|
|
@@ -287,8 +336,19 @@ def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
|
|
|
287
336
|
help="Includes the display names next to taxonomy-IDs.",
|
|
288
337
|
is_flag=True,
|
|
289
338
|
)
|
|
339
|
+
@click.option(
|
|
340
|
+
"-v",
|
|
341
|
+
"--validation",
|
|
342
|
+
help="Detects misclassification for small reads or contigs.",
|
|
343
|
+
is_flag=True,
|
|
344
|
+
)
|
|
290
345
|
def classify_species(
|
|
291
|
-
model_genus,
|
|
346
|
+
model_genus,
|
|
347
|
+
input_path,
|
|
348
|
+
output_path,
|
|
349
|
+
sparse_sampling_step,
|
|
350
|
+
display_names,
|
|
351
|
+
validation,
|
|
292
352
|
):
|
|
293
353
|
"""Classify samples using a species model."""
|
|
294
354
|
click.echo("Classifying...")
|
|
@@ -300,6 +360,7 @@ def classify_species(
|
|
|
300
360
|
Path(output_path),
|
|
301
361
|
sparse_sampling_step,
|
|
302
362
|
display_names,
|
|
363
|
+
validation,
|
|
303
364
|
)
|
|
304
365
|
|
|
305
366
|
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mapping handler for the alignment-based misclassification detection.
|
|
3
|
+
|
|
4
|
+
Notes:
|
|
5
|
+
Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
|
|
6
|
+
(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import mappy, pysam, os, csv
|
|
10
|
+
from Bio import SeqIO
|
|
11
|
+
from xspect.definitions import fasta_endings
|
|
12
|
+
|
|
13
|
+
__author__ = "Cetin, Oemer"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MappingHandler:
|
|
17
|
+
"""Handler class for all mapping related procedures."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, ref_genome_path: str, reads_path: str) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Initialise the mapping handler.
|
|
22
|
+
|
|
23
|
+
This method sets up the paths to the reference genome and query sequences.
|
|
24
|
+
Additionally, the paths to the output formats (SAM, BAM and TSV) are generated.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
ref_genome_path (str): The path to the reference genome.
|
|
28
|
+
reads_path (str): The path to the query sequences.
|
|
29
|
+
"""
|
|
30
|
+
if not os.path.isfile(ref_genome_path):
|
|
31
|
+
raise ValueError("The path to the reference genome does not exist.")
|
|
32
|
+
|
|
33
|
+
if not os.path.isfile(reads_path):
|
|
34
|
+
raise ValueError("The path to the reads does not exist.")
|
|
35
|
+
|
|
36
|
+
if not ref_genome_path.endswith(tuple(fasta_endings)) and reads_path.endswith(
|
|
37
|
+
tuple(fasta_endings)
|
|
38
|
+
):
|
|
39
|
+
raise ValueError("The files must be FASTA-files!")
|
|
40
|
+
|
|
41
|
+
stem = reads_path.rsplit(".", 1)[0] + "_mapped"
|
|
42
|
+
self.ref_genome_path = ref_genome_path
|
|
43
|
+
self.reads_path = reads_path
|
|
44
|
+
self.sam = stem + ".sam"
|
|
45
|
+
self.bam = stem + ".sorted.bam"
|
|
46
|
+
self.tsv = stem + ".start_coordinates.tsv"
|
|
47
|
+
|
|
48
|
+
def map_reads_onto_reference(self) -> None:
|
|
49
|
+
"""
|
|
50
|
+
A Method that maps reads against the respective reference genome.
|
|
51
|
+
|
|
52
|
+
This function creates a SAM file via Mappy and converts it into a BAM file.
|
|
53
|
+
"""
|
|
54
|
+
# create header (entry = sequences of the reference genome)
|
|
55
|
+
ref_seq = [
|
|
56
|
+
{"SN": rec.id, "LN": len(rec.seq)}
|
|
57
|
+
for rec in SeqIO.parse(self.ref_genome_path, "fasta")
|
|
58
|
+
]
|
|
59
|
+
header = {"HD": {"VN": "1.0"}, "SQ": ref_seq}
|
|
60
|
+
target_id = {sequence["SN"]: number for number, sequence in enumerate(ref_seq)}
|
|
61
|
+
|
|
62
|
+
reads = list(SeqIO.parse(self.reads_path, "fasta"))
|
|
63
|
+
if not reads:
|
|
64
|
+
raise ValueError("Reads file is empty.")
|
|
65
|
+
|
|
66
|
+
read_length = len(reads[0].seq)
|
|
67
|
+
preset = "map-ont" if read_length > 150 else "sr"
|
|
68
|
+
# create SAM-file
|
|
69
|
+
aln = mappy.Aligner(self.ref_genome_path, preset=preset)
|
|
70
|
+
with pysam.AlignmentFile(self.sam, "w", header=header) as out:
|
|
71
|
+
for read in reads:
|
|
72
|
+
read_seq = str(read.seq)
|
|
73
|
+
for hit in aln.map(read_seq):
|
|
74
|
+
if hit.cigar_str is None:
|
|
75
|
+
continue
|
|
76
|
+
# add soft-clips so CIGAR length == len(read_seq) IMPORTANT!!
|
|
77
|
+
leftS = hit.q_st
|
|
78
|
+
rightS = len(read_seq) - hit.q_en
|
|
79
|
+
cigar = (
|
|
80
|
+
(f"{leftS}S" if leftS > 0 else "")
|
|
81
|
+
+ hit.cigar_str
|
|
82
|
+
+ (f"{rightS}S" if rightS > 0 else "")
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
mapped_region = pysam.AlignedSegment()
|
|
86
|
+
mapped_region.query_name = read.id
|
|
87
|
+
mapped_region.query_sequence = read_seq
|
|
88
|
+
mapped_region.flag = 16 if hit.strand == -1 else 0
|
|
89
|
+
mapped_region.reference_id = target_id[hit.ctg]
|
|
90
|
+
mapped_region.reference_start = hit.r_st
|
|
91
|
+
mapped_region.mapping_quality = (
|
|
92
|
+
hit.mapq or 255
|
|
93
|
+
) # 0-60 (255 means unavailable)
|
|
94
|
+
mapped_region.cigarstring = cigar
|
|
95
|
+
out.write(mapped_region)
|
|
96
|
+
break # keep only primary
|
|
97
|
+
|
|
98
|
+
# create BAM-file
|
|
99
|
+
pysam.sort("-o", self.bam, self.sam)
|
|
100
|
+
pysam.index(self.bam)
|
|
101
|
+
|
|
102
|
+
def get_total_genome_length(self) -> int:
|
|
103
|
+
"""
|
|
104
|
+
Get the genome length from a BAM-file.
|
|
105
|
+
|
|
106
|
+
This function opens a BAM-file and extracts the genome length information.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
int: The genome length.
|
|
110
|
+
"""
|
|
111
|
+
with pysam.AlignmentFile(self.bam, "rb") as bam:
|
|
112
|
+
return sum(bam.lengths)
|
|
113
|
+
|
|
114
|
+
def extract_starting_coordinates(self) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Extract starting coordinates of mapped regions from a BAM-file.
|
|
117
|
+
|
|
118
|
+
This function scans through a BAM-file and creates a TSV-file.
|
|
119
|
+
The information that is extracted is the starting coordinate for each mapped read.
|
|
120
|
+
"""
|
|
121
|
+
# create tsv-file with all start positions
|
|
122
|
+
with open(self.tsv, "w") as tsv:
|
|
123
|
+
tsv.write("reference_genome\tread\tmapped_starting_coordinate\n")
|
|
124
|
+
try:
|
|
125
|
+
with pysam.AlignmentFile(self.bam, "rb") as bam:
|
|
126
|
+
entry = {
|
|
127
|
+
i: seq["SN"] for i, seq in enumerate(bam.header.to_dict()["SQ"])
|
|
128
|
+
}
|
|
129
|
+
seen = set()
|
|
130
|
+
for ref_seq in bam.references:
|
|
131
|
+
for hit in bam.fetch(ref_seq):
|
|
132
|
+
if (
|
|
133
|
+
hit.is_unmapped
|
|
134
|
+
or hit.is_secondary
|
|
135
|
+
or hit.is_supplementary
|
|
136
|
+
):
|
|
137
|
+
continue
|
|
138
|
+
key = (hit.reference_id, hit.reference_start)
|
|
139
|
+
if key in seen:
|
|
140
|
+
continue
|
|
141
|
+
seen.add(key)
|
|
142
|
+
tsv.write(
|
|
143
|
+
f"{entry[hit.reference_id]}\t{hit.query_name}\t{hit.reference_start}\n"
|
|
144
|
+
)
|
|
145
|
+
except ValueError:
|
|
146
|
+
tsv.write("dummy_reference\tdummy_read\t1000\n")
|
|
147
|
+
|
|
148
|
+
def get_start_coordinates(self) -> list[int]:
|
|
149
|
+
"""
|
|
150
|
+
Get the coordinates of a TSV-file.
|
|
151
|
+
|
|
152
|
+
This function opens a TSV-file and saves all starting coordinates in a list.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
list[int]: The list containing all starting coordinates.
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
ValueError: If no column with starting coordinates is found.
|
|
159
|
+
"""
|
|
160
|
+
coordinates = []
|
|
161
|
+
with open(self.tsv, "r", newline="") as f:
|
|
162
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
163
|
+
for row in reader:
|
|
164
|
+
val = row.get("mapped_starting_coordinate")
|
|
165
|
+
if val is None:
|
|
166
|
+
raise ValueError("Column with starting coordinates not found.")
|
|
167
|
+
coordinates.append(int(val))
|
|
168
|
+
return coordinates
|