XspecT 0.5.4__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {xspect-0.5.4 → xspect-0.6.0}/PKG-INFO +4 -1
  2. {xspect-0.5.4 → xspect-0.6.0}/docs/benchmark.md +5 -5
  3. {xspect-0.5.4 → xspect-0.6.0}/docs/cli.md +2 -0
  4. {xspect-0.5.4 → xspect-0.6.0}/pyproject.toml +5 -2
  5. {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/main.nf +128 -53
  6. {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/PKG-INFO +4 -1
  7. {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/SOURCES.txt +5 -0
  8. {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/requires.txt +3 -0
  9. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/classify.py +8 -1
  10. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/definitions.py +19 -0
  11. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/main.py +64 -3
  12. xspect-0.6.0/src/xspect/misclassification_detection/mapping.py +168 -0
  13. xspect-0.6.0/src/xspect/misclassification_detection/point_pattern_analysis.py +102 -0
  14. xspect-0.6.0/src/xspect/misclassification_detection/simulate_reads.py +55 -0
  15. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_filter_model.py +122 -4
  16. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_filter_svm_model.py +7 -7
  17. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/result.py +2 -0
  18. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/ncbi.py +82 -7
  19. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/train.py +21 -4
  20. xspect-0.6.0/tests/__init__.py +0 -0
  21. {xspect-0.5.4 → xspect-0.6.0}/tests/test_cli.py +3 -1
  22. xspect-0.6.0/tests/test_misclassification_detection.py +92 -0
  23. {xspect-0.5.4 → xspect-0.6.0}/tests/test_ncbi.py +53 -3
  24. {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/black.yml +0 -0
  25. {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/docs.yml +0 -0
  26. {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/pylint.yml +0 -0
  27. {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/pypi.yml +0 -0
  28. {xspect-0.5.4 → xspect-0.6.0}/.github/workflows/test.yml +0 -0
  29. {xspect-0.5.4 → xspect-0.6.0}/.gitignore +0 -0
  30. {xspect-0.5.4 → xspect-0.6.0}/LICENSE +0 -0
  31. {xspect-0.5.4 → xspect-0.6.0}/README.md +0 -0
  32. {xspect-0.5.4 → xspect-0.6.0}/docs/contributing.md +0 -0
  33. {xspect-0.5.4 → xspect-0.6.0}/docs/index.md +0 -0
  34. {xspect-0.5.4 → xspect-0.6.0}/docs/quickstart.md +0 -0
  35. {xspect-0.5.4 → xspect-0.6.0}/docs/understanding.md +0 -0
  36. {xspect-0.5.4 → xspect-0.6.0}/docs/web.md +0 -0
  37. {xspect-0.5.4 → xspect-0.6.0}/mkdocs.yml +0 -0
  38. {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/classify/main.nf +0 -0
  39. {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/environment.yml +0 -0
  40. {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark/nextflow.config +0 -0
  41. {xspect-0.5.4 → xspect-0.6.0}/scripts/benchmark-data/download_data.slurm +0 -0
  42. {xspect-0.5.4 → xspect-0.6.0}/setup.cfg +0 -0
  43. {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/dependency_links.txt +0 -0
  44. {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/entry_points.txt +0 -0
  45. {xspect-0.5.4 → xspect-0.6.0}/src/XspecT.egg-info/top_level.txt +0 -0
  46. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/__init__.py +0 -0
  47. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/download_models.py +0 -0
  48. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/file_io.py +0 -0
  49. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/filter_sequences.py +0 -0
  50. {xspect-0.5.4/src/xspect/mlst_feature → xspect-0.6.0/src/xspect/misclassification_detection}/__init__.py +0 -0
  51. {xspect-0.5.4/src/xspect/models → xspect-0.6.0/src/xspect/mlst_feature}/__init__.py +0 -0
  52. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/mlst_feature/mlst_helper.py +0 -0
  53. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/mlst_feature/pub_mlst_handler.py +0 -0
  54. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/model_management.py +0 -0
  55. {xspect-0.5.4/tests → xspect-0.6.0/src/xspect/models}/__init__.py +0 -0
  56. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_filter_mlst_model.py +0 -0
  57. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/models/probabilistic_single_filter_model.py +0 -0
  58. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/web.py +0 -0
  59. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/.gitignore +0 -0
  60. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/README.md +0 -0
  61. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/components.json +0 -0
  62. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/assets/index-Ceo58xui.css +0 -0
  63. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/assets/index-Dt_UlbgE.js +0 -0
  64. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/index.html +0 -0
  65. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/dist/vite.svg +0 -0
  66. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/eslint.config.js +0 -0
  67. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/index.html +0 -0
  68. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/package-lock.json +0 -0
  69. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/package.json +0 -0
  70. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/pnpm-lock.yaml +0 -0
  71. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/public/vite.svg +0 -0
  72. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/App.tsx +0 -0
  73. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/api.tsx +0 -0
  74. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/assets/react.svg +0 -0
  75. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/classification-form.tsx +0 -0
  76. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/classify.tsx +0 -0
  77. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/data-table.tsx +0 -0
  78. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/dropdown-checkboxes.tsx +0 -0
  79. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/dropdown-slider.tsx +0 -0
  80. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/filter-form.tsx +0 -0
  81. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/filter.tsx +0 -0
  82. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/filtering-result.tsx +0 -0
  83. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/header.tsx +0 -0
  84. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/landing.tsx +0 -0
  85. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/models-details.tsx +0 -0
  86. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/models.tsx +0 -0
  87. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/result-chart.tsx +0 -0
  88. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/result.tsx +0 -0
  89. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/spinner.tsx +0 -0
  90. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/accordion.tsx +0 -0
  91. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/button.tsx +0 -0
  92. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/card.tsx +0 -0
  93. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/chart.tsx +0 -0
  94. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/command.tsx +0 -0
  95. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/dialog.tsx +0 -0
  96. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/dropdown-menu.tsx +0 -0
  97. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/file-upload.tsx +0 -0
  98. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/form.tsx +0 -0
  99. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/input.tsx +0 -0
  100. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/label.tsx +0 -0
  101. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/navigation-menu.tsx +0 -0
  102. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/popover.tsx +0 -0
  103. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/select.tsx +0 -0
  104. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/separator.tsx +0 -0
  105. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/slider.tsx +0 -0
  106. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/switch.tsx +0 -0
  107. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/table.tsx +0 -0
  108. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/components/ui/tabs.tsx +0 -0
  109. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/index.css +0 -0
  110. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/lib/utils.ts +0 -0
  111. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/main.tsx +0 -0
  112. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/types.tsx +0 -0
  113. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/utils.tsx +0 -0
  114. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/src/vite-env.d.ts +0 -0
  115. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/tsconfig.app.json +0 -0
  116. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/tsconfig.json +0 -0
  117. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/tsconfig.node.json +0 -0
  118. {xspect-0.5.4 → xspect-0.6.0}/src/xspect/xspect-web/vite.config.ts +0 -0
  119. {xspect-0.5.4 → xspect-0.6.0}/tests/conftest.py +0 -0
  120. {xspect-0.5.4 → xspect-0.6.0}/tests/test_file_io.py +0 -0
  121. {xspect-0.5.4 → xspect-0.6.0}/tests/test_model_management.py +0 -0
  122. {xspect-0.5.4 → xspect-0.6.0}/tests/test_model_result.py +0 -0
  123. {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilisitc_filter_mlst_model.py +0 -0
  124. {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilistic_filter_model.py +0 -0
  125. {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilistic_filter_svm_model.py +0 -0
  126. {xspect-0.5.4 → xspect-0.6.0}/tests/test_probabilistic_single_filter_model.py +0 -0
  127. {xspect-0.5.4 → xspect-0.6.0}/tests/test_pub_mlst_handler.py +0 -0
  128. {xspect-0.5.4 → xspect-0.6.0}/tests/test_train.py +0 -0
  129. {xspect-0.5.4 → xspect-0.6.0}/tests/test_web.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: XspecT
3
- Version: 0.5.4
3
+ Version: 0.6.0
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -45,6 +45,9 @@ Requires-Dist: xxhash
45
45
  Requires-Dist: fastapi
46
46
  Requires-Dist: uvicorn
47
47
  Requires-Dist: python-multipart
48
+ Requires-Dist: mappy
49
+ Requires-Dist: pysam
50
+ Requires-Dist: numpy
48
51
  Provides-Extra: docs
49
52
  Requires-Dist: mkdocs-material; extra == "docs"
50
53
  Requires-Dist: mkdocs-include-markdown-plugin; extra == "docs"
@@ -6,12 +6,12 @@ The benchmark was performed by first download all available Acinetobacter genome
6
6
 
7
7
  ## Benchmark Results
8
8
 
9
- The benchmark results show that XspecT achieves high classification accuracy, with an overall accuracy of 99.94% for whole genomes and 87.11% for simulated reads.
9
+ The benchmark results show that XspecT achieves high classification accuracy, with an overall accuracy of nearly 100% for whole genomes and 82% for simulated reads. However, the low macro-average F1 score (0.41) for the read dataset highlights a substantial class imbalance.
10
10
 
11
- | Category | Total | Matches | Mismatches | Match Rate | Mismatch Rate |
12
- |-------------------|----------|----------|------------|------------|---------------|
13
- | Assemblies | 44,905 | 44,879 | 26 | 99.94% | 0.06% |
14
- | Simulated reads | 9,000,000| 7,839,877| 1,160,123 | 87.11% | 12.89% |
11
+ | Dataset | Total Samples | Matches | Mismatches | Match Rate | Mismatch Rate | Accuracy | Macro Avg F1 | Weighted Avg F1 |
12
+ |-----------|--------------:|----------:|-----------:|-----------:|--------------:|---------:|-------------:|----------------:|
13
+ | Assembly | 44,905 | 44,879 | 26 | 99.94% | 0.06% | ≈1.00 | 0.95 | 1.00 |
14
+ | Reads | 9,200,000 | 7,526,902 | 1,673,098 | 81.81% | 18.19% | 0.82 | 0.41 | 0.87 |
15
15
 
16
16
  ## Running the benchmark yourself
17
17
 
@@ -43,6 +43,8 @@ To train a model with NCBI data, run the following command:
43
43
  xspect models train ncbi
44
44
  ```
45
45
 
46
+ By default, XspecT filters out NCBI accessions that do not meed minimum N50 thresholds, have an inconclusive taxonomy check status, or are deemed atypical by NCBI. Furthermore, species with "Candidatus" and "sp." in their species names are filtered out. To disable filtering behavior, use the respective flag (see `xspect models train ncbi --help`).
47
+
46
48
  If you would like to train models with manually curated data from a directory, you can use:
47
49
 
48
50
  ```bash
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "XspecT"
3
- version = "0.5.4"
3
+ version = "0.6.0"
4
4
  description = "Tool to monitor and characterize pathogens using Bloom filters."
5
5
  readme = {file = "README.md", content-type = "text/markdown"}
6
6
  license = {file = "LICENSE"}
@@ -18,7 +18,10 @@ dependencies = [
18
18
  "xxhash",
19
19
  "fastapi",
20
20
  "uvicorn",
21
- "python-multipart"
21
+ "python-multipart",
22
+ "mappy",
23
+ "pysam",
24
+ "numpy"
22
25
  ]
23
26
  classifiers = [
24
27
  "Intended Audience :: Developers",
@@ -127,8 +127,8 @@ process createAssemblyTable {
127
127
  }
128
128
 
129
129
  process summarizeClassifications {
130
- conda "jq"
131
- cpus 2
130
+ conda "conda-forge::pandas"
131
+ cpus 4
132
132
  memory '16 GB'
133
133
  publishDir "results"
134
134
 
@@ -141,24 +141,38 @@ process summarizeClassifications {
141
141
 
142
142
  script:
143
143
  """
144
- cp ${assemblies} classifications.tsv
144
+ #!/usr/bin/env python
145
+ import pandas as pd
146
+ import json
147
+ import os
148
+
149
+ df = pd.read_csv('${assemblies}', sep='\\t')
150
+ df['Prediction'] = 'unknown'
151
+
152
+ classifications = '${classifications}'.split()
145
153
 
146
- awk 'BEGIN {FS=OFS="\t"}
147
- NR==1 {print \$0, "Prediction"}
148
- NR>1 {print \$0, "unknown"}' classifications.tsv > temp_classifications.tsv
149
- mv temp_classifications.tsv classifications.tsv
154
+ with open(classifications[0]) as f:
155
+ data = json.load(f)
156
+ keys = data["scores"]["total"]
157
+ for key in keys:
158
+ df[str(key)] = pd.NA
150
159
 
151
- for json_file in ${classifications}; do
152
- basename=\$(basename \$json_file .json)
153
- accession=\$(echo \$basename | cut -d'_' -f1-2)
154
- prediction=\$(jq '.["prediction"]' \$json_file | tr -d '"')
160
+ for json_file in classifications:
161
+ basename = os.path.basename(json_file).replace('.json', '')
162
+ accession = '_'.join(basename.split('_')[:2])
155
163
 
156
- awk -v acc="\$accession" -v pred="\$prediction" 'BEGIN {FS=OFS="\t"}
157
- NR==1 {print}
158
- NR>1 && \$1 ~ acc {\$NF=pred; print}
159
- NR>1 && \$1 !~ acc {print}' classifications.tsv > temp_classifications.tsv
160
- mv temp_classifications.tsv classifications.tsv
161
- done
164
+ with open(json_file, 'r') as f:
165
+ data = json.load(f)
166
+ prediction = data.get('prediction', 'unknown')
167
+
168
+ mask = df['Assembly Accession'].str.contains(accession, na=False)
169
+ df.loc[mask, 'Prediction'] = prediction
170
+
171
+ scores = data.get('scores', {}).get('total', {})
172
+ for species_id, score in scores.items():
173
+ df.loc[mask, str(species_id)] = score
174
+
175
+ df.to_csv('classifications.tsv', sep='\\t', index=False)
162
176
  """
163
177
  }
164
178
 
@@ -188,7 +202,10 @@ process selectForReadGen {
188
202
  for id, accession in species_model["training_accessions"].items():
189
203
  training_accessions.extend(accession)
190
204
 
191
- assemblies = assemblies[assemblies['Assembly Level'] == 'Complete Genome']
205
+ assemblies = assemblies[
206
+ (assemblies['Assembly Level'] == 'Complete Genome') |
207
+ (assemblies['Assembly Level'] == 'Chromosome')
208
+ ]
192
209
  assemblies = assemblies[~assemblies['Assembly Accession'].isin(training_accessions)]
193
210
 
194
211
  # use up to three assemblies for each species
@@ -238,8 +255,8 @@ process generateReads {
238
255
  }
239
256
 
240
257
  process summarizeReadClassifications {
241
- conda "conda-forge::jq"
242
- cpus 2
258
+ conda "conda-forge::pandas"
259
+ cpus 4
243
260
  memory '16 GB'
244
261
  publishDir "results"
245
262
 
@@ -252,29 +269,55 @@ process summarizeReadClassifications {
252
269
 
253
270
  script:
254
271
  """
255
- echo -e "Assembly Accession\tRead\tPrediction\tSpecies ID" > read_classifications.tsv
272
+ #!/usr/bin/env python
273
+ import pandas as pd
274
+ import json
275
+ import os
256
276
 
257
- for json_file in ${read_classifications}; do
258
- basename=\$(basename \$json_file .json)
259
- accession=\$(echo \$basename | cut -d'_' -f1-2)
277
+ df_assemblies = pd.read_csv('${read_assemblies}', sep='\\t')
278
+
279
+ # Create a mapping of accession to species ID
280
+ accession_to_species = dict(zip(df_assemblies['Assembly Accession'], df_assemblies['Species ID']))
281
+
282
+ results = []
283
+
284
+ classifications = '${read_classifications}'.split()
285
+ for json_file in classifications:
286
+ basename = os.path.basename(json_file).replace('.json', '')
287
+ accession = '_'.join(basename.split('_')[:2])
260
288
 
261
- # Get species ID from assemblies table
262
- species_id=\$(awk -F'\t' -v acc="\$accession" '\$1 == acc {print \$6}' ${read_assemblies})
289
+ species_id = accession_to_species.get(accession, 'unknown')
263
290
 
264
- # Extract predictions from JSON and append to TSV
265
- jq -r --arg acc "\$accession" --arg species "\$species_id" '
266
- .scores
267
- | to_entries[]
268
- | select(.key != "total")
269
- | "\\(.key)\\t\\(.value | to_entries | max_by(.value) | .key)"
270
- | "\\(\$acc)\\t" + . + "\\t\\(\$species)"
271
- ' "\$json_file" >> read_classifications.tsv
272
- done
291
+ with open(json_file, 'r') as f:
292
+ data = json.load(f)
293
+ scores = data.get('scores', {})
294
+
295
+ for read_name, read_scores in scores.items():
296
+ if read_name != 'total':
297
+ if read_scores:
298
+ max_score = max(read_scores.values())
299
+ max_species = [species for species, score in read_scores.items() if score == max_score]
300
+ prediction = max_species[0] if len(max_species) == 1 else "ambiguous"
301
+
302
+ result = {
303
+ 'Assembly Accession': accession,
304
+ 'Read': read_name,
305
+ 'Prediction': prediction,
306
+ 'Species ID': species_id
307
+ }
308
+
309
+ for species, score in read_scores.items():
310
+ result[species] = score
311
+
312
+ results.append(result)
313
+
314
+ df_results = pd.DataFrame(results)
315
+ df_results.to_csv('read_classifications.tsv', sep='\\t', index=False)
273
316
  """
274
317
  }
275
318
 
276
319
  process calculateStats {
277
- conda "conda-forge::pandas"
320
+ conda "conda-forge::pandas conda-forge::scikit-learn"
278
321
  cpus 2
279
322
  memory '16 GB'
280
323
  publishDir "results"
@@ -290,33 +333,65 @@ process calculateStats {
290
333
  """
291
334
  #!/usr/bin/env python
292
335
  import pandas as pd
336
+ from sklearn.metrics import classification_report
293
337
 
338
+ # --- Assembly ---
294
339
  df_assembly = pd.read_csv('${assembly_classifications}', sep='\\t')
295
340
  df_assembly['Species ID'] = df_assembly['Species ID'].astype(str)
296
341
  df_assembly['Prediction'] = df_assembly['Prediction'].astype(str)
297
- assembly_matches = df_assembly.loc[df_assembly['Species ID'] == df_assembly['Prediction']]
298
- assembly_mismatches = df_assembly.loc[df_assembly['Species ID'] != df_assembly['Prediction']]
299
342
 
343
+ y_true_asm = df_assembly['Species ID']
344
+ y_pred_asm = df_assembly['Prediction']
345
+
346
+ asm_matches = (y_true_asm == y_pred_asm).sum()
347
+ asm_total = len(df_assembly)
348
+
349
+ asm_labels = sorted(set(y_true_asm.unique()).union(set(y_pred_asm.unique())))
350
+ asm_report = classification_report(
351
+ y_true_asm,
352
+ y_pred_asm,
353
+ labels=asm_labels,
354
+ zero_division=0
355
+ )
356
+
357
+ # --- Reads ---
300
358
  df_read = pd.read_csv('${read_classifications}', sep='\\t')
301
359
  df_read['Species ID'] = df_read['Species ID'].astype(str)
302
360
  df_read['Prediction'] = df_read['Prediction'].astype(str)
303
- read_matches = df_read.loc[df_read['Species ID'] == df_read['Prediction']]
304
- read_mismatches = df_read.loc[df_read['Species ID'] != df_read['Prediction']]
305
361
 
362
+ y_true_read = df_read['Species ID']
363
+ y_pred_read = df_read['Prediction']
364
+
365
+ read_matches = (y_true_read == y_pred_read).sum()
366
+ read_total = len(df_read)
367
+
368
+ read_labels = sorted(set(y_true_read.unique()).union(set(y_pred_read.unique())))
369
+ read_report = classification_report(
370
+ y_true_read,
371
+ y_pred_read,
372
+ labels=read_labels,
373
+ zero_division=0
374
+ )
375
+
376
+ # --- Output ---
306
377
  with open('stats.txt', 'w') as f:
307
- f.write(f"Assembly Total: {len(df_assembly)}\\n")
308
- f.write(f"Assembly Matches: {len(assembly_matches)}\\n")
309
- f.write(f"Assembly Mismatches: {len(assembly_mismatches)}\\n")
310
- f.write(f"Assembly Match Rate: {len(assembly_matches) / len(df_assembly) * 100:.2f}%\\n")
311
- f.write(f"Assembly Mismatch Rate: {len(assembly_mismatches) / len(df_assembly) * 100:.2f}%\\n")
312
-
313
- f.write("\\n")
314
-
315
- f.write(f"Read Total: {len(df_read)}\\n")
316
- f.write(f"Read Matches: {len(read_matches)}\\n")
317
- f.write(f"Read Mismatches: {len(read_mismatches)}\\n")
318
- f.write(f"Read Match Rate: {len(read_matches) / len(df_read) * 100:.2f}%\\n")
319
- f.write(f"Read Mismatch Rate: {len(read_mismatches) / len(df_read) * 100:.2f}%\\n")
378
+ f.write("=== Assembly ===\\n")
379
+ f.write(f"Total: {asm_total}\\n")
380
+ f.write(f"Matches: {asm_matches}\\n")
381
+ f.write(f"Mismatches: {asm_total - asm_matches}\\n")
382
+ f.write(f"Match Rate: {asm_matches / asm_total * 100:.2f}%\\n")
383
+ f.write(f"Mismatch Rate: {(asm_total - asm_matches) / asm_total * 100:.2f}%\\n\\n")
384
+ f.write("Classification report (per class):\\n")
385
+ f.write(asm_report + "\\n")
386
+
387
+ f.write("=== Reads ===\\n")
388
+ f.write(f"Total: {read_total}\\n")
389
+ f.write(f"Matches: {read_matches}\\n")
390
+ f.write(f"Mismatches: {read_total - read_matches}\\n")
391
+ f.write(f"Match Rate: {read_matches / read_total * 100:.2f}%\\n")
392
+ f.write(f"Mismatch Rate: {(read_total - read_matches) / read_total * 100:.2f}%\\n\\n")
393
+ f.write("Classification report (per class):\\n")
394
+ f.write(read_report + "\\n")
320
395
  """
321
396
  }
322
397
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: XspecT
3
- Version: 0.5.4
3
+ Version: 0.6.0
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -45,6 +45,9 @@ Requires-Dist: xxhash
45
45
  Requires-Dist: fastapi
46
46
  Requires-Dist: uvicorn
47
47
  Requires-Dist: python-multipart
48
+ Requires-Dist: mappy
49
+ Requires-Dist: pysam
50
+ Requires-Dist: numpy
48
51
  Provides-Extra: docs
49
52
  Requires-Dist: mkdocs-material; extra == "docs"
50
53
  Requires-Dist: mkdocs-include-markdown-plugin; extra == "docs"
@@ -37,6 +37,10 @@ src/xspect/model_management.py
37
37
  src/xspect/ncbi.py
38
38
  src/xspect/train.py
39
39
  src/xspect/web.py
40
+ src/xspect/misclassification_detection/__init__.py
41
+ src/xspect/misclassification_detection/mapping.py
42
+ src/xspect/misclassification_detection/point_pattern_analysis.py
43
+ src/xspect/misclassification_detection/simulate_reads.py
40
44
  src/xspect/mlst_feature/__init__.py
41
45
  src/xspect/mlst_feature/mlst_helper.py
42
46
  src/xspect/mlst_feature/pub_mlst_handler.py
@@ -110,6 +114,7 @@ tests/__init__.py
110
114
  tests/conftest.py
111
115
  tests/test_cli.py
112
116
  tests/test_file_io.py
117
+ tests/test_misclassification_detection.py
113
118
  tests/test_model_management.py
114
119
  tests/test_model_result.py
115
120
  tests/test_ncbi.py
@@ -11,6 +11,9 @@ xxhash
11
11
  fastapi
12
12
  uvicorn
13
13
  python-multipart
14
+ mappy
15
+ pysam
16
+ numpy
14
17
 
15
18
  [docs]
16
19
  mkdocs-material
@@ -46,6 +46,7 @@ def classify_species(
46
46
  output_path: Path,
47
47
  step: int = 1,
48
48
  display_name: bool = False,
49
+ validation: bool = False,
49
50
  ):
50
51
  """
51
52
  Classify the species of sequences.
@@ -59,6 +60,7 @@ def classify_species(
59
60
  output_path (Path): The path to the output file where results will be saved.
60
61
  step (int): The amount of kmers to be skipped.
61
62
  display_name (bool): Includes a display name for each tax_ID.
63
+ validation (bool): Sorts out misclassified reads.
62
64
  """
63
65
  ProbabilisticFilterSVMModel = import_module(
64
66
  "xspect.models.probabilistic_filter_svm_model"
@@ -69,7 +71,12 @@ def classify_species(
69
71
  input_paths, get_output_path = prepare_input_output_paths(input_path)
70
72
 
71
73
  for idx, current_path in enumerate(input_paths):
72
- result = model.predict(current_path, step=step, display_name=display_name)
74
+ result = model.predict(
75
+ current_path,
76
+ step=step,
77
+ display_name=display_name,
78
+ validation=validation,
79
+ )
73
80
  result.input_source = current_path.name
74
81
  cls_path = get_output_path(idx, output_path)
75
82
  result.save(cls_path)
@@ -89,3 +89,22 @@ def get_xspect_mlst_path() -> Path:
89
89
  mlst_path = get_xspect_root_path() / "mlst"
90
90
  mlst_path.mkdir(exist_ok=True, parents=True)
91
91
  return mlst_path
92
+
93
+
94
+ def get_xspect_misclassification_path() -> Path:
95
+ """
96
+ Notes:
97
+ Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
98
+ (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
99
+
100
+ Return the path to the XspecT Misclassification directory.
101
+
102
+ Returns the path to the XspecT Misclassification directory, which is located within the XspecT data
103
+ directory. If the directory does not exist, it creates the directory.
104
+
105
+ Returns:
106
+ Path: The path to the XspecT Misclassification directory.
107
+ """
108
+ misclassification_path = get_xspect_root_path() / "misclassification"
109
+ misclassification_path.mkdir(exist_ok=True, parents=True)
110
+ return misclassification_path
@@ -87,13 +87,62 @@ def train():
87
87
  help="Email of the author.",
88
88
  default=None,
89
89
  )
90
- def train_ncbi(model_genus, svm_steps, author, author_email):
90
+ @click.option(
91
+ "--min-n50",
92
+ type=int,
93
+ help="Minimum contig N50 to filter the accessions (default: 10000).",
94
+ default=10000,
95
+ )
96
+ @click.option(
97
+ "--include-atypical/--exclude-atypical",
98
+ help="Include or exclude atypical accessions (default: exclude).",
99
+ default=False,
100
+ )
101
+ @click.option(
102
+ "--allow-inconclusive",
103
+ is_flag=True,
104
+ help="Allow the use of accessions with inconclusive taxonomy check status for training.",
105
+ default=False,
106
+ )
107
+ @click.option(
108
+ "--allow-candidatus",
109
+ is_flag=True,
110
+ help="Allow the use of Candidatus species for training.",
111
+ default=False,
112
+ )
113
+ @click.option(
114
+ "--allow-sp",
115
+ is_flag=True,
116
+ help="Allow the use of species with 'sp.' in their names for training.",
117
+ default=False,
118
+ )
119
+ def train_ncbi(
120
+ model_genus,
121
+ svm_steps,
122
+ author,
123
+ author_email,
124
+ min_n50,
125
+ include_atypical,
126
+ allow_inconclusive,
127
+ allow_candidatus,
128
+ allow_sp,
129
+ ):
91
130
  """Train a species and a genus model based on NCBI data."""
92
131
  click.echo(f"Training {model_genus} species and genus metagenome model.")
93
132
  try:
94
133
  train_from_ncbi = import_module("xspect.train").train_from_ncbi
95
134
 
96
- train_from_ncbi(model_genus, svm_steps, author, author_email)
135
+ train_from_ncbi(
136
+ model_genus,
137
+ svm_steps,
138
+ author,
139
+ author_email,
140
+ min_n50=min_n50,
141
+ exclude_atypical=not include_atypical,
142
+ allow_inconclusive=allow_inconclusive,
143
+ allow_candidatus=allow_candidatus,
144
+ allow_sp=allow_sp,
145
+ )
97
146
  except ValueError as e:
98
147
  click.echo(f"Error: {e}")
99
148
  return
@@ -287,8 +336,19 @@ def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
287
336
  help="Includes the display names next to taxonomy-IDs.",
288
337
  is_flag=True,
289
338
  )
339
+ @click.option(
340
+ "-v",
341
+ "--validation",
342
+ help="Detects misclassification for small reads or contigs.",
343
+ is_flag=True,
344
+ )
290
345
  def classify_species(
291
- model_genus, input_path, output_path, sparse_sampling_step, display_names
346
+ model_genus,
347
+ input_path,
348
+ output_path,
349
+ sparse_sampling_step,
350
+ display_names,
351
+ validation,
292
352
  ):
293
353
  """Classify samples using a species model."""
294
354
  click.echo("Classifying...")
@@ -300,6 +360,7 @@ def classify_species(
300
360
  Path(output_path),
301
361
  sparse_sampling_step,
302
362
  display_names,
363
+ validation,
303
364
  )
304
365
 
305
366
 
@@ -0,0 +1,168 @@
1
+ """
2
+ Mapping handler for the alignment-based misclassification detection.
3
+
4
+ Notes:
5
+ Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
6
+ (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
7
+ """
8
+
9
+ import mappy, pysam, os, csv
10
+ from Bio import SeqIO
11
+ from xspect.definitions import fasta_endings
12
+
13
+ __author__ = "Cetin, Oemer"
14
+
15
+
16
+ class MappingHandler:
17
+ """Handler class for all mapping related procedures."""
18
+
19
+ def __init__(self, ref_genome_path: str, reads_path: str) -> None:
20
+ """
21
+ Initialise the mapping handler.
22
+
23
+ This method sets up the paths to the reference genome and query sequences.
24
+ Additionally, the paths to the output formats (SAM, BAM and TSV) are generated.
25
+
26
+ Args:
27
+ ref_genome_path (str): The path to the reference genome.
28
+ reads_path (str): The path to the query sequences.
29
+ """
30
+ if not os.path.isfile(ref_genome_path):
31
+ raise ValueError("The path to the reference genome does not exist.")
32
+
33
+ if not os.path.isfile(reads_path):
34
+ raise ValueError("The path to the reads does not exist.")
35
+
36
+ if not ref_genome_path.endswith(tuple(fasta_endings)) and reads_path.endswith(
37
+ tuple(fasta_endings)
38
+ ):
39
+ raise ValueError("The files must be FASTA-files!")
40
+
41
+ stem = reads_path.rsplit(".", 1)[0] + "_mapped"
42
+ self.ref_genome_path = ref_genome_path
43
+ self.reads_path = reads_path
44
+ self.sam = stem + ".sam"
45
+ self.bam = stem + ".sorted.bam"
46
+ self.tsv = stem + ".start_coordinates.tsv"
47
+
48
+ def map_reads_onto_reference(self) -> None:
49
+ """
50
+ A Method that maps reads against the respective reference genome.
51
+
52
+ This function creates a SAM file via Mappy and converts it into a BAM file.
53
+ """
54
+ # create header (entry = sequences of the reference genome)
55
+ ref_seq = [
56
+ {"SN": rec.id, "LN": len(rec.seq)}
57
+ for rec in SeqIO.parse(self.ref_genome_path, "fasta")
58
+ ]
59
+ header = {"HD": {"VN": "1.0"}, "SQ": ref_seq}
60
+ target_id = {sequence["SN"]: number for number, sequence in enumerate(ref_seq)}
61
+
62
+ reads = list(SeqIO.parse(self.reads_path, "fasta"))
63
+ if not reads:
64
+ raise ValueError("Reads file is empty.")
65
+
66
+ read_length = len(reads[0].seq)
67
+ preset = "map-ont" if read_length > 150 else "sr"
68
+ # create SAM-file
69
+ aln = mappy.Aligner(self.ref_genome_path, preset=preset)
70
+ with pysam.AlignmentFile(self.sam, "w", header=header) as out:
71
+ for read in reads:
72
+ read_seq = str(read.seq)
73
+ for hit in aln.map(read_seq):
74
+ if hit.cigar_str is None:
75
+ continue
76
+ # add soft-clips so CIGAR length == len(read_seq) IMPORTANT!!
77
+ leftS = hit.q_st
78
+ rightS = len(read_seq) - hit.q_en
79
+ cigar = (
80
+ (f"{leftS}S" if leftS > 0 else "")
81
+ + hit.cigar_str
82
+ + (f"{rightS}S" if rightS > 0 else "")
83
+ )
84
+
85
+ mapped_region = pysam.AlignedSegment()
86
+ mapped_region.query_name = read.id
87
+ mapped_region.query_sequence = read_seq
88
+ mapped_region.flag = 16 if hit.strand == -1 else 0
89
+ mapped_region.reference_id = target_id[hit.ctg]
90
+ mapped_region.reference_start = hit.r_st
91
+ mapped_region.mapping_quality = (
92
+ hit.mapq or 255
93
+ ) # 0-60 (255 means unavailable)
94
+ mapped_region.cigarstring = cigar
95
+ out.write(mapped_region)
96
+ break # keep only primary
97
+
98
+ # create BAM-file
99
+ pysam.sort("-o", self.bam, self.sam)
100
+ pysam.index(self.bam)
101
+
102
+ def get_total_genome_length(self) -> int:
103
+ """
104
+ Get the genome length from a BAM-file.
105
+
106
+ This function opens a BAM-file and extracts the genome length information.
107
+
108
+ Returns:
109
+ int: The genome length.
110
+ """
111
+ with pysam.AlignmentFile(self.bam, "rb") as bam:
112
+ return sum(bam.lengths)
113
+
114
+ def extract_starting_coordinates(self) -> None:
115
+ """
116
+ Extract starting coordinates of mapped regions from a BAM-file.
117
+
118
+ This function scans through a BAM-file and creates a TSV-file.
119
+ The information that is extracted is the starting coordinate for each mapped read.
120
+ """
121
+ # create tsv-file with all start positions
122
+ with open(self.tsv, "w") as tsv:
123
+ tsv.write("reference_genome\tread\tmapped_starting_coordinate\n")
124
+ try:
125
+ with pysam.AlignmentFile(self.bam, "rb") as bam:
126
+ entry = {
127
+ i: seq["SN"] for i, seq in enumerate(bam.header.to_dict()["SQ"])
128
+ }
129
+ seen = set()
130
+ for ref_seq in bam.references:
131
+ for hit in bam.fetch(ref_seq):
132
+ if (
133
+ hit.is_unmapped
134
+ or hit.is_secondary
135
+ or hit.is_supplementary
136
+ ):
137
+ continue
138
+ key = (hit.reference_id, hit.reference_start)
139
+ if key in seen:
140
+ continue
141
+ seen.add(key)
142
+ tsv.write(
143
+ f"{entry[hit.reference_id]}\t{hit.query_name}\t{hit.reference_start}\n"
144
+ )
145
+ except ValueError:
146
+ tsv.write("dummy_reference\tdummy_read\t1000\n")
147
+
148
+ def get_start_coordinates(self) -> list[int]:
149
+ """
150
+ Get the coordinates of a TSV-file.
151
+
152
+ This function opens a TSV-file and saves all starting coordinates in a list.
153
+
154
+ Returns:
155
+ list[int]: The list containing all starting coordinates.
156
+
157
+ Raises:
158
+ ValueError: If no column with starting coordinates is found.
159
+ """
160
+ coordinates = []
161
+ with open(self.tsv, "r", newline="") as f:
162
+ reader = csv.DictReader(f, delimiter="\t")
163
+ for row in reader:
164
+ val = row.get("mapped_starting_coordinate")
165
+ if val is None:
166
+ raise ValueError("Column with starting coordinates not found.")
167
+ coordinates.append(int(val))
168
+ return coordinates