XspecT 0.6.0__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/docs.yml +1 -1
  2. {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/pypi.yml +1 -1
  3. {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/test.yml +2 -3
  4. {xspect-0.6.0 → xspect-0.7.1}/.gitignore +5 -1
  5. {xspect-0.6.0 → xspect-0.7.1}/PKG-INFO +7 -7
  6. {xspect-0.6.0 → xspect-0.7.1}/README.md +5 -5
  7. {xspect-0.6.0 → xspect-0.7.1}/docs/benchmark.md +4 -4
  8. {xspect-0.6.0 → xspect-0.7.1}/docs/cli.md +8 -4
  9. {xspect-0.6.0 → xspect-0.7.1}/docs/contributing.md +5 -5
  10. {xspect-0.6.0 → xspect-0.7.1}/mkdocs.yml +1 -1
  11. {xspect-0.6.0 → xspect-0.7.1}/pyproject.toml +2 -2
  12. {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/classify/main.nf +2 -1
  13. {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/environment.yml +2 -1
  14. {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/main.nf +115 -104
  15. {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/nextflow.config +0 -1
  16. {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark-data/download_data.slurm +1 -1
  17. xspect-0.7.1/scripts/nextflow-utils/environment.yml +8 -0
  18. xspect-0.7.1/scripts/nextflow-utils/main.nf +31 -0
  19. {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/PKG-INFO +7 -7
  20. {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/SOURCES.txt +8 -6
  21. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/classify.py +18 -10
  22. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/download_models.py +3 -1
  23. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/file_io.py +24 -0
  24. xspect-0.7.1/src/xspect/handlers/pubmlst.py +130 -0
  25. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/main.py +79 -44
  26. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/model_management.py +61 -3
  27. xspect-0.7.1/src/xspect/models/mlst_result.py +62 -0
  28. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_filter_mlst_model.py +96 -101
  29. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_filter_model.py +7 -7
  30. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/train.py +48 -1
  31. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/web.py +1 -3
  32. xspect-0.6.0/src/xspect/xspect-web/dist/assets/index-Dt_UlbgE.js → xspect-0.7.1/src/xspect/xspect-web/dist/assets/index-Bg0QP9Ys.js +1 -1
  33. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/dist/index.html +1 -1
  34. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/header.tsx +1 -1
  35. {xspect-0.6.0 → xspect-0.7.1}/tests/test_misclassification_detection.py +1 -1
  36. {xspect-0.6.0 → xspect-0.7.1}/tests/test_model_result.py +2 -0
  37. {xspect-0.6.0 → xspect-0.7.1}/tests/test_ncbi.py +1 -1
  38. xspect-0.6.0/tests/test_probabilisitc_filter_mlst_model.py → xspect-0.7.1/tests/test_probabilistic_filter_mlst_model.py +41 -28
  39. xspect-0.7.1/tests/test_pub_mlst_handler.py +41 -0
  40. xspect-0.6.0/src/xspect/mlst_feature/mlst_helper.py +0 -241
  41. xspect-0.6.0/src/xspect/mlst_feature/pub_mlst_handler.py +0 -184
  42. xspect-0.6.0/tests/test_pub_mlst_handler.py +0 -53
  43. {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/black.yml +0 -0
  44. {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/pylint.yml +0 -0
  45. {xspect-0.6.0 → xspect-0.7.1}/LICENSE +0 -0
  46. {xspect-0.6.0 → xspect-0.7.1}/docs/index.md +0 -0
  47. {xspect-0.6.0 → xspect-0.7.1}/docs/quickstart.md +0 -0
  48. {xspect-0.6.0 → xspect-0.7.1}/docs/understanding.md +0 -0
  49. {xspect-0.6.0 → xspect-0.7.1}/docs/web.md +0 -0
  50. {xspect-0.6.0 → xspect-0.7.1}/setup.cfg +0 -0
  51. {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/dependency_links.txt +0 -0
  52. {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/entry_points.txt +0 -0
  53. {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/requires.txt +0 -0
  54. {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/top_level.txt +0 -0
  55. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/__init__.py +0 -0
  56. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/definitions.py +0 -0
  57. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/filter_sequences.py +0 -0
  58. {xspect-0.6.0/src/xspect/misclassification_detection → xspect-0.7.1/src/xspect/handlers}/__init__.py +0 -0
  59. {xspect-0.6.0/src/xspect → xspect-0.7.1/src/xspect/handlers}/ncbi.py +0 -0
  60. {xspect-0.6.0/src/xspect/mlst_feature → xspect-0.7.1/src/xspect/misclassification_detection}/__init__.py +0 -0
  61. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/misclassification_detection/mapping.py +0 -0
  62. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/misclassification_detection/point_pattern_analysis.py +0 -0
  63. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/misclassification_detection/simulate_reads.py +0 -0
  64. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/__init__.py +0 -0
  65. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_filter_svm_model.py +0 -0
  66. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_single_filter_model.py +0 -0
  67. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/result.py +0 -0
  68. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/.gitignore +0 -0
  69. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/README.md +0 -0
  70. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/components.json +0 -0
  71. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/dist/assets/index-Ceo58xui.css +0 -0
  72. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/dist/vite.svg +0 -0
  73. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/eslint.config.js +0 -0
  74. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/index.html +0 -0
  75. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/package-lock.json +0 -0
  76. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/package.json +0 -0
  77. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/pnpm-lock.yaml +0 -0
  78. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/public/vite.svg +0 -0
  79. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/App.tsx +0 -0
  80. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/api.tsx +0 -0
  81. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/assets/react.svg +0 -0
  82. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/classification-form.tsx +0 -0
  83. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/classify.tsx +0 -0
  84. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/data-table.tsx +0 -0
  85. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/dropdown-checkboxes.tsx +0 -0
  86. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/dropdown-slider.tsx +0 -0
  87. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/filter-form.tsx +0 -0
  88. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/filter.tsx +0 -0
  89. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/filtering-result.tsx +0 -0
  90. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/landing.tsx +0 -0
  91. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/models-details.tsx +0 -0
  92. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/models.tsx +0 -0
  93. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/result-chart.tsx +0 -0
  94. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/result.tsx +0 -0
  95. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/spinner.tsx +0 -0
  96. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/accordion.tsx +0 -0
  97. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/button.tsx +0 -0
  98. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/card.tsx +0 -0
  99. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/chart.tsx +0 -0
  100. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/command.tsx +0 -0
  101. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/dialog.tsx +0 -0
  102. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/dropdown-menu.tsx +0 -0
  103. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/file-upload.tsx +0 -0
  104. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/form.tsx +0 -0
  105. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/input.tsx +0 -0
  106. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/label.tsx +0 -0
  107. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/navigation-menu.tsx +0 -0
  108. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/popover.tsx +0 -0
  109. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/select.tsx +0 -0
  110. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/separator.tsx +0 -0
  111. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/slider.tsx +0 -0
  112. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/switch.tsx +0 -0
  113. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/table.tsx +0 -0
  114. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/tabs.tsx +0 -0
  115. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/index.css +0 -0
  116. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/lib/utils.ts +0 -0
  117. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/main.tsx +0 -0
  118. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/types.tsx +0 -0
  119. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/utils.tsx +0 -0
  120. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/vite-env.d.ts +0 -0
  121. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/tsconfig.app.json +0 -0
  122. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/tsconfig.json +0 -0
  123. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/tsconfig.node.json +0 -0
  124. {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/vite.config.ts +0 -0
  125. {xspect-0.6.0 → xspect-0.7.1}/tests/__init__.py +0 -0
  126. {xspect-0.6.0 → xspect-0.7.1}/tests/conftest.py +0 -0
  127. {xspect-0.6.0 → xspect-0.7.1}/tests/test_cli.py +0 -0
  128. {xspect-0.6.0 → xspect-0.7.1}/tests/test_file_io.py +0 -0
  129. {xspect-0.6.0 → xspect-0.7.1}/tests/test_model_management.py +0 -0
  130. {xspect-0.6.0 → xspect-0.7.1}/tests/test_probabilistic_filter_model.py +0 -0
  131. {xspect-0.6.0 → xspect-0.7.1}/tests/test_probabilistic_filter_svm_model.py +0 -0
  132. {xspect-0.6.0 → xspect-0.7.1}/tests/test_probabilistic_single_filter_model.py +0 -0
  133. {xspect-0.6.0 → xspect-0.7.1}/tests/test_train.py +0 -0
  134. {xspect-0.6.0 → xspect-0.7.1}/tests/test_web.py +0 -0
@@ -16,7 +16,7 @@ jobs:
16
16
  git config user.email 41898282+github-actions[bot]@users.noreply.github.com
17
17
  - uses: actions/setup-python@v5
18
18
  with:
19
- python-version: 3.x
19
+ python-version: 3.13
20
20
  - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
21
21
  - uses: actions/cache@v4
22
22
  with:
@@ -25,7 +25,7 @@ jobs:
25
25
 
26
26
  - uses: actions/setup-python@v5
27
27
  with:
28
- python-version: "3.x"
28
+ python-version: "3.13"
29
29
 
30
30
  - name: Build release distributions
31
31
  run: |
@@ -19,15 +19,14 @@ jobs:
19
19
  - name: Set up Python
20
20
  uses: actions/setup-python@v4
21
21
  with:
22
- python-version: "3.x"
22
+ python-version: "3.13"
23
23
  - name: Install package
24
24
  run: |
25
25
  python -m pip install --upgrade pip
26
26
  pip install '.[test]'
27
- - name: Download models and train MLST
27
+ - name: Download models
28
28
  run: |
29
29
  xspect models download
30
- yes 1 | xspect models train mlst
31
30
  - name: Test with pytest
32
31
  env:
33
32
  NCBI_API_KEY: ${{ secrets.NCBI_API_KEY }}
@@ -187,4 +187,8 @@ data/
187
187
  results/
188
188
 
189
189
  # Slurm
190
- slurm-*
190
+ slurm-*
191
+ delete.slurm
192
+
193
+
194
+ playground.ipynb
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: XspecT
3
- Version: 0.6.0
3
+ Version: 0.7.1
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -29,7 +29,7 @@ Project-URL: Repository, https://github.com/BIONF/XspecT2.git
29
29
  Classifier: Intended Audience :: Developers
30
30
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
31
31
  Classifier: License :: OSI Approved :: MIT License
32
- Requires-Python: >=3.10
32
+ Requires-Python: <3.14,>=3.10
33
33
  Description-Content-Type: text/markdown
34
34
  License-File: LICENSE
35
35
  Requires-Dist: biopython
@@ -64,19 +64,19 @@ Dynamic: license-file
64
64
  [![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen)](https://github.com/pylint-dev/pylint)
65
65
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
66
66
 
67
- XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [Support Vector Machine].
67
+ XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [support vector machine].
68
68
 
69
- XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a Support Vector Machine.
69
+ XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a support vector machine.
70
70
 
71
71
  The tool is available as a web-based application and as a command line interface.
72
72
 
73
73
  [kmer indices]: https://arxiv.org/abs/1905.09624
74
- [Support Vector Machine]: https://en.wikipedia.org/wiki/Support-vector_machine
74
+ [support vector machine]: https://en.wikipedia.org/wiki/Support-vector_machine
75
75
  <!-- end intro -->
76
76
 
77
77
  <!-- start quickstart -->
78
78
  ## Installation
79
- To install XspecT, please download the lastest 64 bit Python version and install the package using pip:
79
+ To install XspecT, please download Python 3.10 - 3.13 and install the package using pip:
80
80
  ```
81
81
  pip install xspect
82
82
  ```
@@ -114,5 +114,5 @@ For further instructions on how to use the command line interface, please refer
114
114
  ```
115
115
  xspect --help
116
116
  ```
117
- [documentation]: https://bionf.github.io/XspecT2/cli/index.html
117
+ [documentation]: https://bionf.github.io/XspecT/cli/index.html
118
118
  <!-- end quickstart -->
@@ -4,19 +4,19 @@
4
4
  [![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen)](https://github.com/pylint-dev/pylint)
5
5
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
6
6
 
7
- XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [Support Vector Machine].
7
+ XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [support vector machine].
8
8
 
9
- XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a Support Vector Machine.
9
+ XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a support vector machine.
10
10
 
11
11
  The tool is available as a web-based application and as a command line interface.
12
12
 
13
13
  [kmer indices]: https://arxiv.org/abs/1905.09624
14
- [Support Vector Machine]: https://en.wikipedia.org/wiki/Support-vector_machine
14
+ [support vector machine]: https://en.wikipedia.org/wiki/Support-vector_machine
15
15
  <!-- end intro -->
16
16
 
17
17
  <!-- start quickstart -->
18
18
  ## Installation
19
- To install XspecT, please download the lastest 64 bit Python version and install the package using pip:
19
+ To install XspecT, please download Python 3.10 - 3.13 and install the package using pip:
20
20
  ```
21
21
  pip install xspect
22
22
  ```
@@ -54,5 +54,5 @@ For further instructions on how to use the command line interface, please refer
54
54
  ```
55
55
  xspect --help
56
56
  ```
57
- [documentation]: https://bionf.github.io/XspecT2/cli/index.html
57
+ [documentation]: https://bionf.github.io/XspecT/cli/index.html
58
58
  <!-- end quickstart -->
@@ -2,16 +2,16 @@
2
2
 
3
3
  XspecT is a tool designed for fast and accurate species classification of genome assemblies and simulated reads. To evaluate its classification accuracy, we conducted a benchmark using a set of Acinetobacter genomes.
4
4
 
5
- The benchmark was performed by first download all available Acinetobacter genomes from Genbank, filtered on a passed ("OK") taxonomy check status. Genomes assigned to strain IDs were remapped to their respective species IDs, after which genomes with species IDs not contained in XspecT's Acinetobacter model were removed. The remaining genomes were then used to classify both assemblies and simulated reads generated from them. Simulated reads were generated by first filtering on genomes that were not part of the training data and that were categorized as "complete" by NCBI. The reads were then simulated from the longest contig of each genome (assumed to be the chromosome) using a custom Python script. Up to three genomes were selected per species. 100 000 reads were simulated for each genome, with a read length of 100 bp and no simulated sequencing errors. The reads were then classified using XspecT with predictions based on the maximum-scoring species.
5
+ The benchmark was performed by first downloading all available Acinetobacter genomes from RefSeq, filtered on a passed ("OK") taxonomy check status and on them not being part of the training dataset. Genomes assigned to strain IDs were remapped to their respective species IDs, after which genomes with species IDs not contained in XspecT's Acinetobacter model were removed. The remaining genomes were then used to classify both assemblies and simulated reads generated from them. Simulated reads were generated by first filtering on genomes that were categorized as "complete" or "chromosome" by NCBI. The reads were then simulated from the longest contig of each genome (assumed to be the chromosome) using ART. 100 000 reads were simulated for each genome based on the HiSeq 2500 profile, with a read length of 125 bp. The reads were then classified using XspecT with predictions based on the maximum-scoring species.
6
6
 
7
7
  ## Benchmark Results
8
8
 
9
- The benchmark results show that XspecT achieves high classification accuracy, with an overall accuracy of nearly 100% for whole genomes and 82% for simulated reads. However, the low macro-average F1 score (0.41) for the read dataset highlights a substantial class imbalance.
9
+ The benchmark results show that XspecT achieves very high classification accuracy of nearly 100% for whole genomes and strong but reduced accuracy of 70% for simulated reads. However, the low macro-average F1 score (0.21) for the read dataset highlights a substantial class imbalance.
10
10
 
11
11
  | Dataset | Total Samples | Matches | Mismatches | Match Rate | Mismatch Rate | Accuracy | Macro Avg F1 | Weighted Avg F1 |
12
12
  |-----------|--------------:|----------:|-----------:|-----------:|--------------:|---------:|-------------:|----------------:|
13
- | Assembly | 44,905 | 44,879 | 26 | 99.94% | 0.06% | ≈1.00 | 0.95 | 1.00 |
14
- | Reads | 9,200,000 | 7,526,902 | 1,673,098 | 81.81% | 18.19% | 0.82 | 0.41 | 0.87 |
13
+ | Assembly | 13,795 | 13,776 | 19 | 99.86% | 0.14% | ≈1.00 | 0.96 | 1.00 |
14
+ | Reads | 121,590,139 | 85,679,572| 35,910,567 | 70.47% | 29.53% | 0.70 | 0.21 | 0.79 |
15
15
 
16
16
  ## Running the benchmark yourself
17
17
 
@@ -27,7 +27,7 @@ This will show a list of all available models, separated by their type (species,
27
27
 
28
28
  ### Downloading Models
29
29
 
30
- To download a basic set of pre-trained models (Acinetobacter and Salonella), run:
30
+ To download a basic set of pre-trained models (Acinetobacter, including Oxford MLST scheme, and Salonella), run:
31
31
 
32
32
  ```bash
33
33
  xspect models download
@@ -43,7 +43,7 @@ To train a model with NCBI data, run the following command:
43
43
  xspect models train ncbi
44
44
  ```
45
45
 
46
- By default, XspecT filters out NCBI accessions that do not meed minimum N50 thresholds, have an inconclusive taxonomy check status, or are deemed atypical by NCBI. Furthermore, species with "Candidatus" and "sp." in their species names are filtered out. To disable filtering behavior, use the respective flag (see `xspect models train ncbi --help`).
46
+ By default, XspecT filters out NCBI accessions that do not meet minimum N50 thresholds, have an inconclusive taxonomy check status, or are deemed atypical by NCBI. Furthermore, species with "Candidatus" and "sp." in their species names are filtered out. To disable filtering behavior, use the respective flag (see `xspect models train ncbi --help`).
47
47
 
48
48
  If you would like to train models with manually curated data from a directory, you can use:
49
49
 
@@ -82,6 +82,8 @@ To train models for MLST classifications, run:
82
82
  xspect models train mlst
83
83
  ```
84
84
 
85
+ XspecT will prompt your for the organism name and the MLST scheme you would like to train a model for.
86
+
85
87
  ## Classification
86
88
 
87
89
  To classify samples, the command `xspect classify` can be used. This command will classify the sample based on the models available in your XspecT installation.
@@ -111,7 +113,7 @@ XspecT uses a kmer-based approach to classify samples. This means that the entir
111
113
 
112
114
  **Example**:
113
115
  ```bash
114
- xspect classify species --sparse-sampling-step 10 Acinetobacter path
116
+ xspect classify species --sparse-sampling-step 10
115
117
  ```
116
118
 
117
119
  This will only consider every 10th kmer in the sample.
@@ -120,7 +122,7 @@ This will only consider every 10th kmer in the sample.
120
122
  By default, the classification results show only the taxonomy ID of each species along with its corresponding score for better readability. To display the full names associated with each taxonomy ID, you can use the `--display-names` (or `-n`) option:
121
123
 
122
124
  ```bash
123
- xspect classify species --display-names Acinetobacter path
125
+ xspect classify species --display-names
124
126
  ```
125
127
  The output will then be formatted as: `Taxonomy_ID - Display_Name: Score` for each species.
126
128
 
@@ -132,6 +134,8 @@ Samples can also be classified based on Multi-locus sequence type schemas. To ML
132
134
  xspect classify mlst
133
135
  ```
134
136
 
137
+ XspecT will prompt you for the organism, MLST scheme, and path to your sample directory.
138
+
135
139
  ## Filtering
136
140
  XspecT can also be used to filter samples based on their classification results. This is useful when analyzing metagenomic samples, for example when looking at genomic bycatch.
137
141
 
@@ -5,8 +5,8 @@ Thank you for your interest in contributing to XspecT! This page provides guidel
5
5
 
6
6
  When contributing to XspecT, please follow the following steps to ensure a smooth process:
7
7
 
8
- - **Read the documentation**: Familiarize yourself with the project by reading the [documentation](https://bionf.github.io/XspecT2/), including the [Understanding XspecT](understanding.md) page and the [architecture overview](#architecture-overview).
9
- - **Follow the coding standards**: Adhere to the project's coding standards and best practices. This includes using consistent naming conventions, writing clear and concise code, and documentation. Furthermore, please make sure your changes are algined with the project's [architecture](#architecture-overview).
8
+ - **Read the documentation**: Familiarize yourself with the project by reading the [documentation](https://bionf.github.io/XspecT/), including the [Understanding XspecT](understanding.md) page and the [architecture overview](#architecture-overview).
9
+ - **Follow the coding standards**: Adhere to the project's coding standards and best practices. This includes ensuring that your code is formatted using [Black](https://black.readthedocs.io/en/stable/) and linted with [Pylint](https://pylint.pycqa.org/en/latest/) for Python code, as well as using consistent naming conventions, writing clear and concise code, and documentation. Please use [pure functions](https://goodresearch.dev/decoupled#learn-to-identify-and-use-pure-functions) where possible and make sure your changes are aligned with the project's [architecture](#architecture-overview).
10
10
  - **Write tests**: Ensure that your changes are covered by tests. We use [pytest](https://docs.pytest.org/en/stable/) for testing. If you add new features or fix bugs, please include tests to verify your changes.
11
11
  - **Document your changes**: Update the documentation to reflect any new features or changes you make. This includes updating the README, Google-style docstrings, and the [Mkdocs](https://www.mkdocs.org)-based documentation.
12
12
  - **Use clear commit messages**: When committing your changes, use clear and descriptive commit messages that explain the purpose of the changes.
@@ -17,12 +17,12 @@ To set up XspecT for development, first make sure you have [Python](https://www.
17
17
 
18
18
  Get started by cloning the repository:
19
19
  ```bash
20
- git clone https://github.com/BIONF/XspecT2.git
20
+ git clone https://github.com/BIONF/XspecT.git
21
21
  ```
22
22
 
23
23
  You then need to build the web application using Vite. Navigate to the `xspect-web` directory, install dependencies, and run the build command, which will also watch for changes:
24
24
  ```bash
25
- cd XspecT2/src/xspect/xspect-web
25
+ cd XspecT/src/xspect/xspect-web
26
26
  ```
27
27
  ```bash
28
28
  npm i
@@ -86,7 +86,7 @@ We use GitHub Actions to run checks on commits and pull requests. These checks i
86
86
 
87
87
  Additionally, Github Actions are also used for deployment:
88
88
 
89
- - **Documentation**: The Mkdocs-based documentation is built and deployed to GitHub Pages on changes to the `main` branch. You can view the documentation at [https://bionf.github.io/XspecT2/](https://bionf.github.io/XspecT2/).
89
+ - **Documentation**: The Mkdocs-based documentation is built and deployed to GitHub Pages on changes to the `main` branch. You can view the documentation at [https://bionf.github.io/XspecT/](https://bionf.github.io/XspecT/).
90
90
  - **Python package**: The Python package is built and uploaded to PyPI when a new release is created. This allows users to easily install the latest version of XspecT using `pip install xspect`. Pre-releases are uploaded to TestPyPI and can be installed using `pip install --index-url https://test.pypi.org/simple/ xspect`.
91
91
 
92
92
  ## Pull Request Process
@@ -7,7 +7,7 @@ theme:
7
7
  plugins:
8
8
  - include-markdown
9
9
  - search
10
- repo_url: https://github.com/BIONF/XspecT2
10
+ repo_url: https://github.com/BIONF/XspecT
11
11
  markdown_extensions:
12
12
  - attr_list
13
13
  nav:
@@ -1,10 +1,10 @@
1
1
  [project]
2
2
  name = "XspecT"
3
- version = "0.6.0"
3
+ version = "0.7.1"
4
4
  description = "Tool to monitor and characterize pathogens using Bloom filters."
5
5
  readme = {file = "README.md", content-type = "text/markdown"}
6
6
  license = {file = "LICENSE"}
7
- requires-python = ">=3.10"
7
+ requires-python = ">=3.10,<3.14"
8
8
  dependencies = [
9
9
  "biopython",
10
10
  "requests",
@@ -5,13 +5,14 @@ process classifySample {
5
5
 
6
6
  input:
7
7
  path sample
8
+ val model
8
9
 
9
10
  output:
10
11
  path "${sample.baseName}.json"
11
12
 
12
13
  script:
13
14
  """
14
- xspect classify species -g Acinetobacter -i ${sample} -o ${sample.baseName}.json
15
+ xspect classify species -g ${model} -i ${sample} -o ${sample.baseName}.json
15
16
  """
16
17
 
17
18
  stub:
@@ -2,6 +2,7 @@ name: xspect-benchmark
2
2
  channels:
3
3
  - conda-forge
4
4
  dependencies:
5
+ - python=3.12
5
6
  - pip
6
7
  - pip:
7
- - XspecT
8
+ - XspecT==0.5.4
@@ -2,9 +2,56 @@
2
2
 
3
3
  include { classifySample as classifyAssembly } from './classify'
4
4
  include { classifySample as classifyRead } from './classify'
5
+ include { strain_species_mapping } from '../nextflow-utils'
5
6
 
6
- process downloadModels {
7
- conda "./scripts/benchmark/environment.yml"
7
+ // --------------------- PARAMETERS ---------------------
8
+ params.publishDir = "results/benchmark"
9
+ params.xspectModel = "Acinetobacter"
10
+
11
+ // --------------------- WORKFLOW -----------------------
12
+ workflow {
13
+ species_model = getModelJSON()
14
+ name_mapping = getNameMapping(species_model)
15
+ genomes = file("data/genomes")
16
+ tax_report = file("data/aci_species.json")
17
+ tax_mapping_json = strain_species_mapping(tax_report)
18
+ assemblies = createAssemblyTable(genomes, tax_mapping_json, species_model)
19
+
20
+ // Whole genome assemblies
21
+ samples = Channel.fromPath("${genomes}/**/*.fna")
22
+ .flatten()
23
+ filtered_samples = assemblies
24
+ .splitCsv(header: true, sep: '\t')
25
+ .map { row -> row['Assembly Accession'] }
26
+ .cross(samples.map { sample ->
27
+ [sample.baseName.split('_')[0..1].join('_'), sample]
28
+ })
29
+ .map { it[1][1] }
30
+ classifications = classifyAssembly(filtered_samples, params.xspectModel)
31
+ summarizeClassifications(assemblies, classifications.collect())
32
+ confusionMatrix(summarizeClassifications.out, name_mapping)
33
+ mismatchConfusionMatrix(summarizeClassifications.out, name_mapping)
34
+
35
+ // Simulated reads
36
+ selectForReadGen(assemblies, species_model)
37
+ read_assemblies = selectForReadGen.out
38
+ .splitCsv(header: true, sep: '\t')
39
+ .map { row -> row['Assembly Accession'] }
40
+ .cross(samples.map { sample ->
41
+ [sample.baseName.split('_')[0..1].join('_'), sample]
42
+ })
43
+ .map { it[1][1] }
44
+ filterForChromosome(read_assemblies)
45
+ generateReads(filterForChromosome.out)
46
+ read_classifications = classifyRead(generateReads.out, params.xspectModel)
47
+ summarizeReadClassifications(selectForReadGen.out, read_classifications.collect())
48
+
49
+ calculateStats(summarizeClassifications.out, summarizeReadClassifications.out)
50
+ }
51
+
52
+ // --------------------- PROCESSES ---------------------
53
+
54
+ process getModelJSON {
8
55
  cpus 2
9
56
  memory '16 GB'
10
57
 
@@ -13,10 +60,8 @@ process downloadModels {
13
60
 
14
61
  script:
15
62
  """
16
- if [ ! "$HOME/xspect-data/models/acinetobacter-species.json" ]; then
17
- xspect models download
18
- fi
19
- cp "$HOME/xspect-data/models/acinetobacter-species.json" species_model.json
63
+ model_name="${params.xspectModel.toLowerCase().replaceAll('_','-')}-species.json"
64
+ cp "$HOME/xspect-data/models/\$model_name" species_model.json
20
65
  """
21
66
  }
22
67
 
@@ -50,7 +95,7 @@ process createAssemblyTable {
50
95
 
51
96
  input:
52
97
  path genomes
53
- path tax_report
98
+ path tax_mapping_json
54
99
  path species_model
55
100
 
56
101
  output:
@@ -70,28 +115,11 @@ process createAssemblyTable {
70
115
  awk -F'\t' 'NR==1 || \$5 == "OK"' assemblies.tsv > assemblies_filtered.tsv
71
116
  mv assemblies_filtered.tsv assemblies.tsv
72
117
 
73
- # map taxonmic IDs to species IDs (taxonomic IDs might be strain IDs)
74
- jq '
75
- .reports
76
- | map(select(.taxonomy.children != null))
77
- | map({
78
- species_id: .taxonomy.tax_id,
79
- children: .taxonomy.children
80
- })
81
- | map(
82
- . as \$entry
83
- | \$entry.children
84
- | map({ (tostring): \$entry.species_id })
85
- | add
86
- )
87
- | add
88
- ' ${tax_report} > tax_mapping.json
89
-
90
118
  # add species IDs to assemblies.tsv
91
119
  declare -A species_map
92
120
  while IFS="=" read -r key val; do
93
121
  species_map["\$key"]="\$val"
94
- done < <(jq -r 'to_entries[] | "\\(.key)=\\(.value)"' tax_mapping.json)
122
+ done < <(jq -r 'to_entries[] | "\\(.key)=\\(.value)"' ${tax_mapping_json})
95
123
 
96
124
  {
97
125
  IFS='\t' read -r -a header < assemblies.tsv
@@ -118,6 +146,21 @@ process createAssemblyTable {
118
146
  ' assemblies.tsv > temp_assemblies.tsv
119
147
  mv temp_assemblies.tsv assemblies.tsv
120
148
  rm valid_species.txt
149
+
150
+ # filter out assemblies that are part of the training set
151
+ jq -r '.training_accessions | to_entries[] | .value[]' ${species_model} > training_accessions.txt
152
+ awk -F'\t' '
153
+ BEGIN {
154
+ while ((getline acc < "training_accessions.txt") > 0) {
155
+ training[acc] = 1;
156
+ }
157
+ close("training_accessions.txt");
158
+ }
159
+ NR==1 { print; next }
160
+ !(\$1 in training) { print }
161
+ ' assemblies.tsv > temp_assemblies.tsv
162
+ mv temp_assemblies.tsv assemblies.tsv
163
+ rm training_accessions.txt
121
164
  """
122
165
 
123
166
  stub:
@@ -130,7 +173,7 @@ process summarizeClassifications {
130
173
  conda "conda-forge::pandas"
131
174
  cpus 4
132
175
  memory '16 GB'
133
- publishDir "results"
176
+ publishDir { params.publishDir }, mode: 'copy'
134
177
 
135
178
  input:
136
179
  path assemblies
@@ -208,15 +251,32 @@ process selectForReadGen {
208
251
  ]
209
252
  assemblies = assemblies[~assemblies['Assembly Accession'].isin(training_accessions)]
210
253
 
211
- # use up to three assemblies for each species
212
- assemblies = assemblies.groupby('Species ID').head(3)
213
-
214
254
  assemblies.to_csv('selected_samples.tsv', sep='\\t', index=False)
215
255
  """
216
256
  }
217
257
 
258
+ process filterForChromosome {
259
+ conda "bioconda::seqkit"
260
+ cpus 2
261
+ memory '16 GB'
262
+
263
+ input:
264
+ path sample
265
+
266
+ output:
267
+ path "${sample.baseName}_chromosome.fna"
268
+
269
+ script:
270
+ """
271
+ set -euo pipefail
272
+
273
+ seqkit sort -l -r ${sample} > sorted.tmp
274
+ seqkit head -n 1 sorted.tmp | seqkit seq -t dna -o "${sample.baseName}_chromosome.fna"
275
+ """
276
+ }
277
+
218
278
  process generateReads {
219
- conda "conda-forge::pandas conda-forge::biopython"
279
+ conda "bioconda::art"
220
280
  cpus 2
221
281
  memory '16 GB'
222
282
 
@@ -228,37 +288,24 @@ process generateReads {
228
288
 
229
289
  script:
230
290
  """
231
- #!/usr/bin/env python
232
- import random
233
- from Bio import SeqIO
234
-
235
- read_length = 100
236
- num_reads = 100000
237
- seed = 42
238
-
239
- random.seed(seed)
240
- sequences = list(SeqIO.parse("${sample}", "fasta"))
241
- chromosome_sequence = max(sequences, key=len) # we assume the longest sequence is the chromosome
242
-
243
- ch_rec_id = chromosome_sequence.id
244
- ch_seq = chromosome_sequence.seq
245
- ch_seqlen = len(chromosome_sequence.seq)
246
- with open("${sample.baseName}_simulated.fq", "w") as f:
247
- for i in range(num_reads):
248
- start = random.randint(0, ch_seqlen - read_length)
249
- read_seq = ch_seq[start:start + read_length]
250
- f.write(f"@read_{i}_{ch_rec_id}_{start}-{start+read_length}\\n")
251
- f.write(f"{read_seq}\\n")
252
- f.write("+\\n")
253
- f.write(f"{len(read_seq)*'~'}\\n")
291
+ set -euo pipefail
292
+
293
+ art_illumina \
294
+ -ss HS25 \
295
+ -i "${sample}" \
296
+ -l 125 \
297
+ -c 100000 \
298
+ -na \
299
+ -rs 42 \
300
+ -o "${sample.baseName}_simulated"
254
301
  """
255
302
  }
256
303
 
257
304
  process summarizeReadClassifications {
258
305
  conda "conda-forge::pandas"
259
306
  cpus 4
260
- memory '16 GB'
261
- publishDir "results"
307
+ memory '64 GB'
308
+ publishDir { params.publishDir }, mode: 'copy'
262
309
 
263
310
  input:
264
311
  path read_assemblies
@@ -278,10 +325,9 @@ process summarizeReadClassifications {
278
325
 
279
326
  # Create a mapping of accession to species ID
280
327
  accession_to_species = dict(zip(df_assemblies['Assembly Accession'], df_assemblies['Species ID']))
281
-
282
- results = []
283
328
 
284
329
  classifications = '${read_classifications}'.split()
330
+ include_header = True
285
331
  for json_file in classifications:
286
332
  basename = os.path.basename(json_file).replace('.json', '')
287
333
  accession = '_'.join(basename.split('_')[:2])
@@ -291,6 +337,7 @@ process summarizeReadClassifications {
291
337
  with open(json_file, 'r') as f:
292
338
  data = json.load(f)
293
339
  scores = data.get('scores', {})
340
+ results = []
294
341
 
295
342
  for read_name, read_scores in scores.items():
296
343
  if read_name != 'total':
@@ -310,17 +357,20 @@ process summarizeReadClassifications {
310
357
  result[species] = score
311
358
 
312
359
  results.append(result)
360
+
361
+
313
362
 
314
- df_results = pd.DataFrame(results)
315
- df_results.to_csv('read_classifications.tsv', sep='\\t', index=False)
363
+ df_results = pd.DataFrame(results)
364
+ df_results.to_csv('read_classifications.tsv', sep='\\t', index=False, mode='a', header=include_header)
365
+ include_header = False
316
366
  """
317
367
  }
318
368
 
319
369
  process calculateStats {
320
370
  conda "conda-forge::pandas conda-forge::scikit-learn"
321
- cpus 2
322
- memory '16 GB'
323
- publishDir "results"
371
+ cpus 8
372
+ memory '256 GB'
373
+ publishDir { params.publishDir }, mode: 'copy'
324
374
 
325
375
  input:
326
376
  path assembly_classifications
@@ -399,7 +449,7 @@ process confusionMatrix {
399
449
  conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
400
450
  cpus 2
401
451
  memory '16 GB'
402
- publishDir "results"
452
+ publishDir { params.publishDir }, mode: 'copy'
403
453
 
404
454
  input:
405
455
  path classifications
@@ -449,7 +499,7 @@ process mismatchConfusionMatrix {
449
499
  conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
450
500
  cpus 2
451
501
  memory '16 GB'
452
- publishDir "results"
502
+ publishDir { params.publishDir }, mode: 'copy'
453
503
 
454
504
  input:
455
505
  path classifications
@@ -506,43 +556,4 @@ process mismatchConfusionMatrix {
506
556
 
507
557
  plt.savefig('mismatches_confusion_matrix.png', dpi=300, bbox_inches='tight')
508
558
  """
509
- }
510
-
511
-
512
- workflow {
513
- species_model = downloadModels()
514
- name_mapping = getNameMapping(species_model)
515
- genomes = file("data/genomes")
516
- tax_report = file("data/aci_species.json")
517
- assemblies = createAssemblyTable(genomes, tax_report, species_model)
518
-
519
- // Whole genome assemblies
520
- samples = Channel.fromPath("${genomes}/**/*.fna")
521
- .flatten()
522
- filtered_samples = assemblies
523
- .splitCsv(header: true, sep: '\t')
524
- .map { row -> row['Assembly Accession'] }
525
- .cross(samples.map { sample ->
526
- [sample.baseName.split('_')[0..1].join('_'), sample]
527
- })
528
- .map { it[1][1] }
529
- classifications = classifyAssembly(filtered_samples)
530
- summarizeClassifications(assemblies, classifications.collect())
531
- confusionMatrix(summarizeClassifications.out, name_mapping)
532
- mismatchConfusionMatrix(summarizeClassifications.out, name_mapping)
533
-
534
- // Simulated reads
535
- selectForReadGen(assemblies, species_model)
536
- read_assemblies = selectForReadGen.out
537
- .splitCsv(header: true, sep: '\t')
538
- .map { row -> row['Assembly Accession'] }
539
- .cross(samples.map { sample ->
540
- [sample.baseName.split('_')[0..1].join('_'), sample]
541
- })
542
- .map { it[1][1] }
543
- generateReads(read_assemblies)
544
- read_classifications = classifyRead(generateReads.out)
545
- summarizeReadClassifications(selectForReadGen.out, read_classifications.collect())
546
-
547
- calculateStats(summarizeClassifications.out, summarizeReadClassifications.out)
548
- }
559
+ }
@@ -1,7 +1,6 @@
1
1
  process.executor = 'slurm'
2
2
  executor.account = 'intern'
3
3
  process.queue = 'all'
4
- executor.perCpuMemAllocation = true
5
4
 
6
5
 
7
6
  conda.enabled = true
@@ -5,7 +5,7 @@
5
5
  #SBATCH --mem-per-cpu=8gb
6
6
  #SBATCH --job-name="download_acinetobacter"
7
7
 
8
- datasets download genome taxon 469 --filename acinetobacter_dataset.zip --assembly-source GenBank --assembly-version latest --exclude-atypical --dehydrated
8
+ datasets download genome taxon 469 --filename acinetobacter_dataset.zip --assembly-source RefSeq --assembly-version latest --exclude-atypical --dehydrated
9
9
  unzip -o acinetobacter_dataset.zip -d genomes
10
10
  datasets rehydrate --directory genomes
11
11
  rm acinetobacter_dataset.zip
@@ -0,0 +1,8 @@
1
+ name: xspect
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - python=3.12
6
+ - pip
7
+ - pip:
8
+ - XspecT==0.5.4