XspecT 0.5.2__tar.gz → 0.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {xspect-0.5.2 → xspect-0.5.4}/.gitignore +11 -1
  2. {xspect-0.5.2/src/XspecT.egg-info → xspect-0.5.4}/PKG-INFO +1 -1
  3. xspect-0.5.4/docs/benchmark.md +34 -0
  4. {xspect-0.5.2 → xspect-0.5.4}/docs/cli.md +11 -3
  5. {xspect-0.5.2 → xspect-0.5.4}/docs/contributing.md +4 -1
  6. {xspect-0.5.2 → xspect-0.5.4}/mkdocs.yml +3 -1
  7. {xspect-0.5.2 → xspect-0.5.4}/pyproject.toml +1 -1
  8. xspect-0.5.4/scripts/benchmark/classify/main.nf +22 -0
  9. xspect-0.5.4/scripts/benchmark/environment.yml +7 -0
  10. xspect-0.5.4/scripts/benchmark/main.nf +473 -0
  11. xspect-0.5.4/scripts/benchmark/nextflow.config +7 -0
  12. xspect-0.5.4/scripts/benchmark-data/download_data.slurm +13 -0
  13. {xspect-0.5.2 → xspect-0.5.4/src/XspecT.egg-info}/PKG-INFO +1 -1
  14. {xspect-0.5.2 → xspect-0.5.4}/src/XspecT.egg-info/SOURCES.txt +6 -0
  15. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/classify.py +31 -8
  16. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/definitions.py +11 -10
  17. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/file_io.py +2 -1
  18. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/filter_sequences.py +20 -4
  19. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/main.py +66 -27
  20. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/mlst_feature/mlst_helper.py +15 -19
  21. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/mlst_feature/pub_mlst_handler.py +16 -19
  22. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/model_management.py +14 -17
  23. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/models/probabilistic_filter_mlst_model.py +11 -10
  24. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/models/probabilistic_filter_model.py +21 -5
  25. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/models/probabilistic_filter_svm_model.py +30 -15
  26. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/models/probabilistic_single_filter_model.py +9 -7
  27. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/models/result.py +20 -15
  28. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/ncbi.py +3 -2
  29. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/web.py +16 -5
  30. {xspect-0.5.2 → xspect-0.5.4}/tests/test_cli.py +41 -0
  31. {xspect-0.5.2 → xspect-0.5.4}/tests/test_model_management.py +11 -17
  32. {xspect-0.5.2 → xspect-0.5.4}/tests/test_web.py +12 -0
  33. {xspect-0.5.2 → xspect-0.5.4}/.github/workflows/black.yml +0 -0
  34. {xspect-0.5.2 → xspect-0.5.4}/.github/workflows/docs.yml +0 -0
  35. {xspect-0.5.2 → xspect-0.5.4}/.github/workflows/pylint.yml +0 -0
  36. {xspect-0.5.2 → xspect-0.5.4}/.github/workflows/pypi.yml +0 -0
  37. {xspect-0.5.2 → xspect-0.5.4}/.github/workflows/test.yml +0 -0
  38. {xspect-0.5.2 → xspect-0.5.4}/LICENSE +0 -0
  39. {xspect-0.5.2 → xspect-0.5.4}/README.md +0 -0
  40. {xspect-0.5.2 → xspect-0.5.4}/docs/index.md +0 -0
  41. {xspect-0.5.2 → xspect-0.5.4}/docs/quickstart.md +0 -0
  42. {xspect-0.5.2 → xspect-0.5.4}/docs/understanding.md +0 -0
  43. {xspect-0.5.2 → xspect-0.5.4}/docs/web.md +0 -0
  44. {xspect-0.5.2 → xspect-0.5.4}/setup.cfg +0 -0
  45. {xspect-0.5.2 → xspect-0.5.4}/src/XspecT.egg-info/dependency_links.txt +0 -0
  46. {xspect-0.5.2 → xspect-0.5.4}/src/XspecT.egg-info/entry_points.txt +0 -0
  47. {xspect-0.5.2 → xspect-0.5.4}/src/XspecT.egg-info/requires.txt +0 -0
  48. {xspect-0.5.2 → xspect-0.5.4}/src/XspecT.egg-info/top_level.txt +0 -0
  49. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/__init__.py +0 -0
  50. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/download_models.py +0 -0
  51. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/mlst_feature/__init__.py +0 -0
  52. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/models/__init__.py +0 -0
  53. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/train.py +0 -0
  54. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/.gitignore +0 -0
  55. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/README.md +0 -0
  56. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/components.json +0 -0
  57. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/dist/assets/index-Ceo58xui.css +0 -0
  58. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/dist/assets/index-Dt_UlbgE.js +0 -0
  59. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/dist/index.html +0 -0
  60. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/dist/vite.svg +0 -0
  61. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/eslint.config.js +0 -0
  62. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/index.html +0 -0
  63. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/package-lock.json +0 -0
  64. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/package.json +0 -0
  65. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/pnpm-lock.yaml +0 -0
  66. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/public/vite.svg +0 -0
  67. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/App.tsx +0 -0
  68. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/api.tsx +0 -0
  69. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/assets/react.svg +0 -0
  70. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/classification-form.tsx +0 -0
  71. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/classify.tsx +0 -0
  72. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/data-table.tsx +0 -0
  73. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/dropdown-checkboxes.tsx +0 -0
  74. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/dropdown-slider.tsx +0 -0
  75. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/filter-form.tsx +0 -0
  76. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/filter.tsx +0 -0
  77. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/filtering-result.tsx +0 -0
  78. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/header.tsx +0 -0
  79. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/landing.tsx +0 -0
  80. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/models-details.tsx +0 -0
  81. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/models.tsx +0 -0
  82. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/result-chart.tsx +0 -0
  83. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/result.tsx +0 -0
  84. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/spinner.tsx +0 -0
  85. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/accordion.tsx +0 -0
  86. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/button.tsx +0 -0
  87. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/card.tsx +0 -0
  88. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/chart.tsx +0 -0
  89. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/command.tsx +0 -0
  90. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/dialog.tsx +0 -0
  91. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/dropdown-menu.tsx +0 -0
  92. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/file-upload.tsx +0 -0
  93. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/form.tsx +0 -0
  94. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/input.tsx +0 -0
  95. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/label.tsx +0 -0
  96. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/navigation-menu.tsx +0 -0
  97. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/popover.tsx +0 -0
  98. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/select.tsx +0 -0
  99. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/separator.tsx +0 -0
  100. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/slider.tsx +0 -0
  101. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/switch.tsx +0 -0
  102. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/table.tsx +0 -0
  103. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/components/ui/tabs.tsx +0 -0
  104. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/index.css +0 -0
  105. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/lib/utils.ts +0 -0
  106. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/main.tsx +0 -0
  107. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/types.tsx +0 -0
  108. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/utils.tsx +0 -0
  109. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/src/vite-env.d.ts +0 -0
  110. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/tsconfig.app.json +0 -0
  111. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/tsconfig.json +0 -0
  112. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/tsconfig.node.json +0 -0
  113. {xspect-0.5.2 → xspect-0.5.4}/src/xspect/xspect-web/vite.config.ts +0 -0
  114. {xspect-0.5.2 → xspect-0.5.4}/tests/__init__.py +0 -0
  115. {xspect-0.5.2 → xspect-0.5.4}/tests/conftest.py +0 -0
  116. {xspect-0.5.2 → xspect-0.5.4}/tests/test_file_io.py +0 -0
  117. {xspect-0.5.2 → xspect-0.5.4}/tests/test_model_result.py +0 -0
  118. {xspect-0.5.2 → xspect-0.5.4}/tests/test_ncbi.py +0 -0
  119. {xspect-0.5.2 → xspect-0.5.4}/tests/test_probabilisitc_filter_mlst_model.py +0 -0
  120. {xspect-0.5.2 → xspect-0.5.4}/tests/test_probabilistic_filter_model.py +0 -0
  121. {xspect-0.5.2 → xspect-0.5.4}/tests/test_probabilistic_filter_svm_model.py +0 -0
  122. {xspect-0.5.2 → xspect-0.5.4}/tests/test_probabilistic_single_filter_model.py +0 -0
  123. {xspect-0.5.2 → xspect-0.5.4}/tests/test_pub_mlst_handler.py +0 -0
  124. {xspect-0.5.2 → xspect-0.5.4}/tests/test_train.py +0 -0
@@ -177,4 +177,14 @@ out.png
177
177
 
178
178
  xspect-data/
179
179
 
180
- .devcontainer/
180
+ .devcontainer/
181
+
182
+ # Nextflow
183
+ .nextflow.log*
184
+ .nextflow/
185
+ work/
186
+ data/
187
+ results/
188
+
189
+ # Slurm
190
+ slurm-*
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: XspecT
3
- Version: 0.5.2
3
+ Version: 0.5.4
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -0,0 +1,34 @@
1
+ # Benchmark
2
+
3
+ XspecT is a tool designed for fast and accurate species classification of genome assemblies and simulated reads. To evaluate its classification accuracy, we conducted a benchmark using a set of Acinetobacter genomes.
4
+
5
+ The benchmark was performed by first download all available Acinetobacter genomes from Genbank, filtered on a passed ("OK") taxonomy check status. Genomes assigned to strain IDs were remapped to their respective species IDs, after which genomes with species IDs not contained in XspecT's Acinetobacter model were removed. The remaining genomes were then used to classify both assemblies and simulated reads generated from them. Simulated reads were generated by first filtering on genomes that were not part of the training data and that were categorized as "complete" by NCBI. The reads were then simulated from the longest contig of each genome (assumed to be the chromosome) using a custom Python script. Up to three genomes were selected per species. 100 000 reads were simulated for each genome, with a read length of 100 bp and no simulated sequencing errors. The reads were then classified using XspecT with predictions based on the maximum-scoring species.
6
+
7
+ ## Benchmark Results
8
+
9
+ The benchmark results show that XspecT achieves high classification accuracy, with an overall accuracy of 99.94% for whole genomes and 87.11% for simulated reads.
10
+
11
+ | Category | Total | Matches | Mismatches | Match Rate | Mismatch Rate |
12
+ |-------------------|----------|----------|------------|------------|---------------|
13
+ | Assemblies | 44,905 | 44,879 | 26 | 99.94% | 0.06% |
14
+ | Simulated reads | 9,000,000| 7,839,877| 1,160,123 | 87.11% | 12.89% |
15
+
16
+ ## Running the benchmark yourself
17
+
18
+ To benchmark XspecT performance yourself, you can use the Nextflow workflow provided in the `scripts/benchmark` directory. This workflow allows you to run XspecT on a set of samples and measure species classification accuracy on both genome assemblies, as well as on simulated reads.
19
+
20
+ Before you run the benchmark, you first need to download benchmarking data to the `data` directory, for example from NCBI. To do so, you can use the bash script in `scripts/benchmark-data` to download the data using the [NCBI Datasets CLI](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/command-line-tools/download-and-install/), which needs to be installed first. The script will download all available Acinetobacter genomes, as well as taxonomic data.
21
+
22
+ To run the benchmark, install [Nextflow](https://www.nextflow.io/docs/latest/install.html) and run the following command:
23
+
24
+ ```bash
25
+ nextflow run scripts/benchmark
26
+ ```
27
+
28
+ This will execute the benchmark workflow, which will classify the samples, as well as reads generated from them, using XspecT. The results will be saved in the `results` directory:
29
+
30
+ - `results/classifications.tsv` for the classifications of the assemblies
31
+ - `results/read_classifications.tsv` for the classifications of the simulated reads
32
+ - `results/confusion_matrix.png` for the confusion matrix of genome assembly classifications
33
+ - `results/mismatches_confusion_matrix.png` for a confusion matrix filtered on mismatches of genome assembly classifications
34
+ - `results/stats.txt` for the statistics of the benchmark run
@@ -12,7 +12,7 @@ In general, XspecT commands will prompt you for parameters if they are not provi
12
12
 
13
13
  ## Model Management
14
14
 
15
- At its core, XspecT uses models to classify and filter samples. These models are based on kmer indices trained on publicly availabel genomes as well as, possibly, a support vector machine (SVM) classifier.
15
+ At its core, XspecT uses models to classify and filter samples. These models are based on kmer indices trained on publicly available genomes as well as, possibly, a support vector machine (SVM) classifier.
16
16
 
17
17
  To manage models, the `xspect models` command can be used. This command allows you to download, train, and view available models.
18
18
 
@@ -114,16 +114,24 @@ xspect classify species --sparse-sampling-step 10 Acinetobacter path
114
114
 
115
115
  This will only consider every 10th kmer in the sample.
116
116
 
117
+ ### Inclusion of display names
118
+ By default, the classification results show only the taxonomy ID of each species along with its corresponding score for better readability. To display the full names associated with each taxonomy ID, you can use the `--display-names` (or `-n`) option:
119
+
120
+ ```bash
121
+ xspect classify species --display-names Acinetobacter path
122
+ ```
123
+ The output will then be formatted as: `Taxonomy_ID - Display_Name: Score` for each species.
124
+
117
125
  ### MLST Classification
118
126
 
119
127
  Samples can also be classified based on Multi-locus sequence type schemas. To MLST-classify a sample, run:
120
128
 
121
129
  ```bash
122
- xspect classify-mlst -p path
130
+ xspect classify mlst
123
131
  ```
124
132
 
125
133
  ## Filtering
126
- XspecT can also be used to filter samples based on their classification results. This is useful when analyzing metagenome samples, for example when looking at genomic bycatch.
134
+ XspecT can also be used to filter samples based on their classification results. This is useful when analyzing metagenomic samples, for example when looking at genomic bycatch.
127
135
 
128
136
  To filter samples, the command `xspect filter` can be used. This command will filter the samples based on the specified criteria.
129
137
 
@@ -20,11 +20,14 @@ Get started by cloning the repository:
20
20
  git clone https://github.com/BIONF/XspecT2.git
21
21
  ```
22
22
 
23
- You then need to build the web application using Vite. Navigate to the `xspect-web` directory and run the build command, which will also watch for changes:
23
+ You then need to build the web application using Vite. Navigate to the `xspect-web` directory, install dependencies, and run the build command, which will also watch for changes:
24
24
  ```bash
25
25
  cd XspecT2/src/xspect/xspect-web
26
26
  ```
27
27
  ```bash
28
+ npm i
29
+ ```
30
+ ```bash
28
31
  npx vite build --watch
29
32
  ```
30
33
 
@@ -15,5 +15,7 @@ nav:
15
15
  - Quickstart: quickstart.md
16
16
  - CLI: cli.md
17
17
  - "Web App": web.md
18
- - "Understanding XspecT": understanding.md
18
+ - "Understanding XspecT":
19
+ - understanding.md
20
+ - benchmark.md
19
21
  - Contributing: contributing.md
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "XspecT"
3
- version = "0.5.2"
3
+ version = "0.5.4"
4
4
  description = "Tool to monitor and characterize pathogens using Bloom filters."
5
5
  readme = {file = "README.md", content-type = "text/markdown"}
6
6
  license = {file = "LICENSE"}
@@ -0,0 +1,22 @@
1
+ process classifySample {
2
+ conda "./scripts/benchmark/environment.yml"
3
+ cpus 4
4
+ memory '32 GB'
5
+
6
+ input:
7
+ path sample
8
+
9
+ output:
10
+ path "${sample.baseName}.json"
11
+
12
+ script:
13
+ """
14
+ xspect classify species -g Acinetobacter -i ${sample} -o ${sample.baseName}.json
15
+ """
16
+
17
+ stub:
18
+ """
19
+ mkdir -p results
20
+ touch results/${sample.baseName}.json
21
+ """
22
+ }
@@ -0,0 +1,7 @@
1
+ name: xspect-benchmark
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - pip
6
+ - pip:
7
+ - XspecT
@@ -0,0 +1,473 @@
1
+ #!/usr/bin/env nextflow
2
+
3
+ include { classifySample as classifyAssembly } from './classify'
4
+ include { classifySample as classifyRead } from './classify'
5
+
6
+ process downloadModels {
7
+ conda "./scripts/benchmark/environment.yml"
8
+ cpus 2
9
+ memory '16 GB'
10
+
11
+ output:
12
+ path "species_model.json"
13
+
14
+ script:
15
+ """
16
+ if [ ! "$HOME/xspect-data/models/acinetobacter-species.json" ]; then
17
+ xspect models download
18
+ fi
19
+ cp "$HOME/xspect-data/models/acinetobacter-species.json" species_model.json
20
+ """
21
+ }
22
+
23
+ process getNameMapping {
24
+ conda "conda-forge::jq"
25
+ cpus 2
26
+ memory '16 GB'
27
+
28
+ input:
29
+ path species_model
30
+
31
+ output:
32
+ path "name_mapping.json"
33
+
34
+ script:
35
+ """
36
+ jq '.display_names | to_entries | map({key: .key, value: (.value | sub("Acinetobacter"; "A."))}) | from_entries' ${species_model} > name_mapping.json
37
+ """
38
+
39
+ stub:
40
+ """
41
+ touch name_mapping.json
42
+ """
43
+ }
44
+
45
+
46
+ process createAssemblyTable {
47
+ conda "conda-forge::ncbi-datasets-cli conda-forge::jq"
48
+ cpus 2
49
+ memory '16 GB'
50
+
51
+ input:
52
+ path genomes
53
+ path tax_report
54
+ path species_model
55
+
56
+ output:
57
+ path "assemblies.tsv"
58
+
59
+ when:
60
+ !file("assemblies.tsv").exists()
61
+
62
+
63
+ script:
64
+ """
65
+ inputfile="${genomes}/ncbi_dataset/data/assembly_data_report.jsonl"
66
+
67
+ dataformat tsv genome --inputfile \$inputfile --fields accession,assminfo-name,organism-tax-id,assminfo-level,ani-check-status > assemblies.tsv
68
+
69
+ # filter out assemblies with ANI check status other than "OK"
70
+ awk -F'\t' 'NR==1 || \$5 == "OK"' assemblies.tsv > assemblies_filtered.tsv
71
+ mv assemblies_filtered.tsv assemblies.tsv
72
+
73
+ # map taxonmic IDs to species IDs (taxonomic IDs might be strain IDs)
74
+ jq '
75
+ .reports
76
+ | map(select(.taxonomy.children != null))
77
+ | map({
78
+ species_id: .taxonomy.tax_id,
79
+ children: .taxonomy.children
80
+ })
81
+ | map(
82
+ . as \$entry
83
+ | \$entry.children
84
+ | map({ (tostring): \$entry.species_id })
85
+ | add
86
+ )
87
+ | add
88
+ ' ${tax_report} > tax_mapping.json
89
+
90
+ # add species IDs to assemblies.tsv
91
+ declare -A species_map
92
+ while IFS="=" read -r key val; do
93
+ species_map["\$key"]="\$val"
94
+ done < <(jq -r 'to_entries[] | "\\(.key)=\\(.value)"' tax_mapping.json)
95
+
96
+ {
97
+ IFS='\t' read -r -a header < assemblies.tsv
98
+ IFS='\t'; echo -e "\${header[*]}\tSpecies ID"
99
+
100
+ tail -n +2 assemblies.tsv | while IFS='\t' read -r acc name taxid level status; do
101
+ species_id="\${species_map[\$taxid]:-\$taxid}"
102
+ echo -e "\$acc\t\$name\t\$taxid\t\$level\t\$status\t\$species_id"
103
+ done
104
+ } > temp_assemblies.tsv
105
+ mv temp_assemblies.tsv assemblies.tsv
106
+
107
+ # filter out assemblies with species ID not in the species model
108
+ jq -r '.display_names | keys | .[]' ${species_model} > valid_species.txt
109
+ awk -F'\t' '
110
+ BEGIN {
111
+ while ((getline species < "valid_species.txt") > 0) {
112
+ valid[species] = 1;
113
+ }
114
+ close("valid_species.txt");
115
+ }
116
+ NR==1 { print; next }
117
+ \$6 in valid { print }
118
+ ' assemblies.tsv > temp_assemblies.tsv
119
+ mv temp_assemblies.tsv assemblies.tsv
120
+ rm valid_species.txt
121
+ """
122
+
123
+ stub:
124
+ """
125
+ touch assemblies.tsv
126
+ """
127
+ }
128
+
129
+ process summarizeClassifications {
130
+ conda "jq"
131
+ cpus 2
132
+ memory '16 GB'
133
+ publishDir "results"
134
+
135
+ input:
136
+ path assemblies
137
+ path classifications
138
+
139
+ output:
140
+ path "classifications.tsv"
141
+
142
+ script:
143
+ """
144
+ cp ${assemblies} classifications.tsv
145
+
146
+ awk 'BEGIN {FS=OFS="\t"}
147
+ NR==1 {print \$0, "Prediction"}
148
+ NR>1 {print \$0, "unknown"}' classifications.tsv > temp_classifications.tsv
149
+ mv temp_classifications.tsv classifications.tsv
150
+
151
+ for json_file in ${classifications}; do
152
+ basename=\$(basename \$json_file .json)
153
+ accession=\$(echo \$basename | cut -d'_' -f1-2)
154
+ prediction=\$(jq '.["prediction"]' \$json_file | tr -d '"')
155
+
156
+ awk -v acc="\$accession" -v pred="\$prediction" 'BEGIN {FS=OFS="\t"}
157
+ NR==1 {print}
158
+ NR>1 && \$1 ~ acc {\$NF=pred; print}
159
+ NR>1 && \$1 !~ acc {print}' classifications.tsv > temp_classifications.tsv
160
+ mv temp_classifications.tsv classifications.tsv
161
+ done
162
+ """
163
+ }
164
+
165
+ process selectForReadGen {
166
+ conda "conda-forge::pandas"
167
+ cpus 2
168
+ memory '16 GB'
169
+
170
+ input:
171
+ path assemblies
172
+ path species_model
173
+
174
+ output:
175
+ path "selected_samples.tsv"
176
+
177
+ script:
178
+ """
179
+ #!/usr/bin/env python
180
+ import pandas as pd
181
+ import json
182
+
183
+ assemblies = pd.read_csv('${assemblies}', sep='\\t')
184
+
185
+ training_accessions = []
186
+ with open('${species_model}', 'r') as f:
187
+ species_model = json.load(f)
188
+ for id, accession in species_model["training_accessions"].items():
189
+ training_accessions.extend(accession)
190
+
191
+ assemblies = assemblies[assemblies['Assembly Level'] == 'Complete Genome']
192
+ assemblies = assemblies[~assemblies['Assembly Accession'].isin(training_accessions)]
193
+
194
+ # use up to three assemblies for each species
195
+ assemblies = assemblies.groupby('Species ID').head(3)
196
+
197
+ assemblies.to_csv('selected_samples.tsv', sep='\\t', index=False)
198
+ """
199
+ }
200
+
201
+ process generateReads {
202
+ conda "conda-forge::pandas conda-forge::biopython"
203
+ cpus 2
204
+ memory '16 GB'
205
+
206
+ input:
207
+ path sample
208
+
209
+ output:
210
+ path "${sample.baseName}_simulated.fq"
211
+
212
+ script:
213
+ """
214
+ #!/usr/bin/env python
215
+ import random
216
+ from Bio import SeqIO
217
+
218
+ read_length = 100
219
+ num_reads = 100000
220
+ seed = 42
221
+
222
+ random.seed(seed)
223
+ sequences = list(SeqIO.parse("${sample}", "fasta"))
224
+ chromosome_sequence = max(sequences, key=len) # we assume the longest sequence is the chromosome
225
+
226
+ ch_rec_id = chromosome_sequence.id
227
+ ch_seq = chromosome_sequence.seq
228
+ ch_seqlen = len(chromosome_sequence.seq)
229
+ with open("${sample.baseName}_simulated.fq", "w") as f:
230
+ for i in range(num_reads):
231
+ start = random.randint(0, ch_seqlen - read_length)
232
+ read_seq = ch_seq[start:start + read_length]
233
+ f.write(f"@read_{i}_{ch_rec_id}_{start}-{start+read_length}\\n")
234
+ f.write(f"{read_seq}\\n")
235
+ f.write("+\\n")
236
+ f.write(f"{len(read_seq)*'~'}\\n")
237
+ """
238
+ }
239
+
240
+ process summarizeReadClassifications {
241
+ conda "conda-forge::jq"
242
+ cpus 2
243
+ memory '16 GB'
244
+ publishDir "results"
245
+
246
+ input:
247
+ path read_assemblies
248
+ path read_classifications
249
+
250
+ output:
251
+ path "read_classifications.tsv"
252
+
253
+ script:
254
+ """
255
+ echo -e "Assembly Accession\tRead\tPrediction\tSpecies ID" > read_classifications.tsv
256
+
257
+ for json_file in ${read_classifications}; do
258
+ basename=\$(basename \$json_file .json)
259
+ accession=\$(echo \$basename | cut -d'_' -f1-2)
260
+
261
+ # Get species ID from assemblies table
262
+ species_id=\$(awk -F'\t' -v acc="\$accession" '\$1 == acc {print \$6}' ${read_assemblies})
263
+
264
+ # Extract predictions from JSON and append to TSV
265
+ jq -r --arg acc "\$accession" --arg species "\$species_id" '
266
+ .scores
267
+ | to_entries[]
268
+ | select(.key != "total")
269
+ | "\\(.key)\\t\\(.value | to_entries | max_by(.value) | .key)"
270
+ | "\\(\$acc)\\t" + . + "\\t\\(\$species)"
271
+ ' "\$json_file" >> read_classifications.tsv
272
+ done
273
+ """
274
+ }
275
+
276
+ process calculateStats {
277
+ conda "conda-forge::pandas"
278
+ cpus 2
279
+ memory '16 GB'
280
+ publishDir "results"
281
+
282
+ input:
283
+ path assembly_classifications
284
+ path read_classifications
285
+
286
+ output:
287
+ path "stats.txt"
288
+
289
+ script:
290
+ """
291
+ #!/usr/bin/env python
292
+ import pandas as pd
293
+
294
+ df_assembly = pd.read_csv('${assembly_classifications}', sep='\\t')
295
+ df_assembly['Species ID'] = df_assembly['Species ID'].astype(str)
296
+ df_assembly['Prediction'] = df_assembly['Prediction'].astype(str)
297
+ assembly_matches = df_assembly.loc[df_assembly['Species ID'] == df_assembly['Prediction']]
298
+ assembly_mismatches = df_assembly.loc[df_assembly['Species ID'] != df_assembly['Prediction']]
299
+
300
+ df_read = pd.read_csv('${read_classifications}', sep='\\t')
301
+ df_read['Species ID'] = df_read['Species ID'].astype(str)
302
+ df_read['Prediction'] = df_read['Prediction'].astype(str)
303
+ read_matches = df_read.loc[df_read['Species ID'] == df_read['Prediction']]
304
+ read_mismatches = df_read.loc[df_read['Species ID'] != df_read['Prediction']]
305
+
306
+ with open('stats.txt', 'w') as f:
307
+ f.write(f"Assembly Total: {len(df_assembly)}\\n")
308
+ f.write(f"Assembly Matches: {len(assembly_matches)}\\n")
309
+ f.write(f"Assembly Mismatches: {len(assembly_mismatches)}\\n")
310
+ f.write(f"Assembly Match Rate: {len(assembly_matches) / len(df_assembly) * 100:.2f}%\\n")
311
+ f.write(f"Assembly Mismatch Rate: {len(assembly_mismatches) / len(df_assembly) * 100:.2f}%\\n")
312
+
313
+ f.write("\\n")
314
+
315
+ f.write(f"Read Total: {len(df_read)}\\n")
316
+ f.write(f"Read Matches: {len(read_matches)}\\n")
317
+ f.write(f"Read Mismatches: {len(read_mismatches)}\\n")
318
+ f.write(f"Read Match Rate: {len(read_matches) / len(df_read) * 100:.2f}%\\n")
319
+ f.write(f"Read Mismatch Rate: {len(read_mismatches) / len(df_read) * 100:.2f}%\\n")
320
+ """
321
+ }
322
+
323
+ process confusionMatrix {
324
+ conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
325
+ cpus 2
326
+ memory '16 GB'
327
+ publishDir "results"
328
+
329
+ input:
330
+ path classifications
331
+ path name_mapping
332
+
333
+ output:
334
+ path "confusion_matrix.png"
335
+
336
+ script:
337
+ """
338
+ #!/usr/bin/env python
339
+ import pandas as pd
340
+ from sklearn.metrics import confusion_matrix
341
+ import matplotlib.pyplot as plt
342
+ import numpy as np
343
+ import json
344
+
345
+ df = pd.read_csv('${classifications}', sep='\\t')
346
+ y_true = df["Species ID"].astype(str)
347
+ y_pred = df["Prediction"].astype(str)
348
+
349
+ with open('${name_mapping}', 'r') as f:
350
+ name_mapping_dict = json.load(f)
351
+ labels = list(set(y_true) | set(y_pred))
352
+ labels = sorted(labels, key=lambda x: name_mapping_dict.get(x, x))
353
+ display_labels = [name_mapping_dict.get(label, label) for label in labels]
354
+
355
+ cm = confusion_matrix(y_true, y_pred, labels=labels)
356
+ cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
357
+
358
+ plt.figure(figsize=(30, 30))
359
+ plt.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
360
+ plt.colorbar()
361
+
362
+ plt.xticks(ticks=np.arange(len(labels)), labels=display_labels, rotation=90, fontsize=12)
363
+ plt.yticks(ticks=np.arange(len(labels)), labels=display_labels, fontsize=12)
364
+
365
+ plt.title('Xspect Acinetobacter Confusion Matrix', fontsize=24)
366
+ plt.xlabel('Predicted Labels', fontsize=20)
367
+ plt.ylabel('True Labels', fontsize=20)
368
+
369
+ plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
370
+ """
371
+ }
372
+
373
+ process mismatchConfusionMatrix {
374
+ conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
375
+ cpus 2
376
+ memory '16 GB'
377
+ publishDir "results"
378
+
379
+ input:
380
+ path classifications
381
+ path name_mapping
382
+
383
+ output:
384
+ path "mismatches_confusion_matrix.png"
385
+
386
+ script:
387
+ """
388
+ #!/usr/bin/env python
389
+ import pandas as pd
390
+ from sklearn.metrics import confusion_matrix
391
+ import matplotlib.pyplot as plt
392
+ import numpy as np
393
+ import json
394
+
395
+
396
+ df = pd.read_csv('${classifications}', sep='\\t')
397
+ df["Species ID"] = df["Species ID"].astype(str)
398
+ df["Prediction"] = df["Prediction"].astype(str)
399
+ df_comparison_mismatch = df[df["Species ID"] != df["Prediction"]]
400
+
401
+ with open('${name_mapping}', 'r') as f:
402
+ name_mapping_dict = json.load(f)
403
+ y_true = df_comparison_mismatch["Species ID"]
404
+ y_pred = df_comparison_mismatch["Prediction"]
405
+
406
+ labels = list(set(y_true) | set(y_pred))
407
+ labels = sorted(labels, key=lambda x: name_mapping_dict.get(x, x))
408
+ display_labels = [name_mapping_dict.get(label, label) for label in labels]
409
+
410
+ cm = confusion_matrix(y_true, y_pred, labels=labels)
411
+
412
+ plt.figure(figsize=(30, 30))
413
+ plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
414
+ cbar = plt.colorbar()
415
+ cbar.ax.tick_params(labelsize=20)
416
+
417
+ plt.xticks(ticks=np.arange(len(labels)), labels=display_labels, rotation=90, fontsize=16)
418
+ plt.yticks(ticks=np.arange(len(labels)), labels=display_labels, fontsize=16)
419
+
420
+ thresh = cm.max() / 2.
421
+ for i in range(cm.shape[0]):
422
+ for j in range(cm.shape[1]):
423
+ plt.text(j, i, format(cm[i, j], 'd'), # 'd' ensures integer formatting
424
+ horizontalalignment="center",
425
+ color="white" if cm[i, j] > thresh else "black",
426
+ fontsize=14)
427
+
428
+ plt.title('Mismatches Confusion Matrix', fontsize=30)
429
+ plt.xlabel('Predicted Labels', fontsize=24)
430
+ plt.ylabel('True Labels', fontsize=24)
431
+
432
+ plt.savefig('mismatches_confusion_matrix.png', dpi=300, bbox_inches='tight')
433
+ """
434
+ }
435
+
436
+
437
+ workflow {
438
+ species_model = downloadModels()
439
+ name_mapping = getNameMapping(species_model)
440
+ genomes = file("data/genomes")
441
+ tax_report = file("data/aci_species.json")
442
+ assemblies = createAssemblyTable(genomes, tax_report, species_model)
443
+
444
+ // Whole genome assemblies
445
+ samples = Channel.fromPath("${genomes}/**/*.fna")
446
+ .flatten()
447
+ filtered_samples = assemblies
448
+ .splitCsv(header: true, sep: '\t')
449
+ .map { row -> row['Assembly Accession'] }
450
+ .cross(samples.map { sample ->
451
+ [sample.baseName.split('_')[0..1].join('_'), sample]
452
+ })
453
+ .map { it[1][1] }
454
+ classifications = classifyAssembly(filtered_samples)
455
+ summarizeClassifications(assemblies, classifications.collect())
456
+ confusionMatrix(summarizeClassifications.out, name_mapping)
457
+ mismatchConfusionMatrix(summarizeClassifications.out, name_mapping)
458
+
459
+ // Simulated reads
460
+ selectForReadGen(assemblies, species_model)
461
+ read_assemblies = selectForReadGen.out
462
+ .splitCsv(header: true, sep: '\t')
463
+ .map { row -> row['Assembly Accession'] }
464
+ .cross(samples.map { sample ->
465
+ [sample.baseName.split('_')[0..1].join('_'), sample]
466
+ })
467
+ .map { it[1][1] }
468
+ generateReads(read_assemblies)
469
+ read_classifications = classifyRead(generateReads.out)
470
+ summarizeReadClassifications(selectForReadGen.out, read_classifications.collect())
471
+
472
+ calculateStats(summarizeClassifications.out, summarizeReadClassifications.out)
473
+ }
@@ -0,0 +1,7 @@
1
+ process.executor = 'slurm'
2
+ executor.account = 'intern'
3
+ process.queue = 'all'
4
+ executor.perCpuMemAllocation = true
5
+
6
+
7
+ conda.enabled = true
@@ -0,0 +1,13 @@
1
+ #!/bin/bash
2
+ #SBATCH --partition=all
3
+ #SBATCH --account=intern
4
+ #SBATCH --cpus-per-task=4
5
+ #SBATCH --mem-per-cpu=8gb
6
+ #SBATCH --job-name="download_acinetobacter"
7
+
8
+ datasets download genome taxon 469 --filename acinetobacter_dataset.zip --assembly-source GenBank --assembly-version latest --exclude-atypical --dehydrated
9
+ unzip -o acinetobacter_dataset.zip -d genomes
10
+ datasets rehydrate --directory genomes
11
+ rm acinetobacter_dataset.zip
12
+
13
+ datasets summary taxonomy taxon 469 --rank species --children > aci_species.json
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: XspecT
3
- Version: 0.5.2
3
+ Version: 0.5.4
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -8,12 +8,18 @@ pyproject.toml
8
8
  .github/workflows/pylint.yml
9
9
  .github/workflows/pypi.yml
10
10
  .github/workflows/test.yml
11
+ docs/benchmark.md
11
12
  docs/cli.md
12
13
  docs/contributing.md
13
14
  docs/index.md
14
15
  docs/quickstart.md
15
16
  docs/understanding.md
16
17
  docs/web.md
18
+ scripts/benchmark/environment.yml
19
+ scripts/benchmark/main.nf
20
+ scripts/benchmark/nextflow.config
21
+ scripts/benchmark-data/download_data.slurm
22
+ scripts/benchmark/classify/main.nf
17
23
  src/XspecT.egg-info/PKG-INFO
18
24
  src/XspecT.egg-info/SOURCES.txt
19
25
  src/XspecT.egg-info/dependency_links.txt