XspecT 0.6.0__tar.gz → 0.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/docs.yml +1 -1
- {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/pypi.yml +1 -1
- {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/test.yml +2 -3
- {xspect-0.6.0 → xspect-0.7.1}/.gitignore +5 -1
- {xspect-0.6.0 → xspect-0.7.1}/PKG-INFO +7 -7
- {xspect-0.6.0 → xspect-0.7.1}/README.md +5 -5
- {xspect-0.6.0 → xspect-0.7.1}/docs/benchmark.md +4 -4
- {xspect-0.6.0 → xspect-0.7.1}/docs/cli.md +8 -4
- {xspect-0.6.0 → xspect-0.7.1}/docs/contributing.md +5 -5
- {xspect-0.6.0 → xspect-0.7.1}/mkdocs.yml +1 -1
- {xspect-0.6.0 → xspect-0.7.1}/pyproject.toml +2 -2
- {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/classify/main.nf +2 -1
- {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/environment.yml +2 -1
- {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/main.nf +115 -104
- {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark/nextflow.config +0 -1
- {xspect-0.6.0 → xspect-0.7.1}/scripts/benchmark-data/download_data.slurm +1 -1
- xspect-0.7.1/scripts/nextflow-utils/environment.yml +8 -0
- xspect-0.7.1/scripts/nextflow-utils/main.nf +31 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/PKG-INFO +7 -7
- {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/SOURCES.txt +8 -6
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/classify.py +18 -10
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/download_models.py +3 -1
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/file_io.py +24 -0
- xspect-0.7.1/src/xspect/handlers/pubmlst.py +130 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/main.py +79 -44
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/model_management.py +61 -3
- xspect-0.7.1/src/xspect/models/mlst_result.py +62 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_filter_mlst_model.py +96 -101
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_filter_model.py +7 -7
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/train.py +48 -1
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/web.py +1 -3
- xspect-0.6.0/src/xspect/xspect-web/dist/assets/index-Dt_UlbgE.js → xspect-0.7.1/src/xspect/xspect-web/dist/assets/index-Bg0QP9Ys.js +1 -1
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/dist/index.html +1 -1
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/header.tsx +1 -1
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_misclassification_detection.py +1 -1
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_model_result.py +2 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_ncbi.py +1 -1
- xspect-0.6.0/tests/test_probabilisitc_filter_mlst_model.py → xspect-0.7.1/tests/test_probabilistic_filter_mlst_model.py +41 -28
- xspect-0.7.1/tests/test_pub_mlst_handler.py +41 -0
- xspect-0.6.0/src/xspect/mlst_feature/mlst_helper.py +0 -241
- xspect-0.6.0/src/xspect/mlst_feature/pub_mlst_handler.py +0 -184
- xspect-0.6.0/tests/test_pub_mlst_handler.py +0 -53
- {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/black.yml +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/.github/workflows/pylint.yml +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/LICENSE +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/docs/index.md +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/docs/quickstart.md +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/docs/understanding.md +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/docs/web.md +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/setup.cfg +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/dependency_links.txt +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/entry_points.txt +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/requires.txt +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/XspecT.egg-info/top_level.txt +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/__init__.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/definitions.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/filter_sequences.py +0 -0
- {xspect-0.6.0/src/xspect/misclassification_detection → xspect-0.7.1/src/xspect/handlers}/__init__.py +0 -0
- {xspect-0.6.0/src/xspect → xspect-0.7.1/src/xspect/handlers}/ncbi.py +0 -0
- {xspect-0.6.0/src/xspect/mlst_feature → xspect-0.7.1/src/xspect/misclassification_detection}/__init__.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/misclassification_detection/mapping.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/misclassification_detection/point_pattern_analysis.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/misclassification_detection/simulate_reads.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/__init__.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_filter_svm_model.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/probabilistic_single_filter_model.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/models/result.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/.gitignore +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/README.md +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/components.json +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/dist/assets/index-Ceo58xui.css +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/dist/vite.svg +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/eslint.config.js +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/index.html +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/package-lock.json +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/package.json +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/pnpm-lock.yaml +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/public/vite.svg +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/App.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/api.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/assets/react.svg +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/classification-form.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/classify.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/data-table.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/dropdown-checkboxes.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/dropdown-slider.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/filter-form.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/filter.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/filtering-result.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/landing.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/models-details.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/models.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/result-chart.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/result.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/spinner.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/accordion.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/button.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/card.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/chart.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/command.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/dialog.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/dropdown-menu.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/file-upload.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/form.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/input.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/label.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/navigation-menu.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/popover.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/select.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/separator.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/slider.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/switch.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/table.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/components/ui/tabs.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/index.css +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/lib/utils.ts +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/main.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/types.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/utils.tsx +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/src/vite-env.d.ts +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/tsconfig.app.json +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/tsconfig.json +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/tsconfig.node.json +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/src/xspect/xspect-web/vite.config.ts +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/__init__.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/conftest.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_cli.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_file_io.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_model_management.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_probabilistic_filter_model.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_probabilistic_filter_svm_model.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_probabilistic_single_filter_model.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_train.py +0 -0
- {xspect-0.6.0 → xspect-0.7.1}/tests/test_web.py +0 -0
|
@@ -16,7 +16,7 @@ jobs:
|
|
|
16
16
|
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
|
|
17
17
|
- uses: actions/setup-python@v5
|
|
18
18
|
with:
|
|
19
|
-
python-version: 3.
|
|
19
|
+
python-version: 3.13
|
|
20
20
|
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
|
|
21
21
|
- uses: actions/cache@v4
|
|
22
22
|
with:
|
|
@@ -19,15 +19,14 @@ jobs:
|
|
|
19
19
|
- name: Set up Python
|
|
20
20
|
uses: actions/setup-python@v4
|
|
21
21
|
with:
|
|
22
|
-
python-version: "3.
|
|
22
|
+
python-version: "3.13"
|
|
23
23
|
- name: Install package
|
|
24
24
|
run: |
|
|
25
25
|
python -m pip install --upgrade pip
|
|
26
26
|
pip install '.[test]'
|
|
27
|
-
- name: Download models
|
|
27
|
+
- name: Download models
|
|
28
28
|
run: |
|
|
29
29
|
xspect models download
|
|
30
|
-
yes 1 | xspect models train mlst
|
|
31
30
|
- name: Test with pytest
|
|
32
31
|
env:
|
|
33
32
|
NCBI_API_KEY: ${{ secrets.NCBI_API_KEY }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: XspecT
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Tool to monitor and characterize pathogens using Bloom filters.
|
|
5
5
|
License: MIT License
|
|
6
6
|
|
|
@@ -29,7 +29,7 @@ Project-URL: Repository, https://github.com/BIONF/XspecT2.git
|
|
|
29
29
|
Classifier: Intended Audience :: Developers
|
|
30
30
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
31
31
|
Classifier: License :: OSI Approved :: MIT License
|
|
32
|
-
Requires-Python:
|
|
32
|
+
Requires-Python: <3.14,>=3.10
|
|
33
33
|
Description-Content-Type: text/markdown
|
|
34
34
|
License-File: LICENSE
|
|
35
35
|
Requires-Dist: biopython
|
|
@@ -64,19 +64,19 @@ Dynamic: license-file
|
|
|
64
64
|
[](https://github.com/pylint-dev/pylint)
|
|
65
65
|
[](https://github.com/psf/black)
|
|
66
66
|
|
|
67
|
-
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [
|
|
67
|
+
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [support vector machine].
|
|
68
68
|
|
|
69
|
-
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a
|
|
69
|
+
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a support vector machine.
|
|
70
70
|
|
|
71
71
|
The tool is available as a web-based application and as a command line interface.
|
|
72
72
|
|
|
73
73
|
[kmer indices]: https://arxiv.org/abs/1905.09624
|
|
74
|
-
[
|
|
74
|
+
[support vector machine]: https://en.wikipedia.org/wiki/Support-vector_machine
|
|
75
75
|
<!-- end intro -->
|
|
76
76
|
|
|
77
77
|
<!-- start quickstart -->
|
|
78
78
|
## Installation
|
|
79
|
-
To install XspecT, please download
|
|
79
|
+
To install XspecT, please download Python 3.10 - 3.13 and install the package using pip:
|
|
80
80
|
```
|
|
81
81
|
pip install xspect
|
|
82
82
|
```
|
|
@@ -114,5 +114,5 @@ For further instructions on how to use the command line interface, please refer
|
|
|
114
114
|
```
|
|
115
115
|
xspect --help
|
|
116
116
|
```
|
|
117
|
-
[documentation]: https://bionf.github.io/
|
|
117
|
+
[documentation]: https://bionf.github.io/XspecT/cli/index.html
|
|
118
118
|
<!-- end quickstart -->
|
|
@@ -4,19 +4,19 @@
|
|
|
4
4
|
[](https://github.com/pylint-dev/pylint)
|
|
5
5
|
[](https://github.com/psf/black)
|
|
6
6
|
|
|
7
|
-
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [
|
|
7
|
+
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [support vector machine].
|
|
8
8
|
|
|
9
|
-
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a
|
|
9
|
+
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a support vector machine.
|
|
10
10
|
|
|
11
11
|
The tool is available as a web-based application and as a command line interface.
|
|
12
12
|
|
|
13
13
|
[kmer indices]: https://arxiv.org/abs/1905.09624
|
|
14
|
-
[
|
|
14
|
+
[support vector machine]: https://en.wikipedia.org/wiki/Support-vector_machine
|
|
15
15
|
<!-- end intro -->
|
|
16
16
|
|
|
17
17
|
<!-- start quickstart -->
|
|
18
18
|
## Installation
|
|
19
|
-
To install XspecT, please download
|
|
19
|
+
To install XspecT, please download Python 3.10 - 3.13 and install the package using pip:
|
|
20
20
|
```
|
|
21
21
|
pip install xspect
|
|
22
22
|
```
|
|
@@ -54,5 +54,5 @@ For further instructions on how to use the command line interface, please refer
|
|
|
54
54
|
```
|
|
55
55
|
xspect --help
|
|
56
56
|
```
|
|
57
|
-
[documentation]: https://bionf.github.io/
|
|
57
|
+
[documentation]: https://bionf.github.io/XspecT/cli/index.html
|
|
58
58
|
<!-- end quickstart -->
|
|
@@ -2,16 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
XspecT is a tool designed for fast and accurate species classification of genome assemblies and simulated reads. To evaluate its classification accuracy, we conducted a benchmark using a set of Acinetobacter genomes.
|
|
4
4
|
|
|
5
|
-
The benchmark was performed by first
|
|
5
|
+
The benchmark was performed by first downloading all available Acinetobacter genomes from RefSeq, filtered on a passed ("OK") taxonomy check status and on them not being part of the training dataset. Genomes assigned to strain IDs were remapped to their respective species IDs, after which genomes with species IDs not contained in XspecT's Acinetobacter model were removed. The remaining genomes were then used to classify both assemblies and simulated reads generated from them. Simulated reads were generated by first filtering on genomes that were categorized as "complete" or "chromosome" by NCBI. The reads were then simulated from the longest contig of each genome (assumed to be the chromosome) using ART. 100 000 reads were simulated for each genome based on the HiSeq 2500 profile, with a read length of 125 bp. The reads were then classified using XspecT with predictions based on the maximum-scoring species.
|
|
6
6
|
|
|
7
7
|
## Benchmark Results
|
|
8
8
|
|
|
9
|
-
The benchmark results show that XspecT achieves high classification accuracy
|
|
9
|
+
The benchmark results show that XspecT achieves very high classification accuracy of nearly 100% for whole genomes and strong but reduced accuracy of 70% for simulated reads. However, the low macro-average F1 score (0.21) for the read dataset highlights a substantial class imbalance.
|
|
10
10
|
|
|
11
11
|
| Dataset | Total Samples | Matches | Mismatches | Match Rate | Mismatch Rate | Accuracy | Macro Avg F1 | Weighted Avg F1 |
|
|
12
12
|
|-----------|--------------:|----------:|-----------:|-----------:|--------------:|---------:|-------------:|----------------:|
|
|
13
|
-
| Assembly |
|
|
14
|
-
| Reads |
|
|
13
|
+
| Assembly | 13,795 | 13,776 | 19 | 99.86% | 0.14% | ≈1.00 | 0.96 | ≈1.00 |
|
|
14
|
+
| Reads | 121,590,139 | 85,679,572| 35,910,567 | 70.47% | 29.53% | 0.70 | 0.21 | 0.79 |
|
|
15
15
|
|
|
16
16
|
## Running the benchmark yourself
|
|
17
17
|
|
|
@@ -27,7 +27,7 @@ This will show a list of all available models, separated by their type (species,
|
|
|
27
27
|
|
|
28
28
|
### Downloading Models
|
|
29
29
|
|
|
30
|
-
To download a basic set of pre-trained models (Acinetobacter and Salonella), run:
|
|
30
|
+
To download a basic set of pre-trained models (Acinetobacter, including Oxford MLST scheme, and Salonella), run:
|
|
31
31
|
|
|
32
32
|
```bash
|
|
33
33
|
xspect models download
|
|
@@ -43,7 +43,7 @@ To train a model with NCBI data, run the following command:
|
|
|
43
43
|
xspect models train ncbi
|
|
44
44
|
```
|
|
45
45
|
|
|
46
|
-
By default, XspecT filters out NCBI accessions that do not
|
|
46
|
+
By default, XspecT filters out NCBI accessions that do not meet minimum N50 thresholds, have an inconclusive taxonomy check status, or are deemed atypical by NCBI. Furthermore, species with "Candidatus" and "sp." in their species names are filtered out. To disable filtering behavior, use the respective flag (see `xspect models train ncbi --help`).
|
|
47
47
|
|
|
48
48
|
If you would like to train models with manually curated data from a directory, you can use:
|
|
49
49
|
|
|
@@ -82,6 +82,8 @@ To train models for MLST classifications, run:
|
|
|
82
82
|
xspect models train mlst
|
|
83
83
|
```
|
|
84
84
|
|
|
85
|
+
XspecT will prompt your for the organism name and the MLST scheme you would like to train a model for.
|
|
86
|
+
|
|
85
87
|
## Classification
|
|
86
88
|
|
|
87
89
|
To classify samples, the command `xspect classify` can be used. This command will classify the sample based on the models available in your XspecT installation.
|
|
@@ -111,7 +113,7 @@ XspecT uses a kmer-based approach to classify samples. This means that the entir
|
|
|
111
113
|
|
|
112
114
|
**Example**:
|
|
113
115
|
```bash
|
|
114
|
-
xspect classify species --sparse-sampling-step 10
|
|
116
|
+
xspect classify species --sparse-sampling-step 10
|
|
115
117
|
```
|
|
116
118
|
|
|
117
119
|
This will only consider every 10th kmer in the sample.
|
|
@@ -120,7 +122,7 @@ This will only consider every 10th kmer in the sample.
|
|
|
120
122
|
By default, the classification results show only the taxonomy ID of each species along with its corresponding score for better readability. To display the full names associated with each taxonomy ID, you can use the `--display-names` (or `-n`) option:
|
|
121
123
|
|
|
122
124
|
```bash
|
|
123
|
-
xspect classify species --display-names
|
|
125
|
+
xspect classify species --display-names
|
|
124
126
|
```
|
|
125
127
|
The output will then be formatted as: `Taxonomy_ID - Display_Name: Score` for each species.
|
|
126
128
|
|
|
@@ -132,6 +134,8 @@ Samples can also be classified based on Multi-locus sequence type schemas. To ML
|
|
|
132
134
|
xspect classify mlst
|
|
133
135
|
```
|
|
134
136
|
|
|
137
|
+
XspecT will prompt you for the organism, MLST scheme, and path to your sample directory.
|
|
138
|
+
|
|
135
139
|
## Filtering
|
|
136
140
|
XspecT can also be used to filter samples based on their classification results. This is useful when analyzing metagenomic samples, for example when looking at genomic bycatch.
|
|
137
141
|
|
|
@@ -5,8 +5,8 @@ Thank you for your interest in contributing to XspecT! This page provides guidel
|
|
|
5
5
|
|
|
6
6
|
When contributing to XspecT, please follow the following steps to ensure a smooth process:
|
|
7
7
|
|
|
8
|
-
- **Read the documentation**: Familiarize yourself with the project by reading the [documentation](https://bionf.github.io/
|
|
9
|
-
- **Follow the coding standards**: Adhere to the project's coding standards and best practices. This includes using consistent naming conventions, writing clear and concise code, and documentation.
|
|
8
|
+
- **Read the documentation**: Familiarize yourself with the project by reading the [documentation](https://bionf.github.io/XspecT/), including the [Understanding XspecT](understanding.md) page and the [architecture overview](#architecture-overview).
|
|
9
|
+
- **Follow the coding standards**: Adhere to the project's coding standards and best practices. This includes ensuring that your code is formatted using [Black](https://black.readthedocs.io/en/stable/) and linted with [Pylint](https://pylint.pycqa.org/en/latest/) for Python code, as well as using consistent naming conventions, writing clear and concise code, and documentation. Please use [pure functions](https://goodresearch.dev/decoupled#learn-to-identify-and-use-pure-functions) where possible and make sure your changes are aligned with the project's [architecture](#architecture-overview).
|
|
10
10
|
- **Write tests**: Ensure that your changes are covered by tests. We use [pytest](https://docs.pytest.org/en/stable/) for testing. If you add new features or fix bugs, please include tests to verify your changes.
|
|
11
11
|
- **Document your changes**: Update the documentation to reflect any new features or changes you make. This includes updating the README, Google-style docstrings, and the [Mkdocs](https://www.mkdocs.org)-based documentation.
|
|
12
12
|
- **Use clear commit messages**: When committing your changes, use clear and descriptive commit messages that explain the purpose of the changes.
|
|
@@ -17,12 +17,12 @@ To set up XspecT for development, first make sure you have [Python](https://www.
|
|
|
17
17
|
|
|
18
18
|
Get started by cloning the repository:
|
|
19
19
|
```bash
|
|
20
|
-
git clone https://github.com/BIONF/
|
|
20
|
+
git clone https://github.com/BIONF/XspecT.git
|
|
21
21
|
```
|
|
22
22
|
|
|
23
23
|
You then need to build the web application using Vite. Navigate to the `xspect-web` directory, install dependencies, and run the build command, which will also watch for changes:
|
|
24
24
|
```bash
|
|
25
|
-
cd
|
|
25
|
+
cd XspecT/src/xspect/xspect-web
|
|
26
26
|
```
|
|
27
27
|
```bash
|
|
28
28
|
npm i
|
|
@@ -86,7 +86,7 @@ We use GitHub Actions to run checks on commits and pull requests. These checks i
|
|
|
86
86
|
|
|
87
87
|
Additionally, Github Actions are also used for deployment:
|
|
88
88
|
|
|
89
|
-
- **Documentation**: The Mkdocs-based documentation is built and deployed to GitHub Pages on changes to the `main` branch. You can view the documentation at [https://bionf.github.io/
|
|
89
|
+
- **Documentation**: The Mkdocs-based documentation is built and deployed to GitHub Pages on changes to the `main` branch. You can view the documentation at [https://bionf.github.io/XspecT/](https://bionf.github.io/XspecT/).
|
|
90
90
|
- **Python package**: The Python package is built and uploaded to PyPI when a new release is created. This allows users to easily install the latest version of XspecT using `pip install xspect`. Pre-releases are uploaded to TestPyPI and can be installed using `pip install --index-url https://test.pypi.org/simple/ xspect`.
|
|
91
91
|
|
|
92
92
|
## Pull Request Process
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "XspecT"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.7.1"
|
|
4
4
|
description = "Tool to monitor and characterize pathogens using Bloom filters."
|
|
5
5
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
6
6
|
license = {file = "LICENSE"}
|
|
7
|
-
requires-python = ">=3.10"
|
|
7
|
+
requires-python = ">=3.10,<3.14"
|
|
8
8
|
dependencies = [
|
|
9
9
|
"biopython",
|
|
10
10
|
"requests",
|
|
@@ -5,13 +5,14 @@ process classifySample {
|
|
|
5
5
|
|
|
6
6
|
input:
|
|
7
7
|
path sample
|
|
8
|
+
val model
|
|
8
9
|
|
|
9
10
|
output:
|
|
10
11
|
path "${sample.baseName}.json"
|
|
11
12
|
|
|
12
13
|
script:
|
|
13
14
|
"""
|
|
14
|
-
xspect classify species -g
|
|
15
|
+
xspect classify species -g ${model} -i ${sample} -o ${sample.baseName}.json
|
|
15
16
|
"""
|
|
16
17
|
|
|
17
18
|
stub:
|
|
@@ -2,9 +2,56 @@
|
|
|
2
2
|
|
|
3
3
|
include { classifySample as classifyAssembly } from './classify'
|
|
4
4
|
include { classifySample as classifyRead } from './classify'
|
|
5
|
+
include { strain_species_mapping } from '../nextflow-utils'
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
|
|
7
|
+
// --------------------- PARAMETERS ---------------------
|
|
8
|
+
params.publishDir = "results/benchmark"
|
|
9
|
+
params.xspectModel = "Acinetobacter"
|
|
10
|
+
|
|
11
|
+
// --------------------- WORKFLOW -----------------------
|
|
12
|
+
workflow {
|
|
13
|
+
species_model = getModelJSON()
|
|
14
|
+
name_mapping = getNameMapping(species_model)
|
|
15
|
+
genomes = file("data/genomes")
|
|
16
|
+
tax_report = file("data/aci_species.json")
|
|
17
|
+
tax_mapping_json = strain_species_mapping(tax_report)
|
|
18
|
+
assemblies = createAssemblyTable(genomes, tax_mapping_json, species_model)
|
|
19
|
+
|
|
20
|
+
// Whole genome assemblies
|
|
21
|
+
samples = Channel.fromPath("${genomes}/**/*.fna")
|
|
22
|
+
.flatten()
|
|
23
|
+
filtered_samples = assemblies
|
|
24
|
+
.splitCsv(header: true, sep: '\t')
|
|
25
|
+
.map { row -> row['Assembly Accession'] }
|
|
26
|
+
.cross(samples.map { sample ->
|
|
27
|
+
[sample.baseName.split('_')[0..1].join('_'), sample]
|
|
28
|
+
})
|
|
29
|
+
.map { it[1][1] }
|
|
30
|
+
classifications = classifyAssembly(filtered_samples, params.xspectModel)
|
|
31
|
+
summarizeClassifications(assemblies, classifications.collect())
|
|
32
|
+
confusionMatrix(summarizeClassifications.out, name_mapping)
|
|
33
|
+
mismatchConfusionMatrix(summarizeClassifications.out, name_mapping)
|
|
34
|
+
|
|
35
|
+
// Simulated reads
|
|
36
|
+
selectForReadGen(assemblies, species_model)
|
|
37
|
+
read_assemblies = selectForReadGen.out
|
|
38
|
+
.splitCsv(header: true, sep: '\t')
|
|
39
|
+
.map { row -> row['Assembly Accession'] }
|
|
40
|
+
.cross(samples.map { sample ->
|
|
41
|
+
[sample.baseName.split('_')[0..1].join('_'), sample]
|
|
42
|
+
})
|
|
43
|
+
.map { it[1][1] }
|
|
44
|
+
filterForChromosome(read_assemblies)
|
|
45
|
+
generateReads(filterForChromosome.out)
|
|
46
|
+
read_classifications = classifyRead(generateReads.out, params.xspectModel)
|
|
47
|
+
summarizeReadClassifications(selectForReadGen.out, read_classifications.collect())
|
|
48
|
+
|
|
49
|
+
calculateStats(summarizeClassifications.out, summarizeReadClassifications.out)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// --------------------- PROCESSES ---------------------
|
|
53
|
+
|
|
54
|
+
process getModelJSON {
|
|
8
55
|
cpus 2
|
|
9
56
|
memory '16 GB'
|
|
10
57
|
|
|
@@ -13,10 +60,8 @@ process downloadModels {
|
|
|
13
60
|
|
|
14
61
|
script:
|
|
15
62
|
"""
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
fi
|
|
19
|
-
cp "$HOME/xspect-data/models/acinetobacter-species.json" species_model.json
|
|
63
|
+
model_name="${params.xspectModel.toLowerCase().replaceAll('_','-')}-species.json"
|
|
64
|
+
cp "$HOME/xspect-data/models/\$model_name" species_model.json
|
|
20
65
|
"""
|
|
21
66
|
}
|
|
22
67
|
|
|
@@ -50,7 +95,7 @@ process createAssemblyTable {
|
|
|
50
95
|
|
|
51
96
|
input:
|
|
52
97
|
path genomes
|
|
53
|
-
path
|
|
98
|
+
path tax_mapping_json
|
|
54
99
|
path species_model
|
|
55
100
|
|
|
56
101
|
output:
|
|
@@ -70,28 +115,11 @@ process createAssemblyTable {
|
|
|
70
115
|
awk -F'\t' 'NR==1 || \$5 == "OK"' assemblies.tsv > assemblies_filtered.tsv
|
|
71
116
|
mv assemblies_filtered.tsv assemblies.tsv
|
|
72
117
|
|
|
73
|
-
# map taxonmic IDs to species IDs (taxonomic IDs might be strain IDs)
|
|
74
|
-
jq '
|
|
75
|
-
.reports
|
|
76
|
-
| map(select(.taxonomy.children != null))
|
|
77
|
-
| map({
|
|
78
|
-
species_id: .taxonomy.tax_id,
|
|
79
|
-
children: .taxonomy.children
|
|
80
|
-
})
|
|
81
|
-
| map(
|
|
82
|
-
. as \$entry
|
|
83
|
-
| \$entry.children
|
|
84
|
-
| map({ (tostring): \$entry.species_id })
|
|
85
|
-
| add
|
|
86
|
-
)
|
|
87
|
-
| add
|
|
88
|
-
' ${tax_report} > tax_mapping.json
|
|
89
|
-
|
|
90
118
|
# add species IDs to assemblies.tsv
|
|
91
119
|
declare -A species_map
|
|
92
120
|
while IFS="=" read -r key val; do
|
|
93
121
|
species_map["\$key"]="\$val"
|
|
94
|
-
done < <(jq -r 'to_entries[] | "\\(.key)=\\(.value)"'
|
|
122
|
+
done < <(jq -r 'to_entries[] | "\\(.key)=\\(.value)"' ${tax_mapping_json})
|
|
95
123
|
|
|
96
124
|
{
|
|
97
125
|
IFS='\t' read -r -a header < assemblies.tsv
|
|
@@ -118,6 +146,21 @@ process createAssemblyTable {
|
|
|
118
146
|
' assemblies.tsv > temp_assemblies.tsv
|
|
119
147
|
mv temp_assemblies.tsv assemblies.tsv
|
|
120
148
|
rm valid_species.txt
|
|
149
|
+
|
|
150
|
+
# filter out assemblies that are part of the training set
|
|
151
|
+
jq -r '.training_accessions | to_entries[] | .value[]' ${species_model} > training_accessions.txt
|
|
152
|
+
awk -F'\t' '
|
|
153
|
+
BEGIN {
|
|
154
|
+
while ((getline acc < "training_accessions.txt") > 0) {
|
|
155
|
+
training[acc] = 1;
|
|
156
|
+
}
|
|
157
|
+
close("training_accessions.txt");
|
|
158
|
+
}
|
|
159
|
+
NR==1 { print; next }
|
|
160
|
+
!(\$1 in training) { print }
|
|
161
|
+
' assemblies.tsv > temp_assemblies.tsv
|
|
162
|
+
mv temp_assemblies.tsv assemblies.tsv
|
|
163
|
+
rm training_accessions.txt
|
|
121
164
|
"""
|
|
122
165
|
|
|
123
166
|
stub:
|
|
@@ -130,7 +173,7 @@ process summarizeClassifications {
|
|
|
130
173
|
conda "conda-forge::pandas"
|
|
131
174
|
cpus 4
|
|
132
175
|
memory '16 GB'
|
|
133
|
-
publishDir
|
|
176
|
+
publishDir { params.publishDir }, mode: 'copy'
|
|
134
177
|
|
|
135
178
|
input:
|
|
136
179
|
path assemblies
|
|
@@ -208,15 +251,32 @@ process selectForReadGen {
|
|
|
208
251
|
]
|
|
209
252
|
assemblies = assemblies[~assemblies['Assembly Accession'].isin(training_accessions)]
|
|
210
253
|
|
|
211
|
-
# use up to three assemblies for each species
|
|
212
|
-
assemblies = assemblies.groupby('Species ID').head(3)
|
|
213
|
-
|
|
214
254
|
assemblies.to_csv('selected_samples.tsv', sep='\\t', index=False)
|
|
215
255
|
"""
|
|
216
256
|
}
|
|
217
257
|
|
|
258
|
+
process filterForChromosome {
|
|
259
|
+
conda "bioconda::seqkit"
|
|
260
|
+
cpus 2
|
|
261
|
+
memory '16 GB'
|
|
262
|
+
|
|
263
|
+
input:
|
|
264
|
+
path sample
|
|
265
|
+
|
|
266
|
+
output:
|
|
267
|
+
path "${sample.baseName}_chromosome.fna"
|
|
268
|
+
|
|
269
|
+
script:
|
|
270
|
+
"""
|
|
271
|
+
set -euo pipefail
|
|
272
|
+
|
|
273
|
+
seqkit sort -l -r ${sample} > sorted.tmp
|
|
274
|
+
seqkit head -n 1 sorted.tmp | seqkit seq -t dna -o "${sample.baseName}_chromosome.fna"
|
|
275
|
+
"""
|
|
276
|
+
}
|
|
277
|
+
|
|
218
278
|
process generateReads {
|
|
219
|
-
conda "
|
|
279
|
+
conda "bioconda::art"
|
|
220
280
|
cpus 2
|
|
221
281
|
memory '16 GB'
|
|
222
282
|
|
|
@@ -228,37 +288,24 @@ process generateReads {
|
|
|
228
288
|
|
|
229
289
|
script:
|
|
230
290
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
chromosome_sequence = max(sequences, key=len) # we assume the longest sequence is the chromosome
|
|
242
|
-
|
|
243
|
-
ch_rec_id = chromosome_sequence.id
|
|
244
|
-
ch_seq = chromosome_sequence.seq
|
|
245
|
-
ch_seqlen = len(chromosome_sequence.seq)
|
|
246
|
-
with open("${sample.baseName}_simulated.fq", "w") as f:
|
|
247
|
-
for i in range(num_reads):
|
|
248
|
-
start = random.randint(0, ch_seqlen - read_length)
|
|
249
|
-
read_seq = ch_seq[start:start + read_length]
|
|
250
|
-
f.write(f"@read_{i}_{ch_rec_id}_{start}-{start+read_length}\\n")
|
|
251
|
-
f.write(f"{read_seq}\\n")
|
|
252
|
-
f.write("+\\n")
|
|
253
|
-
f.write(f"{len(read_seq)*'~'}\\n")
|
|
291
|
+
set -euo pipefail
|
|
292
|
+
|
|
293
|
+
art_illumina \
|
|
294
|
+
-ss HS25 \
|
|
295
|
+
-i "${sample}" \
|
|
296
|
+
-l 125 \
|
|
297
|
+
-c 100000 \
|
|
298
|
+
-na \
|
|
299
|
+
-rs 42 \
|
|
300
|
+
-o "${sample.baseName}_simulated"
|
|
254
301
|
"""
|
|
255
302
|
}
|
|
256
303
|
|
|
257
304
|
process summarizeReadClassifications {
|
|
258
305
|
conda "conda-forge::pandas"
|
|
259
306
|
cpus 4
|
|
260
|
-
memory '
|
|
261
|
-
publishDir
|
|
307
|
+
memory '64 GB'
|
|
308
|
+
publishDir { params.publishDir }, mode: 'copy'
|
|
262
309
|
|
|
263
310
|
input:
|
|
264
311
|
path read_assemblies
|
|
@@ -278,10 +325,9 @@ process summarizeReadClassifications {
|
|
|
278
325
|
|
|
279
326
|
# Create a mapping of accession to species ID
|
|
280
327
|
accession_to_species = dict(zip(df_assemblies['Assembly Accession'], df_assemblies['Species ID']))
|
|
281
|
-
|
|
282
|
-
results = []
|
|
283
328
|
|
|
284
329
|
classifications = '${read_classifications}'.split()
|
|
330
|
+
include_header = True
|
|
285
331
|
for json_file in classifications:
|
|
286
332
|
basename = os.path.basename(json_file).replace('.json', '')
|
|
287
333
|
accession = '_'.join(basename.split('_')[:2])
|
|
@@ -291,6 +337,7 @@ process summarizeReadClassifications {
|
|
|
291
337
|
with open(json_file, 'r') as f:
|
|
292
338
|
data = json.load(f)
|
|
293
339
|
scores = data.get('scores', {})
|
|
340
|
+
results = []
|
|
294
341
|
|
|
295
342
|
for read_name, read_scores in scores.items():
|
|
296
343
|
if read_name != 'total':
|
|
@@ -310,17 +357,20 @@ process summarizeReadClassifications {
|
|
|
310
357
|
result[species] = score
|
|
311
358
|
|
|
312
359
|
results.append(result)
|
|
360
|
+
|
|
361
|
+
|
|
313
362
|
|
|
314
|
-
|
|
315
|
-
|
|
363
|
+
df_results = pd.DataFrame(results)
|
|
364
|
+
df_results.to_csv('read_classifications.tsv', sep='\\t', index=False, mode='a', header=include_header)
|
|
365
|
+
include_header = False
|
|
316
366
|
"""
|
|
317
367
|
}
|
|
318
368
|
|
|
319
369
|
process calculateStats {
|
|
320
370
|
conda "conda-forge::pandas conda-forge::scikit-learn"
|
|
321
|
-
cpus
|
|
322
|
-
memory '
|
|
323
|
-
publishDir
|
|
371
|
+
cpus 8
|
|
372
|
+
memory '256 GB'
|
|
373
|
+
publishDir { params.publishDir }, mode: 'copy'
|
|
324
374
|
|
|
325
375
|
input:
|
|
326
376
|
path assembly_classifications
|
|
@@ -399,7 +449,7 @@ process confusionMatrix {
|
|
|
399
449
|
conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
|
|
400
450
|
cpus 2
|
|
401
451
|
memory '16 GB'
|
|
402
|
-
publishDir
|
|
452
|
+
publishDir { params.publishDir }, mode: 'copy'
|
|
403
453
|
|
|
404
454
|
input:
|
|
405
455
|
path classifications
|
|
@@ -449,7 +499,7 @@ process mismatchConfusionMatrix {
|
|
|
449
499
|
conda "conda-forge::pandas conda-forge::scikit-learn conda-forge::numpy conda-forge::matplotlib"
|
|
450
500
|
cpus 2
|
|
451
501
|
memory '16 GB'
|
|
452
|
-
publishDir
|
|
502
|
+
publishDir { params.publishDir }, mode: 'copy'
|
|
453
503
|
|
|
454
504
|
input:
|
|
455
505
|
path classifications
|
|
@@ -506,43 +556,4 @@ process mismatchConfusionMatrix {
|
|
|
506
556
|
|
|
507
557
|
plt.savefig('mismatches_confusion_matrix.png', dpi=300, bbox_inches='tight')
|
|
508
558
|
"""
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
workflow {
|
|
513
|
-
species_model = downloadModels()
|
|
514
|
-
name_mapping = getNameMapping(species_model)
|
|
515
|
-
genomes = file("data/genomes")
|
|
516
|
-
tax_report = file("data/aci_species.json")
|
|
517
|
-
assemblies = createAssemblyTable(genomes, tax_report, species_model)
|
|
518
|
-
|
|
519
|
-
// Whole genome assemblies
|
|
520
|
-
samples = Channel.fromPath("${genomes}/**/*.fna")
|
|
521
|
-
.flatten()
|
|
522
|
-
filtered_samples = assemblies
|
|
523
|
-
.splitCsv(header: true, sep: '\t')
|
|
524
|
-
.map { row -> row['Assembly Accession'] }
|
|
525
|
-
.cross(samples.map { sample ->
|
|
526
|
-
[sample.baseName.split('_')[0..1].join('_'), sample]
|
|
527
|
-
})
|
|
528
|
-
.map { it[1][1] }
|
|
529
|
-
classifications = classifyAssembly(filtered_samples)
|
|
530
|
-
summarizeClassifications(assemblies, classifications.collect())
|
|
531
|
-
confusionMatrix(summarizeClassifications.out, name_mapping)
|
|
532
|
-
mismatchConfusionMatrix(summarizeClassifications.out, name_mapping)
|
|
533
|
-
|
|
534
|
-
// Simulated reads
|
|
535
|
-
selectForReadGen(assemblies, species_model)
|
|
536
|
-
read_assemblies = selectForReadGen.out
|
|
537
|
-
.splitCsv(header: true, sep: '\t')
|
|
538
|
-
.map { row -> row['Assembly Accession'] }
|
|
539
|
-
.cross(samples.map { sample ->
|
|
540
|
-
[sample.baseName.split('_')[0..1].join('_'), sample]
|
|
541
|
-
})
|
|
542
|
-
.map { it[1][1] }
|
|
543
|
-
generateReads(read_assemblies)
|
|
544
|
-
read_classifications = classifyRead(generateReads.out)
|
|
545
|
-
summarizeReadClassifications(selectForReadGen.out, read_classifications.collect())
|
|
546
|
-
|
|
547
|
-
calculateStats(summarizeClassifications.out, summarizeReadClassifications.out)
|
|
548
|
-
}
|
|
559
|
+
}
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
#SBATCH --mem-per-cpu=8gb
|
|
6
6
|
#SBATCH --job-name="download_acinetobacter"
|
|
7
7
|
|
|
8
|
-
datasets download genome taxon 469 --filename acinetobacter_dataset.zip --assembly-source
|
|
8
|
+
datasets download genome taxon 469 --filename acinetobacter_dataset.zip --assembly-source RefSeq --assembly-version latest --exclude-atypical --dehydrated
|
|
9
9
|
unzip -o acinetobacter_dataset.zip -d genomes
|
|
10
10
|
datasets rehydrate --directory genomes
|
|
11
11
|
rm acinetobacter_dataset.zip
|