XspecT 0.5.1__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (127) hide show
  1. {xspect-0.5.1 → xspect-0.5.2}/.github/workflows/test.yml +2 -1
  2. {xspect-0.5.1 → xspect-0.5.2}/PKG-INFO +1 -1
  3. xspect-0.5.2/docs/contributing.md +95 -0
  4. xspect-0.5.2/docs/understanding.md +24 -0
  5. {xspect-0.5.1 → xspect-0.5.2}/pyproject.toml +1 -1
  6. {xspect-0.5.1 → xspect-0.5.2}/src/XspecT.egg-info/PKG-INFO +1 -1
  7. {xspect-0.5.1 → xspect-0.5.2}/src/XspecT.egg-info/SOURCES.txt +3 -2
  8. xspect-0.5.2/src/xspect/classify.py +80 -0
  9. xspect-0.5.2/src/xspect/definitions.py +90 -0
  10. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/download_models.py +10 -2
  11. xspect-0.5.2/src/xspect/file_io.py +232 -0
  12. xspect-0.5.2/src/xspect/filter_sequences.py +108 -0
  13. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/main.py +41 -10
  14. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/mlst_feature/mlst_helper.py +3 -0
  15. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/mlst_feature/pub_mlst_handler.py +43 -1
  16. xspect-0.5.2/src/xspect/model_management.py +149 -0
  17. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/models/probabilistic_filter_mlst_model.py +75 -37
  18. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/models/probabilistic_filter_model.py +194 -12
  19. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/models/probabilistic_filter_svm_model.py +99 -6
  20. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/models/probabilistic_single_filter_model.py +66 -5
  21. xspect-0.5.2/src/xspect/models/result.py +182 -0
  22. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/ncbi.py +45 -10
  23. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/train.py +2 -1
  24. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/web.py +68 -12
  25. xspect-0.5.2/src/xspect/xspect-web/dist/assets/index-Ceo58xui.css +1 -0
  26. xspect-0.5.1/src/xspect/xspect-web/dist/assets/index-CMG4V7fZ.js → xspect-0.5.2/src/xspect/xspect-web/dist/assets/index-Dt_UlbgE.js +82 -77
  27. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/dist/index.html +2 -2
  28. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/App.tsx +4 -2
  29. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/api.tsx +23 -1
  30. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/filter-form.tsx +16 -3
  31. xspect-0.5.2/src/xspect/xspect-web/src/components/filtering-result.tsx +65 -0
  32. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/result.tsx +2 -2
  33. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/types.tsx +5 -0
  34. {xspect-0.5.1 → xspect-0.5.2}/tests/test_file_io.py +6 -10
  35. {xspect-0.5.1 → xspect-0.5.2}/tests/test_probabilisitc_filter_mlst_model.py +49 -7
  36. {xspect-0.5.1 → xspect-0.5.2}/tests/test_pub_mlst_handler.py +19 -0
  37. {xspect-0.5.1 → xspect-0.5.2}/tests/test_web.py +65 -1
  38. xspect-0.5.1/docs/contributing.md +0 -3
  39. xspect-0.5.1/docs/understanding.md +0 -3
  40. xspect-0.5.1/src/xspect/classify.py +0 -67
  41. xspect-0.5.1/src/xspect/definitions.py +0 -50
  42. xspect-0.5.1/src/xspect/file_io.py +0 -165
  43. xspect-0.5.1/src/xspect/filter_sequences.py +0 -138
  44. xspect-0.5.1/src/xspect/model_management.py +0 -79
  45. xspect-0.5.1/src/xspect/models/result.py +0 -115
  46. xspect-0.5.1/src/xspect/xspect-web/dist/assets/index-jIKg1HIy.css +0 -1
  47. {xspect-0.5.1 → xspect-0.5.2}/.github/workflows/black.yml +0 -0
  48. {xspect-0.5.1 → xspect-0.5.2}/.github/workflows/docs.yml +0 -0
  49. {xspect-0.5.1 → xspect-0.5.2}/.github/workflows/pylint.yml +0 -0
  50. {xspect-0.5.1 → xspect-0.5.2}/.github/workflows/pypi.yml +0 -0
  51. {xspect-0.5.1 → xspect-0.5.2}/.gitignore +0 -0
  52. {xspect-0.5.1 → xspect-0.5.2}/LICENSE +0 -0
  53. {xspect-0.5.1 → xspect-0.5.2}/README.md +0 -0
  54. {xspect-0.5.1 → xspect-0.5.2}/docs/cli.md +0 -0
  55. {xspect-0.5.1 → xspect-0.5.2}/docs/index.md +0 -0
  56. {xspect-0.5.1 → xspect-0.5.2}/docs/quickstart.md +0 -0
  57. {xspect-0.5.1 → xspect-0.5.2}/docs/web.md +0 -0
  58. {xspect-0.5.1 → xspect-0.5.2}/mkdocs.yml +0 -0
  59. {xspect-0.5.1 → xspect-0.5.2}/setup.cfg +0 -0
  60. {xspect-0.5.1 → xspect-0.5.2}/src/XspecT.egg-info/dependency_links.txt +0 -0
  61. {xspect-0.5.1 → xspect-0.5.2}/src/XspecT.egg-info/entry_points.txt +0 -0
  62. {xspect-0.5.1 → xspect-0.5.2}/src/XspecT.egg-info/requires.txt +0 -0
  63. {xspect-0.5.1 → xspect-0.5.2}/src/XspecT.egg-info/top_level.txt +0 -0
  64. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/__init__.py +0 -0
  65. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/mlst_feature/__init__.py +0 -0
  66. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/models/__init__.py +0 -0
  67. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/.gitignore +0 -0
  68. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/README.md +0 -0
  69. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/components.json +0 -0
  70. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/dist/vite.svg +0 -0
  71. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/eslint.config.js +0 -0
  72. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/index.html +0 -0
  73. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/package-lock.json +0 -0
  74. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/package.json +0 -0
  75. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/pnpm-lock.yaml +0 -0
  76. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/public/vite.svg +0 -0
  77. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/assets/react.svg +0 -0
  78. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/classification-form.tsx +0 -0
  79. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/classify.tsx +0 -0
  80. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/data-table.tsx +0 -0
  81. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/dropdown-checkboxes.tsx +0 -0
  82. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/dropdown-slider.tsx +0 -0
  83. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/filter.tsx +0 -0
  84. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/header.tsx +0 -0
  85. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/landing.tsx +0 -0
  86. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/models-details.tsx +0 -0
  87. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/models.tsx +0 -0
  88. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/result-chart.tsx +0 -0
  89. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/spinner.tsx +0 -0
  90. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/accordion.tsx +0 -0
  91. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/button.tsx +0 -0
  92. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/card.tsx +0 -0
  93. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/chart.tsx +0 -0
  94. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/command.tsx +0 -0
  95. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/dialog.tsx +0 -0
  96. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/dropdown-menu.tsx +0 -0
  97. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/file-upload.tsx +0 -0
  98. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/form.tsx +0 -0
  99. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/input.tsx +0 -0
  100. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/label.tsx +0 -0
  101. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/navigation-menu.tsx +0 -0
  102. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/popover.tsx +0 -0
  103. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/select.tsx +0 -0
  104. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/separator.tsx +0 -0
  105. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/slider.tsx +0 -0
  106. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/switch.tsx +0 -0
  107. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/table.tsx +0 -0
  108. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/components/ui/tabs.tsx +0 -0
  109. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/index.css +0 -0
  110. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/lib/utils.ts +0 -0
  111. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/main.tsx +0 -0
  112. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/utils.tsx +0 -0
  113. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/src/vite-env.d.ts +0 -0
  114. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/tsconfig.app.json +0 -0
  115. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/tsconfig.json +0 -0
  116. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/tsconfig.node.json +0 -0
  117. {xspect-0.5.1 → xspect-0.5.2}/src/xspect/xspect-web/vite.config.ts +0 -0
  118. {xspect-0.5.1 → xspect-0.5.2}/tests/__init__.py +0 -0
  119. {xspect-0.5.1 → xspect-0.5.2}/tests/conftest.py +0 -0
  120. {xspect-0.5.1 → xspect-0.5.2}/tests/test_cli.py +0 -0
  121. {xspect-0.5.1 → xspect-0.5.2}/tests/test_model_management.py +0 -0
  122. {xspect-0.5.1 → xspect-0.5.2}/tests/test_model_result.py +0 -0
  123. {xspect-0.5.1 → xspect-0.5.2}/tests/test_ncbi.py +0 -0
  124. {xspect-0.5.1 → xspect-0.5.2}/tests/test_probabilistic_filter_model.py +0 -0
  125. {xspect-0.5.1 → xspect-0.5.2}/tests/test_probabilistic_filter_svm_model.py +0 -0
  126. {xspect-0.5.1 → xspect-0.5.2}/tests/test_probabilistic_single_filter_model.py +0 -0
  127. {xspect-0.5.1 → xspect-0.5.2}/tests/test_train.py +0 -0
@@ -24,9 +24,10 @@ jobs:
24
24
  run: |
25
25
  python -m pip install --upgrade pip
26
26
  pip install '.[test]'
27
- - name: Download models
27
+ - name: Download models and train MLST
28
28
  run: |
29
29
  xspect models download
30
+ yes 1 | xspect models train mlst
30
31
  - name: Test with pytest
31
32
  env:
32
33
  NCBI_API_KEY: ${{ secrets.NCBI_API_KEY }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: XspecT
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -0,0 +1,95 @@
1
+ # Contributing to XspecT
2
+
3
+ ## Introduction
4
+ Thank you for your interest in contributing to XspecT! This page provides guidelines for contributing to the project, including how to set up your own development environment, the XspecT architecture, CI/CD, and the process for submitting contributions.
5
+
6
+ When contributing to XspecT, please follow the following steps to ensure a smooth process:
7
+
8
+ - **Read the documentation**: Familiarize yourself with the project by reading the [documentation](https://bionf.github.io/XspecT2/), including the [Understanding XspecT](understanding.md) page and the [architecture overview](#architecture-overview).
9
+ - **Follow the coding standards**: Adhere to the project's coding standards and best practices. This includes using consistent naming conventions, writing clear and concise code, and documentation. Furthermore, please make sure your changes are algined with the project's [architecture](#architecture-overview).
10
+ - **Write tests**: Ensure that your changes are covered by tests. We use [pytest](https://docs.pytest.org/en/stable/) for testing. If you add new features or fix bugs, please include tests to verify your changes.
11
+ - **Document your changes**: Update the documentation to reflect any new features or changes you make. This includes updating the README, Google-style docstrings, and the [Mkdocs](https://www.mkdocs.org)-based documentation.
12
+ - **Use clear commit messages**: When committing your changes, use clear and descriptive commit messages that explain the purpose of the changes.
13
+ - **Follow the pull request process**: When you're ready to submit your changes, follow the [pull request process](#pull-request-process) outlined below.
14
+
15
+ ## Development Installation
16
+ To set up XspecT for development, first make sure you have [Python](https://www.python.org/downloads/) and [Node.js](https://nodejs.org/en/download/) installed. Please note that XspecT is currently not supported in Windows or Alpine Linux environments, unless you build [COBS](https://github.com/aromberg/cobs) yourself.
17
+
18
+ Get started by cloning the repository:
19
+ ```bash
20
+ git clone https://github.com/BIONF/XspecT2.git
21
+ ```
22
+
23
+ You then need to build the web application using Vite. Navigate to the `xspect-web` directory and run the build command, which will also watch for changes:
24
+ ```bash
25
+ cd XspecT2/src/xspect/xspect-web
26
+ ```
27
+ ```bash
28
+ npx vite build --watch
29
+ ```
30
+
31
+ Finally, in a separate terminal, navigate to the root of the cloned repository and install the Python package in editable mode:
32
+ ```bash
33
+ pip install -e .
34
+ ```
35
+
36
+ By combining the two processes, you can develop both the frontend and backend simultaneously.
37
+
38
+ ## Architecture Overview
39
+ XspecT consists of a Python component (`src/xspect`) and a web application built with [Vite](https://vitejs.dev/) (`src/xspect/xspect-web`). The Python component provides the core functionality, including the command-line interface (CLI) and the backend API, while the web application provides a user-friendly interface for interacting with XspecT. Furthermore, tests for the Python component reside in the `tests/` directory, while documentation is provided in the `docs/` directory.
40
+
41
+ ### Python Component
42
+
43
+ The Python component of XspecT is structured as follows:
44
+
45
+ - `main.py`: The entry point for the command-line interface (CLI) and the backend API.
46
+ - `web.py`: The [FastAPI](https://fastapi.tiangolo.com/) application that serves the web interface and handles API requests.
47
+
48
+ The core functionality of XspecT is implemented using the following modules:
49
+
50
+ - `classify.py`: Contains methods to classify sequences based on previously trained XspecT models.
51
+ - `filter_sequences.py`: Contains methods to filter sequences based on classification results.
52
+ - `model_management.py`: Contains methods to manage XspecT models.
53
+ - `train.py`: Contains methods to train XspecT models based on user-provided data or data from the NCBI/PubMLST API.
54
+ - `download_models.py`: Contains methods to download pre-trained XspecT models.
55
+
56
+ In the background, these modules utilize model classes and a result class, which are defined in the `/models/` folder.
57
+
58
+ - `/models/probabilistic_filter_model.py`: Base class for probabilistic filter models, which uses COBS indices for classification and stores the model's metadata. Results from the classification are stored in a `ModelResult` class.
59
+ - `/models/probabilistic_filter_svm_model.py`: This class extends the base model class and implements a probabilistic filter model, in which classification scores are passed to a support vector machine (SVM) for a final prediction. This model is typically used for species-level classification.
60
+ - `/models/probabilistic_filter_mlst_model.py`: This class extends the base model class and implements multilocus strain typing (MLST) by using multiple COBS indices.
61
+ - `/models/probabilistic_single_filter_model.py`: This class extends the base model class and implements a model that uses a single Bloom filter for classification. It is typically used for genus-level classification.
62
+ - `/models/result.py`: Contains the `ModelResult` class, which stores the results of a classification operation, including classification metadata, hits, and a prediction, if applicable.
63
+
64
+ Supplementary modules are documented in their respective files.
65
+
66
+ ### Web Application
67
+ The web application (`src/xspect/xspect-web`) is built using Vite, [Axios](https://axios-http.com/), [Tailwind CSS](https://tailwindcss.com/), and [shadcn/ui](https://ui.shadcn.com/). It provides a user-friendly interface for interacting with XspecT and includes the following main components:
68
+
69
+ - `src/api.ts`: Contains the API client for making requests to the backend FastAPI application.
70
+ - `src/App.tsx`: The main application component that renders the user interface. It uses React Router for navigation and includes the main layout as well as routing logic.
71
+ - `src/assets/`: Contains static assets such as images and icons.
72
+ - `src/components/`: Contains reusable components for the user interface, such as buttons, forms, and modals.
73
+ - `src/components/ui/`: Contains UI components from shadcn/ui, which are used to build the user interface.
74
+ - `src/types.ts`: Contains TypeScript type definitions for the application, including types for API responses.
75
+ - `vite.config.ts`: The Vite configuration file that defines how the web application is built and served. Also includes a configuration for the API proxy to the FastAPI backend.
76
+
77
+ ## Continuous Integration and Deployment
78
+ We use GitHub Actions to run checks on commits and pull requests. These checks include:
79
+
80
+ - **Code style and formatting**: Ensures that changes align with the project's code style. We use [Black](https://black.readthedocs.io/en/stable/) for Python code formatting.
81
+ - **Linting**: [Pylint](https://pylint.pycqa.org/en/latest/) is used for Python code linting. It checks for coding standards, potential errors, and code smells.
82
+ - **Tests**: Ensures that all tests pass. We use [pytest](https://docs.pytest.org/en/stable/) for testing.
83
+
84
+ Additionally, Github Actions are also used for deployment:
85
+
86
+ - **Documentation**: The Mkdocs-based documentation is built and deployed to GitHub Pages on changes to the `main` branch. You can view the documentation at [https://bionf.github.io/XspecT2/](https://bionf.github.io/XspecT2/).
87
+ - **Python package**: The Python package is built and uploaded to PyPI when a new release is created. This allows users to easily install the latest version of XspecT using `pip install xspect`. Pre-releases are uploaded to TestPyPI and can be installed using `pip install --index-url https://test.pypi.org/simple/ xspect`.
88
+
89
+ ## Pull Request Process
90
+ Once you have made your changes and tested them, you can submit a pull request. Please follow these steps:
91
+
92
+ 1. Ensure your code is up to date with the `dev` branch
93
+ 2. Create a pull request with a clear description of your changes to the `dev` branch
94
+ 3. Address any feedback from reviewers
95
+ 4. Once approved, your changes will be merged
@@ -0,0 +1,24 @@
1
+ # Understanding XspecT
2
+
3
+ ## What is XspecT?
4
+
5
+ XspecT is a tool designed to monitor and characterize pathogens using exact pattern matching of kmers. It allows users to filter for pathogen sequences in metagenomic datasets, classify these sequences on a species level, and perform strain-level typing.
6
+
7
+ ## Key Features
8
+ - **Genus-Level Classification**: Classify sequences at the genus level, enabling researchers to quickly identify the presence of specific microbial groups.
9
+ - **Species-Level Classification**: Provides detailed classification of sequences at the species level, enhancing the understanding of microbial diversity.
10
+ - **Multi-Locus Strain Typing**: Offers the ability to type sequences at the strain level, which is crucial for understanding variations within species.
11
+ - **Filtering**: Classification results can be used to filter sequences, enabling analysis of metagenomic samples.
12
+ - **Model Management**: XspecT models can be easily downloaded or trained from scratch using the command line interface. Training is possible both from local data, as well as from the NCBI Datasets and PubMLST API.
13
+ - **User-friendly Interface**: Next to the command line interface (CLI), a React-based web interface is available for easy interaction and visualization of results.
14
+ - **Works with Large Datasets**: Entire folders of input data can be passed to the tool, allowing for efficient processing of large datasets.
15
+
16
+ ## How XspecT Works
17
+ At its core, XspecT uses exact pattern matching of kmers to identify and classify sequences. The tool leverages indices of known pathogen sequences stored in XspecT models to match against input data. This process involves:
18
+
19
+ 1. **Kmer Extraction**: The input sequences are processed to extract kmers, which are short sequences of a fixed length.
20
+ 2. **Pattern Matching**: The extracted kmers are matched against an index of known sequences using exact matching algorithms. The number of matches is recorded, and stored as hits.
21
+ 3. **Classification**: Based on hits, scores are calculated as the fraction of kmers that match known sequences. These scores are then used to classify the sequences at different taxonomic levels.
22
+
23
+ ### COBS Index
24
+ In order to store kmers in a space-efficient manner, XspecT uses a COBS ("Compact Bit-Sliced Signature Index") classic index. This index uses a probabilistic data structure to store kmers, allowing for efficient storage and retrieval. The COBS index is designed to handle large datasets while maintaining fast query performance. More information about the COBS index can be found in the [COBS research paper](https://arxiv.org/abs/1905.09624).
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "XspecT"
3
- version = "0.5.1"
3
+ version = "0.5.2"
4
4
  description = "Tool to monitor and characterize pathogens using Bloom filters."
5
5
  readme = {file = "README.md", content-type = "text/markdown"}
6
6
  license = {file = "LICENSE"}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: XspecT
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -54,8 +54,8 @@ src/xspect/xspect-web/tsconfig.node.json
54
54
  src/xspect/xspect-web/vite.config.ts
55
55
  src/xspect/xspect-web/dist/index.html
56
56
  src/xspect/xspect-web/dist/vite.svg
57
- src/xspect/xspect-web/dist/assets/index-CMG4V7fZ.js
58
- src/xspect/xspect-web/dist/assets/index-jIKg1HIy.css
57
+ src/xspect/xspect-web/dist/assets/index-Ceo58xui.css
58
+ src/xspect/xspect-web/dist/assets/index-Dt_UlbgE.js
59
59
  src/xspect/xspect-web/public/vite.svg
60
60
  src/xspect/xspect-web/src/App.tsx
61
61
  src/xspect/xspect-web/src/api.tsx
@@ -72,6 +72,7 @@ src/xspect/xspect-web/src/components/dropdown-checkboxes.tsx
72
72
  src/xspect/xspect-web/src/components/dropdown-slider.tsx
73
73
  src/xspect/xspect-web/src/components/filter-form.tsx
74
74
  src/xspect/xspect-web/src/components/filter.tsx
75
+ src/xspect/xspect-web/src/components/filtering-result.tsx
75
76
  src/xspect/xspect-web/src/components/header.tsx
76
77
  src/xspect/xspect-web/src/components/landing.tsx
77
78
  src/xspect/xspect-web/src/components/models-details.tsx
@@ -0,0 +1,80 @@
1
+ from pathlib import Path
2
+ from xspect.mlst_feature.mlst_helper import pick_scheme_from_models_dir
3
+ import xspect.model_management as mm
4
+ from xspect.models.probabilistic_filter_mlst_model import (
5
+ ProbabilisticFilterMlstSchemeModel,
6
+ )
7
+ from xspect.file_io import prepare_input_output_paths
8
+
9
+
10
+ def classify_genus(
11
+ model_genus: str, input_path: Path, output_path: Path, step: int = 1
12
+ ):
13
+ """
14
+ Classify the genus of sequences.
15
+
16
+ This function classifies input files using the genus model.
17
+ The input path can be a file or directory
18
+
19
+ Args:
20
+ model_genus (str): The genus model slug.
21
+ input_path (Path): The path to the input file/directory containing sequences.
22
+ output_path (Path): The path to the output file where results will be saved.
23
+ step (int): The amount of kmers to be skipped.
24
+ """
25
+ model = mm.get_genus_model(model_genus)
26
+ input_paths, get_output_path = prepare_input_output_paths(input_path)
27
+
28
+ for idx, current_path in enumerate(input_paths):
29
+ result = model.predict(current_path, step=step)
30
+ result.input_source = current_path.name
31
+ cls_path = get_output_path(idx, output_path)
32
+ result.save(cls_path)
33
+ print(f"Saved result as {cls_path.name}")
34
+
35
+
36
+ def classify_species(
37
+ model_genus: str, input_path: Path, output_path: Path, step: int = 1
38
+ ):
39
+ """
40
+ Classify the species of sequences.
41
+
42
+ This function classifies input files using the species model.
43
+ The input path can be a file or directory
44
+
45
+ Args:
46
+ model_genus (str): The genus model slug.
47
+ input_path (Path): The path to the input file/directory containing sequences.
48
+ output_path (Path): The path to the output file where results will be saved.
49
+ step (int): The amount of kmers to be skipped.
50
+ """
51
+ model = mm.get_species_model(model_genus)
52
+ input_paths, get_output_path = prepare_input_output_paths(input_path)
53
+
54
+ for idx, current_path in enumerate(input_paths):
55
+ result = model.predict(current_path, step=step)
56
+ result.input_source = current_path.name
57
+ cls_path = get_output_path(idx, output_path)
58
+ result.save(cls_path)
59
+ print(f"Saved result as {cls_path.name}")
60
+
61
+
62
+ def classify_mlst(input_path: Path, output_path: Path, limit: bool):
63
+ """
64
+ Classify the strain type using the specific MLST model.
65
+
66
+ Args:
67
+ input_path (Path): The path to the input file/directory containing sequences.
68
+ output_path (Path): The path to the output file where results will be saved.
69
+ limit (bool): A limit for the highest allele_id results that are shown.
70
+ """
71
+
72
+ scheme_path = pick_scheme_from_models_dir()
73
+ model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
74
+ input_paths, get_output_path = prepare_input_output_paths(input_path)
75
+ for idx, current_path in enumerate(input_paths):
76
+ result = model.predict(scheme_path, current_path, step=1, limit=limit)
77
+ result.input_source = current_path.name
78
+ cls_path = get_output_path(idx, output_path)
79
+ result.save(cls_path)
80
+ print(f"Saved result as {cls_path.name}")
@@ -0,0 +1,90 @@
1
+ """This module contains definitions for the XspecT package."""
2
+
3
+ from pathlib import Path
4
+ from os import getcwd
5
+
6
+ fasta_endings = ["fasta", "fna", "fa", "ffn", "frn"]
7
+ fastq_endings = ["fastq", "fq"]
8
+
9
+
10
+ def get_xspect_root_path() -> Path:
11
+ """
12
+ Return the root path for XspecT data.
13
+
14
+ Returns the path to the XspecT data directory, which can be located either in the user's home directory or in the current working directory.
15
+ If neither exists, it creates the directory in the user's home directory.
16
+
17
+ Returns:
18
+ Path: The path to the XspecT data directory.
19
+ """
20
+
21
+ home_based_dir = Path.home() / "xspect-data"
22
+ if home_based_dir.exists():
23
+ return home_based_dir
24
+
25
+ cwd_based_dir = Path(getcwd()) / "xspect-data"
26
+ if cwd_based_dir.exists():
27
+ return cwd_based_dir
28
+
29
+ home_based_dir.mkdir(exist_ok=True, parents=True)
30
+ return home_based_dir
31
+
32
+
33
+ def get_xspect_model_path() -> Path:
34
+ """
35
+ Return the path to the XspecT models.
36
+
37
+ Returns the path to the XspecT models directory, which is located within the XspecT data directory.
38
+ If the directory does not exist, it creates the directory.
39
+
40
+ Returns:
41
+ Path: The path to the XspecT models directory.
42
+ """
43
+ model_path = get_xspect_root_path() / "models"
44
+ model_path.mkdir(exist_ok=True, parents=True)
45
+ return model_path
46
+
47
+
48
+ def get_xspect_upload_path() -> Path:
49
+ """
50
+ Return the path to the XspecT upload directory.
51
+
52
+ Returns the path to the XspecT uploads directory, which is located within the XspecT data directory.
53
+ If the directory does not exist, it creates the directory.
54
+
55
+ Returns:
56
+ Path: The path to the XspecT uploads directory.
57
+ """
58
+ upload_path = get_xspect_root_path() / "uploads"
59
+ upload_path.mkdir(exist_ok=True, parents=True)
60
+ return upload_path
61
+
62
+
63
+ def get_xspect_runs_path() -> Path:
64
+ """
65
+ Return the path to the XspecT runs directory.
66
+
67
+ Returns the path to the XspecT runs directory, which is located within the XspecT data directory.
68
+ If the directory does not exist, it creates the directory.
69
+
70
+ Returns:
71
+ Path: The path to the XspecT runs directory.
72
+ """
73
+ runs_path = get_xspect_root_path() / "runs"
74
+ runs_path.mkdir(exist_ok=True, parents=True)
75
+ return runs_path
76
+
77
+
78
+ def get_xspect_mlst_path() -> Path:
79
+ """
80
+ Return the path to the XspecT MLST directory.
81
+
82
+ Returns the path to the XspecT MLST directory, which is located within the XspecT data directory.
83
+ If the directory does not exist, it creates the directory.
84
+
85
+ Returns:
86
+ Path: The path to the XspecT MLST directory.
87
+ """
88
+ mlst_path = get_xspect_root_path() / "mlst"
89
+ mlst_path.mkdir(exist_ok=True, parents=True)
90
+ return mlst_path
@@ -8,8 +8,16 @@ import requests
8
8
  from xspect.definitions import get_xspect_model_path
9
9
 
10
10
 
11
- def download_test_models(url):
12
- """Download models."""
11
+ def download_test_models(url: str) -> None:
12
+ """
13
+ Download models from the specified URL.
14
+
15
+ This function downloads a zip file from the given URL, extracts its contents,
16
+ and copies the extracted files to the XspecT model directory.
17
+
18
+ Args:
19
+ url (str): The URL from which to download the models.
20
+ """
13
21
  with TemporaryDirectory() as tmp_dir:
14
22
  tmp_dir = Path(tmp_dir)
15
23
  download_path = tmp_dir / "models.zip"
@@ -0,0 +1,232 @@
1
+ """
2
+ File IO module.
3
+ """
4
+
5
+ from json import loads
6
+ import os
7
+ from pathlib import Path
8
+ import zipfile
9
+ from typing import Callable, Iterator
10
+ from Bio import SeqIO
11
+ from xspect.definitions import fasta_endings, fastq_endings
12
+
13
+
14
+ def delete_zip_files(dir_path) -> None:
15
+ """
16
+ Delete all zip files in the given directory.
17
+
18
+ This function checks each file in the specified directory and removes it if it is a zip file.
19
+
20
+ Args:
21
+ dir_path (Path): Path to the directory where zip files should be deleted.
22
+ """
23
+ files = os.listdir(dir_path)
24
+ for file in files:
25
+ if zipfile.is_zipfile(file):
26
+ file_path = dir_path / str(file)
27
+ os.remove(file_path)
28
+
29
+
30
+ def extract_zip(zip_path: Path, unzipped_path: Path) -> None:
31
+ """
32
+ Extracts all files from a zip file.
33
+
34
+ Extracts the contents of the specified zip file to the given directory.
35
+
36
+ Args:
37
+ zip_path (Path): Path to the zip file to be extracted.
38
+ unzipped_path (Path): Path to the directory where the contents will be extracted.
39
+ """
40
+ unzipped_path.mkdir(parents=True, exist_ok=True)
41
+
42
+ with zipfile.ZipFile(zip_path) as item:
43
+ item.extractall(unzipped_path)
44
+
45
+
46
+ def get_record_iterator(file_path: Path) -> Iterator:
47
+ """
48
+ Returns a record iterator for a fasta or fastq file.
49
+
50
+ This function checks the file extension to determine if the file is in fasta or fastq format
51
+ and returns an iterator over the records in the file using Biopython's SeqIO module.
52
+
53
+ Args:
54
+ file_path (Path): Path to the fasta or fastq file.
55
+
56
+ Returns:
57
+ Iterator: An iterator over the records in the file.
58
+
59
+ Raises:
60
+ ValueError: If the file path is not a Path object, does not exist, is not a file,
61
+ or has an invalid file format.
62
+ """
63
+ if not isinstance(file_path, Path):
64
+ raise ValueError("Path must be a Path object")
65
+
66
+ if not file_path.exists():
67
+ raise ValueError("File does not exist")
68
+
69
+ if not file_path.is_file():
70
+ raise ValueError("Path must be a file")
71
+
72
+ if file_path.suffix[1:] in fasta_endings:
73
+ return SeqIO.parse(file_path, "fasta")
74
+
75
+ if file_path.suffix[1:] in fastq_endings:
76
+ return SeqIO.parse(file_path, "fastq")
77
+
78
+ raise ValueError("Invalid file format, must be a fasta or fastq file")
79
+
80
+
81
+ def concatenate_species_fasta_files(
82
+ input_folders: list[Path], output_directory: Path
83
+ ) -> None:
84
+ """
85
+ Concatenate fasta files from different species into one file per species.
86
+
87
+ This function iterates through each species folder within the given input folder,
88
+ collects all fasta files, and concatenates their contents into a single fasta file
89
+ named after the species.
90
+
91
+ Args:
92
+ input_folders (list[Path]): List of paths to species folders.
93
+ output_directory (Path): Path to the output directory.
94
+ """
95
+ for species_folder in input_folders:
96
+ species_name = species_folder.name
97
+ fasta_files = [
98
+ f for ending in fasta_endings for f in species_folder.glob(f"*.{ending}")
99
+ ]
100
+ if len(fasta_files) == 0:
101
+ raise ValueError(f"no fasta files found in {species_folder}")
102
+
103
+ # concatenate fasta files
104
+ concatenated_fasta = output_directory / f"{species_name}.fasta"
105
+ with open(concatenated_fasta, "w", encoding="utf-8") as f:
106
+ for fasta_file in fasta_files:
107
+ with open(fasta_file, "r", encoding="utf-8") as f_in:
108
+ f.write(f_in.read())
109
+
110
+
111
+ def concatenate_metagenome(fasta_dir: Path, meta_path: Path) -> None:
112
+ """
113
+ Concatenate all fasta files in a directory into one file.
114
+
115
+ This function searches for all fasta files in the specified directory and writes their contents
116
+ into a single output file. The output file will contain the concatenated sequences from all fasta files.
117
+
118
+ Args:
119
+ fasta_dir (Path): Path to the directory with the fasta files.
120
+ meta_path (Path): Path to the output file.
121
+ """
122
+ fasta_files = [
123
+ file for ending in fasta_endings for file in fasta_dir.glob(f"*.{ending}")
124
+ ]
125
+ with open(meta_path, "w", encoding="utf-8") as meta_file:
126
+ for fasta_file in fasta_files:
127
+ with open(fasta_file, "r", encoding="utf-8") as f_in:
128
+ meta_file.write(f_in.read())
129
+
130
+
131
+ def get_ncbi_dataset_accession_paths(
132
+ ncbi_dataset_path: Path,
133
+ ) -> dict[str, Path]:
134
+ """
135
+ Get the paths of the NCBI dataset accessions.
136
+
137
+ This function reads the dataset catalog from the NCBI dataset directory and returns a dictionary
138
+ mapping each accession to its corresponding file path. The first item in the dataset catalog is
139
+ assumed to be a data report, and is skipped.
140
+
141
+ Args:
142
+ ncbi_dataset_path (Path): Path to the NCBI dataset directory.
143
+
144
+ Returns:
145
+ dict[str, Path]: Dictionary with the accession as key and the path as value.
146
+
147
+ Raises:
148
+ ValueError: If the dataset path does not exist or is invalid.
149
+ """
150
+ data_path = ncbi_dataset_path / "ncbi_dataset" / "data"
151
+ if not data_path.exists():
152
+ raise ValueError(f"Path {data_path} does not exist.")
153
+
154
+ accession_paths = {}
155
+ with open(data_path / "dataset_catalog.json", "r", encoding="utf-8") as f:
156
+ res = loads(f.read())
157
+ for assembly in res["assemblies"][1:]: # the first item is the data report
158
+ accession = assembly["accession"]
159
+ assembly_path = data_path / assembly["files"][0]["filePath"]
160
+ accession_paths[accession] = assembly_path
161
+ return accession_paths
162
+
163
+
164
+ def filter_sequences(
165
+ input_file: Path,
166
+ output_file: Path,
167
+ included_ids: list[str],
168
+ ) -> None:
169
+ """
170
+ Filter sequences by IDs from an input file and save them to an output file.
171
+
172
+ This function reads a fasta or fastq file, filters the sequences based on the provided IDs,
173
+ and writes the matching sequences to an output file. If no IDs are provided, no output file
174
+ is created.
175
+
176
+ Args:
177
+ input_file (Path): Path to the input file.
178
+ output_file (Path): Path to the output file.
179
+ included_ids (list[str], optional): List of IDs to include. If None, no output file
180
+ is created.
181
+ """
182
+ if not included_ids:
183
+ print("No IDs provided, no output file will be created.")
184
+ return
185
+
186
+ with open(output_file, "w", encoding="utf-8") as out_f:
187
+ for record in get_record_iterator(input_file):
188
+ if record.id in included_ids:
189
+ SeqIO.write(record, out_f, "fasta")
190
+
191
+
192
+ def prepare_input_output_paths(
193
+ input_path: Path,
194
+ ) -> tuple[list[Path], Callable[[int, Path], Path]]:
195
+ """
196
+ Processes the input path into a list of input paths and a function generating output paths.
197
+
198
+ This function checks if the input path is a directory or a file. If it is a directory,
199
+ it collects all files with specified fasta and fastq endings. If it is a file, it uses that file
200
+ as the input path. It then returns a list of input file paths and a function that generates
201
+ output paths based on the index of the input file and a specified output path.
202
+
203
+ Args:
204
+ input_path (Path): Path to the directory or file.
205
+
206
+ Returns:
207
+ tuple[list[Path], Callable[[int, Path], Path]]: A tuple containing:
208
+ - A list of input file paths
209
+ - A function that takes an index and the output path,
210
+ and returns the processed output path.
211
+
212
+ Raises:
213
+ ValueError: If the input path is invalid.
214
+ """
215
+ input_is_dir = input_path.is_dir()
216
+ ending_wildcards = [f"*.{ending}" for ending in fasta_endings + fastq_endings]
217
+
218
+ if input_is_dir:
219
+ input_paths = [p for e in ending_wildcards for p in input_path.glob(e)]
220
+ elif input_path.is_file():
221
+ input_paths = [input_path]
222
+ else:
223
+ raise ValueError("Invalid input path")
224
+
225
+ def get_output_path(idx: int, output_path: Path) -> Path:
226
+ return (
227
+ output_path.parent / f"{output_path.stem}_{idx+1}{output_path.suffix}"
228
+ if input_is_dir
229
+ else output_path
230
+ )
231
+
232
+ return input_paths, get_output_path