XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (58) hide show
  1. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.3.dist-info/RECORD +0 -49
  26. xspect/BF_v2.py +0 -637
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -724
  31. xspect/XspecT_mini.py +0 -1363
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
  53. xspect/train_filter/get_paths.py +0 -35
  54. xspect/train_filter/interface_XspecT.py +0 -204
  55. xspect/train_filter/k_mer_count.py +0 -162
  56. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  57. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  58. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: XspecT
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -32,25 +32,19 @@ Classifier: License :: OSI Approved :: MIT License
32
32
  Requires-Python: >=3.10
33
33
  Description-Content-Type: text/markdown
34
34
  License-File: LICENSE
35
- Requires-Dist: Flask
36
- Requires-Dist: Flask-WTF
37
- Requires-Dist: WTForms
38
- Requires-Dist: Werkzeug
39
35
  Requires-Dist: biopython
40
- Requires-Dist: bitarray
41
- Requires-Dist: mmh3
42
- Requires-Dist: numpy
43
- Requires-Dist: pandas
44
36
  Requires-Dist: requests
45
37
  Requires-Dist: scikit-learn
46
- Requires-Dist: Psutil
47
- Requires-Dist: Matplotlib
48
- Requires-Dist: Pympler
49
- Requires-Dist: H5py
50
38
  Requires-Dist: Bio
51
- Requires-Dist: wheel
52
39
  Requires-Dist: loguru
53
40
  Requires-Dist: click
41
+ Requires-Dist: python-slugify
42
+ Requires-Dist: cobs-reloaded
43
+ Requires-Dist: rbloom
44
+ Requires-Dist: xxhash
45
+ Requires-Dist: fastapi
46
+ Requires-Dist: uvicorn
47
+ Requires-Dist: python-multipart
54
48
  Provides-Extra: docs
55
49
  Requires-Dist: sphinx ; extra == 'docs'
56
50
  Requires-Dist: furo ; extra == 'docs'
@@ -62,9 +56,13 @@ Requires-Dist: pytest ; extra == 'test'
62
56
  Requires-Dist: pytest-cov ; extra == 'test'
63
57
 
64
58
  # XspecT - Acinetobacter Species Assignment Tool
65
- <img src="/src/xspect/static/Logo.png" height="50%" width="50%">
59
+ ![Test](https://github.com/bionf/xspect2/actions/workflows/test.yml/badge.svg)
60
+ [![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen)](https://github.com/pylint-dev/pylint)
61
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
62
+
63
+ <img src="/src/docs/img/logo.png" height="50%" width="50%">
66
64
  <!-- start intro -->
67
- XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or sub-type level using [Bloom Filters](https://en.wikipedia.org/wiki/Bloom_filter) and a [Support Vector Machine](https://en.wikipedia.org/wiki/Support-vector_machine). It also identifies existing [blaOxa-genes](https://en.wikipedia.org/wiki/Beta-lactamase#OXA_beta-lactamases_(class_D)) and provides a list of relevant research papers for further information.
65
+ XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or sub-type level using [Bloom Filters] and a [Support Vector Machine]. It also identifies existing [blaOxa-genes] and provides a list of relevant research papers for further information.
68
66
  <br/><br/>
69
67
 
70
68
  XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a reference database. Bloom Filter ensure a fast lookup in this process. For a final prediction the results are classified using a Support Vector Machine.
@@ -74,6 +72,10 @@ Local extensions of the reference database are supported.
74
72
  <br/>
75
73
 
76
74
  The tool is available as a web-based application and a smaller command line interface.
75
+
76
+ [Bloom Filters]: https://en.wikipedia.org/wiki/Bloom_filter
77
+ [Support Vector Machine]: https://en.wikipedia.org/wiki/Support-vector_machine
78
+ [blaOxa-genes]: https://en.wikipedia.org/wiki/Beta-lactamase#OXA_beta-lactamases_(class_D)
77
79
  <!-- end intro -->
78
80
 
79
81
  <!-- start quickstart -->
@@ -82,11 +84,7 @@ To install Xspect, please download the lastest 64 bit Python version and install
82
84
  ```
83
85
  pip install xspect
84
86
  ```
85
- If you would like to train filters yourself, you need to install Jellyfish, which is used to count distinct k-meres in the assemblies. It can be installed using bioconda:
86
- ```
87
- conda install -c bioconda jellyfish
88
- ```
89
- On Apple Silicon, it is possible that this command installs an incorrect Jellyfish package. Please refer to the official [Jellyfish project](https://github.com/gmarcais/Jellyfish) for installation guidance.
87
+ Please note that Apple Silicon is currently not supported.
90
88
 
91
89
  ## Usage
92
90
  ### Get the Bloomfilters
@@ -100,9 +98,9 @@ xspect train you-ncbi-genus-name
100
98
  ```
101
99
 
102
100
  ### How to run the web app
103
- Run the following command lines in a console, a browser window will open automatically after the application is fully loaded.
101
+ To run the web app, install and run [XspecT Web](https://github.com/aromberg/xspect-web). Additionally, run XspecT in API mode:
104
102
  ```
105
- xspect web
103
+ xspect api
106
104
  ```
107
105
 
108
106
  ### How to use the XspecT command line interface
@@ -110,13 +108,9 @@ Run xspect with the configuration you want to run it with as arguments.
110
108
  ```
111
109
  xspect classify your-genus path/to/your/input-set
112
110
  ```
113
- For further instructions on how to use the command line interface, execute:
111
+ For further instructions on how to use the command line interface, please refer to the [documentation] or execute:
114
112
  ```
115
113
  xspect --help
116
114
  ```
115
+ [documentation]: https://bionf.github.io/XspecT2/cli.html
117
116
  <!-- end quickstart -->
118
-
119
- ## Input Data
120
- XspecT is able to use either raw sequence-reads (FASTQ-format .fq/.fastq) or already assembled genomes (FASTA-format .fasta/.fna). Using sequence-reads saves up the assembly process but high-quality reads with a low error-rate are needed (e.g. Illumina-reads).
121
-
122
- The amount of reads that will be used has to be set by the user when using sequence-reads. The minimum amount is 5000 reads for species classification and 500 reads for sub-type classification. The maximum number of reads is limited by the browser and is usually around ~8 million reads. Using more reads will lead to a increased runtime (xsec./1mio reads).
@@ -0,0 +1,30 @@
1
+ xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ xspect/definitions.py,sha256=gg6NvT8ypNzlnJvMMo3nHsyh8DHFFu41lOfnILkRDpE,1215
3
+ xspect/download_filters.py,sha256=ByE7Oggx-AyJ02Wirk_wcJHNdRDrJMfjwhmUe5tgWbE,741
4
+ xspect/fastapi.py,sha256=UuUr3eQUL0tCcB2d_ZKMToqreNLSNRKpCKK3-lwAzVo,3208
5
+ xspect/file_io.py,sha256=zKhl6Fd9KZAYiD8YgIyje5TbDYk5lxMp1WUrNkGSBo8,2779
6
+ xspect/main.py,sha256=rFoHKBC9UANlZh3TccZAJbOZ6023BnQaGEoPjjJjW0A,3572
7
+ xspect/model_management.py,sha256=w0aqjLUoixCokyKTYrcN1vih5IoLYLJG9p8aeYdVc8Y,3560
8
+ xspect/pipeline.py,sha256=h7duhVZ-hupwO_KQPstzFo8KMfMI2yleb9HmtTiMjic,7219
9
+ xspect/run.py,sha256=OJ7pCFqva3AhIYklKjVnqWGooVRO7S3b56kIAy-xabY,1189
10
+ xspect/train.py,sha256=khC1lldqfr4NvzLUiSJjSlh7DBG1ePielvQMiB29Hl8,10399
11
+ xspect/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ xspect/models/probabilistic_filter_model.py,sha256=ImyNRzR7jf2CBPGI65ItG0_eYmrQjo9soQYlsM0r-P0,9829
13
+ xspect/models/probabilistic_filter_svm_model.py,sha256=9Q4SBAzgbqATpS2E3IoardPpBwqkyrYSnrMwh0zwSag,5420
14
+ xspect/models/probabilistic_single_filter_model.py,sha256=nDAd_-_Ci2eH0KOJtf4wA-w63FMq9rGSR1LGiIA-gdw,3884
15
+ xspect/models/result.py,sha256=vHUEFXvbFyB8WmasXp99IrztjwaxH1f9QMFiRUPe40Q,4824
16
+ xspect/train_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ xspect/train_filter/create_svm.py,sha256=w6gq40yHINVfNzLhJfYFykUaNCwpU9AEDcbkUfis3DY,1504
18
+ xspect/train_filter/extract_and_concatenate.py,sha256=lLrczGgfZi2vAGqxq8fcEmJi5pvqyK33JkB_ZoCNYG8,4840
19
+ xspect/train_filter/html_scrap.py,sha256=76VV_ZbvD2I3IxRb62SiQwRPu2tr4fwn1HkfJQYaosM,3809
20
+ xspect/train_filter/ncbi_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ xspect/train_filter/ncbi_api/download_assemblies.py,sha256=MB_mxSjCTL05DqIt1WQem8AGU3PjtJnzPndeI9J-AOI,1285
22
+ xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py,sha256=puzDIws-yyBAEHwSAIYUM7g8FpLFmvOKh5xH1EsY8ZE,3830
23
+ xspect/train_filter/ncbi_api/ncbi_children_tree.py,sha256=_8puOsnsKp5lsMV2gZY1ijkfD_BZKG9eXZCX09qph5E,1819
24
+ xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py,sha256=O6JDXC4E6AYaf7NPnb34eSJyZhMB8r--bjoVF_ZsEdA,1868
25
+ XspecT-0.2.0.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
26
+ XspecT-0.2.0.dist-info/METADATA,sha256=efT3SkWV55firuZJh1gHCN7061Fxda7teuFLeZHvJQ0,4826
27
+ XspecT-0.2.0.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
28
+ XspecT-0.2.0.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
29
+ XspecT-0.2.0.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
30
+ XspecT-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: setuptools (73.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
xspect/definitions.py ADDED
@@ -0,0 +1,42 @@
1
+ """This module contains definitions for the XspecT package."""
2
+
3
+ from pathlib import Path
4
+ from os import getcwd
5
+
6
+ fasta_endings = ["fasta", "fna", "fa", "ffn", "frn"]
7
+ fastq_endings = ["fastq", "fq"]
8
+
9
+
10
+ def get_xspect_root_path():
11
+ """Return the root path for XspecT data."""
12
+ root_path = Path(getcwd()) / "xspect-data"
13
+ root_path.mkdir(exist_ok=True, parents=True)
14
+ return root_path
15
+
16
+
17
+ def get_xspect_model_path():
18
+ """Return the path to the XspecT models."""
19
+ model_path = get_xspect_root_path() / "models"
20
+ model_path.mkdir(exist_ok=True, parents=True)
21
+ return model_path
22
+
23
+
24
+ def get_xspect_tmp_path():
25
+ """Return the path to the XspecT temporary files."""
26
+ tmp_path = get_xspect_root_path() / "tmp"
27
+ tmp_path.mkdir(exist_ok=True, parents=True)
28
+ return tmp_path
29
+
30
+
31
+ def get_xspect_upload_path():
32
+ """Return the path to the XspecT upload directory."""
33
+ upload_path = get_xspect_root_path() / "uploads"
34
+ upload_path.mkdir(exist_ok=True, parents=True)
35
+ return upload_path
36
+
37
+
38
+ def get_xspect_runs_path():
39
+ """Return the path to the XspecT runs directory."""
40
+ runs_path = get_xspect_root_path() / "runs"
41
+ runs_path.mkdir(exist_ok=True, parents=True)
42
+ return runs_path
@@ -4,45 +4,30 @@ import os
4
4
  import shutil
5
5
  import requests
6
6
 
7
+ from xspect.definitions import get_xspect_model_path, get_xspect_tmp_path
8
+
7
9
 
8
10
  def download_test_filters(url):
9
11
  """Download filters."""
10
12
 
11
- if not os.path.exists("filter"):
12
- os.makedirs("filter")
13
-
14
- if not os.path.exists("Training_data"):
15
- os.makedirs("Training_data")
13
+ download_path = get_xspect_tmp_path() / "models.zip"
14
+ extract_path = get_xspect_tmp_path() / "extracted_models"
16
15
 
17
16
  r = requests.get(url, allow_redirects=True, timeout=10)
18
- with open("filter/filters.zip", "wb") as f:
17
+ with open(download_path, "wb") as f:
19
18
  f.write(r.content)
20
19
 
21
20
  shutil.unpack_archive(
22
- "filter/filters.zip",
23
- "filter/temp",
21
+ download_path,
22
+ extract_path,
24
23
  "zip",
25
24
  )
26
25
 
27
26
  shutil.copytree(
28
- "filter/temp/filters/Training_data",
29
- "Training_data",
27
+ extract_path,
28
+ get_xspect_model_path(),
30
29
  dirs_exist_ok=True,
31
30
  )
32
31
 
33
- shutil.rmtree("filter/temp/filters/Training_data")
34
-
35
- shutil.copytree(
36
- "filter/temp/filters",
37
- "filter",
38
- dirs_exist_ok=True,
39
- )
40
-
41
- shutil.rmtree("filter/temp")
42
-
43
- os.remove("filter/filters.zip")
44
-
45
- saved_options = ["Salmonella"]
46
- with open("saved_options.txt", "w") as f:
47
- for item in saved_options:
48
- f.write("%s\n" % item)
32
+ os.remove(download_path)
33
+ shutil.rmtree(extract_path)
xspect/fastapi.py ADDED
@@ -0,0 +1,101 @@
1
+ """FastAPI application for XspecT."""
2
+
3
+ import datetime
4
+ from pathlib import Path
5
+ from shutil import copyfileobj
6
+ from fastapi import FastAPI, UploadFile, BackgroundTasks
7
+ from xspect.definitions import get_xspect_runs_path, get_xspect_upload_path
8
+ from xspect.download_filters import download_test_filters
9
+ import xspect.model_management as mm
10
+ from xspect.models.result import StepType
11
+ from xspect.pipeline import ModelExecution, Pipeline, PipelineStep
12
+ from xspect.train import train_ncbi
13
+
14
+ app = FastAPI()
15
+
16
+
17
+ @app.get("/download-filters")
18
+ def download_filters():
19
+ """Download filters."""
20
+ download_test_filters("https://xspect2.s3.eu-central-1.amazonaws.com/models.zip")
21
+
22
+
23
+ @app.get("/classify")
24
+ def classify(genus: str, file: str, meta: bool = False, step: int = 500):
25
+ """Classify uploaded sample."""
26
+
27
+ path = get_xspect_upload_path() / file
28
+
29
+ pipeline = Pipeline(genus + " classification", "Test Author", "test@example.com")
30
+ species_execution = ModelExecution(genus + "-species", sparse_sampling_step=step)
31
+ if meta:
32
+ species_filtering_step = PipelineStep(
33
+ StepType.FILTERING, genus, 0.7, species_execution
34
+ )
35
+ genus_execution = ModelExecution(genus + "-genus", sparse_sampling_step=step)
36
+ genus_execution.add_pipeline_step(species_filtering_step)
37
+ pipeline.add_pipeline_step(genus_execution)
38
+ else:
39
+ pipeline.add_pipeline_step(species_execution)
40
+
41
+ run = pipeline.run(Path(path))
42
+ time_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
43
+ save_path = get_xspect_runs_path() / f"run_{time_str}.json"
44
+ run.save(save_path)
45
+
46
+ return run.to_dict()
47
+
48
+
49
+ @app.post("/train")
50
+ def train(genus: str, background_tasks: BackgroundTasks, svm_steps: int = 1):
51
+ """Train NCBI model."""
52
+ background_tasks.add_task(train_ncbi, genus, svm_steps)
53
+
54
+ return {"message": "Training started."}
55
+
56
+
57
+ @app.get("/list-models")
58
+ def list_models():
59
+ """List available models."""
60
+ return mm.get_models()
61
+
62
+
63
+ @app.get("/model-metadata")
64
+ def get_model_metadata(model_slug: str):
65
+ """Get metadata of a model."""
66
+ return mm.get_model_metadata(model_slug)
67
+
68
+
69
+ @app.post("/model-metadata")
70
+ def post_model_metadata(model_slug: str, author: str, author_email: str):
71
+ """Update metadata of a model."""
72
+ try:
73
+ mm.update_model_metadata(model_slug, author, author_email)
74
+ except ValueError as e:
75
+ return {"error": str(e)}
76
+ return {"message": "Metadata updated."}
77
+
78
+
79
+ @app.post("/model-display-name")
80
+ def post_model_display_name(model_slug: str, filter_id: str, display_name: str):
81
+ """Update display name of a filter in a model."""
82
+ try:
83
+ mm.update_model_display_name(model_slug, filter_id, display_name)
84
+ except ValueError as e:
85
+ return {"error": str(e)}
86
+ return {"message": "Display name updated."}
87
+
88
+
89
+ @app.post("/upload-file")
90
+ def upload_file(file: UploadFile):
91
+ """Upload file to the server."""
92
+ upload_path = get_xspect_upload_path() / file.filename
93
+
94
+ if not upload_path.exists():
95
+ try:
96
+ with upload_path.open("wb") as buffer:
97
+ copyfileobj(file.file, buffer)
98
+ finally:
99
+ file.file.close()
100
+
101
+ return {"filename": file.filename}
xspect/file_io.py CHANGED
@@ -2,107 +2,11 @@
2
2
  File IO module.
3
3
  """
4
4
 
5
- from linecache import getline
6
5
  import os
7
6
  from pathlib import Path
8
7
  import zipfile
9
-
10
- from loguru import logger
11
-
12
-
13
- def check_folder_structure():
14
- """Checks the folder structure and creates new folders if needed."""
15
- # Create list of all folder paths.
16
- root_path = Path(os.getcwd())
17
- filter_path = root_path / "filter"
18
- meta_path = root_path / "genus_metadata"
19
- filter_folder_names = [
20
- "array_sizes",
21
- "Metagenomes",
22
- "species_names",
23
- "translation_dicts",
24
- ]
25
- folder_paths = [filter_path, meta_path]
26
- for filter_folder_name in filter_folder_names:
27
- filter_folder_path = filter_path / filter_folder_name
28
- folder_paths.append(filter_folder_path)
29
-
30
- # Check if folders exist. If not create them.
31
- for folder_path in folder_paths:
32
- if not os.path.isdir(folder_path):
33
- os.mkdir(folder_path)
34
-
35
-
36
- def delete_non_fasta(files):
37
- """Delete all non fasta files from the list and return the list without those file names.
38
-
39
- :param files: List of file names.
40
- :type files: list[str]
41
- :return: List with only fasta files.
42
- """
43
- # All possible fasta file endings.
44
- fasta_endings = ["fasta", "fna", "fa", "ffn", "frn"]
45
-
46
- # Iterate through file list backwards and delete all non fasta files.
47
- for i in range(len(files) - 1, -1, -1):
48
- file = files[i].split(".")
49
- if file[-1] in fasta_endings:
50
- continue
51
- else:
52
- del files[i]
53
-
54
- return files
55
-
56
-
57
- def get_accessions(file_names: list[str]) -> list[str]:
58
- """Extract accessions from file names.
59
-
60
- :param files: List of file names.
61
- :type files: list[str]
62
- :return: List of all accessions.
63
- :rtype: list[str]
64
- """
65
- accessions = []
66
- for idx, file in enumerate(file_names):
67
- accessions.append(file.split("_"))
68
- accessions[idx] = accessions[idx][0] + "_" + accessions[idx][1]
69
-
70
- return accessions
71
-
72
-
73
- def get_file_paths(base_path: Path, file_names: list[str]) -> list[Path]:
74
- """Make a list with the paths to the files.
75
-
76
- :param base_path: Path of the parent directory.
77
- :type base_path: Path
78
- :param files: List of file names.
79
- :type files: list[str]
80
- :return: A list with all file paths.
81
- :rtype: list[Path]
82
- """
83
- return [base_path / file for file in file_names]
84
-
85
-
86
- def get_species_names(file_paths: list[Path]):
87
- """Extracts the species names.
88
-
89
- :param file_paths: List with the file paths.
90
- :type file_paths: list[Path]
91
- :return: List with all species names.
92
- """
93
- names = list()
94
- for path in file_paths:
95
- header = getline(str(path), 1)
96
- name = header.replace("\n", "").replace(">", "")
97
- if not name.isdigit():
98
- logger.error(
99
- "The header of file: {path} does not contain a correct ID: {name}. The ID needs to be "
100
- "just numbers"
101
- )
102
- logger.error("Aborting")
103
- exit()
104
- names.append(name)
105
- return names
8
+ from Bio import SeqIO
9
+ from xspect.definitions import fasta_endings, fastq_endings
106
10
 
107
11
 
108
12
  def delete_zip_files(dir_path):
@@ -129,7 +33,7 @@ def extract_zip(zip_path, unzipped_path):
129
33
 
130
34
 
131
35
  def concatenate_meta(path: Path, genus: str):
132
- """Concatenates all concatenated fasta files that are used to train bloomfilters to one fasta file.
36
+ """Concatenates all species files to one fasta file.
133
37
 
134
38
  :param path: Path to the directory with the concatenated fasta files.
135
39
  :type path: Path
@@ -137,20 +41,47 @@ def concatenate_meta(path: Path, genus: str):
137
41
  :type genus: str
138
42
  """
139
43
  files_path = path / "concatenate"
140
- fasta_endings = ["fasta", "fna", "fa", "ffn", "frn"]
141
44
  meta_path = path / (genus + ".fasta")
142
45
  files = os.listdir(files_path)
143
46
 
144
- with open(meta_path, "w") as meta_file:
47
+ with open(meta_path, "w", encoding="utf-8") as meta_file:
145
48
  # Write the header.
146
49
  meta_header = f">{genus} metagenome\n"
147
50
  meta_file.write(meta_header)
148
51
 
149
52
  # Open each concatenated species file and write the sequence in the meta file.
150
53
  for file in files:
151
- file_ending = str(file).split(".")[-1]
54
+ file_ending = str(file).rsplit(".", maxsplit=1)[-1]
152
55
  if file_ending in fasta_endings:
153
- with open((files_path / str(file)), "r") as species_file:
56
+ with open(
57
+ (files_path / str(file)), "r", encoding="utf-8"
58
+ ) as species_file:
154
59
  for line in species_file:
155
60
  if line[0] != ">":
156
61
  meta_file.write(line.replace("\n", ""))
62
+
63
+
64
+ def get_record_iterator(file_path: Path):
65
+ """Returns a record iterator for a fasta or fastq file."""
66
+ if not isinstance(file_path, Path):
67
+ raise ValueError("Path must be a Path object")
68
+
69
+ if not file_path.exists():
70
+ raise ValueError("File does not exist")
71
+
72
+ if not file_path.is_file():
73
+ raise ValueError("Path must be a file")
74
+
75
+ if file_path.suffix[1:] in fasta_endings:
76
+ return SeqIO.parse(file_path, "fasta")
77
+
78
+ if file_path.suffix[1:] in fastq_endings:
79
+ return SeqIO.parse(file_path, "fastq")
80
+
81
+ raise ValueError("Invalid file format, must be a fasta or fastq file")
82
+
83
+
84
+ def get_records_by_id(file: Path, ids: list[str]):
85
+ """Return records with the specified ids."""
86
+ records = get_record_iterator(file)
87
+ return [record for record in records if record.id in ids]