q2-eplacer 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- q2_eplacer-0.1.1/LICENSE.txt +5 -0
- q2_eplacer-0.1.1/PKG-INFO +215 -0
- q2_eplacer-0.1.1/README.md +184 -0
- q2_eplacer-0.1.1/pyproject.toml +77 -0
- q2_eplacer-0.1.1/q2_eplacer/__init__.py +6 -0
- q2_eplacer-0.1.1/q2_eplacer/_formats.py +80 -0
- q2_eplacer-0.1.1/q2_eplacer/_methods.py +472 -0
- q2_eplacer-0.1.1/q2_eplacer/_types.py +7 -0
- q2_eplacer-0.1.1/q2_eplacer/_version.py +1 -0
- q2_eplacer-0.1.1/q2_eplacer/citations.bib +6 -0
- q2_eplacer-0.1.1/q2_eplacer/plugin_setup.py +133 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/__init__.py +8 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/data/alignedSeqs.qza +0 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/data/full_taxonomy.tsv +17 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/data/geoData.tsv +17 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/data/geoData_run.tsv +3 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/data/seqs.qza +0 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/data/testfasta.fa +6 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/data/testfasta.qza +0 -0
- q2_eplacer-0.1.1/q2_eplacer/tests/test_methods.py +113 -0
- q2_eplacer-0.1.1/q2_eplacer.egg-info/PKG-INFO +215 -0
- q2_eplacer-0.1.1/q2_eplacer.egg-info/SOURCES.txt +25 -0
- q2_eplacer-0.1.1/q2_eplacer.egg-info/dependency_links.txt +1 -0
- q2_eplacer-0.1.1/q2_eplacer.egg-info/entry_points.txt +2 -0
- q2_eplacer-0.1.1/q2_eplacer.egg-info/requires.txt +4 -0
- q2_eplacer-0.1.1/q2_eplacer.egg-info/top_level.txt +1 -0
- q2_eplacer-0.1.1/setup.cfg +4 -0
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
Software code created by U.S. Government employees is not subject to copyright in the United States (17 U.S.C. §105).
|
|
2
|
+
The United States/Department of Commerce reserve all rights to seek and obtain copyright protection in countries other
|
|
3
|
+
than the United States for Software authored in its entirety by the Department of Commerce. To this end, the Department
|
|
4
|
+
of Commerce hereby grants to Recipient a royalty-free, nonexclusive license to use, copy, and create derivative works of
|
|
5
|
+
the Software outside of the United States.
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: q2-eplacer
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: ASV classifier with deep-learning and biogeography
|
|
5
|
+
Author-email: Christopher Powers <christopher.powers@noaa.gov>
|
|
6
|
+
License: Software code created by U.S. Government employees is not subject to copyright in the United States (17 U.S.C. §105).
|
|
7
|
+
The United States/Department of Commerce reserve all rights to seek and obtain copyright protection in countries other
|
|
8
|
+
than the United States for Software authored in its entirety by the Department of Commerce. To this end, the Department
|
|
9
|
+
of Commerce hereby grants to Recipient a royalty-free, nonexclusive license to use, copy, and create derivative works of
|
|
10
|
+
the Software outside of the United States.
|
|
11
|
+
Project-URL: Homepage, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer
|
|
12
|
+
Project-URL: Repository, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer
|
|
13
|
+
Project-URL: Bug Tracker, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer/issues
|
|
14
|
+
Keywords: qiime2,microbiome,taxonomy,deep-learning,biogeography
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE.txt
|
|
26
|
+
Requires-Dist: qiime2
|
|
27
|
+
Requires-Dist: pandas
|
|
28
|
+
Requires-Dist: biom-format
|
|
29
|
+
Requires-Dist: eplacer
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# q2-eplacer
|
|
33
|
+
|
|
34
|
+
A [QIIME 2](https://qiime2.org) plugin [developed](https://develop.qiime2.org) by Christopher Powers (christopher.powers@noaa.gov) that alows for the [ePlacer taxonomic classifier](https://github.com/NEFSC/PEMAD-PBB-ePlacer/) to interface with QIIME2.
|
|
35
|
+
|
|
36
|
+
ePlacer is a taxonomic classification tool that uses deep-learning approaches to incorporate both sequence information and biogeographic information into taxonomic assignment of DNA sequences.
|
|
37
|
+
|
|
38
|
+
## Why use ePlacer
|
|
39
|
+
|
|
40
|
+
The machine learning architecture of ePlacer enables powerful prediction beyond sequence-only classification tools (e.g. sequence alignment with blast or naive-bayes classifiers) by directly incorporating additional data into the probabalistic estimate of taxonomy, specifically developed for metabarcoding data. This novel applciation of deep-learning is immensely useful, as there can be many cases in metabarcoding data where two reference species have 100% sequence overlap, but distinct geographic ranges. This tool discriminates these cases and provides additional data for downstream taxonomic curation. Due to this, ePlacer provides enhanced interoperability between metabarcoding datasets.
|
|
41
|
+
|
|
42
|
+
Currently, ePlacer offers pre-trained models for two popular metabarcoding regions: the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. For these two regions, ePlacer offers the following benefits:
|
|
43
|
+
|
|
44
|
+
* **Interoperability.** ePlacer is trained on global datasets, allowing for direct comparison between metabarcoding datasets, regardless of geographic region.
|
|
45
|
+
* **Portability.** ePlacer has pre-trained models available for both MiFish and Riaz marker gene regions containerized and available for out-of-the-box use
|
|
46
|
+
* **Increased Accuracy.** The ePlacer model architecture provides increased accuracy, precision, and recall as compared to blast, Naive-Bayes, or least common ancestor approachers
|
|
47
|
+
* **Trainability** In addition to the two provided barcodes, this code repository provides tools for training new models.
|
|
48
|
+
|
|
49
|
+
For other barcode regions, there will be significant advantages with the training of new models. If you are interested in training a new model for ePlacer, please do not hesitate to reach out!
|
|
50
|
+
|
|
51
|
+
## Installation instructions
|
|
52
|
+
|
|
53
|
+
**The following instructions are intended to be a starting point** and should be replaced when `q2-eplacer` is ready to share with others.
|
|
54
|
+
They will enable you to install the most recent *development* version of `q2-eplacer`.
|
|
55
|
+
Remember that *release* versions should be used for all "real" work (i.e., where you're not testing or prototyping) - if there aren't instructions for installing a release version of this plugin, it is probably not yet intended for use in practice.
|
|
56
|
+
|
|
57
|
+
### Install Prerequisites
|
|
58
|
+
|
|
59
|
+
[Miniconda](https://conda.io/miniconda.html) provides the `conda` environment and package manager, and is currently the only supported way to install QIIME 2.
|
|
60
|
+
Follow the instructions for downloading and installing Miniconda.
|
|
61
|
+
|
|
62
|
+
After installing Miniconda and opening a new terminal, make sure you're running the latest version of `conda`:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
conda update conda
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
You also need to install the base qiime2 as a conda environment. Follow the [install instructions here](https://docs.qiime2.org/2024.10/install/native/).
|
|
69
|
+
|
|
70
|
+
### Install `q2-eplacer`
|
|
71
|
+
|
|
72
|
+
Next, you will install the ePlacer qiime plugin from pip
|
|
73
|
+
```bash
|
|
74
|
+
pip install q2-eplacer
|
|
75
|
+
```
|
|
76
|
+
This will also install all other required dependencies.
|
|
77
|
+
|
|
78
|
+
## Using `q2-eplacer`
|
|
79
|
+
|
|
80
|
+
### Data preparation
|
|
81
|
+
|
|
82
|
+
In order to use ePlacer, you must first prep the data prior to installing, including prepping input data and collecting a pre-trained model for inference.
|
|
83
|
+
|
|
84
|
+
#### Pre-Trained models
|
|
85
|
+
|
|
86
|
+
Currently, two pre-trained models are available: : the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. These are available in a QIIME2-compatible format:
|
|
87
|
+
```bash
|
|
88
|
+
# Mifish marker
|
|
89
|
+
wget https://zenodo.org/records/20820029/files/mifish.qza
|
|
90
|
+
# Riaz marker
|
|
91
|
+
wget https://zenodo.org/records/20820029/files/riaz.qza
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
If desired, users can also train new models, see below in section `Training New Models`. Any new, high performing models may be added to a Zenodo record by reaching out to the maintainers.
|
|
95
|
+
|
|
96
|
+
If you trained a new model with the qiime2 plugin, it will be automatically formatted into the `.qza` format. Otherwise, run the following:
|
|
97
|
+
```bash
|
|
98
|
+
qiime tools import \
|
|
99
|
+
--type EplacerModel \
|
|
100
|
+
--input-path ./model/ \
|
|
101
|
+
--output-path model.qza
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
##### Prepping input data.
|
|
105
|
+
|
|
106
|
+
In addition to the models, users must import their input data properly. Input data formatting requirements may be seen in documentation for the original [ePlacer](https://github.com/NEFSC/PEMAD-PBB-ePlacer) package.
|
|
107
|
+
|
|
108
|
+
##### Sequence data
|
|
109
|
+
|
|
110
|
+
Input sequence data is required in fasta format, which can be imported into QIIME2 formats with the following:
|
|
111
|
+
```bash
|
|
112
|
+
qiime tools import \
|
|
113
|
+
--type "FeatureData[Sequence]" \
|
|
114
|
+
--input-path seqs.fa \
|
|
115
|
+
--output-path seqs.qza
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
The sequence data should also be aligned, which can be done with the q2-eplacer function:
|
|
119
|
+
```bash
|
|
120
|
+
qiime eplacer align-sequences --i-fasta ./seqs.qza \
|
|
121
|
+
--i-model ./model.qza \
|
|
122
|
+
--o-aligned-sequences ./aligned_seqs.qza \
|
|
123
|
+
--p-threads 8
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
##### Count data
|
|
127
|
+
|
|
128
|
+
Count data must first be converted to a `.biom` format, then to a `.qza` format
|
|
129
|
+
```bash
|
|
130
|
+
biom convert -i ./counts.tsv \
|
|
131
|
+
-o ./counts.biom \
|
|
132
|
+
--table-type="OTU table" \
|
|
133
|
+
--to-hdf5
|
|
134
|
+
qiime tools import --input-path ./meta.biom \
|
|
135
|
+
--type 'FeatureTable[Frequency]' \
|
|
136
|
+
--input-format BIOMV210Format \
|
|
137
|
+
--output-path ./counts.qza
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
##### geoData
|
|
141
|
+
|
|
142
|
+
The geographic data can be read in as a metadata file, and requires no further transformations.
|
|
143
|
+
|
|
144
|
+
##### blast data
|
|
145
|
+
|
|
146
|
+
Although not used by the machine learning model, blast results are incredible useful for screening the results for mismatches when 100% matches are possible. Thus, a function for running the blast results was also included:
|
|
147
|
+
```bash
|
|
148
|
+
qiime eplacer run-blast \
|
|
149
|
+
--i-fasta ./seqs.qza \
|
|
150
|
+
--i-model ./model.qza \
|
|
151
|
+
--o-blast ./hits.qza \
|
|
152
|
+
--p-threads 8
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Note the unaligned sequences were used for blast.
|
|
156
|
+
|
|
157
|
+
### Running the model
|
|
158
|
+
|
|
159
|
+
Congratulations! You are ready to run `ePlacer`!
|
|
160
|
+
```bash
|
|
161
|
+
qiime eplacer run-model-qiime \
|
|
162
|
+
--i-fasta ./aligned_seqs.qza \
|
|
163
|
+
--i-model ./model.qza \
|
|
164
|
+
--i-blast ./hits.qza \
|
|
165
|
+
--i-counts ./counts.qza \
|
|
166
|
+
--m-geodata-file ./geoData.tsv \
|
|
167
|
+
--o-eplacer-table ./ePlacerAssignment.qza \
|
|
168
|
+
--o-curated-taxonomy ./qiimeAssignmentCurated.qza \
|
|
169
|
+
--o-raw-taxonomy ./qiimeAssignmentRaw.qza
|
|
170
|
+
qiime tools export --input-path ../ePlacerAssignment.qza \
|
|
171
|
+
--output-path ../ePlacerAssignment
|
|
172
|
+
qiime tools export --input-path ../qiimeAssignmentCurated.qza \
|
|
173
|
+
--output-path ../qiimeAssignmentCurated
|
|
174
|
+
qiime tools export --input-path ../qiimeAssignmentRaw.qza \
|
|
175
|
+
--output-path ../qiimeAssignmentRaw
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
You may notice there are three output files present. This is three different file formats. The first, `--o-eplacer-table` details the native ePlacer output format described in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer). The second, `--o-curated-taxonomy`, outputs the curated assignments in QIIME2 compatible format. The third, `--o-raw-taxonomy`, outputs the raw taxonomic assignments in QIIME2 compatible format.
|
|
179
|
+
|
|
180
|
+
##### A special note
|
|
181
|
+
|
|
182
|
+
As with all other taxonomic assignment tools, all taxonomic assignments should still be manuall curated after assignment. ePlacer exhibits higher accuracy than other tools, but is not perfect.
|
|
183
|
+
|
|
184
|
+
### Training the model
|
|
185
|
+
|
|
186
|
+
The QIIME2 implementation of ePlacer also supports training new models. File format requirements are detailed in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
|
|
187
|
+
```bash
|
|
188
|
+
qiime eplacer train-model \
|
|
189
|
+
--i-fasta ./unalignedSeqs.qza \
|
|
190
|
+
--i-alignedfasta ./alignedSeqs.qza \
|
|
191
|
+
--m-taxonomy-file ./taxonomy.tsv \
|
|
192
|
+
--m-geodata-file ./geoData.tsv \
|
|
193
|
+
--p-num-augments 100 \
|
|
194
|
+
--o-model .toyModel.qza \
|
|
195
|
+
--o-training-stats ./stats.qza \
|
|
196
|
+
--verbose
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## About
|
|
200
|
+
|
|
201
|
+
The `q2-eplacer` Python package was [created from a template](https://develop.qiime2.org/en/stable/plugins/tutorials/create-from-template.html).
|
|
202
|
+
To learn more about `q2-eplacer`, refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
|
|
203
|
+
To learn how to use QIIME 2, refer to the [QIIME 2 User Documentation](https://docs.qiime2.org).
|
|
204
|
+
To learn QIIME 2 plugin development, refer to [*Developing with QIIME 2*](https://develop.qiime2.org).
|
|
205
|
+
|
|
206
|
+
`q2-eplacer` is a QIIME 2 community plugin, meaning that it is not necessarily developed and maintained by the developers of QIIME 2.
|
|
207
|
+
Please be aware that because community plugins are developed by the QIIME 2 developer community, and not necessarily the QIIME 2 developers themselves, some may not be actively maintained or compatible with current release versions of the QIIME 2 distributions.
|
|
208
|
+
More information on development and support for community plugins can be found [here](https://library.qiime2.org).
|
|
209
|
+
If you need help with a community plugin, first refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
|
|
210
|
+
If that page doesn't provide information on how to get help, or you need additional help, head to the [Community Plugins category](https://forum.qiime2.org/c/community-contributions/community-plugins/14) on the QIIME 2 Forum where the QIIME 2 developers will do their best to help you.
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
==============================================================
|
|
214
|
+
|
|
215
|
+
This repository is a scientific product and is not official communication of the National Oceanic and Atmospheric Administration, or the United States Department of Commerce. All NOAA GitHub project code is provided on an ‘as is’ basis and the user assumes responsibility for its use. Any claims against the Department of Commerce or Department of Commerce bureaus stemming from the use of this GitHub project will be governed by all applicable Federal law. Any reference to specific commercial products, processes, or services by service mark, trademark, manufacturer, or otherwise, does not constitute or imply their endorsement, recommendation or favoring by the Department of Commerce. The Department of Commerce seal and logo, or the seal and logo of a DOC bureau, shall not be used in any manner to imply endorsement of any commercial product or activity by DOC or the United States Government.
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# q2-eplacer
|
|
2
|
+
|
|
3
|
+
A [QIIME 2](https://qiime2.org) plugin [developed](https://develop.qiime2.org) by Christopher Powers (christopher.powers@noaa.gov) that alows for the [ePlacer taxonomic classifier](https://github.com/NEFSC/PEMAD-PBB-ePlacer/) to interface with QIIME2.
|
|
4
|
+
|
|
5
|
+
ePlacer is a taxonomic classification tool that uses deep-learning approaches to incorporate both sequence information and biogeographic information into taxonomic assignment of DNA sequences.
|
|
6
|
+
|
|
7
|
+
## Why use ePlacer
|
|
8
|
+
|
|
9
|
+
The machine learning architecture of ePlacer enables powerful prediction beyond sequence-only classification tools (e.g. sequence alignment with blast or naive-bayes classifiers) by directly incorporating additional data into the probabalistic estimate of taxonomy, specifically developed for metabarcoding data. This novel applciation of deep-learning is immensely useful, as there can be many cases in metabarcoding data where two reference species have 100% sequence overlap, but distinct geographic ranges. This tool discriminates these cases and provides additional data for downstream taxonomic curation. Due to this, ePlacer provides enhanced interoperability between metabarcoding datasets.
|
|
10
|
+
|
|
11
|
+
Currently, ePlacer offers pre-trained models for two popular metabarcoding regions: the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. For these two regions, ePlacer offers the following benefits:
|
|
12
|
+
|
|
13
|
+
* **Interoperability.** ePlacer is trained on global datasets, allowing for direct comparison between metabarcoding datasets, regardless of geographic region.
|
|
14
|
+
* **Portability.** ePlacer has pre-trained models available for both MiFish and Riaz marker gene regions containerized and available for out-of-the-box use
|
|
15
|
+
* **Increased Accuracy.** The ePlacer model architecture provides increased accuracy, precision, and recall as compared to blast, Naive-Bayes, or least common ancestor approachers
|
|
16
|
+
* **Trainability** In addition to the two provided barcodes, this code repository provides tools for training new models.
|
|
17
|
+
|
|
18
|
+
For other barcode regions, there will be significant advantages with the training of new models. If you are interested in training a new model for ePlacer, please do not hesitate to reach out!
|
|
19
|
+
|
|
20
|
+
## Installation instructions
|
|
21
|
+
|
|
22
|
+
**The following instructions are intended to be a starting point** and should be replaced when `q2-eplacer` is ready to share with others.
|
|
23
|
+
They will enable you to install the most recent *development* version of `q2-eplacer`.
|
|
24
|
+
Remember that *release* versions should be used for all "real" work (i.e., where you're not testing or prototyping) - if there aren't instructions for installing a release version of this plugin, it is probably not yet intended for use in practice.
|
|
25
|
+
|
|
26
|
+
### Install Prerequisites
|
|
27
|
+
|
|
28
|
+
[Miniconda](https://conda.io/miniconda.html) provides the `conda` environment and package manager, and is currently the only supported way to install QIIME 2.
|
|
29
|
+
Follow the instructions for downloading and installing Miniconda.
|
|
30
|
+
|
|
31
|
+
After installing Miniconda and opening a new terminal, make sure you're running the latest version of `conda`:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
conda update conda
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
You also need to install the base qiime2 as a conda environment. Follow the [install instructions here](https://docs.qiime2.org/2024.10/install/native/).
|
|
38
|
+
|
|
39
|
+
### Install `q2-eplacer`
|
|
40
|
+
|
|
41
|
+
Next, you will install the ePlacer qiime plugin from pip
|
|
42
|
+
```bash
|
|
43
|
+
pip install q2-eplacer
|
|
44
|
+
```
|
|
45
|
+
This will also install all other required dependencies.
|
|
46
|
+
|
|
47
|
+
## Using `q2-eplacer`
|
|
48
|
+
|
|
49
|
+
### Data preparation
|
|
50
|
+
|
|
51
|
+
In order to use ePlacer, you must first prep the data prior to installing, including prepping input data and collecting a pre-trained model for inference.
|
|
52
|
+
|
|
53
|
+
#### Pre-Trained models
|
|
54
|
+
|
|
55
|
+
Currently, two pre-trained models are available: : the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. These are available in a QIIME2-compatible format:
|
|
56
|
+
```bash
|
|
57
|
+
# Mifish marker
|
|
58
|
+
wget https://zenodo.org/records/20820029/files/mifish.qza
|
|
59
|
+
# Riaz marker
|
|
60
|
+
wget https://zenodo.org/records/20820029/files/riaz.qza
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
If desired, users can also train new models, see below in section `Training New Models`. Any new, high performing models may be added to a Zenodo record by reaching out to the maintainers.
|
|
64
|
+
|
|
65
|
+
If you trained a new model with the qiime2 plugin, it will be automatically formatted into the `.qza` format. Otherwise, run the following:
|
|
66
|
+
```bash
|
|
67
|
+
qiime tools import \
|
|
68
|
+
--type EplacerModel \
|
|
69
|
+
--input-path ./model/ \
|
|
70
|
+
--output-path model.qza
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
##### Prepping input data.
|
|
74
|
+
|
|
75
|
+
In addition to the models, users must import their input data properly. Input data formatting requirements may be seen in documentation for the original [ePlacer](https://github.com/NEFSC/PEMAD-PBB-ePlacer) package.
|
|
76
|
+
|
|
77
|
+
##### Sequence data
|
|
78
|
+
|
|
79
|
+
Input sequence data is required in fasta format, which can be imported into QIIME2 formats with the following:
|
|
80
|
+
```bash
|
|
81
|
+
qiime tools import \
|
|
82
|
+
--type "FeatureData[Sequence]" \
|
|
83
|
+
--input-path seqs.fa \
|
|
84
|
+
--output-path seqs.qza
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
The sequence data should also be aligned, which can be done with the q2-eplacer function:
|
|
88
|
+
```bash
|
|
89
|
+
qiime eplacer align-sequences --i-fasta ./seqs.qza \
|
|
90
|
+
--i-model ./model.qza \
|
|
91
|
+
--o-aligned-sequences ./aligned_seqs.qza \
|
|
92
|
+
--p-threads 8
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
##### Count data
|
|
96
|
+
|
|
97
|
+
Count data must first be converted to a `.biom` format, then to a `.qza` format
|
|
98
|
+
```bash
|
|
99
|
+
biom convert -i ./counts.tsv \
|
|
100
|
+
-o ./counts.biom \
|
|
101
|
+
--table-type="OTU table" \
|
|
102
|
+
--to-hdf5
|
|
103
|
+
qiime tools import --input-path ./meta.biom \
|
|
104
|
+
--type 'FeatureTable[Frequency]' \
|
|
105
|
+
--input-format BIOMV210Format \
|
|
106
|
+
--output-path ./counts.qza
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
##### geoData
|
|
110
|
+
|
|
111
|
+
The geographic data can be read in as a metadata file, and requires no further transformations.
|
|
112
|
+
|
|
113
|
+
##### blast data
|
|
114
|
+
|
|
115
|
+
Although not used by the machine learning model, blast results are incredible useful for screening the results for mismatches when 100% matches are possible. Thus, a function for running the blast results was also included:
|
|
116
|
+
```bash
|
|
117
|
+
qiime eplacer run-blast \
|
|
118
|
+
--i-fasta ./seqs.qza \
|
|
119
|
+
--i-model ./model.qza \
|
|
120
|
+
--o-blast ./hits.qza \
|
|
121
|
+
--p-threads 8
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Note the unaligned sequences were used for blast.
|
|
125
|
+
|
|
126
|
+
### Running the model
|
|
127
|
+
|
|
128
|
+
Congratulations! You are ready to run `ePlacer`!
|
|
129
|
+
```bash
|
|
130
|
+
qiime eplacer run-model-qiime \
|
|
131
|
+
--i-fasta ./aligned_seqs.qza \
|
|
132
|
+
--i-model ./model.qza \
|
|
133
|
+
--i-blast ./hits.qza \
|
|
134
|
+
--i-counts ./counts.qza \
|
|
135
|
+
--m-geodata-file ./geoData.tsv \
|
|
136
|
+
--o-eplacer-table ./ePlacerAssignment.qza \
|
|
137
|
+
--o-curated-taxonomy ./qiimeAssignmentCurated.qza \
|
|
138
|
+
--o-raw-taxonomy ./qiimeAssignmentRaw.qza
|
|
139
|
+
qiime tools export --input-path ../ePlacerAssignment.qza \
|
|
140
|
+
--output-path ../ePlacerAssignment
|
|
141
|
+
qiime tools export --input-path ../qiimeAssignmentCurated.qza \
|
|
142
|
+
--output-path ../qiimeAssignmentCurated
|
|
143
|
+
qiime tools export --input-path ../qiimeAssignmentRaw.qza \
|
|
144
|
+
--output-path ../qiimeAssignmentRaw
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
You may notice there are three output files present. This is three different file formats. The first, `--o-eplacer-table` details the native ePlacer output format described in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer). The second, `--o-curated-taxonomy`, outputs the curated assignments in QIIME2 compatible format. The third, `--o-raw-taxonomy`, outputs the raw taxonomic assignments in QIIME2 compatible format.
|
|
148
|
+
|
|
149
|
+
##### A special note
|
|
150
|
+
|
|
151
|
+
As with all other taxonomic assignment tools, all taxonomic assignments should still be manuall curated after assignment. ePlacer exhibits higher accuracy than other tools, but is not perfect.
|
|
152
|
+
|
|
153
|
+
### Training the model
|
|
154
|
+
|
|
155
|
+
The QIIME2 implementation of ePlacer also supports training new models. File format requirements are detailed in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
|
|
156
|
+
```bash
|
|
157
|
+
qiime eplacer train-model \
|
|
158
|
+
--i-fasta ./unalignedSeqs.qza \
|
|
159
|
+
--i-alignedfasta ./alignedSeqs.qza \
|
|
160
|
+
--m-taxonomy-file ./taxonomy.tsv \
|
|
161
|
+
--m-geodata-file ./geoData.tsv \
|
|
162
|
+
--p-num-augments 100 \
|
|
163
|
+
--o-model .toyModel.qza \
|
|
164
|
+
--o-training-stats ./stats.qza \
|
|
165
|
+
--verbose
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## About
|
|
169
|
+
|
|
170
|
+
The `q2-eplacer` Python package was [created from a template](https://develop.qiime2.org/en/stable/plugins/tutorials/create-from-template.html).
|
|
171
|
+
To learn more about `q2-eplacer`, refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
|
|
172
|
+
To learn how to use QIIME 2, refer to the [QIIME 2 User Documentation](https://docs.qiime2.org).
|
|
173
|
+
To learn QIIME 2 plugin development, refer to [*Developing with QIIME 2*](https://develop.qiime2.org).
|
|
174
|
+
|
|
175
|
+
`q2-eplacer` is a QIIME 2 community plugin, meaning that it is not necessarily developed and maintained by the developers of QIIME 2.
|
|
176
|
+
Please be aware that because community plugins are developed by the QIIME 2 developer community, and not necessarily the QIIME 2 developers themselves, some may not be actively maintained or compatible with current release versions of the QIIME 2 distributions.
|
|
177
|
+
More information on development and support for community plugins can be found [here](https://library.qiime2.org).
|
|
178
|
+
If you need help with a community plugin, first refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
|
|
179
|
+
If that page doesn't provide information on how to get help, or you need additional help, head to the [Community Plugins category](https://forum.qiime2.org/c/community-contributions/community-plugins/14) on the QIIME 2 Forum where the QIIME 2 developers will do their best to help you.
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
==============================================================
|
|
183
|
+
|
|
184
|
+
This repository is a scientific product and is not official communication of the National Oceanic and Atmospheric Administration, or the United States Department of Commerce. All NOAA GitHub project code is provided on an ‘as is’ basis and the user assumes responsibility for its use. Any claims against the Department of Commerce or Department of Commerce bureaus stemming from the use of this GitHub project will be governed by all applicable Federal law. Any reference to specific commercial products, processes, or services by service mark, trademark, manufacturer, or otherwise, does not constitute or imply their endorsement, recommendation or favoring by the Department of Commerce. The Department of Commerce seal and logo, or the seal and logo of a DOC bureau, shall not be used in any manner to imply endorsement of any commercial product or activity by DOC or the United States Government.
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "q2-eplacer"
|
|
3
|
+
authors = [
|
|
4
|
+
{ name = "Christopher Powers", email = "christopher.powers@noaa.gov"}
|
|
5
|
+
]
|
|
6
|
+
description = "ASV classifier with deep-learning and biogeography"
|
|
7
|
+
readme = {file = "README.md", content-type = "text/markdown"}
|
|
8
|
+
license = {file = "LICENSE.txt"}
|
|
9
|
+
dynamic = ["version"]
|
|
10
|
+
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"qiime2",
|
|
14
|
+
"pandas",
|
|
15
|
+
"biom-format",
|
|
16
|
+
"eplacer"
|
|
17
|
+
]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 4 - Beta",
|
|
20
|
+
"Intended Audience :: Science/Research",
|
|
21
|
+
"License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
27
|
+
]
|
|
28
|
+
keywords = ["qiime2", "microbiome", "taxonomy", "deep-learning", "biogeography"]
|
|
29
|
+
|
|
30
|
+
[project.entry-points.'qiime2.plugins']
|
|
31
|
+
"q2-eplacer" = "q2_eplacer.plugin_setup:plugin"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer"
|
|
38
|
+
Repository = "https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer"
|
|
39
|
+
"Bug Tracker" = "https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer/issues"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = [
|
|
44
|
+
"setuptools",
|
|
45
|
+
"versioningit",
|
|
46
|
+
"wheel"
|
|
47
|
+
]
|
|
48
|
+
build-backend = "setuptools.build_meta"
|
|
49
|
+
|
|
50
|
+
[tool.versioningit.vcs]
|
|
51
|
+
method = "git-archive"
|
|
52
|
+
describe-subst = "$Format:%(describe:tags)$"
|
|
53
|
+
default-tag = "0.1.0"
|
|
54
|
+
|
|
55
|
+
[tool.versioningit.next-version]
|
|
56
|
+
method = "minor"
|
|
57
|
+
|
|
58
|
+
[tool.versioningit.format]
|
|
59
|
+
distance = "{base_version}+{distance}.{vcs}{rev}"
|
|
60
|
+
dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
|
|
61
|
+
distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
|
|
62
|
+
|
|
63
|
+
[tool.versioningit.write]
|
|
64
|
+
file = "q2_eplacer/_version.py"
|
|
65
|
+
|
|
66
|
+
[tool.versioningit]
|
|
67
|
+
default-version = "0.1.1"
|
|
68
|
+
|
|
69
|
+
[tool.setuptools]
|
|
70
|
+
include-package-data = true
|
|
71
|
+
|
|
72
|
+
[tool.setuptools.packages.find]
|
|
73
|
+
where = ["."]
|
|
74
|
+
include = ["q2_eplacer*"]
|
|
75
|
+
|
|
76
|
+
[tool.setuptools.package-data]
|
|
77
|
+
q2_eplacer = ["**/*"]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import qiime2.plugin.model as model
|
|
2
|
+
import csv
|
|
3
|
+
class BlastOutfmt6Format(model.TextFileFormat):
|
|
4
|
+
"""Format for tabular blastn output (outfmt 6)."""
|
|
5
|
+
def validate(self, level):
|
|
6
|
+
lines_to_check = 10 if level == 'min' else None
|
|
7
|
+
with open(str(self.path), 'r') as fh:
|
|
8
|
+
reader = csv.reader(fh, delimiter='\t')
|
|
9
|
+
|
|
10
|
+
for i, row in enumerate(reader):
|
|
11
|
+
# Stop early if we only need a 'min' validation
|
|
12
|
+
if lines_to_check is not None and i >= lines_to_check:
|
|
13
|
+
break
|
|
14
|
+
# Skip empty lines
|
|
15
|
+
if not row:
|
|
16
|
+
continue
|
|
17
|
+
|
|
18
|
+
# Expecting: qseqid, sseqid, pident, evalue, length, qlen, slen,
|
|
19
|
+
# qstart, qend, sstart, send, sseq
|
|
20
|
+
if len(row) != 12:
|
|
21
|
+
raise model.ValidationError(
|
|
22
|
+
f"Invalid BLAST format on line {i+1}. "
|
|
23
|
+
f"Expected exactly 12 columns, but found {len(row)}."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
# pident
|
|
28
|
+
float(row[2])
|
|
29
|
+
# evalue
|
|
30
|
+
float(row[3])
|
|
31
|
+
# length (index 4)
|
|
32
|
+
int(row[4])
|
|
33
|
+
except ValueError:
|
|
34
|
+
raise model.ValidationError(
|
|
35
|
+
f"Invalid data type on line {i+1}. "
|
|
36
|
+
"Columns like 'pident', 'evalue', and 'length' must be numeric."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
class BlastOutfmt6DirFormat(model.DirectoryFormat):
|
|
40
|
+
"""Directory format containing exactly one tabular BLAST output file."""
|
|
41
|
+
# This maps any file matching the regex pattern to your validator format class
|
|
42
|
+
blast_file = model.File(r'blast_results\.tsv', format=BlastOutfmt6Format)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ePlacerTextFileFormat(model.TextFileFormat):
|
|
46
|
+
def validate(self, level):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
class ePlacerBinaryFileFormat(model.BinaryFileFormat):
|
|
50
|
+
def validate(self, level):
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
class EplacerModelDirectoryFormat(model.DirectoryFormat):
|
|
54
|
+
"""Format representing the eplacer pre-trained model directory."""
|
|
55
|
+
config = model.File('config.yml', format=ePlacerTextFileFormat)
|
|
56
|
+
geo_encoder = model.File('geoEncoder.pkl', format=ePlacerBinaryFileFormat)
|
|
57
|
+
accession_dict = model.File('accessionKeyDict.pkl', format=ePlacerBinaryFileFormat)
|
|
58
|
+
grid_config = model.File(r'grid_config_.*\.npy', format=ePlacerBinaryFileFormat)
|
|
59
|
+
best_model = model.File(r'best_geo_model_.*\.pth', format=ePlacerBinaryFileFormat)
|
|
60
|
+
best_param = model.File(r'best_geo_param_.*\.pth', format=ePlacerBinaryFileFormat)
|
|
61
|
+
taxa_key = model.File(r'taxa_key_.*\.tsv', format=ePlacerTextFileFormat)
|
|
62
|
+
alignment = model.File(r'alignment.fa', format=ePlacerTextFileFormat)
|
|
63
|
+
fasta = model.File(r'reference.fa', format=ePlacerTextFileFormat)
|
|
64
|
+
taxfile = model.File(r'full_taxonomy.tsv', format=ePlacerTextFileFormat)
|
|
65
|
+
geopkl = model.File(r'geoTrain.pkl', format=ePlacerBinaryFileFormat)
|
|
66
|
+
labelpkl = model.File(r'labelTrain.pkl', format=ePlacerBinaryFileFormat)
|
|
67
|
+
|
|
68
|
+
class EplacerOutputTableFormat(model.TextFileFormat):
|
|
69
|
+
def validate(self, level):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
class EplacerOutputTableTrainFormat(model.TextFileFormat):
|
|
73
|
+
def validate(self, level):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
class EplacerOutputTableDirFormat(model.DirectoryFormat):
|
|
77
|
+
predictions = model.File('bestGeoPredict.tsv', format=EplacerOutputTableFormat)
|
|
78
|
+
|
|
79
|
+
class EplacerOutputTableTrainDirFormat(model.DirectoryFormat):
|
|
80
|
+
predictions = model.File('model_geo_stats.tsv', format=EplacerOutputTableTrainFormat)
|