PyPI - q2-eplacer - Versions diffs - 0.1.1__py3-none-any.whl - Mend

q2-eplacer 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

q2_eplacer/__init__.py +6 -0
q2_eplacer/_formats.py +80 -0
q2_eplacer/_methods.py +472 -0
q2_eplacer/_types.py +7 -0
q2_eplacer/_version.py +1 -0
q2_eplacer/citations.bib +6 -0
q2_eplacer/plugin_setup.py +133 -0
q2_eplacer/tests/__init__.py +8 -0
q2_eplacer/tests/data/alignedSeqs.qza +0 -0
q2_eplacer/tests/data/full_taxonomy.tsv +17 -0
q2_eplacer/tests/data/geoData.tsv +17 -0
q2_eplacer/tests/data/geoData_run.tsv +3 -0
q2_eplacer/tests/data/seqs.qza +0 -0
q2_eplacer/tests/data/testfasta.fa +6 -0
q2_eplacer/tests/data/testfasta.qza +0 -0
q2_eplacer/tests/test_methods.py +113 -0
q2_eplacer-0.1.1.dist-info/METADATA +215 -0
q2_eplacer-0.1.1.dist-info/RECORD +22 -0
q2_eplacer-0.1.1.dist-info/WHEEL +5 -0
q2_eplacer-0.1.1.dist-info/entry_points.txt +2 -0
q2_eplacer-0.1.1.dist-info/licenses/LICENSE.txt +5 -0
q2_eplacer-0.1.1.dist-info/top_level.txt +1 -0

q2_eplacer/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# flake8: noqa
+try:
+    from ._version import __version__
+except ModuleNotFoundError:
+    __version__ = '0.0.0+notfound'

q2_eplacer/_formats.py ADDED Viewed

@@ -0,0 +1,80 @@
+import qiime2.plugin.model as model
+import csv
+class BlastOutfmt6Format(model.TextFileFormat):
+    """Format for tabular blastn output (outfmt 6)."""
+    def validate(self, level):
+        lines_to_check = 10 if level == 'min' else None
+        with open(str(self.path), 'r') as fh:
+            reader = csv.reader(fh, delimiter='\t')
+            for i, row in enumerate(reader):
+                # Stop early if we only need a 'min' validation
+                if lines_to_check is not None and i >= lines_to_check:
+                    break
+                # Skip empty lines
+                if not row:
+                    continue
+                # Expecting: qseqid, sseqid, pident, evalue, length, qlen, slen,
+                # qstart, qend, sstart, send, sseq
+                if len(row) != 12:
+                    raise model.ValidationError(
+                        f"Invalid BLAST format on line {i+1}. "
+                        f"Expected exactly 12 columns, but found {len(row)}."
+                    )
+                try:
+                    # pident
+                    float(row[2])
+                    # evalue
+                    float(row[3])
+                    # length (index 4)
+                    int(row[4])
+                except ValueError:
+                    raise model.ValidationError(
+                        f"Invalid data type on line {i+1}. "
+                        "Columns like 'pident', 'evalue', and 'length' must be numeric."
+                    )
+class BlastOutfmt6DirFormat(model.DirectoryFormat):
+    """Directory format containing exactly one tabular BLAST output file."""
+    # This maps any file matching the regex pattern to your validator format class
+    blast_file = model.File(r'blast_results\.tsv', format=BlastOutfmt6Format)
+class ePlacerTextFileFormat(model.TextFileFormat):
+    def validate(self, level):
+        pass
+class ePlacerBinaryFileFormat(model.BinaryFileFormat):
+    def validate(self, level):
+        pass
+class EplacerModelDirectoryFormat(model.DirectoryFormat):
+    """Format representing the eplacer pre-trained model directory."""
+    config = model.File('config.yml', format=ePlacerTextFileFormat)
+    geo_encoder = model.File('geoEncoder.pkl', format=ePlacerBinaryFileFormat)
+    accession_dict = model.File('accessionKeyDict.pkl', format=ePlacerBinaryFileFormat)
+    grid_config = model.File(r'grid_config_.*\.npy', format=ePlacerBinaryFileFormat)
+    best_model = model.File(r'best_geo_model_.*\.pth', format=ePlacerBinaryFileFormat)
+    best_param = model.File(r'best_geo_param_.*\.pth', format=ePlacerBinaryFileFormat)
+    taxa_key = model.File(r'taxa_key_.*\.tsv', format=ePlacerTextFileFormat)
+    alignment = model.File(r'alignment.fa', format=ePlacerTextFileFormat)
+    fasta = model.File(r'reference.fa', format=ePlacerTextFileFormat)
+    taxfile = model.File(r'full_taxonomy.tsv', format=ePlacerTextFileFormat)
+    geopkl = model.File(r'geoTrain.pkl', format=ePlacerBinaryFileFormat)
+    labelpkl = model.File(r'labelTrain.pkl', format=ePlacerBinaryFileFormat)
+class EplacerOutputTableFormat(model.TextFileFormat):
+    def validate(self, level):
+        pass
+class EplacerOutputTableTrainFormat(model.TextFileFormat):
+    def validate(self, level):
+        pass
+class EplacerOutputTableDirFormat(model.DirectoryFormat):
+    predictions = model.File('bestGeoPredict.tsv', format=EplacerOutputTableFormat)
+class EplacerOutputTableTrainDirFormat(model.DirectoryFormat):
+    predictions = model.File('model_geo_stats.tsv', format=EplacerOutputTableTrainFormat)

q2_eplacer/_methods.py ADDED Viewed

@@ -0,0 +1,472 @@
+import pandas as pd
+import eplacer as ep
+from eplacer import external
+from eplacer.geographicRep import SpeciesGeoEncoder
+from eplacer.run_model import run_model_geoOBIS_bootstrap
+from eplacer.train_evaluate import mask_sequence, check_data_loader_geo, train_and_evaluate_geo
+import subprocess
+import qiime2
+from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat, FeatureData, Sequence, AlignedSequence
+from q2_types.feature_table import FeatureTable, Frequency
+from ._formats import EplacerModelDirectoryFormat, BlastOutfmt6Format, BlastOutfmt6DirFormat, EplacerOutputTableFormat, EplacerOutputTableTrainFormat
+from ._types import BlastResults, EplacerModel, BlastResultsDir, EplacerTableTrain, EplacerTable
+from ._formats import BlastOutfmt6DirFormat, EplacerModelDirectoryFormat, EplacerOutputTableDirFormat, EplacerOutputTableTrainDirFormat
+from qiime2.plugin import Str, Float, Int, Bool, Metadata
+import biom
+import os
+import tempfile
+from collections import defaultdict
+import sys
+import re
+import numpy as np
+import pickle
+import yaml
+import time
+import random
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score, f1_score
+import shutil
+from eplacer.models import build_dataloaders_geo, CNNWithSpatialEncoding, GeographicAuxiliaryLoss
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.optim as optim
+import typing
+def align_sequences(fasta: DNAFASTAFormat,
+                    model: EplacerModelDirectoryFormat,
+                    threads: int = 1) -> AlignedDNAFASTAFormat:
+    input_fasta_path = str(fasta)
+    model_dir_path = str(model.path)
+    reference_alignment_path = os.path.join(model_dir_path, "alignment.fa")
+    aligned_artifact = AlignedDNAFASTAFormat()
+    with tempfile.TemporaryDirectory() as temp_dir:
+        moutput_path = os.path.join(temp_dir, 'mafft_total_output.afa')
+        subset_output_path = str(aligned_artifact.path)
+        print("Invoking MAFFT via ePlacer...")
+        fastaDict = external.run_mafft(
+            input=input_fasta_path,
+            threads=str(threads),
+            reference=reference_alignment_path,
+            moutput=moutput_path,
+            subset_output=subset_output_path
+        )
+    return aligned_artifact
+def run_blast(fasta: DNAFASTAFormat,
+              model: EplacerModelDirectoryFormat,
+              threads: int = 1) -> BlastOutfmt6DirFormat:
+    input_fasta_path = str(fasta)
+    model_dir_path = str(model.path)
+    reference_alignment_path = os.path.join(model_dir_path, "reference.fa")
+    blast_artifact = BlastOutfmt6DirFormat()
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print("making blast db...")
+        db_cmd = [
+            "makeblastdb",
+            "-in", reference_alignment_path,
+            "-dbtype", "nucl",
+            "-out", "blastdb"
+        ]
+        subprocess.run(db_cmd, check=True)
+        output_blast_path = os.path.join(str(blast_artifact.path), 'blast_results.tsv')
+        blast_cmd = [
+            "blastn",
+            "-query", input_fasta_path,
+            "-db", "blastdb",
+            "-outfmt", "6 qseqid sseqid pident evalue length qlen slen qstart qend sstart send sseq",
+            "-out", output_blast_path,
+            "-num_threads", str(threads)
+        ]
+        print(f"Running BLAST using: {' '.join(blast_cmd)}")
+        subprocess.run(blast_cmd, check=True)
+        print(f"BLAST search completed. Results saved to {output_blast_path}")
+    return blast_artifact
+def train_model(fasta: DNAFASTAFormat,
+                alignedfasta: AlignedDNAFASTAFormat,
+                taxonomy: qiime2.Metadata,
+                geodata: qiime2.Metadata,
+                taxlevel: str = "SPECIES",
+                num_augments: int = 10,
+                maskrate: float = 0.01,
+                sigma: float = 1,
+                kernel: int = 3,
+                precision: int = 2
+               ) -> (EplacerModelDirectoryFormat, EplacerOutputTableTrainDirFormat):
+    output_artifact = EplacerModelDirectoryFormat()
+    out = str(output_artifact.path)
+    taxa_tsv_path = os.path.join(out, "full_taxonomy.tsv")
+    taxonomy.save(taxa_tsv_path)
+    fasta_path = os.path.join(out, "reference.fa")
+    fasta.save(fasta_path)
+    alignedfasta_path = os.path.join(out, "alignment.fa")
+    alignedfasta.save(alignedfasta_path)
+    taxa_tsv_path = os.path.join(out, "full_taxonomy.tsv")
+    taxonomy.save(taxa_tsv_path)
+    config_dict = {"Augments":num_augments, "Mask Rate":maskrate, "Sigma":sigma, "Kernel":kernel, "Precision":precision}
+    with open(os.path.join(out, f"config.yml"), "w") as outfile:
+        yaml.dump(config_dict, outfile, sort_keys=False)
+    taxDict = {}
+    geo_dict = {}
+    levels = {"SPECIES":7,"GENUS":6,"FAMILY":5,"ORDER":4,"CLASS":3,"PHYLUM":2}
+    with open(taxa_tsv_path, 'r') as f:
+        for line in f:
+            line = line.rstrip()
+            listall = re.split("\t", line)
+            if listall[0] != "accession":
+                taxDict[listall[0]] = listall[levels[taxlevel]]
+                geo_dict[listall[0]] = listall[7]
+    # Read in the sequence data
+    seqDict = {}
+    key = ''
+    seq = ''
+    lengths = []
+    unique = defaultdict(lambda:0)
+    fasta = str(alignedfasta)
+    # copy aligned fasta so if can be properly used in the output direction
+    # shutil.copyfile(fasta, f"{out}/alignment.fa")
+    # shutil.copyfile(taxa, f"{out}/full_taxonomy.tsv")
+    with open(fasta_path, 'r') as f:
+        for line in f:
+            line = line.rstrip()
+            if line.startswith(">"):
+                if key != '':
+                    seqDict[key[1:]] = seq.upper()
+                    unique[seq.upper()]+=1
+                    lengths.append(len(seq.upper().replace("-", "")))
+                    seq = ''
+                key = line
+            else:
+                seq += line
+        seqDict[key[1:]] = seq.upper()
+    mean_length = sum(lengths)/len(lengths)
+    print(mean_length)
+    seqdict2={}
+    for i in seqDict:
+        if len(seqDict[i].replace("-", "")) > mean_length * 0.7:
+            seqdict2[i] = seqDict[i]
+    seqDict = seqdict2
+    print(f"Sequences after removing outlier length sequences: {len(seqDict)}")
+    # Create augmented sequences
+    augmented_seqDict = {}
+    augmented_taxDict = {}
+    augmented_geo_dict = {}
+    added = 0
+    print("Augmenting sequences with a mask rate of {}".format(str(maskrate)))
+    for key, seq in seqDict.items():
+        # Generate augmented sequences
+        for i in range(num_augments):
+            new_key = f"{key}_aug_{i+1}"
+            augmented_seq = mask_sequence(
+                seq, mask_rate=maskrate
+            )
+            if unique[augmented_seq] > 0:
+                unique[augmented_seq] += 1
+            else:
+                unique[augmented_seq] += 1
+                added += 1
+                augmented_seqDict[new_key] = augmented_seq
+                augmented_taxDict[new_key] = taxDict[key]
+                augmented_geo_dict[new_key] = geo_dict[key]
+    label_sequences = {}
+    seqDict = augmented_seqDict.copy()
+    for seq_name, label in augmented_taxDict.items():
+        if label not in label_sequences:
+            label_sequences[label] = []
+        label_sequences[label].append(seq_name)
+    taxDict = {name: augmented_taxDict[name] for name in seqDict}
+    geo_dict = {name: augmented_geo_dict[name] for name in seqDict}
+    print(f"{added} augmented sequences added to the dataset!")
+    print(f"{len(seqDict)} sequences remaining after downsampling")
+    # convert to number format for encoding and store in a key dictionary
+    accessions=[]
+    count = 0
+    accessionKeyDict = {}
+    keyAccessionDict = {}
+    taxaSeqDict = defaultdict(lambda:[])
+    for i in taxDict:
+        if i in seqDict:
+            if taxDict[i] not in accessionKeyDict:
+                accessionKeyDict[taxDict[i]] = count
+                keyAccessionDict[count] = taxDict[i]
+                accessions.append(count)
+                count+=1
+            else:
+                accessions.append(accessionKeyDict[taxDict[i]])
+            taxaSeqDict[taxDict[i]].append(seqDict[i])
+    # Store in a file for reference later. This can be referred to later.
+    with open(os.path.join(out, f"taxa_key_{taxlevel}.tsv"), "w") as outfile:
+        for i in taxDict:
+            outfile.write("{}\t{}\t{}\n".format(accessionKeyDict[taxDict[i]], i, taxDict[i]))
+    with open(f"{out}/accessionKeyDict.pkl", "wb") as outfile:
+        pickle.dump(accessionKeyDict, outfile, pickle.HIGHEST_PROTOCOL)
+    # develop the training dataset
+    train_seq = []
+    train_labels = []
+    train_geo = []
+    test_seq = []
+    test_labels = []
+    test_geo = []
+    with tempfile.TemporaryDirectory() as temp_dir:
+        geo_tsv_path = os.path.join(temp_dir, "geodata_temp.tsv")
+        geodata.save(geo_tsv_path)
+        geoDict = defaultdict(lambda:[])
+        count = 0
+        with open(geo_tsv_path, "r") as infile:
+            for line in infile:
+                line = line.rstrip()
+                listall = re.split("\t", line)
+                count+=1
+                if listall[2] == 'decimallongitude': # check if dataset is obis
+                    obis = True
+                else:
+                    obis = False
+                if obis: # if obis, collect based on their formatting
+                    if count != 1: # skip line 1
+                        if listall[8] in accessionKeyDict:
+                            geoDict[listall[8]].append((float(listall[3]), float(listall[2])))
+                else: # else, assume that first field is species, field 2 is latitude, field 3 is longitude
+                    if count != 1: # skip line 1
+                        if listall[0] in accessionKeyDict:
+                            geoDict[listall[0]].append((float(listall[1]), float(listall[2])))
+                if count % 1000000 == 0 and count > 0:
+                    print("{} Entries read in".format(count))
+    print("confirming precision is set at: {}".format(precision))
+    geoEncoder = SpeciesGeoEncoder(precision=precision)
+    geoEncoded = geoEncoder.encode_species(geoDict)
+    with open(f"{out}/geoEncoder.pkl", "wb") as outfile:
+        pickle.dump(geoEncoder, outfile, pickle.HIGHEST_PROTOCOL)
+    geoEncoder.save_grid('{}/grid_config_{}.npy'.format(out, taxlevel))
+    geoDictionary = {}
+    for i in accessionKeyDict:
+        geoDictionary[i] = geoEncoded[i]
+        if len(taxaSeqDict[i]) > 1:
+            train_seq.append(taxaSeqDict[i][0])
+            train_labels.append(accessionKeyDict[i])
+            train_geo.append(geoEncoded[i])
+            for j in range(1, len(taxaSeqDict[i]), 1):
+                if random.random() >= 0.7:
+                    test_seq.append(taxaSeqDict[i][j])
+                    test_labels.append(accessionKeyDict[i])
+                    test_geo.append(geoEncoded[i])
+                else:
+                    train_seq.append(taxaSeqDict[i][j])
+                    train_labels.append(accessionKeyDict[i])
+                    train_geo.append(geoEncoded[i])
+        else:
+            train_seq.append(taxaSeqDict[i][0])
+            train_labels.append(accessionKeyDict[i])
+            train_geo.append(geoEncoded[i])
+    with open(f"{out}/geoTrain.pkl", "wb") as outfile:
+        pickle.dump(train_geo, outfile, pickle.HIGHEST_PROTOCOL)
+    with open(f"{out}/labelTrain.pkl", "wb") as outfile:
+            pickle.dump(train_labels, outfile, pickle.HIGHEST_PROTOCOL)
+    sys.stdout.write("INFO: Number of sequences in the training dataset: {}\n".format(len(train_seq)))
+    sys.stdout.write("INFO: Number of labels in the training dataset: {}\n".format(len(train_labels)))
+    sys.stdout.write("INFO: Number of labels in the geo dataset: {}\n".format(len(train_geo)))
+    seq_len = len(train_seq[0])
+    batch=256
+    train_dl_geo, val_dl_geo = build_dataloaders_geo(train_seq,
+                                                    train_labels,
+                                                    test_seq,
+                                                    test_labels,
+                                                    train_geo,
+                                                    test_geo,
+                                                    batch)
+    num_classes_seq = check_data_loader_geo(train_dl_geo)+1
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = CNNWithSpatialEncoding(
+        seq_len=seq_len,
+        num_classes=num_classes_seq,
+        spatial_dim=geoEncoder.get_feature_dimension()
+    )
+    criterion = nn.NLLLoss()
+    optimizer = optim.Adam(model.parameters(), lr=0.0001)
+    scheduler = torch.optim.lr_scheduler.StepLR(
+            optimizer, step_size=5, gamma=0.5)
+    num_epochs = 100
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    stats_artifact = EplacerOutputTableTrainDirFormat()
+    stats_dir = str(stats_artifact.path)
+    model = train_and_evaluate_geo(scheduler, sigma, kernel, accessionKeyDict, geoDictionary, stats_dir, taxlevel, model, train_dl_geo, val_dl_geo, criterion, optimizer, num_epochs, device, mask_prob=0.5, geo_encoder=geoEncoder)
+    torch.save(model, '{}/best_geo_model_{}.pth'.format(out, taxlevel))
+    weights_path = os.path.join(stats_dir, f'best_geo_param_{taxlevel}.pth')
+    permanent_weights_path = os.path.join(out, f'best_geo_param_{taxlevel}.pth')
+    import shutil
+    if os.path.exists(weights_path):
+        shutil.move(weights_path, permanent_weights_path)
+    return output_artifact, stats_artifact
+def run_model(fasta: AlignedDNAFASTAFormat,
+              model: EplacerModelDirectoryFormat,
+              blast: BlastOutfmt6DirFormat,
+              counts: biom.Table,
+              geodata: qiime2.Metadata,
+              taxlevel: str = "SPECIES",
+              maskrate: float = 0.01,
+              sigma: float = 1.0,  # Set your actual defaults
+              kernel: int = 1,     # Set your actual defaults
+              threads: int = 1,
+              confidence: float = 0.9,
+              force: bool = False
+              ) -> typing.Tuple[EplacerOutputTableDirFormat, pd.DataFrame, pd.DataFrame]:
+    aligned_fasta_path = str(fasta)
+    blast_path = os.path.join(str(blast.path), 'blast_results.tsv')
+    model_dir_path = str(model.path)
+    taxfile = os.path.join(model_dir_path, "taxa_key_SPECIES.tsv")
+    taxonomy_file_path = os.path.join(model_dir_path, "full_taxonomy.tsv")
+    taxa_lookup = {}
+    with open(taxonomy_file_path, "r") as f:
+        for line in f:
+            parts = line.strip('\n').split('\t')
+            if len(parts) > 1:
+                lineage_parts = parts[1:]
+                current_lineage = []
+                for rank in lineage_parts:
+                    rank_clean = rank.strip()
+                    if not rank_clean:
+                        continue
+                    current_lineage.append(rank_clean)
+                    # Create the standard QIIME 2 semicolon-separated format
+                    joined_lineage = ";".join(current_lineage)
+                    # Map the specific taxon to its full lineage string
+                    taxa_lookup[rank_clean] = joined_lineage
+    geo_df = geodata.to_dataframe()
+    locale_dict = {}
+    for sample_id, row in geo_df.iterrows():
+        locale_dict[sample_id] = (row['Latitude'], row['Longitude'])
+    # convert locale dict to geodict. This tracks what ASVs are
+    # abundant at each site
+    counts_df = counts.to_dataframe(dense=True)
+    geoDict = defaultdict(lambda:[])
+    for asv_id, row in counts_df.iterrows():
+        for sample_id, count in row.items():
+            if count > 0:
+                geoDict[asv_id].append(locale_dict[sample_id])
+    # encode the geodict
+    geoEncoder = SpeciesGeoEncoder.load_grid('{}/grid_config_{}.npy'.format(model_dir_path, taxlevel))
+    geoEncoded = geoEncoder.encode_species(geoDict)
+    # in case any ASVs not in dict after encoding. Since this is a default dict, initializing
+    # initializes empty vector
+    # This should probably be addressed upstream in a different version, but mostly arises
+    # from when an ASV is in the count matrix, but has all zeroes.
+    for i in geoDict:
+        x = geoEncoded[i]
+    seqlen = 0
+    seq = ''
+    with open(aligned_fasta_path, 'r') as infile:
+        for line in infile:
+            line = line.rstrip()
+            if line.startswith(">"):
+                if len(seq) == 0:
+                    pass
+                else:
+                    sys.stdout.write("INFO: sequence length={}\n".format(len(seq)))
+                    seqlen = len(seq)
+                    break
+            else:
+                seq += line
+    class_set = set()
+    with open(taxfile, 'r') as infile:
+        for line in infile:
+            line = line.rstrip()
+            listall = re.split("\t", line)
+            class_set.add(listall[0])
+    num_classes = len(class_set)
+    sys.stdout.write("INFO: number of classes={}\n".format(len(class_set)))
+    num_classes = len(class_set)
+    sys.stdout.write("RUN GEO")
+    output_artifact = EplacerOutputTableDirFormat()
+    out_dir = str(output_artifact.path)
+    high_confidence_predictions = run_model_geoOBIS_bootstrap(
+        out_dir,
+        blast_path,
+        aligned_fasta_path,
+        seqlen,
+        num_classes,
+        taxlevel,
+        taxfile,
+        model_dir_path,
+        geoEncoded,
+        geoEncoder,
+        threads,
+        n_bootstrap=100,
+        sigma=sigma,
+        kernel_size=kernel,
+        maskrate=maskrate,
+        conf_threshold=confidence
+    )
+    curated_records = []
+    raw_records = []
+    tsv_path = os.path.join(out_dir, "bestGeoPredict.tsv")
+    with open(tsv_path, "w") as outfile:
+        outfile.write("ASV\tCurated Taxa\tCurated Taxa Level\tPredicted Taxa\tPredicted Taxa Level\tAll Top Scoring hits\tTop Scoring Mean Probs\tTop Scoring Std Dev Probs\tTop Scoring Geo\tAssignment Note\n")
+        for i in high_confidence_predictions:
+            pt = high_confidence_predictions[i]["predicted taxa"]
+            outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+                i,
+                pt["Curated Taxa"],
+                pt["Curated taxa level"],
+                pt["Consensus Taxa"],
+                pt["Consensus taxa level"],
+                pt["All top scoring hits"],
+                pt["All top scoring probabilities"],
+                pt["All top scoring std deviations"],
+                pt["All top scoring Geography Coverages"],
+                high_confidence_predictions[i]["Homology note"]
+            ))
+            curated_taxon_raw = pt["Curated Taxa"]
+            raw_taxon_raw = pt["Consensus Taxa"]
+            full_curated_lineage = taxa_lookup.get(curated_taxon_raw, curated_taxon_raw)
+            full_raw_lineage = taxa_lookup.get(raw_taxon_raw, raw_taxon_raw)
+            curated_records.append({"Feature ID": asv_id, "Taxon": full_curated_lineage})
+            raw_records.append({"Feature ID": asv_id, "Taxon": full_raw_lineage})
+    curated_df = pd.DataFrame(curated_records).set_index("Feature ID")
+    raw_df = pd.DataFrame(raw_records).set_index("Feature ID")
+    return output_artifact, curated_df, raw_df

q2_eplacer/_types.py ADDED Viewed

@@ -0,0 +1,7 @@
+from qiime2.plugin import SemanticType
+BlastResults = SemanticType('BlastResults')
+BlastResultsDir = SemanticType('BlastResultsDir')
+EplacerModel = SemanticType('EplacerModel')
+EplacerTable = SemanticType('EplacerTable')
+EplacerTableTrain = SemanticType('EplacerTableTrain')

q2_eplacer/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.1"

q2_eplacer/citations.bib ADDED Viewed

@@ -0,0 +1,6 @@
+@MISC{Caporaso-Bolyen-2024,
+  title        = "Developing with {QIIME} 2",
+  author       = "{Caporaso, J Gregory and Bolyen, Evan}",
+  year         =  2024,
+  howpublished = "https://develop.qiime2.org"
+}

q2_eplacer/plugin_setup.py ADDED Viewed

@@ -0,0 +1,133 @@
+from qiime2.plugin import Citations, Plugin
+from q2_types.feature_table import FeatureTable, Frequency
+from q2_eplacer import __version__
+from q2_eplacer._methods import run_model, train_model, align_sequences, run_blast
+from q2_eplacer._types import BlastResults, EplacerModel, BlastResultsDir
+from q2_eplacer._formats import BlastOutfmt6Format, EplacerModelDirectoryFormat, BlastOutfmt6DirFormat, EplacerOutputTableDirFormat, EplacerOutputTableTrainDirFormat
+from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat, FeatureData, Sequence, AlignedSequence, Taxonomy
+from q2_types.feature_table import FeatureTable, Frequency
+from ._formats import EplacerModelDirectoryFormat, BlastOutfmt6Format, BlastOutfmt6DirFormat
+from ._types import BlastResults, EplacerModel, EplacerTable, EplacerTableTrain
+from ._formats import BlastOutfmt6DirFormat, EplacerModelDirectoryFormat, EplacerOutputTableFormat
+from qiime2.plugin import Str, Float, Int, Bool, Metadata
+citations = Citations.load("citations.bib", package="q2_eplacer")
+plugin = Plugin(
+    name="eplacer",
+    version=__version__,
+    website="https://github.com/NEFSC/PEMAD-PBB-ePlacer",
+    package="q2_eplacer",
+    description="ePlacer is a taxonomic classification tool that uses deep-learning approaches to incorporate both sequence information and biogeographic information into taxonomic assignment of DNA sequences.",
+    short_description="ASV classifier with deep-learning and biogeography",
+    # The plugin-level citation of 'Caporaso-Bolyen-2024' is provided as
+    # an example. You can replace this with citations to other references
+    # in citations.bib.
+    citations=[citations['Caporaso-Bolyen-2024']]
+)
+# Register custom formats
+plugin.register_semantic_types(BlastResults, EplacerModel, EplacerTable, EplacerTableTrain)
+plugin.register_formats(BlastOutfmt6Format, EplacerModelDirectoryFormat, BlastOutfmt6DirFormat, EplacerOutputTableDirFormat, EplacerOutputTableTrainDirFormat)
+plugin.register_semantic_type_to_format(EplacerModel, artifact_format=EplacerModelDirectoryFormat)
+plugin.register_semantic_type_to_format(BlastResultsDir,artifact_format=BlastOutfmt6DirFormat)
+plugin.register_semantic_type_to_format(EplacerTable, artifact_format=EplacerOutputTableDirFormat)
+plugin.register_semantic_type_to_format(EplacerTableTrain, artifact_format=EplacerOutputTableTrainDirFormat)
+plugin.methods.register_function(
+    function=align_sequences,
+    inputs={'fasta': FeatureData[Sequence],
+           'model': EplacerModel},
+    parameters={'threads': Int},
+    outputs={'aligned_sequences': FeatureData[AlignedSequence]},
+    input_descriptions={'fasta': 'Path to the query file.',
+                       'model': 'Path to the model directory'},
+    parameter_descriptions={'threads': 'number of cores'},
+    output_descriptions={'aligned_sequences': 'aligned fasta file'},
+    name='Align to reference with MAFFT',
+    description="Align query sequences to fasta",
+    citations=[]
+)
+plugin.methods.register_function(
+    function=run_blast,
+    inputs={'fasta': FeatureData[Sequence],
+           'model': EplacerModel},
+    parameters={'threads': Int},
+    outputs={'blast': BlastResultsDir},
+    input_descriptions={'fasta': 'Path to the query file.',
+                       'model': 'Path to the model directory'},
+    parameter_descriptions={'threads': 'number of cores'},
+    output_descriptions={'blast': 'blast results'},
+    name='blast',
+    description="blast against reference db",
+    citations=[]
+)
+plugin.methods.register_function(
+    function=run_model,
+    inputs={'fasta': FeatureData[AlignedSequence],
+        'model': EplacerModel, # Needs a custom or existing semantic type
+        'blast': BlastResultsDir, # Needs a custom or existing semantic type
+        'counts': FeatureTable[Frequency]},
+    parameters={'geodata': Metadata,
+        'taxlevel': Str,
+        'maskrate': Float,
+        'sigma': Float,
+        'kernel': Int,
+        'threads': Int,
+        'confidence': Float,
+        'force': Bool},
+    outputs=[('eplacer_table', EplacerTable),
+        ('curated_taxonomy', FeatureData[Taxonomy]),
+        ('raw_taxonomy', FeatureData[Taxonomy])],
+    input_descriptions={'fasta': 'Path to the reference database in fasta format. Accessions must match the IDs in the taxa file.',
+        'model': 'The pre-trained model artifact to be applied.',
+        'blast': 'Results from blastn of sequences against database (outfmt 6).',
+        'counts': 'Optional. Abundance data in the format of a count matrix.'},
+    parameter_descriptions={'geodata': 'Optional. Known geographic information (SampleID, Latitude, Longitude) for each sequence.',
+        'taxlevel': 'Specify the taxonomic level the model was trained for.',
+        'maskrate': 'Defines the frequency with which each base is masked. Default: 0.02',
+        'sigma': 'Defines the standard deviation of the gaussian kernel.',
+        'kernel': 'Defines the size of the gaussian kernel.',
+        'threads': 'Specify available threads.',
+        'confidence': 'Specify a confidence threshold.',
+        'force': 'Force overwrite of preexisting files. Default: False'},
+    output_descriptions={'eplacer_table': 'The resulting taxonomic assignments in ePlacer format',
+                        'curated_taxonomy': 'Qiime object for curated taxonomy',
+                        'raw_taxonomy': 'Qiime object for raw predictions',},
+    name='Run ePlacer Model',
+    description=("Classify sequence and (if provided) geographic "
+                 "data to a taxonomic assignment using a pre-trained model."),
+    citations=[]
+)
+plugin.methods.register_function(
+    function=train_model,
+    inputs={'fasta': FeatureData[Sequence],
+            'alignedfasta': FeatureData[AlignedSequence]},
+    parameters={'geodata': Metadata,
+                'taxonomy': Metadata,
+                'taxlevel': Str,
+                'maskrate': Float,
+                'sigma': Float,
+                'kernel': Int,
+                'precision': Int,
+                'num_augments': Float},
+    outputs=[('model', EplacerModel), ('training_stats', EplacerTableTrain)],
+    input_descriptions={'fasta': 'Path to the reference database in fasta format. Accessions must match the IDs in the taxa file. Unaligned',
+        'alignedfasta': 'Path to the reference database in fasta format. Accessions must match the IDs in the taxa file. Aligned'},
+    parameter_descriptions={'geodata': 'Known geographic information (SampleID, Latitude, Longitude) for each taxa.',
+                            'taxonomy': 'Taxonomy Table, tsv.',
+                            'taxlevel': 'Specify the taxonomic level the model was trained for.',
+                            'maskrate': 'Defines the frequency with which each base is masked. Default: 0.02',
+                            'sigma': 'Defines the standard deviation of the gaussian kernel.',
+                            'kernel': 'Defines the size of the gaussian kernel.',
+                            'precision': 'Geohash precision',
+                            'num_augments': 'Number of sequence augments to perform.'},
+    output_descriptions={'model': 'The best performing deep learning epoch', 'training_stats': 'Evaluation statistics generated during training.'},
+    name='Train ePlacer Model',
+    description=("Train a deep learning classifier."),
+    citations=[]
+)

q2_eplacer/tests/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# flake8: noqa
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024, Christopher Powers.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------

q2_eplacer/tests/data/alignedSeqs.qza ADDED Viewed

Binary file

q2_eplacer/tests/data/full_taxonomy.tsv ADDED Viewed

@@ -0,0 +1,17 @@
+sampleid	Kingdom	Phylum	Class	Order	Family	Genus	Species
+A	Eukaryota	Chordata	Actinopteri	OrderA	FamilyA	GenusA	SpeciesA
+B	Eukaryota	Chordata	Actinopteri	OrderBC	FamilyBC	GenusBC	SpeciesB
+C	Eukaryota	Chordata	Actinopteri	OrderBC	FamilyBC	GenusBC	SpeciesC
+D	Eukaryota	Chordata	Actinopteri	OrderDO	FamilyDO	GenusDO	SpeciesD
+E	Eukaryota	Chordata	Actinopteri	OrderEP	FamilyEP	GenusEP	SpeciesE
+F	Eukaryota	Chordata	Actinopteri	OrderFI	FamilyFI	GenusFI	SpeciesF
+G	Eukaryota	Chordata	Actinopteri	OrderGJ	FamilyGJ	GenusGJ	SpeciesG
+H	Eukaryota	Chordata	Actinopteri	OrderHK	FamilyHK	GenusH	SpeciesH
+I	Eukaryota	Chordata	Actinopteri	OrderFI	FamilyFI	GenusFI	SpeciesI
+J	Eukaryota	Chordata	Actinopteri	OrderGJ	FamilyGJ	GenusGJ	SpeciesG
+K	Eukaryota	Chordata	Actinopteri	OrderHK	FamilyHK	GenusK	SpeciesK
+L	Eukaryota	Chordata	Actinopteri	OrderL	FamilyL	GenusL	SpeciesL
+M	Eukaryota	Chordata	Actinopteri	OrderM	FamilyM	GenusM	SpeciesM
+N	Eukaryota	Chordata	Actinopteri	OrderN	FamilyN	GenusN	SpeciesN
+O	Eukaryota	Chordata	Actinopteri	OrderDO	FamilyDO	GenusDO	SpeciesO
+P	Eukaryota	Chordata	Actinopteri	OrderEP	FamilyEP	GenusEP	SpeciesP

q2_eplacer/tests/data/geoData.tsv ADDED Viewed

@@ -0,0 +1,17 @@
+sampleid	Latitude	Longitude
+A	39.645946	-71.746641
+B	39.645946	-71.746641
+C	39.645946	-71.746641
+D	39.645946	-71.746641
+E	39.645946	-71.746641
+F	39.645946	-71.746641
+G	39.645946	-71.746641
+H	39.645946	-71.746641
+I	46.433867	-126.20164
+J	46.433867	-126.20164
+K	46.433867	-126.20164
+L	46.433867	-126.20164
+M	46.433867	-126.20164
+N	46.433867	-126.20164
+O	46.433867	-126.20164
+P	46.433867	-126.20164

q2_eplacer/tests/data/geoData_run.tsv ADDED Viewed

@@ -0,0 +1,3 @@
+sampleid	Latitude	Longitude
+Sample1	39.645946	-71.746641
+Sample2	39.645946	-71.746641

q2_eplacer/tests/data/seqs.qza ADDED Viewed

Binary file

q2_eplacer/tests/data/testfasta.fa ADDED Viewed

@@ -0,0 +1,6 @@
+>ASV1
+GCCGTAAACTTAGATAAATTAGTACAACAAATATCGGCCCGGGAACTACGAGCGCCAGCTTATAACCCAAAGGACTTGGCGCTGCTTCAGACCCCCCT
+>ASV2
+GCGGTAAACTTAGATATATTAGTACAACAAATATCGGCCCGGGAACTACGAGCGCCTGCTTAAAACCCAAAGGTCTTGGCGGTGCTTCAGACCCCCCT
+>ASV3
+GCGGTAAACTTAGATATATTAGTACAACAAATATCGGCCCGGGAACTACGAGCGCCTGCTTAAAACCCAAAGGTCTTGGCGGTGCTTCAGACCCCCCT

q2_eplacer/tests/data/testfasta.qza ADDED Viewed

Binary file

q2_eplacer/tests/test_methods.py ADDED Viewed

@@ -0,0 +1,113 @@
+import pandas as pd
+import pandas.testing as pdt
+from qiime2.plugin.testing import TestPluginBase
+from qiime2.plugin.util import transform
+from q2_types.feature_table import BIOMV100Format
+import os
+import unittest
+import qiime2
+from q2_eplacer._methods import train_model, run_model
+from q2_eplacer._formats import EplacerModelDirectoryFormat, EplacerOutputTableTrainDirFormat
+from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat
+from q2_types.feature_table import FeatureTable, Frequency
+import biom
+from q2_eplacer._formats import (
+    BlastOutfmt6DirFormat,
+    EplacerOutputTableDirFormat
+)
+import numpy as np
+class TestEplacerTraining(unittest.TestCase):
+    """
+    Simple test case to test training and inference.
+    Checks for output file generation
+    """
+    def setUp(self):
+        self.data_dir = os.path.join(os.path.dirname(__file__), 'data')
+        self.fasta_path = os.path.join(self.data_dir, 'seqs.qza')
+        self.aligned_fasta_path = os.path.join(self.data_dir, 'alignedSeqs.qza')
+        self.taxonomy_path = os.path.join(self.data_dir, 'full_taxonomy.tsv')
+        self.geodata_path = os.path.join(self.data_dir, 'geoData.tsv')
+        self.fasta = qiime2.Artifact.load(self.fasta_path)
+        self.alignedfasta = qiime2.Artifact.load(self.aligned_fasta_path)
+        self.taxonomy = qiime2.Metadata.load(os.path.join(self.data_dir, 'full_taxonomy.tsv'))
+        self.geodata = qiime2.Metadata.load(os.path.join(self.data_dir, 'geoData.tsv'))
+        # inference data
+        self.aligned_fasta_path = os.path.join(self.data_dir, 'alignedSeqs.qza')
+        self.geodata_path = os.path.join(self.data_dir, 'geoData_run.tsv')
+        self.alignedfasta = qiime2.Artifact.load(self.aligned_fasta_path)
+        self.geodata = qiime2.Metadata.load(self.geodata_path)
+        count_matrix = np.array([
+            [15, 0],   # ASV1 is present in Sample1
+            [5,  22],  # ASV2 is present in both
+            [0,  10]   # ASV3 is present in Sample2
+        ])
+        observ_ids = ['ASV1', 'ASV2', 'ASV3']
+        sample_ids = ['Sample1', 'Sample2']
+        self.counts = biom.Table(count_matrix, observ_ids, sample_ids)
+    def test_train_model_execution(self):
+        """Test that train_model executes completely and returns valid directory formats."""
+        model_out, stats_out = train_model(
+            fasta=self.fasta.view(DNAFASTAFormat),
+            alignedfasta=self.alignedfasta.view(AlignedDNAFASTAFormat),
+            taxonomy=self.taxonomy,
+            geodata=self.geodata,
+            taxlevel="SPECIES",
+            num_augments=100,  # Keep it ultra-low (like 2) so test cases execute in 3 seconds!
+            maskrate=0.01,
+            sigma=1,
+            kernel=3,
+            precision=2
+        )
+        blast_dir_artifact = BlastOutfmt6DirFormat()
+        blast_file_path = os.path.join(str(blast_dir_artifact.path), 'blast_results.tsv')
+        with open(blast_file_path, 'w') as bf:
+            bf.write("ASV1\tA\t100.00\t1e-100\t98\t98\t98\t1\t98\t1\t98\n")
+            bf.write("ASV2\tB\t99.00\t1e-100\t98\t98\t98\t1\t98\t1\t98\n")
+            bf.write("ASV3\tC\t100.00\t\1e-100\\t98\t98\t98\t1\t98\t1\t98\n")
+        self.assertIsInstance(model_out, EplacerModelDirectoryFormat)
+        self.assertIsInstance(stats_out, EplacerOutputTableTrainDirFormat)
+        model_path = str(model_out.path)
+        stats_path = str(stats_out.path)
+        self.assertTrue(os.path.exists(os.path.join(model_path, 'config.yml')))
+        self.assertTrue(os.path.exists(os.path.join(model_path, 'geoEncoder.pkl')))
+        self.assertTrue(os.path.exists(os.path.join(stats_path, 'model_geo_stats.tsv')))
+        custom_dir_out, curated_df, consensus_df = run_model(
+            fasta=self.alignedfasta.view(AlignedDNAFASTAFormat),
+            model=model_out,
+            blast=blast_dir_artifact,
+            counts=self.counts,
+            geodata=self.geodata,
+            taxlevel="SPECIES",
+            maskrate=0.01,
+            sigma=1.0,
+            kernel=1,
+            threads=1,
+            confidence=0.9,
+            force=False
+        )
+        self.assertIsInstance(custom_dir_out, EplacerOutputTableDirFormat)
+        self.assertIsInstance(curated_df, pd.DataFrame)
+        self.assertIsInstance(consensus_df, pd.DataFrame)
+        inference_path = str(custom_dir_out.path)
+        expected_prediction_file = os.path.join(inference_path, "bestGeoPredict.tsv")
+        self.assertTrue(os.path.exists(expected_prediction_file))
+if __name__ == '__main__':
+    unittest.main()

q2_eplacer-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,215 @@
+Metadata-Version: 2.4
+Name: q2-eplacer
+Version: 0.1.1
+Summary: ASV classifier with deep-learning and biogeography
+Author-email: Christopher Powers <christopher.powers@noaa.gov>
+License: Software code created by U.S. Government employees is not subject to copyright in the United States (17 U.S.C. §105).
+        The United States/Department of Commerce reserve all rights to seek and obtain copyright protection in countries other
+        than the United States for Software authored in its entirety by the Department of Commerce. To this end, the Department
+        of Commerce hereby grants to Recipient a royalty-free, nonexclusive license to use, copy, and create derivative works of
+        the Software outside of the United States.
+Project-URL: Homepage, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer
+Project-URL: Repository, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer
+Project-URL: Bug Tracker, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer/issues
+Keywords: qiime2,microbiome,taxonomy,deep-learning,biogeography
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE.txt
+Requires-Dist: qiime2
+Requires-Dist: pandas
+Requires-Dist: biom-format
+Requires-Dist: eplacer
+Dynamic: license-file
+# q2-eplacer
+A [QIIME 2](https://qiime2.org) plugin [developed](https://develop.qiime2.org) by Christopher Powers (christopher.powers@noaa.gov) that alows for the [ePlacer taxonomic classifier](https://github.com/NEFSC/PEMAD-PBB-ePlacer/) to interface with QIIME2.
+ePlacer is a taxonomic classification tool that uses deep-learning approaches to incorporate both sequence information and biogeographic information into taxonomic assignment of DNA sequences.
+## Why use ePlacer
+The machine learning architecture of ePlacer enables powerful prediction beyond sequence-only classification tools (e.g. sequence alignment with blast or naive-bayes classifiers) by directly incorporating additional data into the probabalistic estimate of taxonomy, specifically developed for metabarcoding data. This novel applciation of deep-learning is immensely useful, as there can be many cases in metabarcoding data where two reference species have 100% sequence overlap, but distinct geographic ranges. This tool discriminates these cases and provides additional data for downstream taxonomic curation. Due to this, ePlacer provides enhanced interoperability between metabarcoding datasets.
+Currently, ePlacer offers pre-trained models for two popular metabarcoding regions: the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. For these two regions, ePlacer offers the following benefits:
+* **Interoperability.** ePlacer is trained on global datasets, allowing for direct comparison between metabarcoding datasets, regardless of geographic region.
+* **Portability.** ePlacer has pre-trained models available for both MiFish and Riaz marker gene regions containerized and available for out-of-the-box use
+* **Increased Accuracy.** The ePlacer model architecture provides increased accuracy, precision, and recall as compared to blast, Naive-Bayes, or least common ancestor approachers
+* **Trainability** In addition to the two provided barcodes, this code repository provides tools for training new models.
+For other barcode regions, there will be significant advantages with the training of new models. If you are interested in training a new model for ePlacer, please do not hesitate to reach out!
+## Installation instructions
+**The following instructions are intended to be a starting point** and should be replaced when `q2-eplacer` is ready to share with others.
+They will enable you to install the most recent *development* version of `q2-eplacer`.
+Remember that *release* versions should be used for all "real" work (i.e., where you're not testing or prototyping) - if there aren't instructions for installing a release version of this plugin, it is probably not yet intended for use in practice.
+### Install Prerequisites
+[Miniconda](https://conda.io/miniconda.html) provides the `conda` environment and package manager, and is currently the only supported way to install QIIME 2.
+Follow the instructions for downloading and installing Miniconda.
+After installing Miniconda and opening a new terminal, make sure you're running the latest version of `conda`:
+```bash
+conda update conda
+```
+You also need to install the base qiime2 as a conda environment. Follow the [install instructions here](https://docs.qiime2.org/2024.10/install/native/).
+###  Install `q2-eplacer`
+Next, you will install the ePlacer qiime plugin from pip
+```bash
+pip install q2-eplacer
+```
+This will also install all other required dependencies.
+## Using `q2-eplacer`
+### Data preparation
+In order to use ePlacer, you must first prep the data prior to installing, including prepping input data and collecting a pre-trained model for inference.
+#### Pre-Trained models
+Currently, two pre-trained models are available: : the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. These are available in a QIIME2-compatible format:
+```bash
+# Mifish marker
+wget https://zenodo.org/records/20820029/files/mifish.qza
+# Riaz marker
+wget https://zenodo.org/records/20820029/files/riaz.qza
+```
+If desired, users can also train new models, see below in section `Training New Models`. Any new, high performing models may be added to a Zenodo record by reaching out to the maintainers.
+If you trained a new model with the qiime2 plugin, it will be automatically formatted into the `.qza` format. Otherwise, run the following:
+```bash
+qiime tools import \
+    --type EplacerModel \
+    --input-path ./model/ \
+    --output-path model.qza
+```
+##### Prepping input data.
+In addition to the models, users must import their input data properly. Input data formatting requirements may be seen in documentation for the original [ePlacer](https://github.com/NEFSC/PEMAD-PBB-ePlacer) package.
+##### Sequence data
+Input sequence data is required in fasta format, which can be imported into QIIME2 formats with the following:
+```bash
+qiime tools import \
+    --type "FeatureData[Sequence]" \
+    --input-path seqs.fa \
+    --output-path seqs.qza
+```
+The sequence data should also be aligned, which can be done with the q2-eplacer function:
+```bash
+qiime eplacer align-sequences --i-fasta ./seqs.qza \
+    --i-model ./model.qza \
+    --o-aligned-sequences ./aligned_seqs.qza \
+    --p-threads 8
+```
+##### Count data
+Count data must first be converted to a `.biom` format, then to a `.qza` format
+```bash
+biom convert -i ./counts.tsv \
+    -o ./counts.biom \
+    --table-type="OTU table" \
+    --to-hdf5
+qiime tools import --input-path ./meta.biom \
+    --type 'FeatureTable[Frequency]' \
+    --input-format BIOMV210Format \
+    --output-path ./counts.qza
+```
+##### geoData
+The geographic data can be read in as a metadata file, and requires no further transformations.
+##### blast data
+Although not used by the machine learning model, blast results are incredible useful for screening the results for mismatches when 100% matches are possible. Thus, a function for running the blast results was also included:
+```bash
+qiime eplacer run-blast \
+    --i-fasta ./seqs.qza \
+    --i-model ./model.qza \
+    --o-blast ./hits.qza \
+    --p-threads 8
+```
+Note the unaligned sequences were used for blast.
+### Running the model
+Congratulations! You are ready to run `ePlacer`!
+```bash
+qiime eplacer run-model-qiime \
+    --i-fasta ./aligned_seqs.qza \
+    --i-model ./model.qza \
+    --i-blast ./hits.qza \
+    --i-counts ./counts.qza \
+    --m-geodata-file ./geoData.tsv \
+    --o-eplacer-table ./ePlacerAssignment.qza \
+    --o-curated-taxonomy ./qiimeAssignmentCurated.qza \
+    --o-raw-taxonomy ./qiimeAssignmentRaw.qza
+qiime tools export --input-path ../ePlacerAssignment.qza \
+    --output-path ../ePlacerAssignment
+qiime tools export --input-path ../qiimeAssignmentCurated.qza \
+    --output-path ../qiimeAssignmentCurated
+qiime tools export --input-path ../qiimeAssignmentRaw.qza \
+    --output-path ../qiimeAssignmentRaw
+```
+You may notice there are three output files present. This is three different file formats. The first, `--o-eplacer-table` details the native ePlacer output format described in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer). The second, `--o-curated-taxonomy`, outputs the curated assignments in QIIME2 compatible format. The third, `--o-raw-taxonomy`, outputs the raw taxonomic assignments in QIIME2 compatible format.
+##### A special note
+As with all other taxonomic assignment tools, all taxonomic assignments should still be manuall curated after assignment. ePlacer exhibits higher accuracy than other tools, but is not perfect.
+### Training the model
+The QIIME2 implementation of ePlacer also supports training new models. File format requirements are detailed in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
+```bash
+qiime eplacer train-model \
+    --i-fasta ./unalignedSeqs.qza \
+    --i-alignedfasta ./alignedSeqs.qza \
+    --m-taxonomy-file ./taxonomy.tsv \
+    --m-geodata-file ./geoData.tsv \
+    --p-num-augments 100 \
+    --o-model .toyModel.qza \
+    --o-training-stats ./stats.qza \
+    --verbose
+```
+## About
+The `q2-eplacer` Python package was [created from a template](https://develop.qiime2.org/en/stable/plugins/tutorials/create-from-template.html).
+To learn more about `q2-eplacer`, refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
+To learn how to use QIIME 2, refer to the [QIIME 2 User Documentation](https://docs.qiime2.org).
+To learn QIIME 2 plugin development, refer to [*Developing with QIIME 2*](https://develop.qiime2.org).
+`q2-eplacer` is a QIIME 2 community plugin, meaning that it is not necessarily developed and maintained by the developers of QIIME 2.
+Please be aware that because community plugins are developed by the QIIME 2 developer community, and not necessarily the QIIME 2 developers themselves, some may not be actively maintained or compatible with current release versions of the QIIME 2 distributions.
+More information on development and support for community plugins can be found [here](https://library.qiime2.org).
+If you need help with a community plugin, first refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
+If that page doesn't provide information on how to get help, or you need additional help, head to the [Community Plugins category](https://forum.qiime2.org/c/community-contributions/community-plugins/14) on the QIIME 2 Forum where the QIIME 2 developers will do their best to help you.
+==============================================================
+This repository is a scientific product and is not official communication of the National Oceanic and Atmospheric Administration, or the United States Department of Commerce. All NOAA GitHub project code is provided on an ‘as is’ basis and the user assumes responsibility for its use. Any claims against the Department of Commerce or Department of Commerce bureaus stemming from the use of this GitHub project will be governed by all applicable Federal law. Any reference to specific commercial products, processes, or services by service mark, trademark, manufacturer, or otherwise, does not constitute or imply their endorsement, recommendation or favoring by the Department of Commerce. The Department of Commerce seal and logo, or the seal and logo of a DOC bureau, shall not be used in any manner to imply endorsement of any commercial product or activity by DOC or the United States Government.

q2_eplacer-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,22 @@
+q2_eplacer/__init__.py,sha256=6ORwOQAjsHowxrxrg48pKOzqRNQzOFySMHw8adKlJZQ,122
+q2_eplacer/_formats.py,sha256=Esiwas_qZlit8WNOXDMHk8ehqPrYfjy8rmz5dNo4akI,3628
+q2_eplacer/_methods.py,sha256=66f-rO98J95MxnWt4RB09sXajn7_7ZiqhJgfCXT9K4o,19349
+q2_eplacer/_types.py,sha256=dser_tJu_CdjrH3aZSZKijTDDn62lU11bwbNMQivlWY,275
+q2_eplacer/_version.py,sha256=rnObPjuBcEStqSO0S6gsdS_ot8ITOQjVj_-P1LUUYpg,22
+q2_eplacer/citations.bib,sha256=SQQV_2H8nQ1XH5nRDJXZeKUqBVeD4n_MjF11pObZT-U,205
+q2_eplacer/plugin_setup.py,sha256=QNl4cBB5EJn6W_wPMrARX55V8ccpy9VsOnV8UsS23H4,7387
+q2_eplacer/tests/__init__.py,sha256=FBJL0kNtj1xr82tEEaR_C8UxijJPsTYyLF8xuctS7ok,353
+q2_eplacer/tests/test_methods.py,sha256=LZeemrRMJq-W4kYIpH33k6Vl85ycEv54zShNHA29nik,4626
+q2_eplacer/tests/data/alignedSeqs.qza,sha256=DYfKgn6cHnBs2DcId6xmy4fb5IcxFBVpvkP-XeDkheU,16174
+q2_eplacer/tests/data/full_taxonomy.tsv,sha256=vRvZ-ZadTSieuzNFyMUU5ae1KG4l9esn3T8XEK3I-kg,1115
+q2_eplacer/tests/data/geoData.tsv,sha256=w5CIberQCftmZB24ixNlOL9jQM-pnyP8m46Wq4lTjYw,396
+q2_eplacer/tests/data/geoData_run.tsv,sha256=SoMG5fdjs0KOH2Jnnv7Upp8OEsOACrhfATnZtGbriaE,86
+q2_eplacer/tests/data/seqs.qza,sha256=srsqDU9enAbRdWWViZJgHq27UUp8S_pCx7Wt_hnNIDg,16133
+q2_eplacer/tests/data/testfasta.fa,sha256=iN2modoscNH8qNAnXh8rDAqB2FQ_YgEA8cgWYy3u62c,315
+q2_eplacer/tests/data/testfasta.qza,sha256=cJnekUyV0MvZz-INAuHL-4PZP__Qtge3gQjNzLqTc4Y,16037
+q2_eplacer-0.1.1.dist-info/licenses/LICENSE.txt,sha256=8vpWtuzfqNhxsr4BFgox7QcAimDo6ewY1Rt9MdpfUS0,525
+q2_eplacer-0.1.1.dist-info/METADATA,sha256=MkCNX2afMwR0vp6FX5Gju2chd5FaKsn--GHhLhO49kY,12180
+q2_eplacer-0.1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+q2_eplacer-0.1.1.dist-info/entry_points.txt,sha256=9lLaVQHujRxUs0NStaU5m6pACjoZ7dgIIYhXETrlXGU,61
+q2_eplacer-0.1.1.dist-info/top_level.txt,sha256=3Arq1r1V9XJKEZXixwht-AMDFXNnt9Vlr0FiffxtsUc,11
+q2_eplacer-0.1.1.dist-info/RECORD,,

q2_eplacer-0.1.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

q2_eplacer-0.1.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [qiime2.plugins]
2	+ q2-eplacer = q2_eplacer.plugin_setup:plugin

q2_eplacer-0.1.1.dist-info/licenses/LICENSE.txt ADDED Viewed

@@ -0,0 +1,5 @@
+Software code created by U.S. Government employees is not subject to copyright in the United States (17 U.S.C. §105).
+The United States/Department of Commerce reserve all rights to seek and obtain copyright protection in countries other
+than the United States for Software authored in its entirety by the Department of Commerce. To this end, the Department
+of Commerce hereby grants to Recipient a royalty-free, nonexclusive license to use, copy, and create derivative works of
+the Software outside of the United States.

q2_eplacer-0.1.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ q2_eplacer