q2-eplacer 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
q2_eplacer/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ # flake8: noqa
2
+
3
+ try:
4
+ from ._version import __version__
5
+ except ModuleNotFoundError:
6
+ __version__ = '0.0.0+notfound'
q2_eplacer/_formats.py ADDED
@@ -0,0 +1,80 @@
1
+ import qiime2.plugin.model as model
2
+ import csv
3
+ class BlastOutfmt6Format(model.TextFileFormat):
4
+ """Format for tabular blastn output (outfmt 6)."""
5
+ def validate(self, level):
6
+ lines_to_check = 10 if level == 'min' else None
7
+ with open(str(self.path), 'r') as fh:
8
+ reader = csv.reader(fh, delimiter='\t')
9
+
10
+ for i, row in enumerate(reader):
11
+ # Stop early if we only need a 'min' validation
12
+ if lines_to_check is not None and i >= lines_to_check:
13
+ break
14
+ # Skip empty lines
15
+ if not row:
16
+ continue
17
+
18
+ # Expecting: qseqid, sseqid, pident, evalue, length, qlen, slen,
19
+ # qstart, qend, sstart, send, sseq
20
+ if len(row) != 12:
21
+ raise model.ValidationError(
22
+ f"Invalid BLAST format on line {i+1}. "
23
+ f"Expected exactly 12 columns, but found {len(row)}."
24
+ )
25
+
26
+ try:
27
+ # pident
28
+ float(row[2])
29
+ # evalue
30
+ float(row[3])
31
+ # length (index 4)
32
+ int(row[4])
33
+ except ValueError:
34
+ raise model.ValidationError(
35
+ f"Invalid data type on line {i+1}. "
36
+ "Columns like 'pident', 'evalue', and 'length' must be numeric."
37
+ )
38
+
39
+ class BlastOutfmt6DirFormat(model.DirectoryFormat):
40
+ """Directory format containing exactly one tabular BLAST output file."""
41
+ # This maps any file matching the regex pattern to your validator format class
42
+ blast_file = model.File(r'blast_results\.tsv', format=BlastOutfmt6Format)
43
+
44
+
45
+ class ePlacerTextFileFormat(model.TextFileFormat):
46
+ def validate(self, level):
47
+ pass
48
+
49
+ class ePlacerBinaryFileFormat(model.BinaryFileFormat):
50
+ def validate(self, level):
51
+ pass
52
+
53
+ class EplacerModelDirectoryFormat(model.DirectoryFormat):
54
+ """Format representing the eplacer pre-trained model directory."""
55
+ config = model.File('config.yml', format=ePlacerTextFileFormat)
56
+ geo_encoder = model.File('geoEncoder.pkl', format=ePlacerBinaryFileFormat)
57
+ accession_dict = model.File('accessionKeyDict.pkl', format=ePlacerBinaryFileFormat)
58
+ grid_config = model.File(r'grid_config_.*\.npy', format=ePlacerBinaryFileFormat)
59
+ best_model = model.File(r'best_geo_model_.*\.pth', format=ePlacerBinaryFileFormat)
60
+ best_param = model.File(r'best_geo_param_.*\.pth', format=ePlacerBinaryFileFormat)
61
+ taxa_key = model.File(r'taxa_key_.*\.tsv', format=ePlacerTextFileFormat)
62
+ alignment = model.File(r'alignment.fa', format=ePlacerTextFileFormat)
63
+ fasta = model.File(r'reference.fa', format=ePlacerTextFileFormat)
64
+ taxfile = model.File(r'full_taxonomy.tsv', format=ePlacerTextFileFormat)
65
+ geopkl = model.File(r'geoTrain.pkl', format=ePlacerBinaryFileFormat)
66
+ labelpkl = model.File(r'labelTrain.pkl', format=ePlacerBinaryFileFormat)
67
+
68
+ class EplacerOutputTableFormat(model.TextFileFormat):
69
+ def validate(self, level):
70
+ pass
71
+
72
+ class EplacerOutputTableTrainFormat(model.TextFileFormat):
73
+ def validate(self, level):
74
+ pass
75
+
76
+ class EplacerOutputTableDirFormat(model.DirectoryFormat):
77
+ predictions = model.File('bestGeoPredict.tsv', format=EplacerOutputTableFormat)
78
+
79
+ class EplacerOutputTableTrainDirFormat(model.DirectoryFormat):
80
+ predictions = model.File('model_geo_stats.tsv', format=EplacerOutputTableTrainFormat)
q2_eplacer/_methods.py ADDED
@@ -0,0 +1,472 @@
1
+ import pandas as pd
2
+ import eplacer as ep
3
+ from eplacer import external
4
+ from eplacer.geographicRep import SpeciesGeoEncoder
5
+ from eplacer.run_model import run_model_geoOBIS_bootstrap
6
+ from eplacer.train_evaluate import mask_sequence, check_data_loader_geo, train_and_evaluate_geo
7
+ import subprocess
8
+ import qiime2
9
+ from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat, FeatureData, Sequence, AlignedSequence
10
+ from q2_types.feature_table import FeatureTable, Frequency
11
+ from ._formats import EplacerModelDirectoryFormat, BlastOutfmt6Format, BlastOutfmt6DirFormat, EplacerOutputTableFormat, EplacerOutputTableTrainFormat
12
+ from ._types import BlastResults, EplacerModel, BlastResultsDir, EplacerTableTrain, EplacerTable
13
+ from ._formats import BlastOutfmt6DirFormat, EplacerModelDirectoryFormat, EplacerOutputTableDirFormat, EplacerOutputTableTrainDirFormat
14
+ from qiime2.plugin import Str, Float, Int, Bool, Metadata
15
+ import biom
16
+ import os
17
+ import tempfile
18
+ from collections import defaultdict
19
+ import sys
20
+ import re
21
+ import numpy as np
22
+ import pickle
23
+ import yaml
24
+ import time
25
+ import random
26
+ from tqdm import tqdm
27
+ from sklearn.metrics import accuracy_score, f1_score
28
+ import shutil
29
+ from eplacer.models import build_dataloaders_geo, CNNWithSpatialEncoding, GeographicAuxiliaryLoss
30
+ import torch
31
+ from torch import nn
32
+ import torch.nn.functional as F
33
+ import torch.optim as optim
34
+ import typing
35
+
36
+ def align_sequences(fasta: DNAFASTAFormat,
37
+ model: EplacerModelDirectoryFormat,
38
+ threads: int = 1) -> AlignedDNAFASTAFormat:
39
+ input_fasta_path = str(fasta)
40
+ model_dir_path = str(model.path)
41
+ reference_alignment_path = os.path.join(model_dir_path, "alignment.fa")
42
+ aligned_artifact = AlignedDNAFASTAFormat()
43
+ with tempfile.TemporaryDirectory() as temp_dir:
44
+ moutput_path = os.path.join(temp_dir, 'mafft_total_output.afa')
45
+ subset_output_path = str(aligned_artifact.path)
46
+ print("Invoking MAFFT via ePlacer...")
47
+
48
+ fastaDict = external.run_mafft(
49
+ input=input_fasta_path,
50
+ threads=str(threads),
51
+ reference=reference_alignment_path,
52
+ moutput=moutput_path,
53
+ subset_output=subset_output_path
54
+ )
55
+ return aligned_artifact
56
+
57
+ def run_blast(fasta: DNAFASTAFormat,
58
+ model: EplacerModelDirectoryFormat,
59
+ threads: int = 1) -> BlastOutfmt6DirFormat:
60
+
61
+ input_fasta_path = str(fasta)
62
+ model_dir_path = str(model.path)
63
+ reference_alignment_path = os.path.join(model_dir_path, "reference.fa")
64
+ blast_artifact = BlastOutfmt6DirFormat()
65
+
66
+ with tempfile.TemporaryDirectory() as temp_dir:
67
+ print("making blast db...")
68
+ db_cmd = [
69
+ "makeblastdb",
70
+ "-in", reference_alignment_path,
71
+ "-dbtype", "nucl",
72
+ "-out", "blastdb"
73
+ ]
74
+ subprocess.run(db_cmd, check=True)
75
+ output_blast_path = os.path.join(str(blast_artifact.path), 'blast_results.tsv')
76
+
77
+ blast_cmd = [
78
+ "blastn",
79
+ "-query", input_fasta_path,
80
+ "-db", "blastdb",
81
+ "-outfmt", "6 qseqid sseqid pident evalue length qlen slen qstart qend sstart send sseq",
82
+ "-out", output_blast_path,
83
+ "-num_threads", str(threads)
84
+ ]
85
+
86
+ print(f"Running BLAST using: {' '.join(blast_cmd)}")
87
+ subprocess.run(blast_cmd, check=True)
88
+ print(f"BLAST search completed. Results saved to {output_blast_path}")
89
+
90
+ return blast_artifact
91
+
92
+ def train_model(fasta: DNAFASTAFormat,
93
+ alignedfasta: AlignedDNAFASTAFormat,
94
+ taxonomy: qiime2.Metadata,
95
+ geodata: qiime2.Metadata,
96
+ taxlevel: str = "SPECIES",
97
+ num_augments: int = 10,
98
+ maskrate: float = 0.01,
99
+ sigma: float = 1,
100
+ kernel: int = 3,
101
+ precision: int = 2
102
+ ) -> (EplacerModelDirectoryFormat, EplacerOutputTableTrainDirFormat):
103
+ output_artifact = EplacerModelDirectoryFormat()
104
+ out = str(output_artifact.path)
105
+
106
+ taxa_tsv_path = os.path.join(out, "full_taxonomy.tsv")
107
+ taxonomy.save(taxa_tsv_path)
108
+ fasta_path = os.path.join(out, "reference.fa")
109
+ fasta.save(fasta_path)
110
+ alignedfasta_path = os.path.join(out, "alignment.fa")
111
+ alignedfasta.save(alignedfasta_path)
112
+ taxa_tsv_path = os.path.join(out, "full_taxonomy.tsv")
113
+ taxonomy.save(taxa_tsv_path)
114
+
115
+ config_dict = {"Augments":num_augments, "Mask Rate":maskrate, "Sigma":sigma, "Kernel":kernel, "Precision":precision}
116
+ with open(os.path.join(out, f"config.yml"), "w") as outfile:
117
+ yaml.dump(config_dict, outfile, sort_keys=False)
118
+
119
+ taxDict = {}
120
+ geo_dict = {}
121
+ levels = {"SPECIES":7,"GENUS":6,"FAMILY":5,"ORDER":4,"CLASS":3,"PHYLUM":2}
122
+ with open(taxa_tsv_path, 'r') as f:
123
+ for line in f:
124
+ line = line.rstrip()
125
+ listall = re.split("\t", line)
126
+ if listall[0] != "accession":
127
+ taxDict[listall[0]] = listall[levels[taxlevel]]
128
+ geo_dict[listall[0]] = listall[7]
129
+
130
+ # Read in the sequence data
131
+ seqDict = {}
132
+ key = ''
133
+ seq = ''
134
+ lengths = []
135
+ unique = defaultdict(lambda:0)
136
+
137
+ fasta = str(alignedfasta)
138
+ # copy aligned fasta so if can be properly used in the output direction
139
+ # shutil.copyfile(fasta, f"{out}/alignment.fa")
140
+ # shutil.copyfile(taxa, f"{out}/full_taxonomy.tsv")
141
+ with open(fasta_path, 'r') as f:
142
+ for line in f:
143
+ line = line.rstrip()
144
+ if line.startswith(">"):
145
+ if key != '':
146
+ seqDict[key[1:]] = seq.upper()
147
+ unique[seq.upper()]+=1
148
+ lengths.append(len(seq.upper().replace("-", "")))
149
+ seq = ''
150
+ key = line
151
+ else:
152
+ seq += line
153
+ seqDict[key[1:]] = seq.upper()
154
+
155
+ mean_length = sum(lengths)/len(lengths)
156
+ print(mean_length)
157
+ seqdict2={}
158
+ for i in seqDict:
159
+ if len(seqDict[i].replace("-", "")) > mean_length * 0.7:
160
+ seqdict2[i] = seqDict[i]
161
+ seqDict = seqdict2
162
+ print(f"Sequences after removing outlier length sequences: {len(seqDict)}")
163
+
164
+ # Create augmented sequences
165
+ augmented_seqDict = {}
166
+ augmented_taxDict = {}
167
+ augmented_geo_dict = {}
168
+
169
+ added = 0
170
+ print("Augmenting sequences with a mask rate of {}".format(str(maskrate)))
171
+ for key, seq in seqDict.items():
172
+ # Generate augmented sequences
173
+ for i in range(num_augments):
174
+ new_key = f"{key}_aug_{i+1}"
175
+ augmented_seq = mask_sequence(
176
+ seq, mask_rate=maskrate
177
+ )
178
+ if unique[augmented_seq] > 0:
179
+ unique[augmented_seq] += 1
180
+ else:
181
+ unique[augmented_seq] += 1
182
+ added += 1
183
+ augmented_seqDict[new_key] = augmented_seq
184
+ augmented_taxDict[new_key] = taxDict[key]
185
+ augmented_geo_dict[new_key] = geo_dict[key]
186
+
187
+ label_sequences = {}
188
+ seqDict = augmented_seqDict.copy()
189
+ for seq_name, label in augmented_taxDict.items():
190
+ if label not in label_sequences:
191
+ label_sequences[label] = []
192
+ label_sequences[label].append(seq_name)
193
+
194
+ taxDict = {name: augmented_taxDict[name] for name in seqDict}
195
+ geo_dict = {name: augmented_geo_dict[name] for name in seqDict}
196
+
197
+ print(f"{added} augmented sequences added to the dataset!")
198
+ print(f"{len(seqDict)} sequences remaining after downsampling")
199
+
200
+ # convert to number format for encoding and store in a key dictionary
201
+ accessions=[]
202
+ count = 0
203
+ accessionKeyDict = {}
204
+ keyAccessionDict = {}
205
+ taxaSeqDict = defaultdict(lambda:[])
206
+ for i in taxDict:
207
+ if i in seqDict:
208
+ if taxDict[i] not in accessionKeyDict:
209
+ accessionKeyDict[taxDict[i]] = count
210
+ keyAccessionDict[count] = taxDict[i]
211
+ accessions.append(count)
212
+ count+=1
213
+ else:
214
+ accessions.append(accessionKeyDict[taxDict[i]])
215
+ taxaSeqDict[taxDict[i]].append(seqDict[i])
216
+
217
+ # Store in a file for reference later. This can be referred to later.
218
+ with open(os.path.join(out, f"taxa_key_{taxlevel}.tsv"), "w") as outfile:
219
+ for i in taxDict:
220
+ outfile.write("{}\t{}\t{}\n".format(accessionKeyDict[taxDict[i]], i, taxDict[i]))
221
+ with open(f"{out}/accessionKeyDict.pkl", "wb") as outfile:
222
+ pickle.dump(accessionKeyDict, outfile, pickle.HIGHEST_PROTOCOL)
223
+
224
+
225
+ # develop the training dataset
226
+ train_seq = []
227
+ train_labels = []
228
+ train_geo = []
229
+ test_seq = []
230
+ test_labels = []
231
+ test_geo = []
232
+
233
+ with tempfile.TemporaryDirectory() as temp_dir:
234
+ geo_tsv_path = os.path.join(temp_dir, "geodata_temp.tsv")
235
+ geodata.save(geo_tsv_path)
236
+ geoDict = defaultdict(lambda:[])
237
+ count = 0
238
+ with open(geo_tsv_path, "r") as infile:
239
+ for line in infile:
240
+ line = line.rstrip()
241
+ listall = re.split("\t", line)
242
+ count+=1
243
+ if listall[2] == 'decimallongitude': # check if dataset is obis
244
+ obis = True
245
+ else:
246
+ obis = False
247
+ if obis: # if obis, collect based on their formatting
248
+ if count != 1: # skip line 1
249
+ if listall[8] in accessionKeyDict:
250
+ geoDict[listall[8]].append((float(listall[3]), float(listall[2])))
251
+ else: # else, assume that first field is species, field 2 is latitude, field 3 is longitude
252
+ if count != 1: # skip line 1
253
+ if listall[0] in accessionKeyDict:
254
+ geoDict[listall[0]].append((float(listall[1]), float(listall[2])))
255
+ if count % 1000000 == 0 and count > 0:
256
+ print("{} Entries read in".format(count))
257
+ print("confirming precision is set at: {}".format(precision))
258
+ geoEncoder = SpeciesGeoEncoder(precision=precision)
259
+ geoEncoded = geoEncoder.encode_species(geoDict)
260
+ with open(f"{out}/geoEncoder.pkl", "wb") as outfile:
261
+ pickle.dump(geoEncoder, outfile, pickle.HIGHEST_PROTOCOL)
262
+
263
+ geoEncoder.save_grid('{}/grid_config_{}.npy'.format(out, taxlevel))
264
+ geoDictionary = {}
265
+ for i in accessionKeyDict:
266
+ geoDictionary[i] = geoEncoded[i]
267
+ if len(taxaSeqDict[i]) > 1:
268
+ train_seq.append(taxaSeqDict[i][0])
269
+ train_labels.append(accessionKeyDict[i])
270
+ train_geo.append(geoEncoded[i])
271
+ for j in range(1, len(taxaSeqDict[i]), 1):
272
+ if random.random() >= 0.7:
273
+ test_seq.append(taxaSeqDict[i][j])
274
+ test_labels.append(accessionKeyDict[i])
275
+ test_geo.append(geoEncoded[i])
276
+ else:
277
+ train_seq.append(taxaSeqDict[i][j])
278
+ train_labels.append(accessionKeyDict[i])
279
+ train_geo.append(geoEncoded[i])
280
+ else:
281
+ train_seq.append(taxaSeqDict[i][0])
282
+ train_labels.append(accessionKeyDict[i])
283
+ train_geo.append(geoEncoded[i])
284
+
285
+ with open(f"{out}/geoTrain.pkl", "wb") as outfile:
286
+ pickle.dump(train_geo, outfile, pickle.HIGHEST_PROTOCOL)
287
+ with open(f"{out}/labelTrain.pkl", "wb") as outfile:
288
+ pickle.dump(train_labels, outfile, pickle.HIGHEST_PROTOCOL)
289
+
290
+ sys.stdout.write("INFO: Number of sequences in the training dataset: {}\n".format(len(train_seq)))
291
+ sys.stdout.write("INFO: Number of labels in the training dataset: {}\n".format(len(train_labels)))
292
+ sys.stdout.write("INFO: Number of labels in the geo dataset: {}\n".format(len(train_geo)))
293
+
294
+ seq_len = len(train_seq[0])
295
+ batch=256
296
+ train_dl_geo, val_dl_geo = build_dataloaders_geo(train_seq,
297
+ train_labels,
298
+ test_seq,
299
+ test_labels,
300
+ train_geo,
301
+ test_geo,
302
+ batch)
303
+ num_classes_seq = check_data_loader_geo(train_dl_geo)+1
304
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
305
+ model = CNNWithSpatialEncoding(
306
+ seq_len=seq_len,
307
+ num_classes=num_classes_seq,
308
+ spatial_dim=geoEncoder.get_feature_dimension()
309
+ )
310
+
311
+ criterion = nn.NLLLoss()
312
+ optimizer = optim.Adam(model.parameters(), lr=0.0001)
313
+ scheduler = torch.optim.lr_scheduler.StepLR(
314
+ optimizer, step_size=5, gamma=0.5)
315
+ num_epochs = 100
316
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
317
+ stats_artifact = EplacerOutputTableTrainDirFormat()
318
+ stats_dir = str(stats_artifact.path)
319
+
320
+ model = train_and_evaluate_geo(scheduler, sigma, kernel, accessionKeyDict, geoDictionary, stats_dir, taxlevel, model, train_dl_geo, val_dl_geo, criterion, optimizer, num_epochs, device, mask_prob=0.5, geo_encoder=geoEncoder)
321
+
322
+ torch.save(model, '{}/best_geo_model_{}.pth'.format(out, taxlevel))
323
+ weights_path = os.path.join(stats_dir, f'best_geo_param_{taxlevel}.pth')
324
+ permanent_weights_path = os.path.join(out, f'best_geo_param_{taxlevel}.pth')
325
+ import shutil
326
+ if os.path.exists(weights_path):
327
+ shutil.move(weights_path, permanent_weights_path)
328
+
329
+ return output_artifact, stats_artifact
330
+
331
+
332
+
333
+ def run_model(fasta: AlignedDNAFASTAFormat,
334
+ model: EplacerModelDirectoryFormat,
335
+ blast: BlastOutfmt6DirFormat,
336
+ counts: biom.Table,
337
+ geodata: qiime2.Metadata,
338
+ taxlevel: str = "SPECIES",
339
+ maskrate: float = 0.01,
340
+ sigma: float = 1.0, # Set your actual defaults
341
+ kernel: int = 1, # Set your actual defaults
342
+ threads: int = 1,
343
+ confidence: float = 0.9,
344
+ force: bool = False
345
+ ) -> typing.Tuple[EplacerOutputTableDirFormat, pd.DataFrame, pd.DataFrame]:
346
+
347
+ aligned_fasta_path = str(fasta)
348
+ blast_path = os.path.join(str(blast.path), 'blast_results.tsv')
349
+ model_dir_path = str(model.path)
350
+ taxfile = os.path.join(model_dir_path, "taxa_key_SPECIES.tsv")
351
+ taxonomy_file_path = os.path.join(model_dir_path, "full_taxonomy.tsv")
352
+ taxa_lookup = {}
353
+ with open(taxonomy_file_path, "r") as f:
354
+ for line in f:
355
+ parts = line.strip('\n').split('\t')
356
+ if len(parts) > 1:
357
+ lineage_parts = parts[1:]
358
+
359
+ current_lineage = []
360
+ for rank in lineage_parts:
361
+ rank_clean = rank.strip()
362
+ if not rank_clean:
363
+ continue
364
+ current_lineage.append(rank_clean)
365
+ # Create the standard QIIME 2 semicolon-separated format
366
+ joined_lineage = ";".join(current_lineage)
367
+ # Map the specific taxon to its full lineage string
368
+ taxa_lookup[rank_clean] = joined_lineage
369
+
370
+ geo_df = geodata.to_dataframe()
371
+ locale_dict = {}
372
+ for sample_id, row in geo_df.iterrows():
373
+ locale_dict[sample_id] = (row['Latitude'], row['Longitude'])
374
+ # convert locale dict to geodict. This tracks what ASVs are
375
+ # abundant at each site
376
+ counts_df = counts.to_dataframe(dense=True)
377
+ geoDict = defaultdict(lambda:[])
378
+ for asv_id, row in counts_df.iterrows():
379
+ for sample_id, count in row.items():
380
+ if count > 0:
381
+ geoDict[asv_id].append(locale_dict[sample_id])
382
+
383
+ # encode the geodict
384
+ geoEncoder = SpeciesGeoEncoder.load_grid('{}/grid_config_{}.npy'.format(model_dir_path, taxlevel))
385
+ geoEncoded = geoEncoder.encode_species(geoDict)
386
+ # in case any ASVs not in dict after encoding. Since this is a default dict, initializing
387
+ # initializes empty vector
388
+ # This should probably be addressed upstream in a different version, but mostly arises
389
+ # from when an ASV is in the count matrix, but has all zeroes.
390
+ for i in geoDict:
391
+ x = geoEncoded[i]
392
+
393
+ seqlen = 0
394
+ seq = ''
395
+ with open(aligned_fasta_path, 'r') as infile:
396
+ for line in infile:
397
+ line = line.rstrip()
398
+ if line.startswith(">"):
399
+ if len(seq) == 0:
400
+ pass
401
+ else:
402
+ sys.stdout.write("INFO: sequence length={}\n".format(len(seq)))
403
+ seqlen = len(seq)
404
+ break
405
+ else:
406
+ seq += line
407
+
408
+ class_set = set()
409
+ with open(taxfile, 'r') as infile:
410
+ for line in infile:
411
+ line = line.rstrip()
412
+ listall = re.split("\t", line)
413
+ class_set.add(listall[0])
414
+ num_classes = len(class_set)
415
+
416
+ sys.stdout.write("INFO: number of classes={}\n".format(len(class_set)))
417
+ num_classes = len(class_set)
418
+ sys.stdout.write("RUN GEO")
419
+
420
+ output_artifact = EplacerOutputTableDirFormat()
421
+ out_dir = str(output_artifact.path)
422
+
423
+ high_confidence_predictions = run_model_geoOBIS_bootstrap(
424
+ out_dir,
425
+ blast_path,
426
+ aligned_fasta_path,
427
+ seqlen,
428
+ num_classes,
429
+ taxlevel,
430
+ taxfile,
431
+ model_dir_path,
432
+ geoEncoded,
433
+ geoEncoder,
434
+ threads,
435
+ n_bootstrap=100,
436
+ sigma=sigma,
437
+ kernel_size=kernel,
438
+ maskrate=maskrate,
439
+ conf_threshold=confidence
440
+ )
441
+
442
+ curated_records = []
443
+ raw_records = []
444
+
445
+ tsv_path = os.path.join(out_dir, "bestGeoPredict.tsv")
446
+ with open(tsv_path, "w") as outfile:
447
+ outfile.write("ASV\tCurated Taxa\tCurated Taxa Level\tPredicted Taxa\tPredicted Taxa Level\tAll Top Scoring hits\tTop Scoring Mean Probs\tTop Scoring Std Dev Probs\tTop Scoring Geo\tAssignment Note\n")
448
+ for i in high_confidence_predictions:
449
+ pt = high_confidence_predictions[i]["predicted taxa"]
450
+ outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
451
+ i,
452
+ pt["Curated Taxa"],
453
+ pt["Curated taxa level"],
454
+ pt["Consensus Taxa"],
455
+ pt["Consensus taxa level"],
456
+ pt["All top scoring hits"],
457
+ pt["All top scoring probabilities"],
458
+ pt["All top scoring std deviations"],
459
+ pt["All top scoring Geography Coverages"],
460
+ high_confidence_predictions[i]["Homology note"]
461
+ ))
462
+ curated_taxon_raw = pt["Curated Taxa"]
463
+ raw_taxon_raw = pt["Consensus Taxa"]
464
+ full_curated_lineage = taxa_lookup.get(curated_taxon_raw, curated_taxon_raw)
465
+ full_raw_lineage = taxa_lookup.get(raw_taxon_raw, raw_taxon_raw)
466
+
467
+ curated_records.append({"Feature ID": asv_id, "Taxon": full_curated_lineage})
468
+ raw_records.append({"Feature ID": asv_id, "Taxon": full_raw_lineage})
469
+ curated_df = pd.DataFrame(curated_records).set_index("Feature ID")
470
+ raw_df = pd.DataFrame(raw_records).set_index("Feature ID")
471
+
472
+ return output_artifact, curated_df, raw_df
q2_eplacer/_types.py ADDED
@@ -0,0 +1,7 @@
1
+ from qiime2.plugin import SemanticType
2
+
3
+ BlastResults = SemanticType('BlastResults')
4
+ BlastResultsDir = SemanticType('BlastResultsDir')
5
+ EplacerModel = SemanticType('EplacerModel')
6
+ EplacerTable = SemanticType('EplacerTable')
7
+ EplacerTableTrain = SemanticType('EplacerTableTrain')
q2_eplacer/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.1"
@@ -0,0 +1,6 @@
1
+ @MISC{Caporaso-Bolyen-2024,
2
+ title = "Developing with {QIIME} 2",
3
+ author = "{Caporaso, J Gregory and Bolyen, Evan}",
4
+ year = 2024,
5
+ howpublished = "https://develop.qiime2.org"
6
+ }
@@ -0,0 +1,133 @@
1
+ from qiime2.plugin import Citations, Plugin
2
+ from q2_types.feature_table import FeatureTable, Frequency
3
+ from q2_eplacer import __version__
4
+ from q2_eplacer._methods import run_model, train_model, align_sequences, run_blast
5
+ from q2_eplacer._types import BlastResults, EplacerModel, BlastResultsDir
6
+ from q2_eplacer._formats import BlastOutfmt6Format, EplacerModelDirectoryFormat, BlastOutfmt6DirFormat, EplacerOutputTableDirFormat, EplacerOutputTableTrainDirFormat
7
+ from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat, FeatureData, Sequence, AlignedSequence, Taxonomy
8
+ from q2_types.feature_table import FeatureTable, Frequency
9
+ from ._formats import EplacerModelDirectoryFormat, BlastOutfmt6Format, BlastOutfmt6DirFormat
10
+ from ._types import BlastResults, EplacerModel, EplacerTable, EplacerTableTrain
11
+ from ._formats import BlastOutfmt6DirFormat, EplacerModelDirectoryFormat, EplacerOutputTableFormat
12
+ from qiime2.plugin import Str, Float, Int, Bool, Metadata
13
+ citations = Citations.load("citations.bib", package="q2_eplacer")
14
+
15
+ plugin = Plugin(
16
+ name="eplacer",
17
+ version=__version__,
18
+ website="https://github.com/NEFSC/PEMAD-PBB-ePlacer",
19
+ package="q2_eplacer",
20
+ description="ePlacer is a taxonomic classification tool that uses deep-learning approaches to incorporate both sequence information and biogeographic information into taxonomic assignment of DNA sequences.",
21
+ short_description="ASV classifier with deep-learning and biogeography",
22
+ # The plugin-level citation of 'Caporaso-Bolyen-2024' is provided as
23
+ # an example. You can replace this with citations to other references
24
+ # in citations.bib.
25
+ citations=[citations['Caporaso-Bolyen-2024']]
26
+ )
27
+
28
+ # Register custom formats
29
+ plugin.register_semantic_types(BlastResults, EplacerModel, EplacerTable, EplacerTableTrain)
30
+ plugin.register_formats(BlastOutfmt6Format, EplacerModelDirectoryFormat, BlastOutfmt6DirFormat, EplacerOutputTableDirFormat, EplacerOutputTableTrainDirFormat)
31
+ plugin.register_semantic_type_to_format(EplacerModel, artifact_format=EplacerModelDirectoryFormat)
32
+ plugin.register_semantic_type_to_format(BlastResultsDir,artifact_format=BlastOutfmt6DirFormat)
33
+ plugin.register_semantic_type_to_format(EplacerTable, artifact_format=EplacerOutputTableDirFormat)
34
+ plugin.register_semantic_type_to_format(EplacerTableTrain, artifact_format=EplacerOutputTableTrainDirFormat)
35
+
36
+
37
+ plugin.methods.register_function(
38
+ function=align_sequences,
39
+ inputs={'fasta': FeatureData[Sequence],
40
+ 'model': EplacerModel},
41
+ parameters={'threads': Int},
42
+ outputs={'aligned_sequences': FeatureData[AlignedSequence]},
43
+ input_descriptions={'fasta': 'Path to the query file.',
44
+ 'model': 'Path to the model directory'},
45
+ parameter_descriptions={'threads': 'number of cores'},
46
+ output_descriptions={'aligned_sequences': 'aligned fasta file'},
47
+ name='Align to reference with MAFFT',
48
+ description="Align query sequences to fasta",
49
+ citations=[]
50
+ )
51
+
52
+ plugin.methods.register_function(
53
+ function=run_blast,
54
+ inputs={'fasta': FeatureData[Sequence],
55
+ 'model': EplacerModel},
56
+ parameters={'threads': Int},
57
+ outputs={'blast': BlastResultsDir},
58
+ input_descriptions={'fasta': 'Path to the query file.',
59
+ 'model': 'Path to the model directory'},
60
+ parameter_descriptions={'threads': 'number of cores'},
61
+ output_descriptions={'blast': 'blast results'},
62
+ name='blast',
63
+ description="blast against reference db",
64
+ citations=[]
65
+ )
66
+
67
+ plugin.methods.register_function(
68
+ function=run_model,
69
+ inputs={'fasta': FeatureData[AlignedSequence],
70
+ 'model': EplacerModel, # Needs a custom or existing semantic type
71
+ 'blast': BlastResultsDir, # Needs a custom or existing semantic type
72
+ 'counts': FeatureTable[Frequency]},
73
+ parameters={'geodata': Metadata,
74
+ 'taxlevel': Str,
75
+ 'maskrate': Float,
76
+ 'sigma': Float,
77
+ 'kernel': Int,
78
+ 'threads': Int,
79
+ 'confidence': Float,
80
+ 'force': Bool},
81
+ outputs=[('eplacer_table', EplacerTable),
82
+ ('curated_taxonomy', FeatureData[Taxonomy]),
83
+ ('raw_taxonomy', FeatureData[Taxonomy])],
84
+ input_descriptions={'fasta': 'Path to the reference database in fasta format. Accessions must match the IDs in the taxa file.',
85
+ 'model': 'The pre-trained model artifact to be applied.',
86
+ 'blast': 'Results from blastn of sequences against database (outfmt 6).',
87
+ 'counts': 'Optional. Abundance data in the format of a count matrix.'},
88
+ parameter_descriptions={'geodata': 'Optional. Known geographic information (SampleID, Latitude, Longitude) for each sequence.',
89
+ 'taxlevel': 'Specify the taxonomic level the model was trained for.',
90
+ 'maskrate': 'Defines the frequency with which each base is masked. Default: 0.02',
91
+ 'sigma': 'Defines the standard deviation of the gaussian kernel.',
92
+ 'kernel': 'Defines the size of the gaussian kernel.',
93
+ 'threads': 'Specify available threads.',
94
+ 'confidence': 'Specify a confidence threshold.',
95
+ 'force': 'Force overwrite of preexisting files. Default: False'},
96
+ output_descriptions={'eplacer_table': 'The resulting taxonomic assignments in ePlacer format',
97
+ 'curated_taxonomy': 'Qiime object for curated taxonomy',
98
+ 'raw_taxonomy': 'Qiime object for raw predictions',},
99
+ name='Run ePlacer Model',
100
+ description=("Classify sequence and (if provided) geographic "
101
+ "data to a taxonomic assignment using a pre-trained model."),
102
+ citations=[]
103
+ )
104
+
105
+
106
+ plugin.methods.register_function(
107
+ function=train_model,
108
+ inputs={'fasta': FeatureData[Sequence],
109
+ 'alignedfasta': FeatureData[AlignedSequence]},
110
+ parameters={'geodata': Metadata,
111
+ 'taxonomy': Metadata,
112
+ 'taxlevel': Str,
113
+ 'maskrate': Float,
114
+ 'sigma': Float,
115
+ 'kernel': Int,
116
+ 'precision': Int,
117
+ 'num_augments': Float},
118
+ outputs=[('model', EplacerModel), ('training_stats', EplacerTableTrain)],
119
+ input_descriptions={'fasta': 'Path to the reference database in fasta format. Accessions must match the IDs in the taxa file. Unaligned',
120
+ 'alignedfasta': 'Path to the reference database in fasta format. Accessions must match the IDs in the taxa file. Aligned'},
121
+ parameter_descriptions={'geodata': 'Known geographic information (SampleID, Latitude, Longitude) for each taxa.',
122
+ 'taxonomy': 'Taxonomy Table, tsv.',
123
+ 'taxlevel': 'Specify the taxonomic level the model was trained for.',
124
+ 'maskrate': 'Defines the frequency with which each base is masked. Default: 0.02',
125
+ 'sigma': 'Defines the standard deviation of the gaussian kernel.',
126
+ 'kernel': 'Defines the size of the gaussian kernel.',
127
+ 'precision': 'Geohash precision',
128
+ 'num_augments': 'Number of sequence augments to perform.'},
129
+ output_descriptions={'model': 'The best performing deep learning epoch', 'training_stats': 'Evaluation statistics generated during training.'},
130
+ name='Train ePlacer Model',
131
+ description=("Train a deep learning classifier."),
132
+ citations=[]
133
+ )
@@ -0,0 +1,8 @@
1
+ # flake8: noqa
2
+ # ----------------------------------------------------------------------------
3
+ # Copyright (c) 2024, Christopher Powers.
4
+ #
5
+ # Distributed under the terms of the Modified BSD License.
6
+ #
7
+ # The full license is in the file LICENSE, distributed with this software.
8
+ # ----------------------------------------------------------------------------
Binary file
@@ -0,0 +1,17 @@
1
+ sampleid Kingdom Phylum Class Order Family Genus Species
2
+ A Eukaryota Chordata Actinopteri OrderA FamilyA GenusA SpeciesA
3
+ B Eukaryota Chordata Actinopteri OrderBC FamilyBC GenusBC SpeciesB
4
+ C Eukaryota Chordata Actinopteri OrderBC FamilyBC GenusBC SpeciesC
5
+ D Eukaryota Chordata Actinopteri OrderDO FamilyDO GenusDO SpeciesD
6
+ E Eukaryota Chordata Actinopteri OrderEP FamilyEP GenusEP SpeciesE
7
+ F Eukaryota Chordata Actinopteri OrderFI FamilyFI GenusFI SpeciesF
8
+ G Eukaryota Chordata Actinopteri OrderGJ FamilyGJ GenusGJ SpeciesG
9
+ H Eukaryota Chordata Actinopteri OrderHK FamilyHK GenusH SpeciesH
10
+ I Eukaryota Chordata Actinopteri OrderFI FamilyFI GenusFI SpeciesI
11
+ J Eukaryota Chordata Actinopteri OrderGJ FamilyGJ GenusGJ SpeciesG
12
+ K Eukaryota Chordata Actinopteri OrderHK FamilyHK GenusK SpeciesK
13
+ L Eukaryota Chordata Actinopteri OrderL FamilyL GenusL SpeciesL
14
+ M Eukaryota Chordata Actinopteri OrderM FamilyM GenusM SpeciesM
15
+ N Eukaryota Chordata Actinopteri OrderN FamilyN GenusN SpeciesN
16
+ O Eukaryota Chordata Actinopteri OrderDO FamilyDO GenusDO SpeciesO
17
+ P Eukaryota Chordata Actinopteri OrderEP FamilyEP GenusEP SpeciesP
@@ -0,0 +1,17 @@
1
+ sampleid Latitude Longitude
2
+ A 39.645946 -71.746641
3
+ B 39.645946 -71.746641
4
+ C 39.645946 -71.746641
5
+ D 39.645946 -71.746641
6
+ E 39.645946 -71.746641
7
+ F 39.645946 -71.746641
8
+ G 39.645946 -71.746641
9
+ H 39.645946 -71.746641
10
+ I 46.433867 -126.20164
11
+ J 46.433867 -126.20164
12
+ K 46.433867 -126.20164
13
+ L 46.433867 -126.20164
14
+ M 46.433867 -126.20164
15
+ N 46.433867 -126.20164
16
+ O 46.433867 -126.20164
17
+ P 46.433867 -126.20164
@@ -0,0 +1,3 @@
1
+ sampleid Latitude Longitude
2
+ Sample1 39.645946 -71.746641
3
+ Sample2 39.645946 -71.746641
Binary file
@@ -0,0 +1,6 @@
1
+ >ASV1
2
+ GCCGTAAACTTAGATAAATTAGTACAACAAATATCGGCCCGGGAACTACGAGCGCCAGCTTATAACCCAAAGGACTTGGCGCTGCTTCAGACCCCCCT
3
+ >ASV2
4
+ GCGGTAAACTTAGATATATTAGTACAACAAATATCGGCCCGGGAACTACGAGCGCCTGCTTAAAACCCAAAGGTCTTGGCGGTGCTTCAGACCCCCCT
5
+ >ASV3
6
+ GCGGTAAACTTAGATATATTAGTACAACAAATATCGGCCCGGGAACTACGAGCGCCTGCTTAAAACCCAAAGGTCTTGGCGGTGCTTCAGACCCCCCT
Binary file
@@ -0,0 +1,113 @@
1
+ import pandas as pd
2
+ import pandas.testing as pdt
3
+
4
+ from qiime2.plugin.testing import TestPluginBase
5
+ from qiime2.plugin.util import transform
6
+ from q2_types.feature_table import BIOMV100Format
7
+
8
+ import os
9
+ import unittest
10
+ import qiime2
11
+ from q2_eplacer._methods import train_model, run_model
12
+ from q2_eplacer._formats import EplacerModelDirectoryFormat, EplacerOutputTableTrainDirFormat
13
+ from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat
14
+ from q2_types.feature_table import FeatureTable, Frequency
15
+ import biom
16
+ from q2_eplacer._formats import (
17
+ BlastOutfmt6DirFormat,
18
+ EplacerOutputTableDirFormat
19
+ )
20
+ import numpy as np
21
+
22
+ class TestEplacerTraining(unittest.TestCase):
23
+ """
24
+ Simple test case to test training and inference.
25
+ Checks for output file generation
26
+ """
27
+ def setUp(self):
28
+ self.data_dir = os.path.join(os.path.dirname(__file__), 'data')
29
+ self.fasta_path = os.path.join(self.data_dir, 'seqs.qza')
30
+ self.aligned_fasta_path = os.path.join(self.data_dir, 'alignedSeqs.qza')
31
+ self.taxonomy_path = os.path.join(self.data_dir, 'full_taxonomy.tsv')
32
+ self.geodata_path = os.path.join(self.data_dir, 'geoData.tsv')
33
+ self.fasta = qiime2.Artifact.load(self.fasta_path)
34
+ self.alignedfasta = qiime2.Artifact.load(self.aligned_fasta_path)
35
+ self.taxonomy = qiime2.Metadata.load(os.path.join(self.data_dir, 'full_taxonomy.tsv'))
36
+ self.geodata = qiime2.Metadata.load(os.path.join(self.data_dir, 'geoData.tsv'))
37
+
38
+ # inference data
39
+ self.aligned_fasta_path = os.path.join(self.data_dir, 'alignedSeqs.qza')
40
+ self.geodata_path = os.path.join(self.data_dir, 'geoData_run.tsv')
41
+
42
+ self.alignedfasta = qiime2.Artifact.load(self.aligned_fasta_path)
43
+ self.geodata = qiime2.Metadata.load(self.geodata_path)
44
+ count_matrix = np.array([
45
+ [15, 0], # ASV1 is present in Sample1
46
+ [5, 22], # ASV2 is present in both
47
+ [0, 10] # ASV3 is present in Sample2
48
+ ])
49
+ observ_ids = ['ASV1', 'ASV2', 'ASV3']
50
+ sample_ids = ['Sample1', 'Sample2']
51
+ self.counts = biom.Table(count_matrix, observ_ids, sample_ids)
52
+
53
+ def test_train_model_execution(self):
54
+ """Test that train_model executes completely and returns valid directory formats."""
55
+
56
+ model_out, stats_out = train_model(
57
+ fasta=self.fasta.view(DNAFASTAFormat),
58
+ alignedfasta=self.alignedfasta.view(AlignedDNAFASTAFormat),
59
+ taxonomy=self.taxonomy,
60
+ geodata=self.geodata,
61
+ taxlevel="SPECIES",
62
+ num_augments=100, # Keep it ultra-low (like 2) so test cases execute in 3 seconds!
63
+ maskrate=0.01,
64
+ sigma=1,
65
+ kernel=3,
66
+ precision=2
67
+ )
68
+
69
+ blast_dir_artifact = BlastOutfmt6DirFormat()
70
+ blast_file_path = os.path.join(str(blast_dir_artifact.path), 'blast_results.tsv')
71
+
72
+ with open(blast_file_path, 'w') as bf:
73
+ bf.write("ASV1\tA\t100.00\t1e-100\t98\t98\t98\t1\t98\t1\t98\n")
74
+ bf.write("ASV2\tB\t99.00\t1e-100\t98\t98\t98\t1\t98\t1\t98\n")
75
+ bf.write("ASV3\tC\t100.00\t\1e-100\\t98\t98\t98\t1\t98\t1\t98\n")
76
+
77
+ self.assertIsInstance(model_out, EplacerModelDirectoryFormat)
78
+ self.assertIsInstance(stats_out, EplacerOutputTableTrainDirFormat)
79
+
80
+ model_path = str(model_out.path)
81
+ stats_path = str(stats_out.path)
82
+
83
+ self.assertTrue(os.path.exists(os.path.join(model_path, 'config.yml')))
84
+ self.assertTrue(os.path.exists(os.path.join(model_path, 'geoEncoder.pkl')))
85
+ self.assertTrue(os.path.exists(os.path.join(stats_path, 'model_geo_stats.tsv')))
86
+
87
+ custom_dir_out, curated_df, consensus_df = run_model(
88
+ fasta=self.alignedfasta.view(AlignedDNAFASTAFormat),
89
+ model=model_out,
90
+ blast=blast_dir_artifact,
91
+ counts=self.counts,
92
+ geodata=self.geodata,
93
+ taxlevel="SPECIES",
94
+ maskrate=0.01,
95
+ sigma=1.0,
96
+ kernel=1,
97
+ threads=1,
98
+ confidence=0.9,
99
+ force=False
100
+ )
101
+
102
+ self.assertIsInstance(custom_dir_out, EplacerOutputTableDirFormat)
103
+
104
+ self.assertIsInstance(curated_df, pd.DataFrame)
105
+ self.assertIsInstance(consensus_df, pd.DataFrame)
106
+
107
+ inference_path = str(custom_dir_out.path)
108
+ expected_prediction_file = os.path.join(inference_path, "bestGeoPredict.tsv")
109
+
110
+ self.assertTrue(os.path.exists(expected_prediction_file))
111
+
112
+ if __name__ == '__main__':
113
+ unittest.main()
@@ -0,0 +1,215 @@
1
+ Metadata-Version: 2.4
2
+ Name: q2-eplacer
3
+ Version: 0.1.1
4
+ Summary: ASV classifier with deep-learning and biogeography
5
+ Author-email: Christopher Powers <christopher.powers@noaa.gov>
6
+ License: Software code created by U.S. Government employees is not subject to copyright in the United States (17 U.S.C. §105).
7
+ The United States/Department of Commerce reserve all rights to seek and obtain copyright protection in countries other
8
+ than the United States for Software authored in its entirety by the Department of Commerce. To this end, the Department
9
+ of Commerce hereby grants to Recipient a royalty-free, nonexclusive license to use, copy, and create derivative works of
10
+ the Software outside of the United States.
11
+ Project-URL: Homepage, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer
12
+ Project-URL: Repository, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer
13
+ Project-URL: Bug Tracker, https://github.com/NEFSC/PEMAD-PBB-q2-ePlacer/issues
14
+ Keywords: qiime2,microbiome,taxonomy,deep-learning,biogeography
15
+ Classifier: Development Status :: 4 - Beta
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
23
+ Requires-Python: >=3.10
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE.txt
26
+ Requires-Dist: qiime2
27
+ Requires-Dist: pandas
28
+ Requires-Dist: biom-format
29
+ Requires-Dist: eplacer
30
+ Dynamic: license-file
31
+
32
+ # q2-eplacer
33
+
34
+ A [QIIME 2](https://qiime2.org) plugin [developed](https://develop.qiime2.org) by Christopher Powers (christopher.powers@noaa.gov) that alows for the [ePlacer taxonomic classifier](https://github.com/NEFSC/PEMAD-PBB-ePlacer/) to interface with QIIME2.
35
+
36
+ ePlacer is a taxonomic classification tool that uses deep-learning approaches to incorporate both sequence information and biogeographic information into taxonomic assignment of DNA sequences.
37
+
38
+ ## Why use ePlacer
39
+
40
+ The machine learning architecture of ePlacer enables powerful prediction beyond sequence-only classification tools (e.g. sequence alignment with blast or naive-bayes classifiers) by directly incorporating additional data into the probabalistic estimate of taxonomy, specifically developed for metabarcoding data. This novel applciation of deep-learning is immensely useful, as there can be many cases in metabarcoding data where two reference species have 100% sequence overlap, but distinct geographic ranges. This tool discriminates these cases and provides additional data for downstream taxonomic curation. Due to this, ePlacer provides enhanced interoperability between metabarcoding datasets.
41
+
42
+ Currently, ePlacer offers pre-trained models for two popular metabarcoding regions: the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. For these two regions, ePlacer offers the following benefits:
43
+
44
+ * **Interoperability.** ePlacer is trained on global datasets, allowing for direct comparison between metabarcoding datasets, regardless of geographic region.
45
+ * **Portability.** ePlacer has pre-trained models available for both MiFish and Riaz marker gene regions containerized and available for out-of-the-box use
46
+ * **Increased Accuracy.** The ePlacer model architecture provides increased accuracy, precision, and recall as compared to blast, Naive-Bayes, or least common ancestor approachers
47
+ * **Trainability** In addition to the two provided barcodes, this code repository provides tools for training new models.
48
+
49
+ For other barcode regions, there will be significant advantages with the training of new models. If you are interested in training a new model for ePlacer, please do not hesitate to reach out!
50
+
51
+ ## Installation instructions
52
+
53
+ **The following instructions are intended to be a starting point** and should be replaced when `q2-eplacer` is ready to share with others.
54
+ They will enable you to install the most recent *development* version of `q2-eplacer`.
55
+ Remember that *release* versions should be used for all "real" work (i.e., where you're not testing or prototyping) - if there aren't instructions for installing a release version of this plugin, it is probably not yet intended for use in practice.
56
+
57
+ ### Install Prerequisites
58
+
59
+ [Miniconda](https://conda.io/miniconda.html) provides the `conda` environment and package manager, and is currently the only supported way to install QIIME 2.
60
+ Follow the instructions for downloading and installing Miniconda.
61
+
62
+ After installing Miniconda and opening a new terminal, make sure you're running the latest version of `conda`:
63
+
64
+ ```bash
65
+ conda update conda
66
+ ```
67
+
68
+ You also need to install the base qiime2 as a conda environment. Follow the [install instructions here](https://docs.qiime2.org/2024.10/install/native/).
69
+
70
+ ### Install `q2-eplacer`
71
+
72
+ Next, you will install the ePlacer qiime plugin from pip
73
+ ```bash
74
+ pip install q2-eplacer
75
+ ```
76
+ This will also install all other required dependencies.
77
+
78
+ ## Using `q2-eplacer`
79
+
80
+ ### Data preparation
81
+
82
+ In order to use ePlacer, you must first prep the data prior to installing, including prepping input data and collecting a pre-trained model for inference.
83
+
84
+ #### Pre-Trained models
85
+
86
+ Currently, two pre-trained models are available: : the [MiFish](https://doi.org/10.1007/s12562-020-01461-x) and the [ecoPrimer, or Riaz,](https://doi.org/10.1093/nar/gkr732) marker gene regions. These are available in a QIIME2-compatible format:
87
+ ```bash
88
+ # Mifish marker
89
+ wget https://zenodo.org/records/20820029/files/mifish.qza
90
+ # Riaz marker
91
+ wget https://zenodo.org/records/20820029/files/riaz.qza
92
+ ```
93
+
94
+ If desired, users can also train new models, see below in section `Training New Models`. Any new, high performing models may be added to a Zenodo record by reaching out to the maintainers.
95
+
96
+ If you trained a new model with the qiime2 plugin, it will be automatically formatted into the `.qza` format. Otherwise, run the following:
97
+ ```bash
98
+ qiime tools import \
99
+ --type EplacerModel \
100
+ --input-path ./model/ \
101
+ --output-path model.qza
102
+ ```
103
+
104
+ ##### Prepping input data.
105
+
106
+ In addition to the models, users must import their input data properly. Input data formatting requirements may be seen in documentation for the original [ePlacer](https://github.com/NEFSC/PEMAD-PBB-ePlacer) package.
107
+
108
+ ##### Sequence data
109
+
110
+ Input sequence data is required in fasta format, which can be imported into QIIME2 formats with the following:
111
+ ```bash
112
+ qiime tools import \
113
+ --type "FeatureData[Sequence]" \
114
+ --input-path seqs.fa \
115
+ --output-path seqs.qza
116
+ ```
117
+
118
+ The sequence data should also be aligned, which can be done with the q2-eplacer function:
119
+ ```bash
120
+ qiime eplacer align-sequences --i-fasta ./seqs.qza \
121
+ --i-model ./model.qza \
122
+ --o-aligned-sequences ./aligned_seqs.qza \
123
+ --p-threads 8
124
+ ```
125
+
126
+ ##### Count data
127
+
128
+ Count data must first be converted to a `.biom` format, then to a `.qza` format
129
+ ```bash
130
+ biom convert -i ./counts.tsv \
131
+ -o ./counts.biom \
132
+ --table-type="OTU table" \
133
+ --to-hdf5
134
+ qiime tools import --input-path ./meta.biom \
135
+ --type 'FeatureTable[Frequency]' \
136
+ --input-format BIOMV210Format \
137
+ --output-path ./counts.qza
138
+ ```
139
+
140
+ ##### geoData
141
+
142
+ The geographic data can be read in as a metadata file, and requires no further transformations.
143
+
144
+ ##### blast data
145
+
146
+ Although not used by the machine learning model, blast results are incredible useful for screening the results for mismatches when 100% matches are possible. Thus, a function for running the blast results was also included:
147
+ ```bash
148
+ qiime eplacer run-blast \
149
+ --i-fasta ./seqs.qza \
150
+ --i-model ./model.qza \
151
+ --o-blast ./hits.qza \
152
+ --p-threads 8
153
+ ```
154
+
155
+ Note the unaligned sequences were used for blast.
156
+
157
+ ### Running the model
158
+
159
+ Congratulations! You are ready to run `ePlacer`!
160
+ ```bash
161
+ qiime eplacer run-model-qiime \
162
+ --i-fasta ./aligned_seqs.qza \
163
+ --i-model ./model.qza \
164
+ --i-blast ./hits.qza \
165
+ --i-counts ./counts.qza \
166
+ --m-geodata-file ./geoData.tsv \
167
+ --o-eplacer-table ./ePlacerAssignment.qza \
168
+ --o-curated-taxonomy ./qiimeAssignmentCurated.qza \
169
+ --o-raw-taxonomy ./qiimeAssignmentRaw.qza
170
+ qiime tools export --input-path ../ePlacerAssignment.qza \
171
+ --output-path ../ePlacerAssignment
172
+ qiime tools export --input-path ../qiimeAssignmentCurated.qza \
173
+ --output-path ../qiimeAssignmentCurated
174
+ qiime tools export --input-path ../qiimeAssignmentRaw.qza \
175
+ --output-path ../qiimeAssignmentRaw
176
+ ```
177
+
178
+ You may notice there are three output files present. This is three different file formats. The first, `--o-eplacer-table` details the native ePlacer output format described in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer). The second, `--o-curated-taxonomy`, outputs the curated assignments in QIIME2 compatible format. The third, `--o-raw-taxonomy`, outputs the raw taxonomic assignments in QIIME2 compatible format.
179
+
180
+ ##### A special note
181
+
182
+ As with all other taxonomic assignment tools, all taxonomic assignments should still be manuall curated after assignment. ePlacer exhibits higher accuracy than other tools, but is not perfect.
183
+
184
+ ### Training the model
185
+
186
+ The QIIME2 implementation of ePlacer also supports training new models. File format requirements are detailed in the [ePlacer repository](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
187
+ ```bash
188
+ qiime eplacer train-model \
189
+ --i-fasta ./unalignedSeqs.qza \
190
+ --i-alignedfasta ./alignedSeqs.qza \
191
+ --m-taxonomy-file ./taxonomy.tsv \
192
+ --m-geodata-file ./geoData.tsv \
193
+ --p-num-augments 100 \
194
+ --o-model .toyModel.qza \
195
+ --o-training-stats ./stats.qza \
196
+ --verbose
197
+ ```
198
+
199
+ ## About
200
+
201
+ The `q2-eplacer` Python package was [created from a template](https://develop.qiime2.org/en/stable/plugins/tutorials/create-from-template.html).
202
+ To learn more about `q2-eplacer`, refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
203
+ To learn how to use QIIME 2, refer to the [QIIME 2 User Documentation](https://docs.qiime2.org).
204
+ To learn QIIME 2 plugin development, refer to [*Developing with QIIME 2*](https://develop.qiime2.org).
205
+
206
+ `q2-eplacer` is a QIIME 2 community plugin, meaning that it is not necessarily developed and maintained by the developers of QIIME 2.
207
+ Please be aware that because community plugins are developed by the QIIME 2 developer community, and not necessarily the QIIME 2 developers themselves, some may not be actively maintained or compatible with current release versions of the QIIME 2 distributions.
208
+ More information on development and support for community plugins can be found [here](https://library.qiime2.org).
209
+ If you need help with a community plugin, first refer to the [project website](https://github.com/NEFSC/PEMAD-PBB-ePlacer).
210
+ If that page doesn't provide information on how to get help, or you need additional help, head to the [Community Plugins category](https://forum.qiime2.org/c/community-contributions/community-plugins/14) on the QIIME 2 Forum where the QIIME 2 developers will do their best to help you.
211
+
212
+
213
+ ==============================================================
214
+
215
+ This repository is a scientific product and is not official communication of the National Oceanic and Atmospheric Administration, or the United States Department of Commerce. All NOAA GitHub project code is provided on an ‘as is’ basis and the user assumes responsibility for its use. Any claims against the Department of Commerce or Department of Commerce bureaus stemming from the use of this GitHub project will be governed by all applicable Federal law. Any reference to specific commercial products, processes, or services by service mark, trademark, manufacturer, or otherwise, does not constitute or imply their endorsement, recommendation or favoring by the Department of Commerce. The Department of Commerce seal and logo, or the seal and logo of a DOC bureau, shall not be used in any manner to imply endorsement of any commercial product or activity by DOC or the United States Government.
@@ -0,0 +1,22 @@
1
+ q2_eplacer/__init__.py,sha256=6ORwOQAjsHowxrxrg48pKOzqRNQzOFySMHw8adKlJZQ,122
2
+ q2_eplacer/_formats.py,sha256=Esiwas_qZlit8WNOXDMHk8ehqPrYfjy8rmz5dNo4akI,3628
3
+ q2_eplacer/_methods.py,sha256=66f-rO98J95MxnWt4RB09sXajn7_7ZiqhJgfCXT9K4o,19349
4
+ q2_eplacer/_types.py,sha256=dser_tJu_CdjrH3aZSZKijTDDn62lU11bwbNMQivlWY,275
5
+ q2_eplacer/_version.py,sha256=rnObPjuBcEStqSO0S6gsdS_ot8ITOQjVj_-P1LUUYpg,22
6
+ q2_eplacer/citations.bib,sha256=SQQV_2H8nQ1XH5nRDJXZeKUqBVeD4n_MjF11pObZT-U,205
7
+ q2_eplacer/plugin_setup.py,sha256=QNl4cBB5EJn6W_wPMrARX55V8ccpy9VsOnV8UsS23H4,7387
8
+ q2_eplacer/tests/__init__.py,sha256=FBJL0kNtj1xr82tEEaR_C8UxijJPsTYyLF8xuctS7ok,353
9
+ q2_eplacer/tests/test_methods.py,sha256=LZeemrRMJq-W4kYIpH33k6Vl85ycEv54zShNHA29nik,4626
10
+ q2_eplacer/tests/data/alignedSeqs.qza,sha256=DYfKgn6cHnBs2DcId6xmy4fb5IcxFBVpvkP-XeDkheU,16174
11
+ q2_eplacer/tests/data/full_taxonomy.tsv,sha256=vRvZ-ZadTSieuzNFyMUU5ae1KG4l9esn3T8XEK3I-kg,1115
12
+ q2_eplacer/tests/data/geoData.tsv,sha256=w5CIberQCftmZB24ixNlOL9jQM-pnyP8m46Wq4lTjYw,396
13
+ q2_eplacer/tests/data/geoData_run.tsv,sha256=SoMG5fdjs0KOH2Jnnv7Upp8OEsOACrhfATnZtGbriaE,86
14
+ q2_eplacer/tests/data/seqs.qza,sha256=srsqDU9enAbRdWWViZJgHq27UUp8S_pCx7Wt_hnNIDg,16133
15
+ q2_eplacer/tests/data/testfasta.fa,sha256=iN2modoscNH8qNAnXh8rDAqB2FQ_YgEA8cgWYy3u62c,315
16
+ q2_eplacer/tests/data/testfasta.qza,sha256=cJnekUyV0MvZz-INAuHL-4PZP__Qtge3gQjNzLqTc4Y,16037
17
+ q2_eplacer-0.1.1.dist-info/licenses/LICENSE.txt,sha256=8vpWtuzfqNhxsr4BFgox7QcAimDo6ewY1Rt9MdpfUS0,525
18
+ q2_eplacer-0.1.1.dist-info/METADATA,sha256=MkCNX2afMwR0vp6FX5Gju2chd5FaKsn--GHhLhO49kY,12180
19
+ q2_eplacer-0.1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
+ q2_eplacer-0.1.1.dist-info/entry_points.txt,sha256=9lLaVQHujRxUs0NStaU5m6pACjoZ7dgIIYhXETrlXGU,61
21
+ q2_eplacer-0.1.1.dist-info/top_level.txt,sha256=3Arq1r1V9XJKEZXixwht-AMDFXNnt9Vlr0FiffxtsUc,11
22
+ q2_eplacer-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [qiime2.plugins]
2
+ q2-eplacer = q2_eplacer.plugin_setup:plugin
@@ -0,0 +1,5 @@
1
+ Software code created by U.S. Government employees is not subject to copyright in the United States (17 U.S.C. §105).
2
+ The United States/Department of Commerce reserve all rights to seek and obtain copyright protection in countries other
3
+ than the United States for Software authored in its entirety by the Department of Commerce. To this end, the Department
4
+ of Commerce hereby grants to Recipient a royalty-free, nonexclusive license to use, copy, and create derivative works of
5
+ the Software outside of the United States.
@@ -0,0 +1 @@
1
+ q2_eplacer