PyPI - eplacer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

eplacer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

eplacer/__init__.py +0 -0
eplacer/__main__.py +176 -0
eplacer/data_prep.py +97 -0
eplacer/external.py +60 -0
eplacer/geographicRep.py +133 -0
eplacer/models.py +234 -0
eplacer/run_command.py +62 -0
eplacer/run_model.py +610 -0
eplacer/train_command.py +53 -0
eplacer/train_evaluate.py +478 -0
eplacer-0.1.0.dist-info/METADATA +143 -0
eplacer-0.1.0.dist-info/RECORD +16 -0
eplacer-0.1.0.dist-info/WHEEL +5 -0
eplacer-0.1.0.dist-info/entry_points.txt +2 -0
eplacer-0.1.0.dist-info/licenses/LICENSE.txt +5 -0
eplacer-0.1.0.dist-info/top_level.txt +1 -0

eplacer/__init__.py ADDED Viewed

File without changes

eplacer/__main__.py ADDED Viewed

@@ -0,0 +1,176 @@
+#! /usr/bin/env python
+'''
+Usage:
+    eplacer [--version] [--help] <command> [<args>...]
+Options:
+    -h, --help          Generate Help Screen
+    -v, --version       Get Version Number
+General Commands:
+    train-model         Trains a convolutional neural network to perform
+                        a classification task to a specific taxonomic
+                        group.
+                        Options for classification are the following
+                        - sequence-only
+                        - sequence-geo
+    run-model           Runs the convolutional neural network on new
+                        data to assign taxonomy
+See 'eplacer <command> --help' for more information on a command
+'''
+import sys
+import os
+from docopt import docopt
+import multiprocessing
+def main():
+    args = docopt(__doc__,
+                  version='',
+                  options_first=True)
+    argv = [args['<command>']] + args['<args>']
+    if args['<command>'] == 'train-model':
+        import eplacer.train_command
+        args = docopt(eplacer.train_command.__doc__, argv=argv)
+        # Check if provided directory exists, if provided
+        if args['--out']:
+            if os.path.exists(args['--out']):
+                if args['--force'] == False:
+                    raise Exception('The path already exists! Exiting...\n')
+        # Set default out directory if it doesn't exist
+        else:
+            args['--out']="data/models/"
+            if os.path.exists(args['--out']):
+                if args['--force'] == False:
+                    raise Exception('The path already exists! Exiting...\n')
+        # Check for existence of required input
+        if args['--fasta']:
+            if not os.path.exists(args['--fasta']):
+                raise Exception('No fasta file exists at this location! Exiting...\n')
+        else:
+            raise Exception('No fasta file specified! Exiting...\n')
+        if args['--taxa']:
+            if not os.path.exists(args['--taxa']):
+                raise Exception('No taxa file exists at this location! Exiting...\n')
+        else:
+            raise Exception('No taxa file specified! Exiting...\n')
+        # Default to the species level. Which may or may not work
+        if not args['--taxlevel']:
+            args['--taxlevel']="SPECIES"
+        # Check for the geo data. Set mode of running based on this.
+        if args['--geoData']:
+            if not os.path.exists(args['--geoData']):
+                raise Exception("GeoData path does not exist! Exiting\n")
+            else:
+                sys.stdout.write("Setting mode to train on "
+                                 "sequence and geographic data\n")
+                mode='sequence_geo'
+            if not args['--kernel']:
+                kernel = 3
+            else:
+                kernel = int(args['--kernel'])
+            if not args['--sigma']:
+                sigma = 1
+            else:
+                sigma = int(args['--sigma'])
+            if not args['--precision']:
+                precision = 3
+            else:
+                precision = float(args['--precision'])
+        else:
+            sys.stdout.write("Setting mode to train on "
+                             "sequence data only\n")
+            mode='sequence'
+        if not args['--taxlevel']:
+            args['--taxlevel'] = "SPECIES"
+        if not args['--maskrate']:
+            maskrate = 0
+        else:
+            maskrate = float(args['--maskrate'])
+        if not args['--augments']:
+            augments = 5
+        else:
+            augments = int(args['--augments'])
+            print("augments: ", augments)
+        if mode == 'sequence':
+            eplacer.train_evaluate.train_sequence(args['--fasta'], args['--taxa'], args['--taxlevel'],args['--out'],args['--augments'],args['--maskrate'])
+        elif mode == 'sequence_geo':
+            eplacer.train_evaluate.train_sequenceOBIS(args['--fasta'], args['--taxa'], args['--taxlevel'],args['--out'],args['--geoData'],
+                                                      augments,maskrate,sigma,kernel,precision)
+        exit()
+    elif args['<command>'] == 'run-model':
+        import eplacer.run_command
+        import eplacer.run_model
+        args = docopt(eplacer.run_command.__doc__, argv=argv)
+        # check that the provided directory exists, if provided
+        if args['--out']:
+            if os.path.exists(args['--out']):
+                if args['--force'] == False:
+                    raise Exception('The path already exists! Exiting...\n')
+        # Set default out directory if it doesn't exist
+        else:
+            args['--out']="result/models/"
+            if os.path.exists(args['--out']):
+                if args['--force'] == False:
+                    raise Exception('The path already exists! Exiting...\n')
+        if args['--fasta']:
+            if not os.path.exists(args['--fasta']):
+                raise Exception('No fasta file exists at this location! Exiting...\n')
+        else:
+            raise Exception('No fasta file specified! Exiting...\n')
+        if args['--blast']:
+            if not os.path.exists(args['--blast']):
+                raise Exception('No blast result file exists at this location! Exiting...\n')
+        else:
+            raise Exception('No blast result file specified! Exiting...\n')
+        if args['--model']:
+            if not os.path.exists(args['--model']):
+                raise Exception('No model file exists at this location! Exiting...\n')
+            args['--taxfile'] = str(args['--model']) + "/taxa_key_SPECIES.tsv"
+            if not os.path.exists(args['--taxfile']):
+                raise Exception('No taxfile file exists at this location! Your model directory may be corrupted. Exiting...\n')
+        else:
+            raise Exception('No model file specified! Exiting...\n')
+        if not args['--taxlevel']:
+            args['--taxlevel'] = "SPECIES"
+        if args['--threads']:
+            cpu_count = multiprocessing.cpu_count()
+            if int(args['--threads']) > cpu_count:
+                args['--threads'] = cpu_count
+                print(f"Too many threads requested. setting to {cpu_count}")
+        else:
+            cpu_count = multiprocessing.cpu_count()
+            args['--threads'] = cpu_count
+            print(f"No threads requested. setting to {cpu_count}")
+        if not args['--maskrate']:
+            maskrate = 0
+        else:
+            maskrate = float(args['--maskrate'])
+        # Check for the geo data. Set mode of running based on this.
+        if args['--geoData']:
+            if not os.path.exists(args['--geoData']):
+                raise Exception("GeoData path does not exist! Exiting\n")
+            else:
+                sys.stdout.write("Setting mode to train on "
+                                 "sequence and geographic data\n")
+                mode='sequence_geo'
+            if not args['--counts']:
+                raise Exception('Abundance matrix not specified! Exiting\n')
+            elif not os.path.exists(args['--counts']):
+                raise Exception('The path to the count matrix does not exist! Exiting\n')
+            if not args['--kernel']:
+                kernel = 3
+            else:
+                kernel = int(args['--kernel'])
+            if not args['--sigma']:
+                sigma = 1
+            else:
+                sigma = int(args['--sigma'])
+            if not args['--confidence']:
+                args['--confidence'] = 0.9
+        else:
+            raise Exception('No geoData available. Exiting...\n')
+        eplacer.run_model.gen_model_output_OBIS(args['--confidence'], args['--blast'], args['--out'],args['--fasta'], args['--model'], args['--taxlevel'],args['--taxfile'],args['--geoData'],args['--counts'],maskrate,sigma,kernel, args['--threads'])

eplacer/data_prep.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""
+This script defines some useful code for prepping datasets
+for ePlacer
+Author: Christopher Powers
+Institution: NOAA NEFSC PEMAD PBB
+"""
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+import numpy as np
+def get_degenerate_bases():
+    """
+    Returns a dictionary mapping IUPAC degenerate bases to their possible canonical bases
+    """
+    return {
+        'A': ['A'],
+        'C': ['C'],
+        'G': ['G'],
+        'T': ['T'],
+        'R': ['A', 'G'],           # Purine
+        'Y': ['C', 'T'],           # Pyrimidine
+        'M': ['A', 'C'],           # Amino
+        'K': ['G', 'T'],           # Keto
+        'S': ['C', 'G'],           # Strong
+        'W': ['A', 'T'],           # Weak
+        'H': ['A', 'C', 'T'],      # not G
+        'B': ['C', 'G', 'T'],      # not A
+        'V': ['A', 'C', 'G'],      # not T
+        'D': ['A', 'G', 'T'],      # not C
+        'N': ['-'],                # any base is not informative. Encode as gap
+        '-': ['-']                 # gap
+    }
+def encode_onehot(seq, mask_token = "N"):
+    """
+    Function to encode an individual sequence with one hot encoding,
+    handling degenerate bases by averaging their possible canonical forms
+    """
+    mapping = {
+        "A": [1., 0., 0., 0.],
+        "C": [0., 1., 0., 0.],
+        "G": [0., 0., 1., 0.],
+        "T": [0., 0., 0., 1.],
+        "-": [0., 0., 0., 0.],
+        mask_token: [0.25, 0.25, 0.25, 0.25]
+    }
+    degenerate_bases = get_degenerate_bases()
+    # Pre-calculate average encodings for degenerate bases
+    for base, possibilities in degenerate_bases.items():
+        if base not in mapping and len(possibilities) > 0:
+            avg_encoding = np.zeros(4)
+            for canonical_base in possibilities:
+                if canonical_base in mapping:
+                    avg_encoding += np.array(mapping[canonical_base])
+            mapping[base] = (avg_encoding / len(possibilities)).tolist()
+    # Vectorized operation for the whole sequence
+    return np.array([mapping.get(base, [0., 0., 0., 0.]) for base in seq])
+class SeqGeoDataset(Dataset):
+    """
+    Dataset that stores the one-hot encoded data alongside
+    the geographic data
+    """
+    def __init__(self, sequences, labels, geo_data):
+        self.seqs = []
+        self.geo = []
+        self.taxa_labels = []
+        for i in range(0,len(sequences)):
+            self.seqs.append(sequences[i])
+        for i in range(0,len(labels)):
+            self.taxa_labels.append(labels[i])
+        self.ohe_seqs = torch.stack([torch.from_numpy(encode_onehot(x)).float() for x in self.seqs])
+        self.labels = torch.Tensor(self.taxa_labels).long()
+        for i in range(0,len(geo_data)):
+            self.geo.append(geo_data[i])
+        self.ohe_seqs = torch.stack([torch.from_numpy(encode_onehot(x)).float() for x in self.seqs])
+        self.ohe_geo = torch.stack([torch.from_numpy(x).float() for x in self.geo])
+        self.labels = torch.Tensor(self.taxa_labels).long()
+    def __len__(self): return len(self.seqs)
+    def __getitem__(self,idx):
+        seq = self.ohe_seqs[idx]
+        label = self.labels[idx]
+        geo = self.ohe_geo[idx]
+        return seq, geo, label

eplacer/external.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""
+This script runs mafft as a subprocess and generates
+an alignment.
+Author: Christopher Powers
+Institution: NOAA NEFSC PEMAD PBB
+"""
+import subprocess
+def run_mafft(input, reference, moutput, subset_output, threads):
+    """
+    Run the MAFFT alignment to add your sequences to a new fasta
+    """
+    # Get initial IDs
+    names = []
+    with open(input, "r") as infile:
+        for line in infile:
+            if line.startswith(">"):
+                names.append(line[1:].rstrip())
+    try:
+        print("Beginning subprocess...")
+        print("Aligning with mafft...")
+        command = ["mafft --add", input, "--adjustdirection --thread", str(threads), "--keeplength --reorder", reference, ">", moutput]
+        subprocess.run(" ".join(command), shell=True, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"MAFFT exection failed with error: {e}")
+    except FileNotFoundError:
+        print("MAFFT not found. Is it installed/in the path?")
+    # subset the new file
+    key = ''
+    seq = ''
+    seqdict = {}
+    with open(moutput, "r") as infile:
+        for line in infile:
+            line = line.rstrip()
+            if line.startswith(">_R_"):
+                if key != '':
+                    seqdict[key] = seq.upper()
+                    seq = ''
+                key = line[4:]
+            elif line.startswith(">"):
+                if key != '':
+                    seqdict[key] = seq.upper()
+                    seq = ''
+                key = line[1:]
+            else:
+                seq += line
+    with open(subset_output, "w") as outfile:
+        for s in seqdict:
+            if s in names:
+                outfile.write(f">{s}\n{seqdict[s]}\n")
+    return seqdict

eplacer/geographicRep.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+This script contains several scripts in order
+to represent the distribution of a species geographically.
+Author: Christopher Powers
+Institution: NOAA NEFSC PEMAD PBB
+"""
+import numpy as np
+import pygeohash
+from collections import defaultdict
+from scipy.spatial import cKDTree
+class SpeciesGeoEncoder:
+    def __init__(self, precision=3, min_lat=-90, max_lat=90, min_lon=-180, max_lon=180):
+        """
+        Initialize the encoder with a fixed geographic grid
+        """
+        self.precision = precision
+        self.min_lat = min_lat
+        self.max_lat = max_lat
+        self.min_lon = min_lon
+        self.max_lon = max_lon
+        self.total_precision = 32*precision
+        self.lat_divisions = int(self.total_precision/2)
+        self.lon_divisions = int(self.total_precision/2)
+        # Create fixed grid of geohashes and corresponding lat/lon points
+        self.grid_geohashes, self.grid_points = self._create_geohash_grid()
+        self.feature_dimension = len(self.grid_geohashes)
+        # Create KD-tree for efficient nearest neighbor search
+        self.kdtree = cKDTree(self.grid_points)
+        # Create mapping from geohash to index for faster lookup
+        self.geohash_to_index = {ghash: idx for idx, ghash in enumerate(self.grid_geohashes)}
+    def _create_geohash_grid(self):
+        """Create a fixed grid of geohashes covering the area of interest"""
+        geohashes = []
+        points = []
+        # Create evenly spaced grid points
+        lats = np.linspace(self.min_lat, self.max_lat, self.lat_divisions)
+        lons = np.linspace(self.min_lon, self.max_lon, self.lon_divisions)
+        # Generate geohash for each grid point
+        for lat in lats:
+            for lon in lons:
+                ghash = pygeohash.encode(lat, lon, precision=int(self.precision))
+                geohashes.append(ghash)
+                points.append([lat, lon])
+        return geohashes, np.array(points)
+    def encode_species(self, species_locations):
+        """
+        Encode species location data using the fixed geohash grid
+        """
+        encoded_data = defaultdict(lambda:np.zeros(self.feature_dimension))
+        for species, locations in species_locations.items():
+            # Initialize array for counts
+            counts = np.zeros(self.feature_dimension)
+            if not locations:
+                encoded_data[species] = counts
+                continue
+            # Convert locations to numpy array for batch processing
+            locations_array = np.array(locations)
+            # Find nearest grid points for all locations at once
+            _, indices = self.kdtree.query(locations_array)
+            # Count occurrences using numpy
+            unique_indices, occurrence_counts = np.unique(indices, return_counts=True)
+            counts[unique_indices] = occurrence_counts
+            # Normalize counts
+            total = np.sum(counts)
+            if total > 0:
+                counts = counts / total
+            print(species)
+            non_zero_indices = np.nonzero(counts)
+            print(counts[non_zero_indices])
+            # null distribution. This is 1/100 of the the null distribution
+            mask = counts < 0.00165
+            counts[mask] = 0
+            non_zero_indices = np.nonzero(counts)
+            print(counts[non_zero_indices])
+            encoded_data[species] = counts
+        return encoded_data
+    def get_feature_dimension(self):
+        """Return the fixed dimension of the feature space"""
+        return self.feature_dimension
+    def save_grid(self, filepath):
+        """Save the grid information to a file"""
+        grid_info = {
+            'precision': self.precision,
+            'min_lat': self.min_lat,
+            'max_lat': self.max_lat,
+            'min_lon': self.min_lon,
+            'max_lon': self.max_lon,
+            'lat_divisions': self.lat_divisions,
+            'lon_divisions': self.lon_divisions,
+            'grid_geohashes': self.grid_geohashes,
+            'grid_points': self.grid_points,
+            'feature_dimension': self.feature_dimension
+        }
+        print("Saving Grid Info: ")
+        for i in grid_info:
+            if i != "grid_geohashes":
+                print("{}:{}".format(i, grid_info[i]))
+        np.save(filepath, grid_info)
+    @classmethod
+    def load_grid(cls, filepath):
+        """Load a saved grid"""
+        grid_info = np.load(filepath, allow_pickle=True).item()
+        encoder = cls(
+            precision=grid_info['precision'],
+            min_lat=grid_info['min_lat'],
+            max_lat=grid_info['max_lat'],
+            min_lon=grid_info['min_lon'],
+            max_lon=grid_info['max_lon']
+        )
+        # Verify grid matches
+        assert np.array_equal(encoder.grid_points, grid_info['grid_points'])
+        return encoder