PyPI - PyamilySeq - Versions diffs - 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

PyamilySeq 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

PyamilySeq/Constants.py +1 -1
PyamilySeq/Group_Splitter.py +24 -9
PyamilySeq/PyamilySeq.py +8 -5
PyamilySeq/utils.py +19 -4
{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/METADATA +12 -9
PyamilySeq-0.8.1.dist-info/RECORD +15 -0
PyamilySeq-0.8.0.dist-info/RECORD +0 -15
{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/LICENSE +0 -0
{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/WHEEL +0 -0
{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/entry_points.txt +0 -0
{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/top_level.txt +0 -0

PyamilySeq/Constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-PyamilySeq_Version = 'v0.8.0'
+PyamilySeq_Version = 'v0.8.1'

PyamilySeq/Group_Splitter.py CHANGED Viewed

@@ -110,11 +110,13 @@ def read_cd_hit_output(clustering_output):
                     clustered_header = clustered_info.split('>')[1].split('...')[0]
                     clustered_header = '>' + clustered_header
-                    if 'at +' in clustered_info:
-                        percent_identity = float(clustered_info.split('at +/')[1].strip().replace('%', ''))
+                    if 'at' in clustered_info:
+                        percent_identity = extract_identity(line)
-                    if '*' in line:
+                    elif '*' in line:
                         percent_identity = 100.0
+                    else:
+                        raise ValueError("Percent identity not found in the string.")
                     clusters[current_cluster_id].append({
                         'header': clustered_header,
@@ -130,7 +132,6 @@ def separate_groups(input_fasta, options, clustering_mode):
     paralog_groups = defaultdict(int)  # To track number of paralog groups
     for group_header, sequences in groups.items():
         group_name = group_header.split('|')[0]  # Get the group part (e.g., '>Group_n')
@@ -149,11 +150,12 @@ def separate_groups(input_fasta, options, clustering_mode):
         group_file_name = group_name.replace('>','')
-        temp_fasta = f"{options.output_dir}{group_file_name}.fasta"
+        temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
         write_fasta(sequences, temp_fasta)
         # Run cd-hit on the individual group
         clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
         run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
         # Read the clustering results to find subgroups
@@ -255,7 +257,7 @@ def separate_groups(input_fasta, options, clustering_mode):
                 # Determine the next subgroup for this genome
                 subgroup_id = genome_count[genome] % num_subgroups
-                new_header = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
+                new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
                 subgroup_sequences[subgroup_id].append((new_header, seq))
                 # Increment the count for this genome
@@ -266,6 +268,12 @@ def separate_groups(input_fasta, options, clustering_mode):
                 subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
                 write_fasta(seqs, subgroup_file)
+                # Increment subgroup ID globally for the next subgroup
+                subgroup_id += 1
+                paralog_groups[group_name] += 1  # Count this group as a paralog group
         # Clean up temporary fasta file if the option is set
         if options.delete_temp_files:
             if temp_fasta and os.path.exists(temp_fasta):
@@ -288,6 +296,9 @@ def main():
     required.add_argument('-input_fasta', action='store', dest='input_fasta',
                           help='Input FASTA file containing gene groups.',
                           required=True)
+    required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
+                          help='Default - DNA: Are groups "DNA" or "AA" sequences?',
+                          required=False)
     required.add_argument('-output_dir', action='store', dest='output_dir',
                           help='Output directory.',
                           required=True)
@@ -305,8 +316,8 @@ def main():
     optional.add_argument('-percent_threshold', action='store', dest='percent_threshold', type=float, default=80,
                           help='Minimum percentage of genomes with paralogs (default: 80.0)')
     optional.add_argument('-verbose', action='store_true', dest='verbose', help='Print verbose output.')
-    optional.add_argument('-delete_temp_files', action='store_true', dest='delete_temp_files',
-                          help='Delete all temporary files after processing.')
+    optional.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
+                          help='Default: Delete all temporary files after processing.')
     misc = parser.add_argument_group('Misc Arguments')
     misc.add_argument('-v', action='store_true', dest='version',
@@ -325,7 +336,11 @@ def main():
     if not os.path.exists(options.output_dir):
         os.makedirs(options.output_dir)
-    clustering_mode = 'cd-hit-est'
+    if options.sequence_type == 'DNA':
+        clustering_mode = 'cd-hit-est'
+    else:
+        clustering_mode = 'cd-hit'
     separate_groups(options.input_fasta, options, clustering_mode)
     print("Done")

PyamilySeq/PyamilySeq.py CHANGED Viewed

@@ -27,7 +27,7 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
         '-o', clustering_output,
         '-c', str(options.pident),
         '-s', str(options.len_diff),
-        '-T', str(options.clustering_threads),
+        '-T', str(options.threads),
         '-M', str(options.clustering_memory),
         '-d', "0",
         '-sc', "1",
@@ -84,8 +84,8 @@ def main():
     clustering_args.add_argument("-mem", action="store", dest="clustering_memory", type=int, default=4000,
                           help="Default 4000: Memory to be allocated for clustering (in MBs).",
                           required=False)
-    clustering_args.add_argument("-t", action="store", dest="clustering_threads", type=int, default=4,
-                          help="Default 4: Threads to be allocated for clustering.",
+    clustering_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
+                          help="Default 8: Threads to be allocated for clustering and/or alignment.",
                           required=False)
     ###Partial-Mode Arguments
@@ -130,8 +130,9 @@ def main():
     ### Misc Arguments
     misc = parser.add_argument_group('Misc')
-    misc.add_argument('-verbose', action='store_true', dest='verbose', default=None,                        help='Default - False: Print out runtime messages',
-                        required = False)
+    misc.add_argument('-verbose', action='store_true', dest='verbose', default=None,
+                      help='Default - False: Print out runtime messages',
+                      required = False)
     misc.add_argument('-v', action='store_true', dest='version',
                         help='Default - False: Print out version number and exit',
                         required=False)
@@ -254,6 +255,7 @@ def main():
                 self.output_dir = options.output_dir
                 self.gene_presence_absence_out = options.gene_presence_absence_out
                 self.write_groups = options.write_groups
+                self.threads = options.threads
                 self.align_core = options.align_core
                 self.fasta = combined_out_file
                 self.verbose = options.verbose
@@ -272,6 +274,7 @@ def main():
                 self.output_dir = options.output_dir
                 self.gene_presence_absence_out = options.gene_presence_absence_out
                 self.write_groups = options.write_groups
+                self.threads = options.threads
                 self.align_core = options.align_core
                 self.fasta = options.original_fasta
                 self.verbose = options.verbose

PyamilySeq/utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ import collections
 from tempfile import NamedTemporaryFile
 import sys
 from line_profiler_pycharm import profile
+import re
 ################### We are currently fixed using Table 11
@@ -110,12 +111,23 @@ def reverse_complement(seq):
     complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
     return ''.join(complement[base] for base in reversed(seq))
 def fix_path(path):
     fixed_path = os.path.normpath(path)
     fixed_path = os.path.realpath(fixed_path)
     return fixed_path
+def extract_identity(clustered_info):
+    # Use regular expressions to capture the percentage value at the end of the line
+    match = re.search(r'at ([-+]*)(\d+\.\d+)%', clustered_info)
+    if match:
+        percent_identity = float(match.group(2))  # Extract the percentage value
+        return percent_identity
+    else:
+        raise ValueError("Percent identity not found in the string.")
 def wrap_sequence(sequence, width=60):
     wrapped_sequence = []
     for i in range(0, len(sequence), width):
@@ -172,14 +184,15 @@ def run_mafft_on_sequences(options, sequences, output_file):
         with open(output_file, 'w') as output_f:
             if options.verbose == True:
                 subprocess.run(
-                    ['mafft', '--auto', temp_input_file_path],
+                    ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
                     stdout=output_f,
                     stderr=sys.stderr,
                     check=True
                 )
             else:
                 subprocess.run(
-                    ['mafft', '--auto', temp_input_file_path],
+                    ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
                     stdout=output_f,
                     stderr=subprocess.DEVNULL,  # Suppress stderr
                     check=True
@@ -385,7 +398,7 @@ def process_gene_families(options, directory, output_file):
     # Iterate over each gene family file
     for gene_file in os.listdir(directory):
-        if gene_file.endswith('.fasta'):
+        if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta'):
             gene_path = os.path.join(directory, gene_file)
             # Read sequences from the gene family file
@@ -395,13 +408,15 @@ def process_gene_families(options, directory, output_file):
             longest_sequences = select_longest_gene(sequences)
             # Run mafft on the longest sequences
-            aligned_file = f"{gene_file}_aligned.fasta"
+            aligned_file = f"{directory}/{gene_file}_aligned.fasta.tmp"
             run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
             # Read aligned sequences and concatenate them
             aligned_sequences = read_fasta(aligned_file)
             for genome, aligned_seq in aligned_sequences.items():
                 genome_name = genome.split('|')[0]
+                if 'Group' in genome_name:
+                    print(2)
                 if genome_name not in concatenated_sequences:
                     concatenated_sequences[genome_name] = ""
                 concatenated_sequences[genome_name] += aligned_seq

{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: PyamilySeq
-Version: 0.8.0
+Version: 0.8.1
 Summary: PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
 Home-page: https://github.com/NickJD/PyamilySeq
 Author: Nicholas Dimonaco
@@ -58,7 +58,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX	Escherichia_coli_110957|ENSB:TIZS9k
 ```
 ### Example output:
 ```
-Running PyamilySeq v0.8.0
+Running PyamilySeq v0.8.1
 Calculating Groups
 Gene Groups:
 First_core_99: 2682
@@ -80,7 +80,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
  -cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
 ```
 ```commandline
-Running PyamilySeq v0.8.0
+Running PyamilySeq v0.8.1
 Calculating Groups
 Genus Groups:
 First_genera_1:	28549
@@ -137,14 +137,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
 ## PyamilySeq - Menu:
 ### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
 ```
-Running PyamilySeq v0.8.0
+Running PyamilySeq v0.8.1
 usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
                      [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
                      [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
                      [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
                      [-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
-PyamilySeq v0.8.0: A tool that groups genes into unique clusters.
+PyamilySeq v0.8.1: A tool that groups genes into unique clusters.
 options:
   -h, --help            show this help message and exit
@@ -176,8 +176,9 @@ Full-Mode Arguments - Required when "-run_mode Full" is used:
 Clustering Runtime Arguments - Optional when "-run_mode Full" is used:
   -mem CLUSTERING_MEMORY
                         Default 4000: Memory to be allocated for clustering (in MBs).
-  -t CLUSTERING_THREADS
-                        Default 4: Threads to be allocated for clustering.
+  -t THREADS            Default 8: Threads to be allocated for clustering
+                        and/or alignment.
 Partial-Mode Arguments - Required when "-run_mode Partial" is used:
   -cluster_file CLUSTER_FILE
@@ -221,7 +222,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split _combined.gff3 -output
 ```
 usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
-Seq-Combiner v0.8.0: A tool to extract sequences from GFF/FASTA files.
+Seq-Combiner v0.8.1: A tool to extract sequences from GFF/FASTA files.
 options:
   -h, --help            show this help message and exit
@@ -254,7 +255,7 @@ Misc Arguments:
 usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -output_dir OUTPUT_DIR [-pident PIDENT] [-len_diff LEN_DIFF] [-clustering_threads CLUSTERING_THREADS]
                          [-clustering_memory CLUSTERING_MEMORY] [-percent_threshold PERCENT_THRESHOLD] [-verbose] [-delete_temp_files] [-v]
-Group-Splitter: v0.8.0: A tool to split "paralogous" groups identified by PyamilySeq.
+Group-Splitter: v0.8.1: A tool to split "paralogous" groups identified by PyamilySeq.
 options:
   -h, --help            show this help message and exit
@@ -262,6 +263,8 @@ options:
 Required Arguments:
   -input_fasta INPUT_FASTA
                         Input FASTA file containing gene groups.
+  -sequence_type {AA,DNA}
+                        Default - DNA: Are groups "DNA" or "AA" sequences?
   -output_dir OUTPUT_DIR
                         Output directory.

PyamilySeq-0.8.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+PyamilySeq/Constants.py,sha256=J_jZheqHCbmFVCLrY8nMe4T5VZQOQ7PbT_HmYSi58WM,31
+PyamilySeq/Group_Splitter.py,sha256=wrz-vcQ2gJ40MLLczFY8te35_uYrOBuh2v-fJSIVsWo,15578
+PyamilySeq/PyamilySeq.py,sha256=OAtz6b7dnvA-Qg0dnf2JXImiOtsDrDfVit7Q6DFbuPU,15265
+PyamilySeq/PyamilySeq_Genus.py,sha256=hC34cHIFu8YaXXgcPyVwuWENlsxx-7mT-Qr6PAdio4U,12414
+PyamilySeq/PyamilySeq_Species.py,sha256=spgS-h-lrySZBiOiB6jX6pPRaL5j8f5V1Hq3XOjBOko,14404
+PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
+PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
+PyamilySeq/utils.py,sha256=vjPSIua4E72JTWlzH4CUaRcR-Z6Nr-RQ9N_92tfZI_w,19686
+PyamilySeq-0.8.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+PyamilySeq-0.8.1.dist-info/METADATA,sha256=weIjFQkc7ggqkPlPkSA5an8eFiUzhDyxGl9t7-rJPsA,14555
+PyamilySeq-0.8.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+PyamilySeq-0.8.1.dist-info/entry_points.txt,sha256=15BsozBN6vRWvZeQon05dY4YQT7DqP5i2TUqFWRGCvc,150
+PyamilySeq-0.8.1.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
+PyamilySeq-0.8.1.dist-info/RECORD,,

PyamilySeq-0.8.0.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-PyamilySeq/Constants.py,sha256=lbVZv4vDHroA83KCDTIGuVb6bubKYZbwLmhYHxedXQc,31
-PyamilySeq/Group_Splitter.py,sha256=raZMV9SN7Qqw5Hci5qpkaahR66JMQf6dX8TvThjh3kU,14986
-PyamilySeq/PyamilySeq.py,sha256=0607A9nqafoQ8IhBxGgGJ-v3DVV6C6-LgzdDIXb2C-c,15179
-PyamilySeq/PyamilySeq_Genus.py,sha256=hC34cHIFu8YaXXgcPyVwuWENlsxx-7mT-Qr6PAdio4U,12414
-PyamilySeq/PyamilySeq_Species.py,sha256=spgS-h-lrySZBiOiB6jX6pPRaL5j8f5V1Hq3XOjBOko,14404
-PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
-PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
-PyamilySeq/utils.py,sha256=6UtYJW3_0rDhEhvrJi6R3smvKu2n_bjqUkuzr5DcJM4,19061
-PyamilySeq-0.8.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-PyamilySeq-0.8.0.dist-info/METADATA,sha256=ZnpQvAQy5EXGrzS0G9y5qH2Rhmb0LW2HvOT-b5WJLoo,14436
-PyamilySeq-0.8.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-PyamilySeq-0.8.0.dist-info/entry_points.txt,sha256=15BsozBN6vRWvZeQon05dY4YQT7DqP5i2TUqFWRGCvc,150
-PyamilySeq-0.8.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
-PyamilySeq-0.8.0.dist-info/RECORD,,

{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{PyamilySeq-0.8.0.dist-info → PyamilySeq-0.8.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

PyamilySeq 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

PyamilySeq 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl