PyamilySeq 0.0.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1 +1 @@
1
- PyamilySeq_Version = 'v0.0.1'
1
+ PyamilySeq_Version = 'v0.2.0'
@@ -57,6 +57,27 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
57
57
  # edge_list_outfile.write(line + '\n')
58
58
 
59
59
 
60
+ def wrap_sequence(sequence, width=60):
61
+ wrapped_sequence = []
62
+ for i in range(0, len(sequence), width):
63
+ wrapped_sequence.append(sequence[i:i + width])
64
+ return "\n".join(wrapped_sequence)
65
+
66
+
67
+ def read_fasta(fasta_file):
68
+ sequences = {}
69
+ current_sequence = None
70
+ with open(fasta_file, 'r') as file:
71
+ for line in file:
72
+ line = line.strip()
73
+ if not line:
74
+ continue # Skip empty lines
75
+ if line.startswith('>'):
76
+ current_sequence = line[1:] # Remove '>' character
77
+ sequences[current_sequence] = ''
78
+ else:
79
+ sequences[current_sequence] += line
80
+ return sequences
60
81
 
61
82
 
62
83
  def reorder_dict_by_keys(original_dict, sorted_keys):
@@ -77,25 +98,25 @@ def get_cores(options,genome_dict):
77
98
  first = False
78
99
  prev_top = calculated_floor
79
100
  first_core_group = 'first_core_' + group
80
- cores[first_core_group] = 0
101
+ cores[first_core_group] = []
81
102
  if options.reclustered != None:
82
103
  extended_core_group = 'extended_core_' + group
83
- cores[extended_core_group] = 0
104
+ cores[extended_core_group] = []
84
105
  combined_core_group = 'combined_core_' + group
85
- cores[combined_core_group] = 0
106
+ cores[combined_core_group] = []
86
107
  second_core_group = 'second_core_' + group
87
- cores[second_core_group] = 0
108
+ cores[second_core_group] = []
88
109
  only_second_core_group = 'only_second_core_' + group
89
- cores[only_second_core_group] = 0
110
+ cores[only_second_core_group] = []
90
111
  return cores, groups
91
112
 
92
113
  #@profile
93
- def calc_First_only_core(pep_num, groups, cores):
114
+ def calc_First_only_core(cluster, pep_num, groups, cores):
94
115
  groups_as_list = list(groups.values())
95
116
  for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
96
117
  res = idx
97
118
  family_group = list(groups)[res]
98
- cores['first_core_'+family_group] +=1
119
+ cores['first_core_'+family_group].append(cluster)
99
120
 
100
121
  #@profile
101
122
  def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
@@ -103,7 +124,7 @@ def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_n
103
124
  for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
104
125
  res = idx
105
126
  family_group = list(groups)[res]
106
- cores['extended_core_' + family_group] += 1
127
+ cores['extended_core_' + family_group].append(pep_num)
107
128
 
108
129
 
109
130
  #@profile
@@ -484,10 +505,10 @@ def cluster(options):
484
505
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
485
506
  ############################### Calculate First only
486
507
  if numbers[0] == 1 and numbers[1] >=2:
487
- calc_First_only_core(numbers[1],groups,cores)
508
+ calc_First_only_core(cluster, numbers[1],groups,cores)
488
509
  counter +=1
489
510
  elif numbers[0] >1 and numbers[1] >=2:
490
- calc_First_only_core(numbers[2][0],groups,cores)
511
+ calc_First_only_core(cluster, numbers[2][0],groups,cores)
491
512
  counter += 1
492
513
 
493
514
  if options.reclustered != None:
@@ -517,11 +538,35 @@ def cluster(options):
517
538
  for key_prefix in key_order:
518
539
  for key, value in cores.items():
519
540
  if key.startswith(key_prefix):
520
- print(f"{key}: {value}")
541
+ print(f"{key}: {len(value)}")
521
542
 
522
543
  if options.gene_presence_absence_out != None:
523
544
  gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
524
545
 
546
+ if options.write_families != None and options.fasta != None:
547
+ sequences = read_fasta(options.fasta)
548
+ input_dir = os.path.dirname(os.path.abspath(options.clusters))
549
+ output_dir = os.path.join(input_dir, 'Gene_Families_Output')
550
+
551
+ # Create output directory if it doesn't exist
552
+ if not os.path.exists(output_dir):
553
+ os.makedirs(output_dir)
554
+ for key_prefix in key_order:
555
+ for key, values in cores.items():
556
+ if any(part in options.write_families.split(',') for part in key.split('_')):
557
+ if key.startswith(key_prefix):
558
+ for value in values:
559
+ output_filename = f"{key}_{value}.fasta"
560
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
561
+ # Write sequences to output file that are in the sequences dictionary
562
+ with open(os.path.join(output_dir, output_filename), 'w') as outfile:
563
+ for header in sequences_to_write:
564
+ if header in sequences:
565
+ outfile.write(f">{header}\n")
566
+ wrapped_sequence = wrap_sequence(sequences[header])
567
+ outfile.write(f"{wrapped_sequence}\n")
568
+
569
+
525
570
 
526
571
  def main():
527
572
 
@@ -534,13 +579,20 @@ def main():
534
579
  required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
535
580
  help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
536
581
 
582
+ output_args = parser.add_argument_group('Output Parameters')
583
+ output_args.add_argument('-w', action="store", dest='write_families', default="99",
584
+ help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95"'
585
+ ' - Must provide FASTA file with -fasta')
586
+ output_args.add_argument('-fasta', action='store', dest='fasta',
587
+ help='FASTA file to use in conjunction with "-w"',
588
+ required=False)
537
589
 
538
590
  optional = parser.add_argument_group('Optional Arguments')
539
591
  optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
540
592
  required=False)
541
593
  optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
542
594
  required=False)
543
- optional.add_argument('-groups', action="store", dest='core_groups', default="99,80,15",
595
+ optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,90,80,15",
544
596
  help='Default - (\'99,95,90,80,15\'): Gene family groups to use')
545
597
  optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
546
598
  required=False)
@@ -562,6 +614,11 @@ def main():
562
614
  if options.sequence_tag == None:
563
615
  options.sequence_tag = 'StORF'
564
616
 
617
+
618
+ if options.write_families != None and options.fasta == False:
619
+ exit("-fasta must br provided if -w is used")
620
+
621
+
565
622
  options.clusters = os.path.normpath(options.clusters)
566
623
  options.clusters = os.path.realpath(options.clusters)
567
624
  if options.reclustered:
@@ -573,6 +630,10 @@ def main():
573
630
 
574
631
  cluster(options)
575
632
 
633
+
634
+
635
+
636
+
576
637
  print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
577
638
  "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
578
639
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PyamilySeq
3
- Version: 0.0.2
3
+ Version: 0.2.0
4
4
  Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Home-page: https://github.com/NickJD/PyamilySeq
6
6
  Author: Nicholas Dimonaco
@@ -31,7 +31,37 @@ PyamilySeq requires Python 3.6 or higher. Install dependencies using pip:
31
31
  pip install PyamilySeq
32
32
  ```
33
33
 
34
- ## Usage
34
+ ## Usage - Menu
35
+ ```
36
+ PyamilySeq_Species.py -h
37
+ usage: PyamilySeq_Species.py [-h] -c CLUSTERS -f {CD-HIT,CSV,TSV} [-w WRITE_FAMILIES] [-fasta FASTA] [-rc RECLUSTERED] [-st SEQUENCE_TAG]
38
+ [-groups CORE_GROUPS] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
39
+
40
+ PyamilySeq v0.2.0: PyamilySeq Run Parameters.
41
+
42
+ Required Arguments:
43
+ -c CLUSTERS Clustering output file from CD-HIT, TSV or CSV Edge List
44
+ -f {CD-HIT,CSV,TSV} Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))
45
+
46
+ Output Parameters:
47
+ -w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95" - Must provide
48
+ FASTA file with -fasta
49
+ -fasta FASTA FASTA file to use in conjunction with "-w"
50
+
51
+ Optional Arguments:
52
+ -rc RECLUSTERED Clustering output file from secondary round of clustering
53
+ -st SEQUENCE_TAG Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
54
+ -groups CORE_GROUPS Default - ('99,95,90,80,15'): Gene family groups to use
55
+ -gpa GENE_PRESENCE_ABSENCE_OUT
56
+ Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other
57
+ downstream tools
58
+
59
+ Misc:
60
+ -verbose {True,False}
61
+ Default - False: Print out runtime messages
62
+ -v Default - False: Print out version number and exit
63
+
64
+ ```
35
65
 
36
66
  ### Clustering Analysis
37
67
 
@@ -58,6 +88,7 @@ Replace `reclustered_file` with the path to the file containing additional seque
58
88
  PyamilySeq generates various outputs, including:
59
89
 
60
90
  - **Gene Presence-Absence File**: This CSV file details the presence and absence of genes across genomes.
91
+ - **FASTA Files for Each Gene Family**:
61
92
 
62
93
  ## Gene Family Groups
63
94
 
@@ -0,0 +1,11 @@
1
+ PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
2
+ PyamilySeq/Constants.py,sha256=3Nr6JfUVt2eZT4M7fV-sz_bPXIvPgxIBT5nR76kCPIo,30
3
+ PyamilySeq/PyamilySeq_Species.py,sha256=SCWeK7bEfnKLrfzliiOx7Jtmie8vvAXGtQE_PpJD5hY,31040
4
+ PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
6
+ PyamilySeq-0.2.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
7
+ PyamilySeq-0.2.0.dist-info/METADATA,sha256=FUiZzxQzqnOwokb7MflZCMUzK9JgFVUVzEvLBPAlpgk,4144
8
+ PyamilySeq-0.2.0.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
9
+ PyamilySeq-0.2.0.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
10
+ PyamilySeq-0.2.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
11
+ PyamilySeq-0.2.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
2
- PyamilySeq/Constants.py,sha256=hrbTdmPUFEzLfGZOPoQPV0NsAG-VnfIX51291vqb1C8,30
3
- PyamilySeq/PyamilySeq_Species.py,sha256=34NHcViENyAdvGRltNUbfWjEcNCYnsmbuhDdl8__mH0,28209
4
- PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
6
- PyamilySeq-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
7
- PyamilySeq-0.0.2.dist-info/METADATA,sha256=v6hOL3kekqt8H5YhjpS6uQOF1QSFcBh4Zy-jNW3xDTk,2550
8
- PyamilySeq-0.0.2.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
9
- PyamilySeq-0.0.2.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
10
- PyamilySeq-0.0.2.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
11
- PyamilySeq-0.0.2.dist-info/RECORD,,