PyamilySeq 0.0.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq_Species.py +73 -12
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.2.0.dist-info}/METADATA +33 -2
- PyamilySeq-0.2.0.dist-info/RECORD +11 -0
- PyamilySeq-0.0.2.dist-info/RECORD +0 -11
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.2.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.2.0.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.2.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.2.0.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v0.0
|
|
1
|
+
PyamilySeq_Version = 'v0.2.0'
|
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -57,6 +57,27 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
57
57
|
# edge_list_outfile.write(line + '\n')
|
|
58
58
|
|
|
59
59
|
|
|
60
|
+
def wrap_sequence(sequence, width=60):
|
|
61
|
+
wrapped_sequence = []
|
|
62
|
+
for i in range(0, len(sequence), width):
|
|
63
|
+
wrapped_sequence.append(sequence[i:i + width])
|
|
64
|
+
return "\n".join(wrapped_sequence)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def read_fasta(fasta_file):
|
|
68
|
+
sequences = {}
|
|
69
|
+
current_sequence = None
|
|
70
|
+
with open(fasta_file, 'r') as file:
|
|
71
|
+
for line in file:
|
|
72
|
+
line = line.strip()
|
|
73
|
+
if not line:
|
|
74
|
+
continue # Skip empty lines
|
|
75
|
+
if line.startswith('>'):
|
|
76
|
+
current_sequence = line[1:] # Remove '>' character
|
|
77
|
+
sequences[current_sequence] = ''
|
|
78
|
+
else:
|
|
79
|
+
sequences[current_sequence] += line
|
|
80
|
+
return sequences
|
|
60
81
|
|
|
61
82
|
|
|
62
83
|
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
@@ -77,25 +98,25 @@ def get_cores(options,genome_dict):
|
|
|
77
98
|
first = False
|
|
78
99
|
prev_top = calculated_floor
|
|
79
100
|
first_core_group = 'first_core_' + group
|
|
80
|
-
cores[first_core_group] =
|
|
101
|
+
cores[first_core_group] = []
|
|
81
102
|
if options.reclustered != None:
|
|
82
103
|
extended_core_group = 'extended_core_' + group
|
|
83
|
-
cores[extended_core_group] =
|
|
104
|
+
cores[extended_core_group] = []
|
|
84
105
|
combined_core_group = 'combined_core_' + group
|
|
85
|
-
cores[combined_core_group] =
|
|
106
|
+
cores[combined_core_group] = []
|
|
86
107
|
second_core_group = 'second_core_' + group
|
|
87
|
-
cores[second_core_group] =
|
|
108
|
+
cores[second_core_group] = []
|
|
88
109
|
only_second_core_group = 'only_second_core_' + group
|
|
89
|
-
cores[only_second_core_group] =
|
|
110
|
+
cores[only_second_core_group] = []
|
|
90
111
|
return cores, groups
|
|
91
112
|
|
|
92
113
|
#@profile
|
|
93
|
-
def calc_First_only_core(pep_num, groups, cores):
|
|
114
|
+
def calc_First_only_core(cluster, pep_num, groups, cores):
|
|
94
115
|
groups_as_list = list(groups.values())
|
|
95
116
|
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
|
|
96
117
|
res = idx
|
|
97
118
|
family_group = list(groups)[res]
|
|
98
|
-
cores['first_core_'+family_group]
|
|
119
|
+
cores['first_core_'+family_group].append(cluster)
|
|
99
120
|
|
|
100
121
|
#@profile
|
|
101
122
|
def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
|
|
@@ -103,7 +124,7 @@ def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_n
|
|
|
103
124
|
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
|
|
104
125
|
res = idx
|
|
105
126
|
family_group = list(groups)[res]
|
|
106
|
-
cores['extended_core_' + family_group]
|
|
127
|
+
cores['extended_core_' + family_group].append(pep_num)
|
|
107
128
|
|
|
108
129
|
|
|
109
130
|
#@profile
|
|
@@ -484,10 +505,10 @@ def cluster(options):
|
|
|
484
505
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
485
506
|
############################### Calculate First only
|
|
486
507
|
if numbers[0] == 1 and numbers[1] >=2:
|
|
487
|
-
calc_First_only_core(numbers[1],groups,cores)
|
|
508
|
+
calc_First_only_core(cluster, numbers[1],groups,cores)
|
|
488
509
|
counter +=1
|
|
489
510
|
elif numbers[0] >1 and numbers[1] >=2:
|
|
490
|
-
calc_First_only_core(numbers[2][0],groups,cores)
|
|
511
|
+
calc_First_only_core(cluster, numbers[2][0],groups,cores)
|
|
491
512
|
counter += 1
|
|
492
513
|
|
|
493
514
|
if options.reclustered != None:
|
|
@@ -517,11 +538,35 @@ def cluster(options):
|
|
|
517
538
|
for key_prefix in key_order:
|
|
518
539
|
for key, value in cores.items():
|
|
519
540
|
if key.startswith(key_prefix):
|
|
520
|
-
print(f"{key}: {value}")
|
|
541
|
+
print(f"{key}: {len(value)}")
|
|
521
542
|
|
|
522
543
|
if options.gene_presence_absence_out != None:
|
|
523
544
|
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
524
545
|
|
|
546
|
+
if options.write_families != None and options.fasta != None:
|
|
547
|
+
sequences = read_fasta(options.fasta)
|
|
548
|
+
input_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
549
|
+
output_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
550
|
+
|
|
551
|
+
# Create output directory if it doesn't exist
|
|
552
|
+
if not os.path.exists(output_dir):
|
|
553
|
+
os.makedirs(output_dir)
|
|
554
|
+
for key_prefix in key_order:
|
|
555
|
+
for key, values in cores.items():
|
|
556
|
+
if any(part in options.write_families.split(',') for part in key.split('_')):
|
|
557
|
+
if key.startswith(key_prefix):
|
|
558
|
+
for value in values:
|
|
559
|
+
output_filename = f"{key}_{value}.fasta"
|
|
560
|
+
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
561
|
+
# Write sequences to output file that are in the sequences dictionary
|
|
562
|
+
with open(os.path.join(output_dir, output_filename), 'w') as outfile:
|
|
563
|
+
for header in sequences_to_write:
|
|
564
|
+
if header in sequences:
|
|
565
|
+
outfile.write(f">{header}\n")
|
|
566
|
+
wrapped_sequence = wrap_sequence(sequences[header])
|
|
567
|
+
outfile.write(f"{wrapped_sequence}\n")
|
|
568
|
+
|
|
569
|
+
|
|
525
570
|
|
|
526
571
|
def main():
|
|
527
572
|
|
|
@@ -534,13 +579,20 @@ def main():
|
|
|
534
579
|
required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
|
|
535
580
|
help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
|
|
536
581
|
|
|
582
|
+
output_args = parser.add_argument_group('Output Parameters')
|
|
583
|
+
output_args.add_argument('-w', action="store", dest='write_families', default="99",
|
|
584
|
+
help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95"'
|
|
585
|
+
' - Must provide FASTA file with -fasta')
|
|
586
|
+
output_args.add_argument('-fasta', action='store', dest='fasta',
|
|
587
|
+
help='FASTA file to use in conjunction with "-w"',
|
|
588
|
+
required=False)
|
|
537
589
|
|
|
538
590
|
optional = parser.add_argument_group('Optional Arguments')
|
|
539
591
|
optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
|
|
540
592
|
required=False)
|
|
541
593
|
optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
542
594
|
required=False)
|
|
543
|
-
optional.add_argument('-groups', action="store", dest='core_groups', default="99,80,15",
|
|
595
|
+
optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,90,80,15",
|
|
544
596
|
help='Default - (\'99,95,90,80,15\'): Gene family groups to use')
|
|
545
597
|
optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
546
598
|
required=False)
|
|
@@ -562,6 +614,11 @@ def main():
|
|
|
562
614
|
if options.sequence_tag == None:
|
|
563
615
|
options.sequence_tag = 'StORF'
|
|
564
616
|
|
|
617
|
+
|
|
618
|
+
if options.write_families != None and options.fasta == False:
|
|
619
|
+
exit("-fasta must br provided if -w is used")
|
|
620
|
+
|
|
621
|
+
|
|
565
622
|
options.clusters = os.path.normpath(options.clusters)
|
|
566
623
|
options.clusters = os.path.realpath(options.clusters)
|
|
567
624
|
if options.reclustered:
|
|
@@ -573,6 +630,10 @@ def main():
|
|
|
573
630
|
|
|
574
631
|
cluster(options)
|
|
575
632
|
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
|
|
576
637
|
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
577
638
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
578
639
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -31,7 +31,37 @@ PyamilySeq requires Python 3.6 or higher. Install dependencies using pip:
|
|
|
31
31
|
pip install PyamilySeq
|
|
32
32
|
```
|
|
33
33
|
|
|
34
|
-
## Usage
|
|
34
|
+
## Usage - Menu
|
|
35
|
+
```
|
|
36
|
+
PyamilySeq_Species.py -h
|
|
37
|
+
usage: PyamilySeq_Species.py [-h] -c CLUSTERS -f {CD-HIT,CSV,TSV} [-w WRITE_FAMILIES] [-fasta FASTA] [-rc RECLUSTERED] [-st SEQUENCE_TAG]
|
|
38
|
+
[-groups CORE_GROUPS] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
|
|
39
|
+
|
|
40
|
+
PyamilySeq v0.2.0: PyamilySeq Run Parameters.
|
|
41
|
+
|
|
42
|
+
Required Arguments:
|
|
43
|
+
-c CLUSTERS Clustering output file from CD-HIT, TSV or CSV Edge List
|
|
44
|
+
-f {CD-HIT,CSV,TSV} Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))
|
|
45
|
+
|
|
46
|
+
Output Parameters:
|
|
47
|
+
-w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95" - Must provide
|
|
48
|
+
FASTA file with -fasta
|
|
49
|
+
-fasta FASTA FASTA file to use in conjunction with "-w"
|
|
50
|
+
|
|
51
|
+
Optional Arguments:
|
|
52
|
+
-rc RECLUSTERED Clustering output file from secondary round of clustering
|
|
53
|
+
-st SEQUENCE_TAG Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
|
|
54
|
+
-groups CORE_GROUPS Default - ('99,95,90,80,15'): Gene family groups to use
|
|
55
|
+
-gpa GENE_PRESENCE_ABSENCE_OUT
|
|
56
|
+
Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other
|
|
57
|
+
downstream tools
|
|
58
|
+
|
|
59
|
+
Misc:
|
|
60
|
+
-verbose {True,False}
|
|
61
|
+
Default - False: Print out runtime messages
|
|
62
|
+
-v Default - False: Print out version number and exit
|
|
63
|
+
|
|
64
|
+
```
|
|
35
65
|
|
|
36
66
|
### Clustering Analysis
|
|
37
67
|
|
|
@@ -58,6 +88,7 @@ Replace `reclustered_file` with the path to the file containing additional seque
|
|
|
58
88
|
PyamilySeq generates various outputs, including:
|
|
59
89
|
|
|
60
90
|
- **Gene Presence-Absence File**: This CSV file details the presence and absence of genes across genomes.
|
|
91
|
+
- **FASTA Files for Each Gene Family**:
|
|
61
92
|
|
|
62
93
|
## Gene Family Groups
|
|
63
94
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
|
|
2
|
+
PyamilySeq/Constants.py,sha256=3Nr6JfUVt2eZT4M7fV-sz_bPXIvPgxIBT5nR76kCPIo,30
|
|
3
|
+
PyamilySeq/PyamilySeq_Species.py,sha256=SCWeK7bEfnKLrfzliiOx7Jtmie8vvAXGtQE_PpJD5hY,31040
|
|
4
|
+
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
|
|
6
|
+
PyamilySeq-0.2.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
7
|
+
PyamilySeq-0.2.0.dist-info/METADATA,sha256=FUiZzxQzqnOwokb7MflZCMUzK9JgFVUVzEvLBPAlpgk,4144
|
|
8
|
+
PyamilySeq-0.2.0.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
9
|
+
PyamilySeq-0.2.0.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
|
|
10
|
+
PyamilySeq-0.2.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
11
|
+
PyamilySeq-0.2.0.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
|
|
2
|
-
PyamilySeq/Constants.py,sha256=hrbTdmPUFEzLfGZOPoQPV0NsAG-VnfIX51291vqb1C8,30
|
|
3
|
-
PyamilySeq/PyamilySeq_Species.py,sha256=34NHcViENyAdvGRltNUbfWjEcNCYnsmbuhDdl8__mH0,28209
|
|
4
|
-
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
|
|
6
|
-
PyamilySeq-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
7
|
-
PyamilySeq-0.0.2.dist-info/METADATA,sha256=v6hOL3kekqt8H5YhjpS6uQOF1QSFcBh4Zy-jNW3xDTk,2550
|
|
8
|
-
PyamilySeq-0.0.2.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
9
|
-
PyamilySeq-0.0.2.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
|
|
10
|
-
PyamilySeq-0.0.2.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
11
|
-
PyamilySeq-0.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|