phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,254 @@
1
+ import sys
2
+ import os
3
+ from textwrap import dedent
4
+ from typing import Dict, List, Tuple
5
+ from Bio import SeqIO
6
+ from Bio.SeqRecord import SeqRecord
7
+ from concurrent.futures import ProcessPoolExecutor, as_completed
8
+ from functools import partial
9
+ import multiprocessing as mp
10
+ from collections import defaultdict
11
+
12
+ from .base import Alignment
13
+ from ...helpers.files import read_single_column_file_to_list
14
+
15
+
16
+ class CreateConcatenationMatrix(Alignment):
17
+ def __init__(self, args) -> None:
18
+ super().__init__(**self.process_args(args))
19
+
20
+ def run(self) -> None:
21
+ self.create_concatenation_matrix(
22
+ self.alignment_list_path,
23
+ self.prefix
24
+ )
25
+
26
+ def process_args(self, args) -> Dict[str, str]:
27
+ return dict(alignment_list_path=args.alignment_list, prefix=args.prefix)
28
+
29
+ def read_alignment_paths(self, alignment_list_path: str) -> List[str]:
30
+ try:
31
+ return read_single_column_file_to_list(alignment_list_path)
32
+ except FileNotFoundError:
33
+ print("Alignment list file (-a) is not found. Please check pathing.")
34
+ sys.exit(2)
35
+
36
+ @staticmethod
37
+ def _get_taxa_from_alignment(alignment_path: str) -> set:
38
+ """Extract taxa names from a single alignment file."""
39
+ return {seq_record.id for seq_record in SeqIO.parse(alignment_path, "fasta")}
40
+
41
+ def get_taxa_names(self, alignment_paths: List[str]) -> List[str]:
42
+ """Get all unique taxa names from alignment files in parallel."""
43
+ taxa = set()
44
+
45
+ # Process files in parallel if there are many
46
+ if len(alignment_paths) > 10:
47
+ with ProcessPoolExecutor(max_workers=min(mp.cpu_count(), len(alignment_paths))) as executor:
48
+ futures = [executor.submit(self._get_taxa_from_alignment, path) for path in alignment_paths]
49
+ for future in as_completed(futures):
50
+ taxa.update(future.result())
51
+ else:
52
+ # Process sequentially for small datasets
53
+ for alignment_path in alignment_paths:
54
+ taxa.update(self._get_taxa_from_alignment(alignment_path))
55
+
56
+ return sorted(taxa)
57
+
58
+ def print_start_message(
59
+ self,
60
+ taxa: List[str],
61
+ alignment_paths: List[str],
62
+ file_partition: str,
63
+ fasta_output: str,
64
+ file_occupancy: str,
65
+ ) -> None:
66
+ start_message = dedent(f"""
67
+ --------------------
68
+ | General features |
69
+ --------------------
70
+ Total number of taxa: {len(taxa)}
71
+ Total number of alignments: {len(alignment_paths)}
72
+
73
+
74
+ ----------------
75
+ | Output files |
76
+ ----------------
77
+ Partition file output: {file_partition}
78
+ Concatenated fasta output: {fasta_output}
79
+ Occupancy report: {file_occupancy}
80
+ """)
81
+ print(start_message)
82
+
83
+ def get_list_of_taxa_and_records(
84
+ self, alignment_path: str
85
+ ) -> Tuple[set, List[SeqRecord]]:
86
+ records = list(SeqIO.parse(alignment_path, "fasta"))
87
+ og_taxa = {record.id for record in records}
88
+ return og_taxa, records
89
+
90
+ def create_missing_seq_str(self, records: List[SeqRecord]) -> Tuple[str, int]:
91
+ """Create a placeholder string for sequences with missing taxa."""
92
+ if not records:
93
+ print(f"No sequence records found. Exiting...")
94
+ sys.exit(2)
95
+
96
+ og_len = len(records[0].seq)
97
+ missing_seq = '?' * og_len
98
+ return missing_seq, og_len
99
+
100
+ def process_taxa_sequences(
101
+ self,
102
+ records: List[SeqRecord],
103
+ taxa: List[str],
104
+ concatenated_seqs: Dict[str, List[str]],
105
+ missing_seq: str,
106
+ ) -> None:
107
+ present_taxa = {record.id for record in records}
108
+ missing_taxa = set(taxa) - present_taxa
109
+
110
+ # Add sequences for present taxa
111
+ for record in records:
112
+ concatenated_seqs[record.id].append(str(record.seq))
113
+
114
+ # Add missing sequences for missing taxa
115
+ for taxon in missing_taxa:
116
+ concatenated_seqs[taxon].append(missing_seq)
117
+
118
+ def add_to_partition_info(
119
+ self,
120
+ partition_info: List[str],
121
+ og_len: int,
122
+ field_one: str,
123
+ fasta: str,
124
+ first_len: int,
125
+ second_len: int,
126
+ ) -> Tuple[List[str], int, int]:
127
+ second_len += og_len
128
+ partition_info.append(f"{field_one}, {fasta}={first_len}-{second_len}\n")
129
+ return partition_info, second_len + 1, second_len
130
+
131
+ def add_to_occupancy_info(
132
+ self,
133
+ occupancy_info: List[str],
134
+ present_taxa: set,
135
+ taxa: List[str],
136
+ fasta: str,
137
+ ) -> List[str]:
138
+ missing_taxa = sorted(set(taxa) - present_taxa)
139
+ num_present = len(present_taxa)
140
+ num_missing = len(missing_taxa)
141
+ percent_occupancy = num_present / len(taxa)
142
+ occupancy_info.append(f"{fasta}\t{num_present}\t{num_missing}\t{percent_occupancy:.4f}\t{';'.join(missing_taxa)}\n")
143
+ return occupancy_info
144
+
145
+ def fasta_file_write(self, fasta_output: str, concatenated_seqs: Dict[str, List[str]]) -> None:
146
+ """Write concatenated sequences to FASTA file with buffered I/O."""
147
+ # Use larger buffer for better I/O performance
148
+ with open(fasta_output, "w", buffering=8192) as final_fasta_file:
149
+ for taxon, sequences in concatenated_seqs.items():
150
+ # Join sequences once instead of in the write statement
151
+ concatenated = ''.join(sequences)
152
+ final_fasta_file.write(f">{taxon}\n{concatenated}\n")
153
+
154
+ def write_occupancy_or_partition_file(self, info: List[str], output_file_name: str) -> None:
155
+ with open(output_file_name, "w") as f:
156
+ f.writelines(info)
157
+
158
+ @staticmethod
159
+ def _process_alignment_file(alignment_path: str, taxa: List[str]) -> Tuple[str, Dict[str, str], set, int]:
160
+ """Process a single alignment file and return its data."""
161
+ records = list(SeqIO.parse(alignment_path, "fasta"))
162
+ present_taxa = {record.id for record in records}
163
+
164
+ if not records:
165
+ return alignment_path, {}, present_taxa, 0
166
+
167
+ og_len = len(records[0].seq)
168
+ missing_seq = '?' * og_len
169
+
170
+ # Create sequence dict for this alignment
171
+ seq_dict = {}
172
+ for taxon in taxa:
173
+ if taxon in present_taxa:
174
+ # Find the sequence for this taxon
175
+ for record in records:
176
+ if record.id == taxon:
177
+ seq_dict[taxon] = str(record.seq)
178
+ break
179
+ else:
180
+ seq_dict[taxon] = missing_seq
181
+
182
+ return alignment_path, seq_dict, present_taxa, og_len
183
+
184
+ def create_concatenation_matrix(self, alignment_list_path: str, prefix: str) -> None:
185
+ alignment_paths = self.read_alignment_paths(alignment_list_path)
186
+ taxa = self.get_taxa_names(alignment_paths)
187
+
188
+ # Create output directory if needed
189
+ output_dir = os.path.dirname(prefix)
190
+ if output_dir and not os.path.exists(output_dir):
191
+ os.makedirs(output_dir, exist_ok=True)
192
+
193
+ # Assign output file names
194
+ file_partition = f"{prefix}.partition"
195
+ fasta_output = f"{prefix}.fa"
196
+ file_occupancy = f"{prefix}.occupancy"
197
+
198
+ self.print_start_message(taxa, alignment_paths, file_partition, fasta_output, file_occupancy)
199
+
200
+ # Initialize placeholders for partition info
201
+ first_len, second_len = 1, 0
202
+ partition_info, occupancy_info = [], []
203
+ concatenated_seqs = defaultdict(list)
204
+
205
+ # Process alignment files in parallel if there are many
206
+ if len(alignment_paths) > 2:
207
+ with ProcessPoolExecutor(max_workers=min(mp.cpu_count(), 8)) as executor:
208
+ process_func = partial(self._process_alignment_file, taxa=taxa)
209
+ # Keep results indexed by path to maintain order
210
+ futures = {executor.submit(process_func, path): path for path in alignment_paths}
211
+ results = {}
212
+
213
+ for future in as_completed(futures):
214
+ path = futures[future]
215
+ results[path] = future.result()
216
+
217
+ # Process results in original order
218
+ for alignment_path in alignment_paths:
219
+ _, seq_dict, present_taxa, og_len = results[alignment_path]
220
+
221
+ # Add sequences to concatenated dict
222
+ for taxon in taxa:
223
+ concatenated_seqs[taxon].append(seq_dict[taxon])
224
+
225
+ # Add to partition and occupancy info
226
+ partition_info, first_len, second_len = self.add_to_partition_info(
227
+ partition_info, og_len, "AUTO", alignment_path, first_len, second_len
228
+ )
229
+ occupancy_info = self.add_to_occupancy_info(occupancy_info, present_taxa, taxa, alignment_path)
230
+ else:
231
+ # Process sequentially for small datasets
232
+ for alignment_path in alignment_paths:
233
+ present_taxa, records = self.get_list_of_taxa_and_records(alignment_path)
234
+ missing_seq, og_len = self.create_missing_seq_str(records)
235
+
236
+ # Process taxa sequences and add to the concatenated sequences
237
+ self.process_taxa_sequences(records, taxa, concatenated_seqs, missing_seq)
238
+
239
+ # Add to partition and occupancy info
240
+ partition_info, first_len, second_len = self.add_to_partition_info(
241
+ partition_info, og_len, "AUTO", alignment_path, first_len, second_len
242
+ )
243
+ occupancy_info = self.add_to_occupancy_info(occupancy_info, present_taxa, taxa, alignment_path)
244
+
245
+ # Convert defaultdict to regular dict for writing
246
+ if isinstance(concatenated_seqs, defaultdict):
247
+ concatenated_seqs = dict(concatenated_seqs)
248
+
249
+ # Write output files
250
+ self.fasta_file_write(fasta_output, concatenated_seqs)
251
+ self.write_occupancy_or_partition_file(occupancy_info, file_occupancy)
252
+ self.write_occupancy_or_partition_file(partition_info, file_partition)
253
+
254
+ print("Complete!\n")
@@ -0,0 +1,145 @@
1
+ import sys
2
+ from typing import Dict, List
3
+ import numpy as np
4
+ from Bio import SeqIO
5
+ from Bio.Seq import Seq
6
+ from .base import Alignment
7
+
8
+
9
+ class DNAThreader(Alignment):
10
+ """
11
+ Threads DNA on top of protein alignment
12
+ """
13
+
14
+ def __init__(self, args) -> None:
15
+ self.process_args(args)
16
+
17
+ def process_args(self, args):
18
+ self.remove_stop_codon = args.stop
19
+ self.protein_file_path = args.protein
20
+ self.nucleotide_file_path = args.nucleotide
21
+ self.clipkit_log_file = args.clipkit_log_file
22
+
23
+ @property
24
+ def clipkit_log_data(self) -> List[List[str]]:
25
+ if self.clipkit_log_file:
26
+ with open(self.clipkit_log_file) as f:
27
+ return [line.rstrip("\n").split(" ") for line in f.readlines()]
28
+ return None
29
+
30
+ def run(self) -> None:
31
+ prot_records = SeqIO.parse(self.protein_file_path, "fasta")
32
+ pal2nal = self.thread(prot_records)
33
+
34
+ for gene_id, sequence in pal2nal.items():
35
+ print(f">{gene_id}")
36
+ print(f"{sequence}")
37
+
38
+ def create_mask(self, length: int) -> List[bool]:
39
+ if not self.clipkit_log_data:
40
+ return [True] * length
41
+
42
+ # create a mask that replicates the 'keep' and 'remove' status 3 times for each amino acid
43
+ return [
44
+ True if row[1] == "keep" else False
45
+ for row in self.clipkit_log_data
46
+ for _ in range(3)
47
+ ]
48
+
49
+ def normalize_p_seq(self, p_seq: Seq) -> str:
50
+ # triplicate each amino acid
51
+ return ''.join(c * 3 for c in p_seq)
52
+
53
+ def normalize_n_seq(self, n_seq: Seq, p_seq: Seq) -> str:
54
+ # Pre-split codons for faster access
55
+ codons = [str(n_seq[i:i+3]) for i in range(0, len(n_seq), 3)]
56
+ normalized_n_seq = []
57
+ gap_chars = {'-', '?', '*', 'X', 'x'}
58
+
59
+ codon_idx = 0
60
+ for aa in p_seq:
61
+ if aa in gap_chars:
62
+ normalized_n_seq.append("---")
63
+ else:
64
+ if codon_idx < len(codons):
65
+ normalized_n_seq.append(codons[codon_idx])
66
+ codon_idx += 1
67
+ else:
68
+ normalized_n_seq.append("---") # fallback in case of misalignment
69
+
70
+ return ''.join(normalized_n_seq)
71
+
72
+ def thread(self, prot_records) -> Dict[str, str]:
73
+ pal2nal = dict()
74
+ prot_dict = SeqIO.to_dict(prot_records)
75
+
76
+ if not prot_dict:
77
+ print("Protein file is empty or incorrectly formatted.")
78
+ sys.exit(2)
79
+
80
+ # Pre-load nucleotide sequences only for proteins we have
81
+ nucl_records = {}
82
+ for record in SeqIO.parse(self.nucleotide_file_path, "fasta"):
83
+ if record.id in prot_dict:
84
+ nucl_records[record.id] = record
85
+
86
+ length = len(next(iter(prot_dict.values())).seq)
87
+ keep_mask = self.create_mask(length * 3)
88
+
89
+ # Convert keep_mask to numpy array for faster operations
90
+ keep_mask_arr = np.array(keep_mask)
91
+ gap_chars = {'-', '?', '*', 'X', 'x'}
92
+
93
+ for gene_id, protein_seq_record in prot_dict.items():
94
+ try:
95
+ if gene_id not in nucl_records:
96
+ print(f"Nucleotide sequence for {gene_id} not found.")
97
+ sys.exit(2)
98
+
99
+ p_seq = protein_seq_record.seq
100
+ n_seq = nucl_records[gene_id].seq
101
+
102
+ # Get normalized sequences
103
+ normalized_p_seq = self.normalize_p_seq(p_seq)
104
+ normalized_n_seq = self.normalize_n_seq(n_seq, normalized_p_seq)
105
+
106
+ # Convert to numpy arrays for faster operations
107
+ p_arr = np.array(list(normalized_p_seq), dtype='U1')
108
+ n_arr = np.array(list(normalized_n_seq), dtype='U1')
109
+
110
+ # Create mask for non-gap positions in protein
111
+ non_gap_mask_protein = ~np.isin(p_arr, list(gap_chars))
112
+
113
+ # Expand protein mask to nucleotide positions (each AA = 3 nucleotides)
114
+ non_gap_mask = np.repeat(non_gap_mask_protein, 3)
115
+
116
+ # Ensure masks have same shape
117
+ min_len = min(len(non_gap_mask), len(keep_mask_arr), len(n_arr))
118
+ non_gap_mask = non_gap_mask[:min_len]
119
+ keep_mask_arr_trimmed = keep_mask_arr[:min_len]
120
+ n_arr = n_arr[:min_len]
121
+
122
+ # Combine masks
123
+ final_mask = keep_mask_arr_trimmed & non_gap_mask
124
+
125
+ # Apply masks and build result
126
+ result = np.where(final_mask, n_arr, '-')
127
+
128
+ # Handle stop codon if needed
129
+ if self.remove_stop_codon and p_seq[-1] == "*":
130
+ # Find the last 3 positions that were kept
131
+ kept_indices = np.where(keep_mask_arr_trimmed)[0]
132
+ if len(kept_indices) >= 3:
133
+ last_3_indices = kept_indices[-3:]
134
+ for idx in last_3_indices:
135
+ if idx < len(result) and idx < len(n_arr):
136
+ result[idx] = n_arr[idx]
137
+
138
+ # Only keep positions marked in keep_mask
139
+ pal2nal[gene_id] = ''.join(result[keep_mask_arr_trimmed[:len(result)]])
140
+
141
+ except KeyError:
142
+ print(f"Nucleotide sequence for {gene_id} not found.")
143
+ sys.exit(2)
144
+
145
+ return pal2nal
@@ -0,0 +1,85 @@
1
+ from collections import Counter
2
+ from typing import List, Dict
3
+ import numpy as np
4
+
5
+ from Bio.Align import MultipleSeqAlignment
6
+
7
+ from .base import Alignment
8
+
9
+
10
+ class EvolutionaryRatePerSite(Alignment):
11
+ def __init__(self, args) -> None:
12
+ super().__init__(**self.process_args(args))
13
+
14
+ def run(self):
15
+ alignment, _, is_protein = self.get_alignment_and_format()
16
+ pic_values = self.calculate_evolutionary_rate_per_site(alignment)
17
+
18
+ for idx, value in enumerate(pic_values):
19
+ print(f"{idx + 1}\t{round(value, 4)}")
20
+
21
+ def process_args(self, args):
22
+ return dict(alignment_file_path=args.alignment)
23
+
24
+ def remove_gap_characters(self, seq: str, gap_chars: List[str]) -> str:
25
+ return ''.join([char for char in seq if char not in gap_chars]).upper()
26
+
27
+ def get_number_of_occurrences_per_character(
28
+ self,
29
+ alignment: MultipleSeqAlignment,
30
+ idx: int,
31
+ gap_chars: List[str]
32
+ ) -> Dict[str, int]:
33
+ seq_at_position = alignment[:, idx]
34
+ clean_seq = self.remove_gap_characters(seq_at_position, gap_chars)
35
+
36
+ return Counter(clean_seq)
37
+
38
+ def calculate_pic(
39
+ self,
40
+ num_occurrences: Dict[str, int],
41
+ ) -> float:
42
+ total_frequencies = sum(num_occurrences.values())
43
+ sum_of_frequencies = sum(
44
+ (frequency / total_frequencies) ** 2
45
+ for frequency in num_occurrences.values()
46
+ )
47
+ return 1 - sum_of_frequencies
48
+
49
+ def calculate_evolutionary_rate_per_site(
50
+ self,
51
+ alignment: MultipleSeqAlignment,
52
+ ) -> List[float]:
53
+ aln_len = alignment.get_alignment_length()
54
+ gap_chars = set(self.get_gap_chars())
55
+
56
+ # Convert alignment to numpy array for vectorized operations
57
+ alignment_array = np.array([
58
+ [c.upper() for c in str(record.seq)]
59
+ for record in alignment
60
+ ], dtype='U1')
61
+
62
+ pic_values = []
63
+
64
+ # Process each column
65
+ for col_idx in range(aln_len):
66
+ column = alignment_array[:, col_idx]
67
+
68
+ # Filter out gaps
69
+ non_gap_mask = ~np.isin(column, list(gap_chars))
70
+ filtered_column = column[non_gap_mask]
71
+
72
+ if len(filtered_column) > 0:
73
+ # Count occurrences using numpy
74
+ unique_chars, counts = np.unique(filtered_column, return_counts=True)
75
+ total_frequencies = len(filtered_column)
76
+
77
+ # Calculate PIC (Probability of Identical Characters)
78
+ sum_of_frequencies = np.sum((counts / total_frequencies) ** 2)
79
+ pic = 1 - sum_of_frequencies
80
+ else:
81
+ pic = 0
82
+
83
+ pic_values.append(pic)
84
+
85
+ return pic_values
@@ -0,0 +1,21 @@
1
+ from typing import Dict
2
+
3
+ from Bio import SeqIO
4
+
5
+ from .base import Alignment
6
+
7
+
8
+ class Faidx(Alignment):
9
+ def __init__(self, args) -> None:
10
+ super().__init__(**self.process_args(args))
11
+
12
+ def run(self) -> None:
13
+ record_dict = SeqIO.index(self.fasta, "fasta")
14
+
15
+ # Split entries and iterate
16
+ for e in map(str.strip, self.entry.split(",")):
17
+ record = record_dict[e]
18
+ print(f">{record.name}\n{record.seq}")
19
+
20
+ def process_args(self, args) -> Dict[str, str]:
21
+ return dict(fasta=args.fasta, entry=args.entry)
@@ -0,0 +1,94 @@
1
+ from enum import Enum
2
+ import re
3
+ import sys
4
+ from typing import Dict, Tuple
5
+ from collections import Counter
6
+ import numpy as np
7
+
8
+ from Bio.Align import MultipleSeqAlignment
9
+
10
+ from .base import Alignment
11
+ from ...helpers.files import get_alignment_and_format
12
+
13
+
14
+ class FileFormat(Enum):
15
+ fasta = "fasta"
16
+ clustal = "clustal"
17
+ maf = "maf"
18
+ mauve = "mauve"
19
+ phylip = "phylip"
20
+ phylip_seq = "phylip-sequential"
21
+ stockholm = "stockholm"
22
+
23
+
24
+ class GCContent(Alignment):
25
+ def __init__(self, args) -> None:
26
+ super().__init__(**self.process_args(args))
27
+
28
+ def run(self):
29
+ records, _, is_protein = get_alignment_and_format(self.fasta)
30
+
31
+ if is_protein:
32
+ print("GC content can't be calculated for protein sequences")
33
+ sys.exit(2)
34
+
35
+ if self.verbose:
36
+ self.calculate_gc_per_sequence(records)
37
+ else:
38
+ self.calculate_gc_total(records)
39
+
40
+ def process_args(self, args) -> Dict[str, str]:
41
+ return dict(fasta=args.fasta, verbose=args.verbose)
42
+
43
+ def calculate_gc_per_sequence(self, records: MultipleSeqAlignment) -> None:
44
+ gap_chars = set(self.get_gap_chars())
45
+
46
+ for record in records:
47
+ # Convert to numpy array for faster operations
48
+ seq_arr = np.array(list(str(record.seq).upper()), dtype='U1')
49
+
50
+ # Filter out gaps
51
+ non_gap_mask = ~np.isin(seq_arr, list(gap_chars))
52
+ cleaned_seq = seq_arr[non_gap_mask]
53
+
54
+ if len(cleaned_seq) > 0:
55
+ # Count G and C
56
+ gc_count = np.sum((cleaned_seq == 'G') | (cleaned_seq == 'C'))
57
+ gc_content = gc_count / len(cleaned_seq)
58
+ else:
59
+ gc_content = 0
60
+
61
+ try:
62
+ print(f"{record.id}\t{round(gc_content, 4)}")
63
+ except BrokenPipeError:
64
+ pass
65
+
66
+ def calculate_gc_total(self, records: MultipleSeqAlignment) -> None:
67
+ gap_chars = set(self.get_gap_chars())
68
+
69
+ # Combine all sequences into one array
70
+ all_seqs = [list(str(record.seq).upper()) for record in records]
71
+ combined_arr = np.concatenate([np.array(seq, dtype='U1') for seq in all_seqs])
72
+
73
+ # Filter out gaps
74
+ non_gap_mask = ~np.isin(combined_arr, list(gap_chars))
75
+ cleaned_seq = combined_arr[non_gap_mask]
76
+
77
+ if len(cleaned_seq) > 0:
78
+ # Count G and C
79
+ gc_count = np.sum((cleaned_seq == 'G') | (cleaned_seq == 'C'))
80
+ gc_content = round(gc_count / len(cleaned_seq), 4)
81
+ print(gc_content)
82
+ else:
83
+ print(
84
+ "Input file has an unacceptable format. Please check input file argument."
85
+ )
86
+ sys.exit(2)
87
+
88
+ def remove_gaps_and_count_gc(self, seq: str) -> Tuple[str, float]:
89
+ gap_chars = self.get_gap_chars()
90
+ pattern = "[" + "".join(re.escape(char) for char in gap_chars) + "]"
91
+ cleaned_seq = re.sub(pattern, "", seq)
92
+ gc_count = Counter(cleaned_seq.upper())["G"] + Counter(cleaned_seq.upper())["C"]
93
+
94
+ return cleaned_seq, gc_count