dayhoff-tools 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dayhoff_tools/fasta.py ADDED
@@ -0,0 +1,1082 @@
1
+ import gzip
2
+ import logging
3
+ import math
4
+ import multiprocessing
5
+ import os
6
+ import re
7
+ import sqlite3
8
+ import time
9
+ from functools import partial
10
+ from pathlib import Path
11
+ from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
12
+
13
+ import requests
14
+ from Bio import SeqIO
15
+ from Bio.SeqRecord import SeqRecord
16
+ from tqdm import tqdm
17
+ from tqdm.notebook import tqdm_notebook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _clean_noncanonical_fasta(
23
+ input_path: str,
24
+ output_path: Optional[str] = None,
25
+ split_char: str = " ",
26
+ id_field: int = 0,
27
+ ) -> Optional[dict[str, str]]:
28
+ """
29
+ Read in a FASTA file containing multiple sequences, replace non-canonical amino acids,
30
+ remove empty sequences, and either write the sequences to a new FASTA file or return them as a dictionary.
31
+
32
+ Args:
33
+ input_path (str): Path to the input FASTA file.
34
+ output_path (Optional[str]): Path to the output FASTA file. If None, the sequences are returned as a dictionary.
35
+ split_char (str): Character used to split the identifier in the header.
36
+ id_field (int): Field index for the identifier after splitting.
37
+
38
+ Returns:
39
+ Optional[dict[str, str]]: A dictionary with sequence identifiers as keys and cleaned sequences as values if output_path is None.
40
+ """
41
+ logger.info("Reading FASTA file: %s", input_path)
42
+ if output_path:
43
+ logger.info("Writing FASTA file: %s", output_path)
44
+
45
+ sequences = {}
46
+ with open(input_path, "r") as fasta_file:
47
+ seq_id = ""
48
+ seq_lines = []
49
+
50
+ for line in fasta_file:
51
+ if line.startswith(">"):
52
+ if seq_id and seq_lines:
53
+ seq = "".join(seq_lines).translate(str.maketrans("OJUZB", "XLCED"))
54
+ if seq.strip(): # Only process non-empty sequences
55
+ sequences[seq_id] = seq
56
+ if output_path:
57
+ with open(output_path, "a") as output_file:
58
+ output_file.write(f">{seq_id}\n{seq}\n")
59
+ seq_lines = []
60
+ seq_id = line[1:].strip().split(split_char)[id_field]
61
+ else:
62
+ seq_lines.append(line.strip().replace("-", "").upper())
63
+
64
+ # Process the last sequence
65
+ if seq_id and seq_lines:
66
+ seq = "".join(seq_lines).translate(str.maketrans("OJUZB", "XLCED"))
67
+ if seq.strip(): # Only process non-empty sequences
68
+ sequences[seq_id] = seq
69
+ if output_path:
70
+ with open(output_path, "a") as output_file:
71
+ output_file.write(f">{seq_id}\n{seq}\n")
72
+
73
+ logger.info("FASTA file processing completed.")
74
+ if not output_path:
75
+ return sequences
76
+
77
+
78
+ def _check_output_file(output_path: str) -> None:
79
+ """
80
+ Check if the output file already exists and raise an error if it does.
81
+
82
+ Args:
83
+ output_path (str): Path to the output file.
84
+
85
+ Raises:
86
+ FileExistsError: If the output file already exists.
87
+ """
88
+ if os.path.exists(output_path):
89
+ raise FileExistsError(f"Output file already exists: {output_path}")
90
+
91
+
92
+ def clean_noncanonical_fasta(
93
+ input_path: str, output_path: str, split_char: str = " ", id_field: int = 0
94
+ ):
95
+ """
96
+ Read in a FASTA file containing multiple sequences and write the sequences to a new FASTA file.
97
+ Replace non-canonical amino acids along the way.
98
+
99
+ Args:
100
+ input_path (str): Path to the input FASTA file.
101
+ output_path (str): Path to the output FASTA file.
102
+ split_char (str): Character used to split the identifier in the header.
103
+ id_field (int): Field index for the identifier after splitting.
104
+
105
+ Raises:
106
+ FileExistsError: If the output file already exists.
107
+ """
108
+ _check_output_file(output_path)
109
+ _clean_noncanonical_fasta(input_path, output_path, split_char, id_field)
110
+
111
+
112
+ def clean_noncanonical_fasta_to_dict(
113
+ input_path: str, split_char: str = " ", id_field: int = 0
114
+ ) -> dict[str, str]:
115
+ """
116
+ Read in a FASTA file containing multiple sequences and return the sequences as a dictionary.
117
+ Replace non-canonical amino acids along the way.
118
+
119
+ Args:
120
+ input_path (str): Path to the input FASTA file.
121
+ split_char (str): Character used to split the identifier in the header.
122
+ id_field (int): Field index for the identifier after splitting.
123
+
124
+ Returns:
125
+ dict[str, str]: A dictionary with sequence identifiers as keys and cleaned sequences as values.
126
+ """
127
+ ans = _clean_noncanonical_fasta(input_path, None, split_char, id_field)
128
+ if not ans:
129
+ return {}
130
+
131
+ return ans
132
+
133
+
134
+ def combine_fasta_files(input_path: Union[str, List[str]], output_path: str) -> None:
135
+ """Combine several FASTA files into one.
136
+ Args:
137
+ input_path (Union[str, List[str]]): Folder of fasta files or list of file paths to be combined.
138
+ output_path (str): Output path for the combined fasta file.
139
+
140
+ Raises:
141
+ FileExistsError: If the output file already exists.
142
+ """
143
+ _check_output_file(output_path)
144
+
145
+ if isinstance(input_path, str):
146
+ # If input_path is a string, treat it as a folder path
147
+ fasta_files = sorted(
148
+ [
149
+ os.path.join(input_path, file)
150
+ for file in os.listdir(input_path)
151
+ if file.endswith((".fasta", ".faa"))
152
+ ]
153
+ )
154
+ else:
155
+ # If input_path is a list, use it directly
156
+ fasta_files = input_path
157
+
158
+ total_size = sum(os.path.getsize(file) for file in fasta_files)
159
+
160
+ with open(output_path, "w") as outfile:
161
+ with tqdm(
162
+ total=total_size, unit="B", unit_scale=True, desc="Combining files"
163
+ ) as pbar:
164
+ for fasta_file in fasta_files:
165
+ file_size = os.path.getsize(fasta_file)
166
+ with open(fasta_file, "r") as infile:
167
+ for chunk in iter(lambda: infile.read(8192), ""):
168
+ outfile.write(chunk)
169
+ pbar.update(len(chunk))
170
+ outfile.write("\n")
171
+ pbar.update(file_size - pbar.n + pbar.last_print_n)
172
+
173
+ print(f"Combined {len(fasta_files)} .fasta files into {output_path}.")
174
+
175
+
176
+ def extract_uniprot_dat_to_fasta(
177
+ dat_file_path: str, fasta_file_path: str, max_entries: int | None = None
178
+ ):
179
+ """Extract all the sequences from a Uniprot DAT file into a FASTA file.
180
+
181
+ Args:
182
+ dat_file_path (str): Path to the input Uniprot DAT file.
183
+ fasta_file_path (str): Path to the output FASTA file.
184
+ max_entries (int | None, optional): Maximum number of entries to extract.
185
+ If None, all entries are processed.
186
+
187
+ Raises:
188
+ FileExistsError: If the output file already exists.
189
+ """
190
+ _check_output_file(fasta_file_path)
191
+ start_time = time.time()
192
+
193
+ with open(dat_file_path, "r") as dat_file:
194
+ print("Calculating file length...")
195
+ total_lines = sum(1 for _ in dat_file)
196
+ print(f"File has {total_lines:,} lines.")
197
+
198
+ with open(dat_file_path, "r") as dat_file:
199
+ with open(fasta_file_path, "w") as fasta_file:
200
+ current_id = ""
201
+ sequence = ""
202
+ recording_sequence = False
203
+ entries_count = 0
204
+ batch_size = 1000
205
+ buffer = []
206
+ processed_lines = 0
207
+ update_interval = 1000000 # Update progress every 1 million lines
208
+
209
+ for line in dat_file:
210
+ processed_lines += 1
211
+
212
+ if line.startswith("AC"):
213
+ # Extract the ID
214
+ current_id = line.strip().split()[1].rstrip(";")
215
+ elif line.startswith("SQ"):
216
+ # Start recording the sequence lines after this
217
+ recording_sequence = True
218
+ sequence = ""
219
+ elif recording_sequence and line.startswith(" "):
220
+ # Concatenate sequence lines directly
221
+ sequence += line.strip()
222
+ elif line.startswith("//"):
223
+ # End of an entry; write to FASTA file if we have a sequence
224
+ if sequence and current_id:
225
+ buffer.append(f">{current_id}\n{sequence.replace(' ', '')}\n")
226
+ entries_count += 1
227
+
228
+ if len(buffer) >= batch_size:
229
+ fasta_file.write("".join(buffer))
230
+ buffer = []
231
+
232
+ # Reset for the next entry
233
+ recording_sequence = False
234
+ current_id = ""
235
+ sequence = ""
236
+
237
+ # Check if we've reached the maximum number of entries to extract
238
+ if max_entries and entries_count >= max_entries:
239
+ break
240
+
241
+ # Print progress update every update_interval lines
242
+ if processed_lines % update_interval == 0:
243
+ elapsed_time = time.time() - start_time
244
+ progress_percentage = (processed_lines / total_lines) * 100
245
+ print(
246
+ f"Done with {progress_percentage:.2f}% : {entries_count:,} sequences in {elapsed_time:.2f} seconds."
247
+ )
248
+
249
+ # Write any remaining entries in the buffer
250
+ if buffer:
251
+ fasta_file.write("".join(buffer))
252
+
253
+ end_time = time.time()
254
+ execution_time = end_time - start_time
255
+
256
+ # Print final count and execution time
257
+ print(f"\nTotal sequences processed: {entries_count:,}")
258
+ print(f"Total lines processed: {processed_lines:,}")
259
+ print(f"Execution time: {execution_time:.2f} seconds")
260
+
261
+
262
+ def split_fasta(
263
+ fasta_file: str,
264
+ target_folder: str,
265
+ base_name: str,
266
+ sequences_per_file: int = 1000,
267
+ max_files=None,
268
+ ) -> int:
269
+ """Split a FASTA file into multiple smaller files within a target folder.
270
+
271
+ Args:
272
+ fasta_file (str): Path to the input FASTA file.
273
+ target_folder (str): Path to the folder where output files will be saved.
274
+ base_name (str): Used to make output filenames: eg, basename_1.fasta.
275
+ sequences_per_file (int): Number of sequences per output file.
276
+ max_files (int, optional): Maximum number of files to create. If None, all sequences are processed.
277
+ """
278
+ # Ensure the target folder exists
279
+ os.makedirs(target_folder, exist_ok=True)
280
+
281
+ # Initialize counters
282
+ file_count = 1
283
+ sequence_count = 0
284
+
285
+ # Open the large FASTA file for reading
286
+ with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
287
+ # Prepare the first output file
288
+ output_file_path = os.path.join(
289
+ target_folder, f"{base_name}_{file_count}.fasta"
290
+ )
291
+ output_file = open(output_file_path, "w", buffering=1024 * 1024)
292
+
293
+ for line in fasta:
294
+ # Check if we've reached the maximum number of files, if specified
295
+ if max_files is not None and file_count > max_files:
296
+ break
297
+
298
+ # If line starts with ">", it's the beginning of a new sequence
299
+ if line.startswith(">"):
300
+ sequence_count += 1
301
+
302
+ # If we reached the limit, start a new file
303
+ if sequence_count > sequences_per_file:
304
+ # Close current file and open a new one
305
+ output_file.close()
306
+ print(f"File written: {output_file}")
307
+ file_count += 1
308
+ sequence_count = 1 # Reset sequence count for the new file
309
+
310
+ # Check again after incrementing file_count
311
+ if max_files is not None and file_count > max_files:
312
+ break
313
+
314
+ output_file_path = os.path.join(
315
+ target_folder, f"{base_name}_{file_count}.fasta"
316
+ )
317
+ output_file = open(output_file_path, "w", buffering=1024 * 1024)
318
+
319
+ # Write the line to the current output file
320
+ output_file.write(line)
321
+
322
+ # Close the last output file
323
+ output_file.close()
324
+
325
+ return file_count
326
+
327
+
328
+ def subtract_fasta_files(file1: str, file2: str, output_file: str):
329
+ """Load two fasta files and get their set subtraction as a new fasta file.
330
+ The TQDM progress bar for loading files is broken, but adds a little
331
+ feedback and is better than nothing.
332
+
333
+ Args:
334
+ file1 (str): File with everything you want
335
+ file2 (str): File with everything you don't want
336
+ output_file (str): Everything in file1, unless it's in file2
337
+
338
+ Raises:
339
+ FileExistsError: If the output file already exists.
340
+ """
341
+ _check_output_file(output_file)
342
+
343
+ # Load sequences from file1 with a progress bar
344
+ print(f"Loading sequences from {file1}")
345
+ sequences_file1 = {
346
+ record.id: record
347
+ for record in tqdm(
348
+ SeqIO.parse(file1, "fasta"), desc="Loading sequences from file1"
349
+ )
350
+ }
351
+ print(f"Number of sequences in {file1}: {len(sequences_file1)}")
352
+
353
+ # Load sequences from file2 with a progress bar
354
+ print(f"Loading sequences from {file2}")
355
+ sequences_file2 = {
356
+ record.id: record
357
+ for record in tqdm(
358
+ SeqIO.parse(file2, "fasta"), desc="Loading sequences from file2"
359
+ )
360
+ }
361
+
362
+ print(f"Number of sequences in {file2}: {len(sequences_file2)}")
363
+
364
+ # Find sequences that are in file1 but not in file2
365
+ unique_sequences = [
366
+ record for id, record in sequences_file1.items() if id not in sequences_file2
367
+ ]
368
+
369
+ print(f"Number of UNIQUE sequences in {file1}: {len(unique_sequences)}")
370
+
371
+ # Write unique sequences to the output file with a progress bar
372
+ with open(output_file, "w") as output_handle:
373
+ for record in tqdm(
374
+ unique_sequences, desc="Writing unique sequences to output file"
375
+ ):
376
+ SeqIO.write(record, output_handle, "fasta")
377
+
378
+
379
+ def simplify_fasta_ids(
380
+ input_fasta: str, output_fasta: str, progress_interval=100000
381
+ ) -> None:
382
+ """Take a fasta file with either full IDs that UNIPROT normally publishes
383
+ or IDs in the format 'eco:b0002 description', and rewrite it into a file
384
+ with the same sequences but simplified IDs that are just the accession numbers.
385
+
386
+ For UNIPROT-style IDs, the accession number is assumed to be the second part between '|' characters.
387
+ For 'eco:b0002' style IDs, the entire 'eco:b0002' is considered the accession number.
388
+
389
+ Args:
390
+ input_fasta (str): path to the input file
391
+ output_fasta (str): path where to write the output
392
+ progress_interval (int, optional): print out progress every n sequences. Defaults to 100k.
393
+
394
+ Raises:
395
+ FileExistsError: If the output file already exists.
396
+ """
397
+ _check_output_file(output_fasta)
398
+
399
+ count = 0
400
+
401
+ with (
402
+ open(input_fasta, "r") as input_handle,
403
+ open(output_fasta, "w") as output_handle,
404
+ ):
405
+ for record in SeqIO.parse(input_handle, "fasta"):
406
+ # Check if the ID contains '|' characters (UNIPROT style)
407
+ if "|" in record.id:
408
+ accession = record.id.split("|")[1]
409
+ else:
410
+ # For 'eco:b0002' style, extract everything up to the first space
411
+ accession = record.id.split()[0]
412
+
413
+ # Update the record id and description
414
+ record.id = accession
415
+ record.description = ""
416
+ # Write the updated record to the output file
417
+ SeqIO.write(record, output_handle, "fasta")
418
+
419
+ # Print progress
420
+ count += 1
421
+ if count % progress_interval == 0:
422
+ print(f"Processed {count} sequences")
423
+
424
+ # Final progress update
425
+ print(f"Processed {count} sequences in total")
426
+
427
+
428
+ def estimate_sequences(fasta_file: str, sample_size: int = 100000) -> int:
429
+ """
430
+ Estimate the number of sequences in a FASTA file based on file size and a sample.
431
+
432
+ Args:
433
+ fasta_file (str): Path to the input FASTA file (can be .fasta or .fasta.gz).
434
+ sample_size (int): Number of bytes to sample for estimation.
435
+
436
+ Returns:
437
+ int: Estimated number of sequences in the FASTA file.
438
+ """
439
+ print("Estimating number of sequences in FASTA file...")
440
+ file_size = os.path.getsize(fasta_file)
441
+ if file_size == 0:
442
+ return 0
443
+
444
+ is_gzipped = fasta_file.endswith(".gz")
445
+ open_func = gzip.open if is_gzipped else open
446
+
447
+ with open_func(fasta_file, "rt") as handle:
448
+ sample = handle.read(min(sample_size, file_size))
449
+ sample_sequences = sample.count(">")
450
+
451
+ if len(sample) == 0:
452
+ return 0
453
+
454
+ # Estimate total sequences based on the sample
455
+ estimated_sequences = int((sample_sequences / len(sample)) * file_size)
456
+
457
+ # Adjust for potential underestimation due to short sequences
458
+ adjustment_factor = 1.1 # 10% increase
459
+ return max(int(estimated_sequences * adjustment_factor), 1)
460
+
461
+
462
+ def extract_ids_from_fasta(fasta_file: str) -> Set[str]:
463
+ """
464
+ Extract all sequence IDs from a large FASTA file (compressed or uncompressed) using BioPython.
465
+
466
+ Args:
467
+ fasta_file (str): Path to the input FASTA file (can be .fasta or .fasta.gz).
468
+
469
+ Returns:
470
+ Set[str]: A set containing all unique sequence IDs found in the FASTA file.
471
+
472
+ Raises:
473
+ ValueError: If there's an issue reading or parsing the input file.
474
+ """
475
+ sequence_ids: Set[str] = set()
476
+ try:
477
+ estimated_records = estimate_sequences(fasta_file)
478
+
479
+ if estimated_records == 0:
480
+ return sequence_ids # Return empty set for empty files
481
+
482
+ is_gzipped = fasta_file.endswith(".gz")
483
+ open_func = gzip.open if is_gzipped else open
484
+
485
+ with open_func(fasta_file, "rt") as handle:
486
+ with tqdm(total=estimated_records, desc="Extracting IDs") as pbar:
487
+ for record in SeqIO.parse(handle, "fasta"):
488
+ sequence_ids.add(record.id)
489
+ pbar.update(1)
490
+
491
+ except Exception as e:
492
+ print(f"An error occurred while processing the file: {e}")
493
+ raise ValueError(f"Error parsing FASTA file: {e}")
494
+
495
+ print(f"\nExtracted {len(sequence_ids)} unique sequence IDs")
496
+ return sequence_ids
497
+
498
+
499
+ def process_chunk(
500
+ chunk: List[str], target_ids_lower: Set[str], exclude: bool
501
+ ) -> Tuple[List[str], Set[str]]:
502
+ output_sequences = []
503
+ written_ids = set()
504
+ current_id = ""
505
+ current_seq = []
506
+
507
+ def id_matches(seq_id: str) -> bool:
508
+ return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
509
+
510
+ for line in chunk:
511
+ line = line.strip()
512
+ if line.startswith(">"):
513
+ if current_id and current_seq:
514
+ if id_matches(current_id) != exclude:
515
+ output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
516
+ written_ids.add(current_id)
517
+ current_id = line[1:]
518
+ current_seq = []
519
+ elif current_id:
520
+ current_seq.append(line)
521
+
522
+ # Process the last sequence in the chunk
523
+ if current_id and current_seq and id_matches(current_id) != exclude:
524
+ output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
525
+ written_ids.add(current_id)
526
+
527
+ return output_sequences, written_ids
528
+
529
+
530
+ def subset_fasta(
531
+ fasta_file: str,
532
+ output_path: str,
533
+ target_ids: Set[str],
534
+ exclude: bool = False,
535
+ return_written_ids: bool = False,
536
+ ) -> Optional[Set[str]]:
537
+ """
538
+ Create a new FASTA file with sequences that either match or don't match the target IDs.
539
+ Optimized for very large files and uses all available CPU cores.
540
+
541
+ Args:
542
+ fasta_file (str): Path to the input FASTA file.
543
+ output_path (str): Path to the output FASTA file.
544
+ target_ids (Set[str]): A set of sequence IDs to match.
545
+ exclude (bool): If True, write sequences that don't match target_ids. If False, write matching sequences.
546
+ return_written_ids (bool): If True, return the set of sequence IDs that were written to the output file.
547
+
548
+ Returns:
549
+ Optional[Set[str]]: A set of sequence IDs that were written to the output file if return_written_ids is True,
550
+ otherwise None.
551
+
552
+ Raises:
553
+ FileExistsError: If the output file already exists.
554
+ """
555
+ _check_output_file(output_path)
556
+
557
+ target_ids_lower = {id.lower() for id in target_ids}
558
+ total_size = os.path.getsize(fasta_file)
559
+ chunk_size = max(
560
+ 1, total_size // (multiprocessing.cpu_count() * 2)
561
+ ) # Adjust chunk size based on CPU count
562
+
563
+ def chunk_reader(file_obj, chunk_size: int):
564
+ chunk = []
565
+ chunk_bytes = 0
566
+ for line in file_obj:
567
+ chunk.append(line)
568
+ chunk_bytes += len(line)
569
+ if chunk_bytes >= chunk_size and line.startswith(">"):
570
+ yield chunk
571
+ chunk = [line]
572
+ chunk_bytes = len(line)
573
+ if chunk:
574
+ yield chunk
575
+
576
+ open_func = gzip.open if fasta_file.endswith(".gz") else open
577
+ mode = "rt" if fasta_file.endswith(".gz") else "r"
578
+
579
+ with open_func(fasta_file, mode) as input_file:
580
+ with multiprocessing.Pool() as pool:
581
+ process_func = partial(
582
+ process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
583
+ )
584
+ results = list(
585
+ tqdm(
586
+ pool.imap(process_func, chunk_reader(input_file, chunk_size)),
587
+ total=total_size // chunk_size,
588
+ desc="Processing FASTA",
589
+ )
590
+ )
591
+
592
+ all_written_ids = set()
593
+ with open(output_path, "w") as output_file:
594
+ for output_sequences, written_ids in results:
595
+ output_file.writelines(output_sequences)
596
+ all_written_ids.update(written_ids)
597
+
598
+ print(f"Wrote {len(all_written_ids)} sequences to {output_path}")
599
+ return all_written_ids if return_written_ids else None
600
+
601
+
602
+ def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
603
+ """
604
+ Load a FASTA file into a dictionary with record IDs as keys.
605
+ Keep only the first instance of each identifier.
606
+
607
+ Args:
608
+ fasta_file (str): Path to the FASTA file.
609
+
610
+ Returns:
611
+ Dict[str, SeqRecord]: A dictionary with record IDs as keys and SeqRecord objects as values.
612
+ """
613
+ record_dict: Dict[str, SeqRecord] = {}
614
+ estimated_sequences = estimate_sequences(fasta_file)
615
+
616
+ with tqdm(total=estimated_sequences, desc="Loading FASTA") as pbar:
617
+ for record in SeqIO.parse(fasta_file, "fasta"):
618
+ if record.id not in record_dict:
619
+ record_dict[record.id] = record
620
+ pbar.update(1)
621
+
622
+ return record_dict
623
+
624
+
625
+ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> None:
626
+ """
627
+ Convert a FASTA file to a SQLite database containing protein IDs and sequences.
628
+
629
+ This function performs the following steps:
630
+ 1. Creates a new SQLite database with a 'proteins' table.
631
+ 2. Estimates the number of sequences in the FASTA file.
632
+ 3. Reads the FASTA file and extracts protein IDs and sequences.
633
+ 4. Inserts the protein data into the SQLite database in batches.
634
+
635
+ Args:
636
+ fasta_file (str): Path to the input FASTA file.
637
+ db_file (str): Path to the output SQLite database file.
638
+ batch_size (int, optional): Number of records to insert in each batch. Defaults to 1000.
639
+
640
+ Raises:
641
+ FileNotFoundError: If the input FASTA file doesn't exist.
642
+ sqlite3.Error: If there's an error in database operations.
643
+ FileExistsError: If the output database file already exists.
644
+
645
+ Example:
646
+ fasta_to_sqlite("proteins.fasta", "proteins.db")
647
+ """
648
+ _check_output_file(db_file)
649
+
650
+ if not os.path.exists(fasta_file):
651
+ raise FileNotFoundError(f"FASTA file not found: {fasta_file}")
652
+
653
+ print(f"Starting conversion of {fasta_file} to SQLite database {db_file}")
654
+
655
+ # Create the SQLite database and table
656
+ print("Creating SQLite database...")
657
+ with sqlite3.connect(db_file) as conn:
658
+ conn.execute(
659
+ """
660
+ CREATE TABLE IF NOT EXISTS proteins (
661
+ protein_id TEXT PRIMARY KEY,
662
+ sequence TEXT NOT NULL
663
+ )
664
+ """
665
+ )
666
+ print("Database created successfully.")
667
+
668
+ # Estimate number of records for progress bar
669
+ estimated_records = estimate_sequences(fasta_file)
670
+ print(f"Estimated number of sequences: {estimated_records}")
671
+
672
+ # Insert protein data
673
+ print("Inserting protein data into the database...")
674
+ with sqlite3.connect(db_file) as conn:
675
+ cursor = conn.cursor()
676
+ batch = []
677
+
678
+ for protein_id, sequence in tqdm(
679
+ _protein_generator(fasta_file),
680
+ total=estimated_records,
681
+ desc="Processing proteins",
682
+ ):
683
+ batch.append((protein_id, sequence))
684
+
685
+ if len(batch) >= batch_size:
686
+ cursor.executemany(
687
+ "INSERT OR IGNORE INTO proteins (protein_id, sequence) VALUES (?, ?)",
688
+ batch,
689
+ )
690
+ conn.commit()
691
+ batch.clear()
692
+
693
+ # Insert any remaining records
694
+ if batch:
695
+ cursor.executemany(
696
+ "INSERT OR IGNORE INTO proteins (protein_id, sequence) VALUES (?, ?)",
697
+ batch,
698
+ )
699
+ conn.commit()
700
+
701
+ print(f"Conversion completed. SQLite database saved to {db_file}")
702
+
703
+
704
+ def _protein_generator(fasta_path: Path) -> Iterator[tuple[str, str]]:
705
+ """
706
+ Generate protein data from a FASTA file.
707
+
708
+ Args:
709
+ fasta_path (Path): Path to the FASTA file.
710
+
711
+ Yields:
712
+ tuple[str, str]: A tuple containing protein_id and sequence.
713
+ """
714
+ for record in SeqIO.parse(fasta_path, "fasta"):
715
+ protein_id = record.id.split()[
716
+ 0
717
+ ] # Assumes the first part of the id is the protein_id
718
+ sequence = str(record.seq)
719
+ yield protein_id, sequence
720
+
721
+
722
+ def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
723
+ """
724
+ Check a FASTA file for duplicate IDs, optimized for very large files.
725
+ Uses a memory-efficient approach and displays progress.
726
+
727
+ Args:
728
+ fasta_path: Path to the FASTA file to check.
729
+
730
+ Returns:
731
+ A tuple containing:
732
+ - set of duplicate IDs found
733
+ - empty set (for API compatibility)
734
+
735
+ Raises:
736
+ FileNotFoundError: If the input file doesn't exist
737
+ ValueError: If the FASTA file is malformed
738
+ """
739
+ if not os.path.exists(fasta_path):
740
+ raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
741
+
742
+ # Get estimated number of sequences for progress bar
743
+ estimated_sequences = estimate_sequences(fasta_path)
744
+
745
+ seen_ids: dict[str, int] = {}
746
+ duplicate_ids: set[str] = set()
747
+ processed_sequences = 0
748
+
749
+ try:
750
+ with open(fasta_path, "rt") as handle:
751
+ with tqdm(
752
+ total=estimated_sequences, desc="Checking for duplicates"
753
+ ) as pbar:
754
+ for record in SeqIO.parse(handle, "fasta"):
755
+ if record.id in seen_ids:
756
+ duplicate_ids.add(record.id)
757
+ seen_ids[record.id] += 1
758
+ else:
759
+ seen_ids[record.id] = 1
760
+
761
+ processed_sequences += 1
762
+ pbar.update(1)
763
+
764
+ # Print summary of findings
765
+ total_ids = len(seen_ids)
766
+ total_duplicates = len(duplicate_ids)
767
+ if total_duplicates > 0:
768
+ print(
769
+ f"\nFound {total_duplicates:,} duplicate IDs out of {total_ids:,} total IDs"
770
+ )
771
+ # Print details for the first few duplicates
772
+ sample_size = min(5, len(duplicate_ids))
773
+ if sample_size > 0:
774
+ print("\nExample duplicates (showing first 5):")
775
+ for dup_id in list(duplicate_ids)[:sample_size]:
776
+ print(f" {dup_id}: appears {seen_ids[dup_id]} times")
777
+ else:
778
+ print(f"\nNo duplicates found in {total_ids:,} sequences")
779
+
780
+ except ValueError as e:
781
+ raise ValueError(f"Malformed FASTA file: {str(e)}")
782
+ except Exception as e:
783
+ raise ValueError(f"Error parsing FASTA file: {str(e)}")
784
+
785
+ return duplicate_ids, set() # Return empty set for sequences to maintain API
786
+
787
+
788
+ def clean_fasta_duplicates(
789
+ input_path: str, output_path: str
790
+ ) -> tuple[set[str], set[str]]:
791
+ """Clean duplicate entries from a FASTA file.
792
+
793
+ For each duplicate ID found:
794
+ - If all sequences for that ID are identical, keep only the first occurrence
795
+ - If sequences differ, skip that ID and report it as a conflict
796
+
797
+ Optimized for very large files by:
798
+ - Using memory-efficient data structures
799
+ - Processing in chunks
800
+ - Using generators where possible
801
+
802
+ Args:
803
+ input_path (str): Path to input FASTA file
804
+ output_path (str): Path to write cleaned FASTA file
805
+
806
+ Returns:
807
+ tuple[set[str], set[str]]: A tuple containing:
808
+ - set of IDs that were deduplicated (had identical sequences)
809
+ - set of IDs that had sequence conflicts
810
+
811
+ Raises:
812
+ FileExistsError: If the output file already exists
813
+ FileNotFoundError: If the input file doesn't exist
814
+ """
815
+ _check_output_file(output_path)
816
+
817
+ # First pass: collect sequence hashes for each ID
818
+ # Using hashes instead of full sequences saves memory
819
+ id_hashes: dict[str, set[str]] = {}
820
+ id_count: dict[str, int] = {}
821
+ estimated_sequences = estimate_sequences(input_path)
822
+
823
+ print("Analyzing sequences...")
824
+ with open(input_path, "rt", buffering=1024 * 1024) as handle: # 1MB buffer
825
+ for record in tqdm(SeqIO.parse(handle, "fasta"), total=estimated_sequences):
826
+ seq_hash = str(hash(str(record.seq))) # Hash the sequence
827
+ if record.id not in id_hashes:
828
+ id_hashes[record.id] = {seq_hash}
829
+ id_count[record.id] = 1
830
+ else:
831
+ id_hashes[record.id].add(seq_hash)
832
+ id_count[record.id] += 1
833
+
834
+ # Identify duplicates and conflicts
835
+ duplicate_ids = {id for id, count in id_count.items() if count > 1}
836
+ if not duplicate_ids:
837
+ print("No duplicates found. Creating identical copy of input file...")
838
+ # Use shutil.copyfile for efficient file copying
839
+ import shutil
840
+
841
+ shutil.copyfile(input_path, output_path)
842
+ return set(), set()
843
+
844
+ cleaned_ids = {id for id in duplicate_ids if len(id_hashes[id]) == 1}
845
+ conflict_ids = {id for id in duplicate_ids if len(id_hashes[id]) > 1}
846
+
847
+ # Print summary
848
+ if conflict_ids:
849
+ print("\nFound sequence conflicts for these IDs:")
850
+ for id in conflict_ids:
851
+ print(f" {id}: {len(id_hashes[id])} different sequences")
852
+
853
+ if cleaned_ids:
854
+ print(f"\nCleaning {len(cleaned_ids)} IDs with identical duplicates...")
855
+
856
+ # Second pass: write cleaned file
857
+ # Use a larger buffer size for better I/O performance
858
+ seen_ids = set()
859
+ with (
860
+ open(input_path, "rt", buffering=1024 * 1024) as infile,
861
+ open(output_path, "wt", buffering=1024 * 1024) as outfile,
862
+ ):
863
+ for record in tqdm(SeqIO.parse(infile, "fasta"), total=estimated_sequences):
864
+ # Skip if we've seen this ID before and it's a duplicate we're cleaning
865
+ if record.id in seen_ids and record.id in cleaned_ids:
866
+ continue
867
+ # Skip if this ID has conflicting sequences
868
+ if record.id in conflict_ids:
869
+ continue
870
+ # Write the record and mark as seen
871
+ SeqIO.write(record, outfile, "fasta")
872
+ seen_ids.add(record.id)
873
+
874
+ print(f"\nWrote cleaned FASTA to {output_path}")
875
+ if cleaned_ids:
876
+ print(f"Removed duplicates for {len(cleaned_ids)} IDs")
877
+ if conflict_ids:
878
+ print(f"Skipped {len(conflict_ids)} IDs with sequence conflicts")
879
+
880
+ return cleaned_ids, conflict_ids
881
+
882
+
883
+ def fetch_uniprot_fasta(
884
+ accession_set,
885
+ batch_size=100,
886
+ output_file=None,
887
+ show_preview=True,
888
+ simple_headers=True,
889
+ ):
890
+ """
891
+ Retrieve FASTA sequences for a set of UniProt accession numbers with progress reporting.
892
+
893
+ Args:
894
+ accession_set (set): Set of UniProt accession numbers
895
+ batch_size (int): Number of accessions to process per batch
896
+ output_file (str): Path to output FASTA file. If None, will use "uniprot_sequences.fasta"
897
+ show_preview (bool): Whether to show the first few lines of the output file
898
+ simple_headers (bool): If True, replace FASTA headers with just the accession number
899
+
900
+ Returns:
901
+ tuple: (success_count, failed_count, output_filepath, failed_accessions)
902
+ """
903
+ # Convert set to list for batch processing
904
+ accession_list = list(accession_set)
905
+
906
+ # Set default output file if not provided
907
+ if output_file is None:
908
+ output_file = "uniprot_sequences.fasta"
909
+
910
+ print(f"Starting download of {len(accession_list)} UniProt sequences")
911
+ print(f"Output file: {os.path.abspath(output_file)}")
912
+ print(f"Using {'simple' if simple_headers else 'full'} FASTA headers")
913
+
914
+ start_time = time.time()
915
+
916
+ # Open the output file
917
+ with open(output_file, "w") as f:
918
+ # Initialize counters
919
+ success_count = 0
920
+ failed_count = 0
921
+ failed_accessions = set()
922
+
923
+ # Calculate number of batches for progress bar
924
+ num_batches = math.ceil(len(accession_list) / batch_size)
925
+
926
+ # Process accessions in batches with progress bar
927
+ for i in tqdm_notebook(
928
+ range(0, len(accession_list), batch_size),
929
+ desc="Downloading sequences",
930
+ total=num_batches,
931
+ ):
932
+
933
+ batch = accession_list[i : i + batch_size]
934
+ batch_size_actual = len(batch)
935
+ batch_set = set(batch)
936
+
937
+ # Construct the query string with OR operators
938
+ accession_query = " OR ".join([f"accession:{acc}" for acc in batch])
939
+
940
+ # Define the API endpoint and parameters
941
+ url = "https://rest.uniprot.org/uniprotkb/stream"
942
+ params = {"query": accession_query, "format": "fasta"}
943
+
944
+ # Set headers
945
+ headers = {"Accept": "text/fasta"}
946
+
947
+ # Make the API request with retry logic
948
+ max_retries = 3
949
+ for attempt in range(max_retries):
950
+ try:
951
+ response = requests.get(
952
+ url, params=params, headers=headers, timeout=30
953
+ )
954
+ response.raise_for_status() # Raise exception for 4XX/5XX responses
955
+
956
+ # Check if we got FASTA content
957
+ if response.text and ">" in response.text:
958
+ # Process the FASTA content
959
+ if simple_headers:
960
+ # Simplify headers to just the accession number
961
+ fasta_content = []
962
+ current_accession = None
963
+ sequence_lines = []
964
+
965
+ for line in response.text.splitlines():
966
+ if line.startswith(">"):
967
+ # If we have collected sequence lines for a previous accession, write them
968
+ if current_accession and sequence_lines:
969
+ f.write(f">{current_accession}\n")
970
+ f.write("\n".join(sequence_lines) + "\n")
971
+ sequence_lines = []
972
+
973
+ # Extract accession from the header line
974
+ # UniProt FASTA headers typically follow format: >db|ACCESSION|NAME
975
+ parts = line.split("|")
976
+ if len(parts) >= 2:
977
+ current_accession = parts[1]
978
+ else:
979
+ # Fallback if header format is unexpected
980
+ match = re.search(r">(\S+)", line)
981
+ current_accession = (
982
+ match.group(1) if match else line[1:]
983
+ )
984
+ else:
985
+ # Collect sequence lines
986
+ sequence_lines.append(line)
987
+
988
+ # Write the last sequence
989
+ if current_accession and sequence_lines:
990
+ f.write(f">{current_accession}\n")
991
+ f.write("\n".join(sequence_lines) + "\n")
992
+ else:
993
+ # Write original FASTA content
994
+ f.write(response.text)
995
+
996
+ # Count successful retrievals by parsing FASTA headers
997
+ retrieved_accessions = set()
998
+ for line in response.text.splitlines():
999
+ if line.startswith(">"):
1000
+ # Extract accession number from FASTA header line
1001
+ parts = line.split("|")
1002
+ if len(parts) >= 2:
1003
+ retrieved_accessions.add(parts[1])
1004
+
1005
+ # Determine which accessions weren't retrieved
1006
+ missing_accessions = batch_set - retrieved_accessions
1007
+ failed_accessions.update(missing_accessions)
1008
+
1009
+ success_in_batch = len(retrieved_accessions)
1010
+ failed_in_batch = batch_size_actual - success_in_batch
1011
+
1012
+ if failed_in_batch > 0:
1013
+ print(
1014
+ f"Warning: Batch {i//batch_size + 1} missing {failed_in_batch} sequences"
1015
+ )
1016
+
1017
+ success_count += success_in_batch
1018
+ failed_count += failed_in_batch
1019
+ else:
1020
+ print(
1021
+ f"Warning: Batch {i//batch_size + 1} returned no valid FASTA data"
1022
+ )
1023
+ failed_count += batch_size_actual
1024
+ failed_accessions.update(batch_set)
1025
+
1026
+ # Successful request, break the retry loop
1027
+ break
1028
+
1029
+ except requests.exceptions.RequestException as e:
1030
+ if attempt < max_retries - 1:
1031
+ wait_time = 2**attempt # Exponential backoff
1032
+ print(
1033
+ f"Request failed: {e}. Retrying in {wait_time} seconds..."
1034
+ )
1035
+ time.sleep(wait_time)
1036
+ else:
1037
+ print(
1038
+ f"Failed to retrieve batch {i//batch_size + 1} after {max_retries} attempts: {e}"
1039
+ )
1040
+ failed_count += batch_size_actual
1041
+ failed_accessions.update(batch_set)
1042
+
1043
+ # Brief pause to avoid overloading the server
1044
+ time.sleep(0.5)
1045
+
1046
+ # Report results
1047
+ elapsed_time = time.time() - start_time
1048
+ print(f"\nDownload completed in {elapsed_time:.1f} seconds")
1049
+ print(f"Successfully retrieved: {success_count} sequences")
1050
+ print(f"Failed to retrieve: {failed_count} sequences")
1051
+
1052
+ # Calculate download rate
1053
+ if elapsed_time > 0:
1054
+ rate = success_count / elapsed_time
1055
+ print(f"Download rate: {rate:.1f} sequences/second")
1056
+
1057
+ # Display first few lines of the FASTA file for verification
1058
+ if (
1059
+ show_preview
1060
+ and os.path.exists(output_file)
1061
+ and os.path.getsize(output_file) > 0
1062
+ ):
1063
+ print("\nFirst 5 lines of the FASTA file:")
1064
+ with open(output_file, "r") as f:
1065
+ for i, line in enumerate(f):
1066
+ if i < 5:
1067
+ print(line.strip())
1068
+ else:
1069
+ break
1070
+
1071
+ # Return the set of failed accessions for potential retry
1072
+ if failed_count > 0:
1073
+ print("\nFailed accessions:")
1074
+ # Print only first 10 if there are many
1075
+ if len(failed_accessions) > 10:
1076
+ print(
1077
+ f"{list(failed_accessions)[:10]} ... and {len(failed_accessions)-10} more"
1078
+ )
1079
+ else:
1080
+ print(failed_accessions)
1081
+
1082
+ return success_count, failed_count, os.path.abspath(output_file), failed_accessions