dayhoff-tools 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ import csv
2
+ from typing import Dict, List, Set
3
+
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+
7
+
8
+ def pick_mmseqs_cluster_representatives(
9
+ tsv_file: str,
10
+ priority_sets: List[Set[str]],
11
+ avoid_sets: List[Set[str]] | None = None,
12
+ ) -> Dict[str, Set[str]]:
13
+ """
14
+ Select representative protein IDs from clusters and return full cluster information.
15
+
16
+ This function reads a TSV file containing protein cluster information, where each line
17
+ represents a cluster with a representative sequence and a member. It then selects a
18
+ representative for each cluster using the following priority:
19
+ 1. IDs from priority_sets (in order of the sets)
20
+ 2. IDs not in avoid_sets (if avoid_sets is provided)
21
+ 3. Original representative as fallback
22
+
23
+ Progress is displayed using TQDM progress bars.
24
+
25
+ Args:
26
+ tsv_file (str): Path to the TSV file containing cluster information.
27
+ Each line should be in the format: representative_id\tmember_id
28
+ priority_sets (List[Set[str]]): An ordered list of sets of protein IDs to prioritize
29
+ as representatives. Earlier sets in the list have higher priority.
30
+ avoid_sets (List[Set[str]] | None, optional): An ordered list of sets of protein IDs to avoid
31
+ when selecting representatives. Only used if no priority IDs are found.
32
+ Defaults to None.
33
+
34
+ Returns:
35
+ Dict[str, Set[str]]: A dictionary where keys are selected representative protein IDs
36
+ and values are sets of all members in the cluster (including the representative).
37
+
38
+ Raises:
39
+ FileNotFoundError: If the specified TSV file does not exist.
40
+ ValueError: If the TSV file is not properly formatted.
41
+ """
42
+ cluster_dict: Dict[str, Set[str]] = {}
43
+ final_clusters: Dict[str, Set[str]] = {}
44
+
45
+ # First pass: count lines for TQDM
46
+ print("Counting TSV lines")
47
+ total_lines = sum(1 for _ in open(tsv_file, "r"))
48
+
49
+ # Read the TSV file and build the cluster dictionary
50
+ try:
51
+ with (
52
+ open(tsv_file, "r") as file,
53
+ tqdm(total=total_lines, desc="Reading TSV", unit="lines") as pbar,
54
+ ):
55
+ for line in file:
56
+ parts = line.strip().split("\t")
57
+ if len(parts) != 2:
58
+ raise ValueError(f"Invalid line format in TSV file: {line}")
59
+ rep, member = parts
60
+ if rep not in cluster_dict:
61
+ cluster_dict[rep] = set()
62
+ cluster_dict[rep].add(member)
63
+ cluster_dict[rep].add(
64
+ rep
65
+ ) # Ensure the representative is also in the cluster set
66
+ pbar.update(1)
67
+ except FileNotFoundError:
68
+ raise FileNotFoundError(f"The TSV file '{tsv_file}' was not found.")
69
+
70
+ # Process each cluster and select a representative
71
+ for rep, cluster in tqdm(
72
+ cluster_dict.items(), desc="Selecting representatives", unit="clusters"
73
+ ):
74
+ selected_rep = None
75
+
76
+ # First try to find IDs from priority sets
77
+ for priority_set in priority_sets:
78
+ priority_rep = cluster.intersection(priority_set)
79
+ if priority_rep:
80
+ selected_rep = min(
81
+ priority_rep
82
+ ) # Choose the lexicographically first ID
83
+ break
84
+
85
+ # If no priority ID found and avoid_sets provided, try to find non-avoided IDs
86
+ if selected_rep is None and avoid_sets is not None:
87
+ # Create a set of all IDs to avoid
88
+ all_avoid_ids = set().union(*avoid_sets)
89
+ # Find IDs that are not in any avoid set
90
+ non_avoided_ids = cluster - all_avoid_ids
91
+ if non_avoided_ids:
92
+ selected_rep = min(
93
+ non_avoided_ids
94
+ ) # Choose the lexicographically first non-avoided ID
95
+ elif rep not in all_avoid_ids:
96
+ # If no non-avoided IDs found but original rep is not avoided, use it
97
+ selected_rep = rep
98
+
99
+ # If still no representative found, use the original representative
100
+ if selected_rep is None:
101
+ selected_rep = rep
102
+
103
+ final_clusters[selected_rep] = cluster
104
+
105
+ return final_clusters
106
+
107
+
108
+ def replace_proteins_with_representatives(
109
+ df: pd.DataFrame, reps: Dict[str, Set[str]]
110
+ ) -> pd.DataFrame:
111
+ """
112
+ Replace protein IDs in a DataFrame with their cluster representatives.
113
+
114
+ This function takes a DataFrame containing protein IDs and a dictionary of
115
+ cluster representatives. It replaces each protein ID in the DataFrame with
116
+ its corresponding cluster representative.
117
+
118
+ Args:
119
+ df (pd.DataFrame): Input DataFrame with columns ["pr_id", "reaction_id", "protein_id"].
120
+ reps (Dict[str, Set[str]]): A dictionary where keys are representative protein IDs
121
+ and values are sets of all members in the cluster (including the representative).
122
+
123
+ Returns:
124
+ pd.DataFrame: A new DataFrame with protein IDs replaced by their cluster representatives.
125
+
126
+ Raises:
127
+ ValueError: If the input DataFrame doesn't have the required columns.
128
+ """
129
+ if not all(col in df.columns for col in ["pr_id", "reaction_id", "protein_id"]):
130
+ raise ValueError(
131
+ "Input DataFrame must have columns: 'pr_id', 'reaction_id', 'protein_id'"
132
+ )
133
+
134
+ print("Starting protein ID replacement process...")
135
+
136
+ # Create a mapping of all proteins to their representatives
137
+ protein_to_rep = {}
138
+ for rep, cluster in reps.items():
139
+ for protein in cluster:
140
+ protein_to_rep[protein] = rep
141
+
142
+ print("Protein to representative mapping created.")
143
+
144
+ # Create a copy of the input DataFrame to avoid modifying the original
145
+ df_copy = df.copy()
146
+
147
+ # Replace protein IDs with their representatives
148
+ tqdm.pandas(desc="Replacing protein IDs")
149
+ df_copy["protein_id"] = df_copy["protein_id"].progress_map(
150
+ lambda x: protein_to_rep.get(x, x)
151
+ )
152
+
153
+ print("Protein ID replacement completed.")
154
+ return df_copy
155
+
156
+
157
+ def write_clusters_to_tsv(clusters, output_file):
158
+ with open(output_file, "w", newline="") as f:
159
+ writer = csv.writer(f, delimiter="\t")
160
+ writer.writerow(["protein_id", "cluster_rep"]) # Write header
161
+
162
+ for cluster_rep, members in tqdm(clusters.items(), desc="Writing clusters"):
163
+ for protein_id in members:
164
+ writer.writerow([protein_id, cluster_rep])
@@ -0,0 +1,516 @@
1
+ import os
2
+ import shutil
3
+ import sqlite3
4
+ from pathlib import Path
5
+ from typing import Any, Iterable, Literal, Optional, Tuple, Union
6
+
7
+ import pandas as pd
8
+ from sqlalchemy import create_engine, inspect, text
9
+ from tqdm import tqdm
10
+ from typing_extensions import Literal
11
+
12
+
13
+ def sql_query(db_file: str, query: str, params: tuple[Any, ...] = ()) -> list[Any]:
14
+ """Perform a SQL query on the specified SQLite file.
15
+
16
+ Args:
17
+ db_file (str): Path to the SQLite file.
18
+ query (str): SQL query to perform.
19
+ params (tuple[Any, ...], optional): Query parameters. Defaults to ().
20
+
21
+ Returns:
22
+ list[Any]: Results of the query.
23
+
24
+ Raises:
25
+ sqlite3.DatabaseError: If the query fails.
26
+ """
27
+ with sqlite3.connect(db_file) as conn:
28
+ try:
29
+ return conn.execute(query, params).fetchall()
30
+ except sqlite3.Error as e:
31
+ raise sqlite3.DatabaseError(f"Query: {query}\n failed with error:\n {e}")
32
+
33
+
34
+ def sql_check_columns(
35
+ db_file: str,
36
+ table_name: str,
37
+ columns: str | Iterable[str] | None = None,
38
+ ) -> list[str]:
39
+ """Check that the table and columns exist in the specified SQLite file.
40
+ Returns a list of verified columns.
41
+
42
+ Args:
43
+ db_file (str): Path to the SQLite file.
44
+ table_name (str): Name of the table in the database that contains the data.
45
+ columns (str | list[str] | tuple[str, ...] | None, optional): Names of the
46
+ columns in the database that will be returned as features. If None then
47
+ all columns will be returned.
48
+
49
+ Returns:
50
+ list[str]: List of verified columns.
51
+
52
+ Raises:
53
+ sqlite3.DatabaseError: If the table, or columns do not exist.
54
+ """
55
+ tables = [
56
+ table[0]
57
+ for table in sql_query(
58
+ db_file, "SELECT name FROM sqlite_master WHERE type='table'"
59
+ )
60
+ ]
61
+ if table_name not in tables:
62
+ raise sqlite3.DatabaseError(f"Table {table_name} does not exist.")
63
+
64
+ existing_columns = [
65
+ row[1] for row in sql_query(db_file, f"PRAGMA table_info({table_name})")
66
+ ]
67
+ if columns is not None:
68
+ columns = [columns] if isinstance(columns, str) else list(columns)
69
+ missing_columns = set(columns) - set(existing_columns)
70
+ if missing_columns:
71
+ raise sqlite3.DatabaseError(
72
+ "One or more specified columns does not exist.\n"
73
+ + f"Missing columns: {missing_columns}\n"
74
+ + f"Existing columns: {existing_columns}"
75
+ )
76
+ return columns
77
+ else:
78
+ return existing_columns
79
+
80
+
81
+ def sql_check_index(db_file: str, table_name: str, column_name: str) -> bool:
82
+ """Verify that an index exists for the given column in the specified table."""
83
+ conn = sqlite3.connect(db_file)
84
+ cursor = conn.cursor()
85
+
86
+ query = f"PRAGMA index_list('{table_name}')"
87
+ cursor.execute(query)
88
+ indexes = cursor.fetchall()
89
+
90
+ # Collect all indexed columns from the retrieved indexes
91
+ indexed_columns = []
92
+ for index in indexes:
93
+ index_name = index[1] # Adjust according to the structure returned
94
+ index_info_query = f"PRAGMA index_info('{index_name}')"
95
+ cursor.execute(index_info_query)
96
+ columns = cursor.fetchall()
97
+ indexed_columns.extend(col[2] for col in columns if col[2] == column_name)
98
+
99
+ conn.close()
100
+ if column_name not in indexed_columns:
101
+ raise Exception(
102
+ f"Column '{column_name}' is not indexed in table '{table_name}'."
103
+ )
104
+
105
+ return True
106
+
107
+
108
+ def sql_check_primary_key(db_file: str, table_name: str, expected_primary_key: str):
109
+ """Check that the expected primary key is set for the specified table."""
110
+ with sqlite3.connect(db_file) as conn:
111
+ cursor = conn.cursor()
112
+ cursor.execute(f"PRAGMA table_info('{table_name}')")
113
+ columns = cursor.fetchall()
114
+ # Look for the primary key in the results
115
+ for col in columns:
116
+ if col[1] == expected_primary_key and col[5] == 1: # Check the pk flag
117
+ return True
118
+ return False
119
+
120
+
121
+ def write_to_sqlite(
122
+ input_data: Union[str, pd.DataFrame],
123
+ sql_filepath: str,
124
+ table_name: str,
125
+ columns: list[Tuple[str, Any]],
126
+ primary_key: str,
127
+ indexes: list[Tuple[str, str]],
128
+ if_table_exists: Literal["fail", "replace", "append"] = "append",
129
+ delete_file_if_exists: bool = True,
130
+ chunksize: int = 1000,
131
+ ) -> None:
132
+ """
133
+ Write data from a TSV file or DataFrame to a SQLite database file in a memory-efficient way.
134
+
135
+ Args:
136
+ input_data (Union[str, pd.DataFrame]): The input data, either a file path of the TSV file or a DataFrame.
137
+ sql_filepath (str): The file path of the SQLite database file.
138
+ table_name (str): The name of the table to be created in the database.
139
+ columns (list[Tuple[str, Any]]): A list of tuples representing the column names and their data types.
140
+ Each tuple should be in the format (column_name, data_type), where the data_type is an sqlite data type.
141
+ Example: [("id", "INTEGER"), ("name", "TEXT"), ("age", "INTEGER")]
142
+ primary_key (str): The name of the column to be used as the primary key.
143
+ indexes (list[Tuple[str, str]]): A list of tuples representing the indexes to be created.
144
+ Each tuple should be in the format (index_name, column_name).
145
+ Example: [("idx_name", "name"), ("idx_age", "age")]
146
+ if_table_exists (Literal['fail', 'replace', 'append']): What to do if the table exists already.
147
+ delete_file_if_exists (bool): Whether to delete the database file itself, if it exists.
148
+ chunksize (int): Number of lines to read from the TSV file at a time if input_data is a TSV file.
149
+
150
+ Examples:
151
+ columns = [
152
+ ("pr_id", "INTEGER"),
153
+ ("reaction_id", "TEXT"),
154
+ ("protein_id", "TEXT"),
155
+ ("db_source", "TEXT"),
156
+ ]
157
+ primary_key = "pr_id"
158
+ indexes = [
159
+ ("idx_on_protein_id", "protein_id"),
160
+ ("idx_on_reaction_id", "reaction_id"),
161
+ ]
162
+ write_to_sqlite(
163
+ input_data="path/to/data.tsv",
164
+ sql_filepath="path/to/database.db",
165
+ table_name="protein_to_reaction",
166
+ columns=columns,
167
+ primary_key=primary_key,
168
+ indexes=indexes,
169
+ if_table_exists="append",
170
+ delete_file_if_exists=True,
171
+ chunksize=1000
172
+ )
173
+
174
+ # or
175
+
176
+ df = pd.DataFrame([...])
177
+ write_to_sqlite(
178
+ input_data=df,
179
+ sql_filepath="path/to/database.db",
180
+ table_name="protein_to_reaction",
181
+ columns=columns,
182
+ primary_key=primary_key,
183
+ indexes=indexes,
184
+ if_table_exists="append",
185
+ delete_file_if_exists=True,
186
+ )
187
+ """
188
+
189
+ # Optionally delete the database file itself
190
+ if delete_file_if_exists and os.path.exists(sql_filepath):
191
+ os.remove(sql_filepath)
192
+
193
+ with sqlite3.connect(sql_filepath) as conn:
194
+ cursor = conn.cursor()
195
+
196
+ # Determine action if table exists
197
+ if if_table_exists == "replace":
198
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
199
+ elif if_table_exists == "append":
200
+ cursor.execute(
201
+ f"SELECT name FROM sqlite_master WHERE type='table' AND name=?;",
202
+ (table_name,),
203
+ )
204
+ if cursor.fetchone() is None:
205
+ if_table_exists = "replace"
206
+ else: # if_table_exists == 'fail'
207
+ cursor.execute(
208
+ f"SELECT name FROM sqlite_master WHERE type='table' AND name=?;",
209
+ (table_name,),
210
+ )
211
+ if cursor.fetchone():
212
+ raise ValueError(f"Table {table_name} already exists.")
213
+
214
+ # Create table if 'replace' or not exists
215
+ if if_table_exists == "replace":
216
+ col_defs = [f"{name} {dtype}" for name, dtype in columns]
217
+ col_defs.append(f"PRIMARY KEY ({primary_key})")
218
+ cursor.execute(f"CREATE TABLE {table_name} ({', '.join(col_defs)})")
219
+
220
+ # Convert columns to list of column names
221
+ column_names = [col[0] for col in columns]
222
+ placeholders = ", ".join("?" * len(column_names))
223
+
224
+ if isinstance(input_data, str):
225
+ # Read TSV file in chunks and insert into the table
226
+ for chunk in pd.read_csv(
227
+ input_data, sep="\t", usecols=column_names, chunksize=chunksize
228
+ ):
229
+ data = chunk.values.tolist()
230
+ cursor.executemany(
231
+ f"INSERT INTO {table_name} VALUES ({placeholders})", data
232
+ )
233
+ elif isinstance(input_data, pd.DataFrame):
234
+ # Insert DataFrame data into the table
235
+ data = input_data[column_names].values.tolist()
236
+ cursor.executemany(
237
+ f"INSERT INTO {table_name} VALUES ({placeholders})", data
238
+ )
239
+ else:
240
+ raise ValueError(
241
+ "input_data must be either a file path to a TSV file or a DataFrame."
242
+ )
243
+
244
+ # Create indexes if specified
245
+ for index_name, column_name in indexes:
246
+ cursor.execute(
247
+ f"CREATE INDEX IF NOT EXISTS {index_name} ON {table_name} ({column_name})"
248
+ )
249
+
250
+ conn.commit()
251
+
252
+
253
+ def inspect_sqlite_db(db_path: str) -> None:
254
+ """
255
+ Inspect the contents of a SQLite database file.
256
+
257
+ This function connects to the specified SQLite database, retrieves information
258
+ about its tables, columns, and sample data, and prints this information to
259
+ the console.
260
+
261
+ Args:
262
+ db_path (str): The file path to the SQLite database.
263
+
264
+ Returns:
265
+ None
266
+
267
+ Raises:
268
+ sqlite3.Error: If there's an error connecting to or querying the database.
269
+ """
270
+ try:
271
+ conn = sqlite3.connect(db_path)
272
+ cursor = conn.cursor()
273
+
274
+ # Get list of tables
275
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
276
+ tables = cursor.fetchall()
277
+
278
+ for table in tables:
279
+ table_name = table[0]
280
+ print(f"\nTable: {table_name}")
281
+
282
+ # Get column names
283
+ cursor.execute(f"PRAGMA table_info({table_name})")
284
+ columns = [column[1] for column in cursor.fetchall()]
285
+ print("Columns:", ", ".join(columns))
286
+
287
+ # Get sample data (first 5 rows)
288
+ cursor.execute(f"SELECT * FROM {table_name} LIMIT 5")
289
+ rows = cursor.fetchall()
290
+
291
+ print("\nSample data:")
292
+ for row in rows:
293
+ print(row)
294
+
295
+ conn.close()
296
+
297
+ except sqlite3.Error as e:
298
+ print(f"An error occurred: {e}")
299
+
300
+
301
+ def read_sql_table_to_df(
302
+ db_path: str, table_name: Optional[str] = None, chunksize: int = 10000
303
+ ) -> pd.DataFrame:
304
+ """
305
+ Read an SQL table into a pandas DataFrame with a progress bar.
306
+
307
+ This function connects to a SQLite database using the provided file path,
308
+ reads the specified table (or the only table if not specified) in chunks,
309
+ and displays a progress bar during the reading process.
310
+
311
+ Args:
312
+ db_path (str): The file path to the SQLite database.
313
+ table_name (Optional[str], optional): The name of the table to read from the database.
314
+ If None, assumes there's only one table. Defaults to None.
315
+ chunksize (int, optional): The number of rows to read per chunk. Defaults to 10000.
316
+
317
+ Returns:
318
+ pd.DataFrame: A pandas DataFrame containing the data from the SQL table.
319
+
320
+ Raises:
321
+ ValueError: If the table_name is not provided and there are multiple tables,
322
+ or if the specified table doesn't exist.
323
+ SQLAlchemyError: If there's an issue connecting to the database or reading the table.
324
+ """
325
+
326
+ engine_url = f"sqlite:///{db_path}"
327
+ engine = create_engine(engine_url)
328
+
329
+ # Get all table names
330
+ with engine.connect() as connection:
331
+ inspector = inspect(engine)
332
+ table_names = inspector.get_table_names()
333
+
334
+ # Handle table selection
335
+ if table_name is None:
336
+ if len(table_names) == 1:
337
+ table_name = table_names[0]
338
+ elif len(table_names) == 0:
339
+ raise ValueError("No tables found in the database.")
340
+ else:
341
+ raise ValueError(
342
+ f"Multiple tables found. Please specify a table_name. Options are: {', '.join(table_names)}"
343
+ )
344
+ elif table_name not in table_names:
345
+ raise ValueError(f"Table '{table_name}' not found in the database.")
346
+
347
+ # Get the number of rows in the table
348
+ with engine.connect() as connection:
349
+ result = connection.execute(text(f"SELECT COUNT(*) FROM {table_name}"))
350
+ row_count = result.scalar()
351
+
352
+ # Create a tqdm progress bar
353
+ pbar = tqdm(total=row_count, desc=f"Reading {table_name}", unit="rows")
354
+
355
+ # Read the SQL table in chunks with progress updates
356
+ chunks = []
357
+ for chunk in pd.read_sql(
358
+ f"SELECT * FROM {table_name}", engine, chunksize=chunksize
359
+ ):
360
+ chunks.append(chunk)
361
+ pbar.update(len(chunk))
362
+
363
+ # Close the progress bar
364
+ pbar.close()
365
+
366
+ # Combine all chunks into a single DataFrame
367
+ df = pd.concat(chunks, ignore_index=True)
368
+
369
+ return df
370
+
371
+
372
+ def _create_optimized_db(dst_path: Path) -> None:
373
+ """Helper function to create an optimized database."""
374
+ if dst_path.exists():
375
+ dst_path.unlink()
376
+
377
+ with sqlite3.connect(str(dst_path)) as conn:
378
+ # Use default page size but optimize other settings
379
+ conn.execute("PRAGMA journal_mode=OFF")
380
+ conn.execute(
381
+ "PRAGMA synchronous=FULL"
382
+ ) # Start with FULL for safety during creation
383
+ conn.commit()
384
+
385
+
386
+ def _verify_destination(dst_path: Path) -> None:
387
+ """Helper function to verify destination path is writable."""
388
+ # Convert to absolute path for consistent handling
389
+ dst_path = dst_path.absolute()
390
+
391
+ # Check if path is under root and we don't have root access
392
+ if str(dst_path).startswith("/") and not os.access("/", os.W_OK):
393
+ if any(not os.access(p, os.W_OK) for p in dst_path.parents):
394
+ raise OSError(f"No write permission for path: {dst_path}")
395
+
396
+ try:
397
+ # Check if parent directory exists
398
+ if not dst_path.parent.exists():
399
+ try:
400
+ dst_path.parent.mkdir(parents=True, exist_ok=True)
401
+ except (OSError, PermissionError) as e:
402
+ raise OSError(
403
+ f"Cannot create parent directory: {dst_path.parent}"
404
+ ) from e
405
+
406
+ # Check if parent directory is writable
407
+ if not os.access(str(dst_path.parent), os.W_OK):
408
+ raise OSError(
409
+ f"No write permission for parent directory: {dst_path.parent}"
410
+ )
411
+
412
+ # If file exists, check if it's writable
413
+ if dst_path.exists():
414
+ if not os.access(str(dst_path), os.W_OK):
415
+ raise OSError(f"No write permission for destination: {dst_path}")
416
+ else:
417
+ # Try to create and write to the file
418
+ try:
419
+ dst_path.touch()
420
+ dst_path.unlink() # Clean up the test file
421
+ except (OSError, PermissionError) as e:
422
+ raise OSError(f"Cannot write to destination path: {dst_path}") from e
423
+
424
+ except Exception as e:
425
+ # Ensure we always raise OSError
426
+ if not isinstance(e, OSError):
427
+ raise OSError(f"Cannot create or write to destination: {dst_path}") from e
428
+ raise
429
+
430
+
431
+ def optimize_protein_db(src_path: Union[str, Path], dst_path: Union[str, Path]):
432
+ """
433
+ Optimize SQLite database containing protein sequences.
434
+ """
435
+ src_path, dst_path = Path(src_path), Path(dst_path)
436
+
437
+ # Check source file exists
438
+ if not src_path.exists():
439
+ raise FileNotFoundError(f"Source database not found: {src_path}")
440
+
441
+ # Verify destination is writable
442
+ _verify_destination(dst_path)
443
+
444
+ # Create optimized database
445
+ _create_optimized_db(dst_path)
446
+
447
+ # Copy data from source to destination
448
+ with sqlite3.connect(str(dst_path)) as dst_conn:
449
+ with sqlite3.connect(src_path) as src_conn:
450
+ src_conn.backup(dst_conn)
451
+
452
+ # Apply conservative optimizations
453
+ dst_conn.execute("PRAGMA journal_mode=WAL")
454
+ dst_conn.execute("PRAGMA synchronous=FULL") # Keep FULL for protein data
455
+ dst_conn.execute("PRAGMA cache_size=-2000000") # 2GB cache
456
+ dst_conn.execute("PRAGMA temp_store=FILE")
457
+ dst_conn.commit()
458
+
459
+
460
+ def optimize_reaction_db(src_path: Union[str, Path], dst_path: Union[str, Path]):
461
+ """
462
+ Optimize SQLite database containing reaction data.
463
+ """
464
+ src_path, dst_path = Path(src_path), Path(dst_path)
465
+
466
+ # Check source file exists
467
+ if not src_path.exists():
468
+ raise FileNotFoundError(f"Source database not found: {src_path}")
469
+
470
+ # Verify destination is writable
471
+ _verify_destination(dst_path)
472
+
473
+ # Create optimized database
474
+ _create_optimized_db(dst_path)
475
+
476
+ # Copy data from source to destination
477
+ with sqlite3.connect(str(dst_path)) as dst_conn:
478
+ with sqlite3.connect(src_path) as src_conn:
479
+ src_conn.backup(dst_conn)
480
+
481
+ # Apply conservative optimizations
482
+ dst_conn.execute("PRAGMA journal_mode=WAL")
483
+ dst_conn.execute("PRAGMA synchronous=FULL") # Keep FULL for safety
484
+ dst_conn.execute("PRAGMA cache_size=-2000000") # 2GB cache
485
+ dst_conn.execute("PRAGMA temp_store=FILE") # Use FILE instead of MEMORY
486
+ dst_conn.commit()
487
+
488
+
489
+ def optimize_pairs_db(src_path: Union[str, Path], dst_path: Union[str, Path]):
490
+ """
491
+ Optimize SQLite database containing protein-reaction pairs.
492
+ """
493
+ src_path, dst_path = Path(src_path), Path(dst_path)
494
+
495
+ # Check source file exists
496
+ if not src_path.exists():
497
+ raise FileNotFoundError(f"Source database not found: {src_path}")
498
+
499
+ # Verify destination is writable
500
+ _verify_destination(dst_path)
501
+
502
+ # Create optimized database
503
+ _create_optimized_db(dst_path)
504
+
505
+ # Copy data from source to destination
506
+ with sqlite3.connect(str(dst_path)) as dst_conn:
507
+ with sqlite3.connect(src_path) as src_conn:
508
+ src_conn.backup(dst_conn)
509
+
510
+ # Apply conservative optimizations
511
+ dst_conn.execute("PRAGMA journal_mode=WAL")
512
+ dst_conn.execute("PRAGMA synchronous=FULL") # Keep FULL for safety
513
+ dst_conn.execute("PRAGMA cache_size=-2000000") # 2GB cache
514
+ dst_conn.execute("PRAGMA temp_store=FILE") # Use FILE instead of MEMORY
515
+ dst_conn.execute("PRAGMA mmap_size=30000000000") # 30GB memory mapping
516
+ dst_conn.commit()