dayhoff-tools 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/__init__.py +0 -0
- dayhoff_tools/chemistry/standardizer.py +297 -0
- dayhoff_tools/chemistry/utils.py +63 -0
- dayhoff_tools/cli/__init__.py +0 -0
- dayhoff_tools/cli/main.py +90 -0
- dayhoff_tools/cli/swarm_commands.py +156 -0
- dayhoff_tools/cli/utility_commands.py +244 -0
- dayhoff_tools/deployment/base.py +434 -0
- dayhoff_tools/deployment/deploy_aws.py +458 -0
- dayhoff_tools/deployment/deploy_gcp.py +176 -0
- dayhoff_tools/deployment/deploy_utils.py +781 -0
- dayhoff_tools/deployment/job_runner.py +153 -0
- dayhoff_tools/deployment/processors.py +125 -0
- dayhoff_tools/deployment/swarm.py +591 -0
- dayhoff_tools/embedders.py +893 -0
- dayhoff_tools/fasta.py +1082 -0
- dayhoff_tools/file_ops.py +261 -0
- dayhoff_tools/gcp.py +85 -0
- dayhoff_tools/h5.py +542 -0
- dayhoff_tools/kegg.py +37 -0
- dayhoff_tools/logs.py +27 -0
- dayhoff_tools/mmseqs.py +164 -0
- dayhoff_tools/sqlite.py +516 -0
- dayhoff_tools/structure.py +751 -0
- dayhoff_tools/uniprot.py +434 -0
- dayhoff_tools/warehouse.py +418 -0
- dayhoff_tools-1.0.0.dist-info/METADATA +122 -0
- dayhoff_tools-1.0.0.dist-info/RECORD +30 -0
- dayhoff_tools-1.0.0.dist-info/WHEEL +4 -0
- dayhoff_tools-1.0.0.dist-info/entry_points.txt +3 -0
dayhoff_tools/mmseqs.py
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
import csv
|
2
|
+
from typing import Dict, List, Set
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
from tqdm import tqdm
|
6
|
+
|
7
|
+
|
8
|
+
def pick_mmseqs_cluster_representatives(
|
9
|
+
tsv_file: str,
|
10
|
+
priority_sets: List[Set[str]],
|
11
|
+
avoid_sets: List[Set[str]] | None = None,
|
12
|
+
) -> Dict[str, Set[str]]:
|
13
|
+
"""
|
14
|
+
Select representative protein IDs from clusters and return full cluster information.
|
15
|
+
|
16
|
+
This function reads a TSV file containing protein cluster information, where each line
|
17
|
+
represents a cluster with a representative sequence and a member. It then selects a
|
18
|
+
representative for each cluster using the following priority:
|
19
|
+
1. IDs from priority_sets (in order of the sets)
|
20
|
+
2. IDs not in avoid_sets (if avoid_sets is provided)
|
21
|
+
3. Original representative as fallback
|
22
|
+
|
23
|
+
Progress is displayed using TQDM progress bars.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
tsv_file (str): Path to the TSV file containing cluster information.
|
27
|
+
Each line should be in the format: representative_id\tmember_id
|
28
|
+
priority_sets (List[Set[str]]): An ordered list of sets of protein IDs to prioritize
|
29
|
+
as representatives. Earlier sets in the list have higher priority.
|
30
|
+
avoid_sets (List[Set[str]] | None, optional): An ordered list of sets of protein IDs to avoid
|
31
|
+
when selecting representatives. Only used if no priority IDs are found.
|
32
|
+
Defaults to None.
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
Dict[str, Set[str]]: A dictionary where keys are selected representative protein IDs
|
36
|
+
and values are sets of all members in the cluster (including the representative).
|
37
|
+
|
38
|
+
Raises:
|
39
|
+
FileNotFoundError: If the specified TSV file does not exist.
|
40
|
+
ValueError: If the TSV file is not properly formatted.
|
41
|
+
"""
|
42
|
+
cluster_dict: Dict[str, Set[str]] = {}
|
43
|
+
final_clusters: Dict[str, Set[str]] = {}
|
44
|
+
|
45
|
+
# First pass: count lines for TQDM
|
46
|
+
print("Counting TSV lines")
|
47
|
+
total_lines = sum(1 for _ in open(tsv_file, "r"))
|
48
|
+
|
49
|
+
# Read the TSV file and build the cluster dictionary
|
50
|
+
try:
|
51
|
+
with (
|
52
|
+
open(tsv_file, "r") as file,
|
53
|
+
tqdm(total=total_lines, desc="Reading TSV", unit="lines") as pbar,
|
54
|
+
):
|
55
|
+
for line in file:
|
56
|
+
parts = line.strip().split("\t")
|
57
|
+
if len(parts) != 2:
|
58
|
+
raise ValueError(f"Invalid line format in TSV file: {line}")
|
59
|
+
rep, member = parts
|
60
|
+
if rep not in cluster_dict:
|
61
|
+
cluster_dict[rep] = set()
|
62
|
+
cluster_dict[rep].add(member)
|
63
|
+
cluster_dict[rep].add(
|
64
|
+
rep
|
65
|
+
) # Ensure the representative is also in the cluster set
|
66
|
+
pbar.update(1)
|
67
|
+
except FileNotFoundError:
|
68
|
+
raise FileNotFoundError(f"The TSV file '{tsv_file}' was not found.")
|
69
|
+
|
70
|
+
# Process each cluster and select a representative
|
71
|
+
for rep, cluster in tqdm(
|
72
|
+
cluster_dict.items(), desc="Selecting representatives", unit="clusters"
|
73
|
+
):
|
74
|
+
selected_rep = None
|
75
|
+
|
76
|
+
# First try to find IDs from priority sets
|
77
|
+
for priority_set in priority_sets:
|
78
|
+
priority_rep = cluster.intersection(priority_set)
|
79
|
+
if priority_rep:
|
80
|
+
selected_rep = min(
|
81
|
+
priority_rep
|
82
|
+
) # Choose the lexicographically first ID
|
83
|
+
break
|
84
|
+
|
85
|
+
# If no priority ID found and avoid_sets provided, try to find non-avoided IDs
|
86
|
+
if selected_rep is None and avoid_sets is not None:
|
87
|
+
# Create a set of all IDs to avoid
|
88
|
+
all_avoid_ids = set().union(*avoid_sets)
|
89
|
+
# Find IDs that are not in any avoid set
|
90
|
+
non_avoided_ids = cluster - all_avoid_ids
|
91
|
+
if non_avoided_ids:
|
92
|
+
selected_rep = min(
|
93
|
+
non_avoided_ids
|
94
|
+
) # Choose the lexicographically first non-avoided ID
|
95
|
+
elif rep not in all_avoid_ids:
|
96
|
+
# If no non-avoided IDs found but original rep is not avoided, use it
|
97
|
+
selected_rep = rep
|
98
|
+
|
99
|
+
# If still no representative found, use the original representative
|
100
|
+
if selected_rep is None:
|
101
|
+
selected_rep = rep
|
102
|
+
|
103
|
+
final_clusters[selected_rep] = cluster
|
104
|
+
|
105
|
+
return final_clusters
|
106
|
+
|
107
|
+
|
108
|
+
def replace_proteins_with_representatives(
|
109
|
+
df: pd.DataFrame, reps: Dict[str, Set[str]]
|
110
|
+
) -> pd.DataFrame:
|
111
|
+
"""
|
112
|
+
Replace protein IDs in a DataFrame with their cluster representatives.
|
113
|
+
|
114
|
+
This function takes a DataFrame containing protein IDs and a dictionary of
|
115
|
+
cluster representatives. It replaces each protein ID in the DataFrame with
|
116
|
+
its corresponding cluster representative.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
df (pd.DataFrame): Input DataFrame with columns ["pr_id", "reaction_id", "protein_id"].
|
120
|
+
reps (Dict[str, Set[str]]): A dictionary where keys are representative protein IDs
|
121
|
+
and values are sets of all members in the cluster (including the representative).
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
pd.DataFrame: A new DataFrame with protein IDs replaced by their cluster representatives.
|
125
|
+
|
126
|
+
Raises:
|
127
|
+
ValueError: If the input DataFrame doesn't have the required columns.
|
128
|
+
"""
|
129
|
+
if not all(col in df.columns for col in ["pr_id", "reaction_id", "protein_id"]):
|
130
|
+
raise ValueError(
|
131
|
+
"Input DataFrame must have columns: 'pr_id', 'reaction_id', 'protein_id'"
|
132
|
+
)
|
133
|
+
|
134
|
+
print("Starting protein ID replacement process...")
|
135
|
+
|
136
|
+
# Create a mapping of all proteins to their representatives
|
137
|
+
protein_to_rep = {}
|
138
|
+
for rep, cluster in reps.items():
|
139
|
+
for protein in cluster:
|
140
|
+
protein_to_rep[protein] = rep
|
141
|
+
|
142
|
+
print("Protein to representative mapping created.")
|
143
|
+
|
144
|
+
# Create a copy of the input DataFrame to avoid modifying the original
|
145
|
+
df_copy = df.copy()
|
146
|
+
|
147
|
+
# Replace protein IDs with their representatives
|
148
|
+
tqdm.pandas(desc="Replacing protein IDs")
|
149
|
+
df_copy["protein_id"] = df_copy["protein_id"].progress_map(
|
150
|
+
lambda x: protein_to_rep.get(x, x)
|
151
|
+
)
|
152
|
+
|
153
|
+
print("Protein ID replacement completed.")
|
154
|
+
return df_copy
|
155
|
+
|
156
|
+
|
157
|
+
def write_clusters_to_tsv(clusters, output_file):
|
158
|
+
with open(output_file, "w", newline="") as f:
|
159
|
+
writer = csv.writer(f, delimiter="\t")
|
160
|
+
writer.writerow(["protein_id", "cluster_rep"]) # Write header
|
161
|
+
|
162
|
+
for cluster_rep, members in tqdm(clusters.items(), desc="Writing clusters"):
|
163
|
+
for protein_id in members:
|
164
|
+
writer.writerow([protein_id, cluster_rep])
|
dayhoff_tools/sqlite.py
ADDED
@@ -0,0 +1,516 @@
|
|
1
|
+
import os
|
2
|
+
import shutil
|
3
|
+
import sqlite3
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Iterable, Literal, Optional, Tuple, Union
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
from sqlalchemy import create_engine, inspect, text
|
9
|
+
from tqdm import tqdm
|
10
|
+
from typing_extensions import Literal
|
11
|
+
|
12
|
+
|
13
|
+
def sql_query(db_file: str, query: str, params: tuple[Any, ...] = ()) -> list[Any]:
|
14
|
+
"""Perform a SQL query on the specified SQLite file.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
db_file (str): Path to the SQLite file.
|
18
|
+
query (str): SQL query to perform.
|
19
|
+
params (tuple[Any, ...], optional): Query parameters. Defaults to ().
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
list[Any]: Results of the query.
|
23
|
+
|
24
|
+
Raises:
|
25
|
+
sqlite3.DatabaseError: If the query fails.
|
26
|
+
"""
|
27
|
+
with sqlite3.connect(db_file) as conn:
|
28
|
+
try:
|
29
|
+
return conn.execute(query, params).fetchall()
|
30
|
+
except sqlite3.Error as e:
|
31
|
+
raise sqlite3.DatabaseError(f"Query: {query}\n failed with error:\n {e}")
|
32
|
+
|
33
|
+
|
34
|
+
def sql_check_columns(
|
35
|
+
db_file: str,
|
36
|
+
table_name: str,
|
37
|
+
columns: str | Iterable[str] | None = None,
|
38
|
+
) -> list[str]:
|
39
|
+
"""Check that the table and columns exist in the specified SQLite file.
|
40
|
+
Returns a list of verified columns.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
db_file (str): Path to the SQLite file.
|
44
|
+
table_name (str): Name of the table in the database that contains the data.
|
45
|
+
columns (str | list[str] | tuple[str, ...] | None, optional): Names of the
|
46
|
+
columns in the database that will be returned as features. If None then
|
47
|
+
all columns will be returned.
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
list[str]: List of verified columns.
|
51
|
+
|
52
|
+
Raises:
|
53
|
+
sqlite3.DatabaseError: If the table, or columns do not exist.
|
54
|
+
"""
|
55
|
+
tables = [
|
56
|
+
table[0]
|
57
|
+
for table in sql_query(
|
58
|
+
db_file, "SELECT name FROM sqlite_master WHERE type='table'"
|
59
|
+
)
|
60
|
+
]
|
61
|
+
if table_name not in tables:
|
62
|
+
raise sqlite3.DatabaseError(f"Table {table_name} does not exist.")
|
63
|
+
|
64
|
+
existing_columns = [
|
65
|
+
row[1] for row in sql_query(db_file, f"PRAGMA table_info({table_name})")
|
66
|
+
]
|
67
|
+
if columns is not None:
|
68
|
+
columns = [columns] if isinstance(columns, str) else list(columns)
|
69
|
+
missing_columns = set(columns) - set(existing_columns)
|
70
|
+
if missing_columns:
|
71
|
+
raise sqlite3.DatabaseError(
|
72
|
+
"One or more specified columns does not exist.\n"
|
73
|
+
+ f"Missing columns: {missing_columns}\n"
|
74
|
+
+ f"Existing columns: {existing_columns}"
|
75
|
+
)
|
76
|
+
return columns
|
77
|
+
else:
|
78
|
+
return existing_columns
|
79
|
+
|
80
|
+
|
81
|
+
def sql_check_index(db_file: str, table_name: str, column_name: str) -> bool:
|
82
|
+
"""Verify that an index exists for the given column in the specified table."""
|
83
|
+
conn = sqlite3.connect(db_file)
|
84
|
+
cursor = conn.cursor()
|
85
|
+
|
86
|
+
query = f"PRAGMA index_list('{table_name}')"
|
87
|
+
cursor.execute(query)
|
88
|
+
indexes = cursor.fetchall()
|
89
|
+
|
90
|
+
# Collect all indexed columns from the retrieved indexes
|
91
|
+
indexed_columns = []
|
92
|
+
for index in indexes:
|
93
|
+
index_name = index[1] # Adjust according to the structure returned
|
94
|
+
index_info_query = f"PRAGMA index_info('{index_name}')"
|
95
|
+
cursor.execute(index_info_query)
|
96
|
+
columns = cursor.fetchall()
|
97
|
+
indexed_columns.extend(col[2] for col in columns if col[2] == column_name)
|
98
|
+
|
99
|
+
conn.close()
|
100
|
+
if column_name not in indexed_columns:
|
101
|
+
raise Exception(
|
102
|
+
f"Column '{column_name}' is not indexed in table '{table_name}'."
|
103
|
+
)
|
104
|
+
|
105
|
+
return True
|
106
|
+
|
107
|
+
|
108
|
+
def sql_check_primary_key(db_file: str, table_name: str, expected_primary_key: str):
|
109
|
+
"""Check that the expected primary key is set for the specified table."""
|
110
|
+
with sqlite3.connect(db_file) as conn:
|
111
|
+
cursor = conn.cursor()
|
112
|
+
cursor.execute(f"PRAGMA table_info('{table_name}')")
|
113
|
+
columns = cursor.fetchall()
|
114
|
+
# Look for the primary key in the results
|
115
|
+
for col in columns:
|
116
|
+
if col[1] == expected_primary_key and col[5] == 1: # Check the pk flag
|
117
|
+
return True
|
118
|
+
return False
|
119
|
+
|
120
|
+
|
121
|
+
def write_to_sqlite(
|
122
|
+
input_data: Union[str, pd.DataFrame],
|
123
|
+
sql_filepath: str,
|
124
|
+
table_name: str,
|
125
|
+
columns: list[Tuple[str, Any]],
|
126
|
+
primary_key: str,
|
127
|
+
indexes: list[Tuple[str, str]],
|
128
|
+
if_table_exists: Literal["fail", "replace", "append"] = "append",
|
129
|
+
delete_file_if_exists: bool = True,
|
130
|
+
chunksize: int = 1000,
|
131
|
+
) -> None:
|
132
|
+
"""
|
133
|
+
Write data from a TSV file or DataFrame to a SQLite database file in a memory-efficient way.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
input_data (Union[str, pd.DataFrame]): The input data, either a file path of the TSV file or a DataFrame.
|
137
|
+
sql_filepath (str): The file path of the SQLite database file.
|
138
|
+
table_name (str): The name of the table to be created in the database.
|
139
|
+
columns (list[Tuple[str, Any]]): A list of tuples representing the column names and their data types.
|
140
|
+
Each tuple should be in the format (column_name, data_type), where the data_type is an sqlite data type.
|
141
|
+
Example: [("id", "INTEGER"), ("name", "TEXT"), ("age", "INTEGER")]
|
142
|
+
primary_key (str): The name of the column to be used as the primary key.
|
143
|
+
indexes (list[Tuple[str, str]]): A list of tuples representing the indexes to be created.
|
144
|
+
Each tuple should be in the format (index_name, column_name).
|
145
|
+
Example: [("idx_name", "name"), ("idx_age", "age")]
|
146
|
+
if_table_exists (Literal['fail', 'replace', 'append']): What to do if the table exists already.
|
147
|
+
delete_file_if_exists (bool): Whether to delete the database file itself, if it exists.
|
148
|
+
chunksize (int): Number of lines to read from the TSV file at a time if input_data is a TSV file.
|
149
|
+
|
150
|
+
Examples:
|
151
|
+
columns = [
|
152
|
+
("pr_id", "INTEGER"),
|
153
|
+
("reaction_id", "TEXT"),
|
154
|
+
("protein_id", "TEXT"),
|
155
|
+
("db_source", "TEXT"),
|
156
|
+
]
|
157
|
+
primary_key = "pr_id"
|
158
|
+
indexes = [
|
159
|
+
("idx_on_protein_id", "protein_id"),
|
160
|
+
("idx_on_reaction_id", "reaction_id"),
|
161
|
+
]
|
162
|
+
write_to_sqlite(
|
163
|
+
input_data="path/to/data.tsv",
|
164
|
+
sql_filepath="path/to/database.db",
|
165
|
+
table_name="protein_to_reaction",
|
166
|
+
columns=columns,
|
167
|
+
primary_key=primary_key,
|
168
|
+
indexes=indexes,
|
169
|
+
if_table_exists="append",
|
170
|
+
delete_file_if_exists=True,
|
171
|
+
chunksize=1000
|
172
|
+
)
|
173
|
+
|
174
|
+
# or
|
175
|
+
|
176
|
+
df = pd.DataFrame([...])
|
177
|
+
write_to_sqlite(
|
178
|
+
input_data=df,
|
179
|
+
sql_filepath="path/to/database.db",
|
180
|
+
table_name="protein_to_reaction",
|
181
|
+
columns=columns,
|
182
|
+
primary_key=primary_key,
|
183
|
+
indexes=indexes,
|
184
|
+
if_table_exists="append",
|
185
|
+
delete_file_if_exists=True,
|
186
|
+
)
|
187
|
+
"""
|
188
|
+
|
189
|
+
# Optionally delete the database file itself
|
190
|
+
if delete_file_if_exists and os.path.exists(sql_filepath):
|
191
|
+
os.remove(sql_filepath)
|
192
|
+
|
193
|
+
with sqlite3.connect(sql_filepath) as conn:
|
194
|
+
cursor = conn.cursor()
|
195
|
+
|
196
|
+
# Determine action if table exists
|
197
|
+
if if_table_exists == "replace":
|
198
|
+
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
|
199
|
+
elif if_table_exists == "append":
|
200
|
+
cursor.execute(
|
201
|
+
f"SELECT name FROM sqlite_master WHERE type='table' AND name=?;",
|
202
|
+
(table_name,),
|
203
|
+
)
|
204
|
+
if cursor.fetchone() is None:
|
205
|
+
if_table_exists = "replace"
|
206
|
+
else: # if_table_exists == 'fail'
|
207
|
+
cursor.execute(
|
208
|
+
f"SELECT name FROM sqlite_master WHERE type='table' AND name=?;",
|
209
|
+
(table_name,),
|
210
|
+
)
|
211
|
+
if cursor.fetchone():
|
212
|
+
raise ValueError(f"Table {table_name} already exists.")
|
213
|
+
|
214
|
+
# Create table if 'replace' or not exists
|
215
|
+
if if_table_exists == "replace":
|
216
|
+
col_defs = [f"{name} {dtype}" for name, dtype in columns]
|
217
|
+
col_defs.append(f"PRIMARY KEY ({primary_key})")
|
218
|
+
cursor.execute(f"CREATE TABLE {table_name} ({', '.join(col_defs)})")
|
219
|
+
|
220
|
+
# Convert columns to list of column names
|
221
|
+
column_names = [col[0] for col in columns]
|
222
|
+
placeholders = ", ".join("?" * len(column_names))
|
223
|
+
|
224
|
+
if isinstance(input_data, str):
|
225
|
+
# Read TSV file in chunks and insert into the table
|
226
|
+
for chunk in pd.read_csv(
|
227
|
+
input_data, sep="\t", usecols=column_names, chunksize=chunksize
|
228
|
+
):
|
229
|
+
data = chunk.values.tolist()
|
230
|
+
cursor.executemany(
|
231
|
+
f"INSERT INTO {table_name} VALUES ({placeholders})", data
|
232
|
+
)
|
233
|
+
elif isinstance(input_data, pd.DataFrame):
|
234
|
+
# Insert DataFrame data into the table
|
235
|
+
data = input_data[column_names].values.tolist()
|
236
|
+
cursor.executemany(
|
237
|
+
f"INSERT INTO {table_name} VALUES ({placeholders})", data
|
238
|
+
)
|
239
|
+
else:
|
240
|
+
raise ValueError(
|
241
|
+
"input_data must be either a file path to a TSV file or a DataFrame."
|
242
|
+
)
|
243
|
+
|
244
|
+
# Create indexes if specified
|
245
|
+
for index_name, column_name in indexes:
|
246
|
+
cursor.execute(
|
247
|
+
f"CREATE INDEX IF NOT EXISTS {index_name} ON {table_name} ({column_name})"
|
248
|
+
)
|
249
|
+
|
250
|
+
conn.commit()
|
251
|
+
|
252
|
+
|
253
|
+
def inspect_sqlite_db(db_path: str) -> None:
|
254
|
+
"""
|
255
|
+
Inspect the contents of a SQLite database file.
|
256
|
+
|
257
|
+
This function connects to the specified SQLite database, retrieves information
|
258
|
+
about its tables, columns, and sample data, and prints this information to
|
259
|
+
the console.
|
260
|
+
|
261
|
+
Args:
|
262
|
+
db_path (str): The file path to the SQLite database.
|
263
|
+
|
264
|
+
Returns:
|
265
|
+
None
|
266
|
+
|
267
|
+
Raises:
|
268
|
+
sqlite3.Error: If there's an error connecting to or querying the database.
|
269
|
+
"""
|
270
|
+
try:
|
271
|
+
conn = sqlite3.connect(db_path)
|
272
|
+
cursor = conn.cursor()
|
273
|
+
|
274
|
+
# Get list of tables
|
275
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
276
|
+
tables = cursor.fetchall()
|
277
|
+
|
278
|
+
for table in tables:
|
279
|
+
table_name = table[0]
|
280
|
+
print(f"\nTable: {table_name}")
|
281
|
+
|
282
|
+
# Get column names
|
283
|
+
cursor.execute(f"PRAGMA table_info({table_name})")
|
284
|
+
columns = [column[1] for column in cursor.fetchall()]
|
285
|
+
print("Columns:", ", ".join(columns))
|
286
|
+
|
287
|
+
# Get sample data (first 5 rows)
|
288
|
+
cursor.execute(f"SELECT * FROM {table_name} LIMIT 5")
|
289
|
+
rows = cursor.fetchall()
|
290
|
+
|
291
|
+
print("\nSample data:")
|
292
|
+
for row in rows:
|
293
|
+
print(row)
|
294
|
+
|
295
|
+
conn.close()
|
296
|
+
|
297
|
+
except sqlite3.Error as e:
|
298
|
+
print(f"An error occurred: {e}")
|
299
|
+
|
300
|
+
|
301
|
+
def read_sql_table_to_df(
|
302
|
+
db_path: str, table_name: Optional[str] = None, chunksize: int = 10000
|
303
|
+
) -> pd.DataFrame:
|
304
|
+
"""
|
305
|
+
Read an SQL table into a pandas DataFrame with a progress bar.
|
306
|
+
|
307
|
+
This function connects to a SQLite database using the provided file path,
|
308
|
+
reads the specified table (or the only table if not specified) in chunks,
|
309
|
+
and displays a progress bar during the reading process.
|
310
|
+
|
311
|
+
Args:
|
312
|
+
db_path (str): The file path to the SQLite database.
|
313
|
+
table_name (Optional[str], optional): The name of the table to read from the database.
|
314
|
+
If None, assumes there's only one table. Defaults to None.
|
315
|
+
chunksize (int, optional): The number of rows to read per chunk. Defaults to 10000.
|
316
|
+
|
317
|
+
Returns:
|
318
|
+
pd.DataFrame: A pandas DataFrame containing the data from the SQL table.
|
319
|
+
|
320
|
+
Raises:
|
321
|
+
ValueError: If the table_name is not provided and there are multiple tables,
|
322
|
+
or if the specified table doesn't exist.
|
323
|
+
SQLAlchemyError: If there's an issue connecting to the database or reading the table.
|
324
|
+
"""
|
325
|
+
|
326
|
+
engine_url = f"sqlite:///{db_path}"
|
327
|
+
engine = create_engine(engine_url)
|
328
|
+
|
329
|
+
# Get all table names
|
330
|
+
with engine.connect() as connection:
|
331
|
+
inspector = inspect(engine)
|
332
|
+
table_names = inspector.get_table_names()
|
333
|
+
|
334
|
+
# Handle table selection
|
335
|
+
if table_name is None:
|
336
|
+
if len(table_names) == 1:
|
337
|
+
table_name = table_names[0]
|
338
|
+
elif len(table_names) == 0:
|
339
|
+
raise ValueError("No tables found in the database.")
|
340
|
+
else:
|
341
|
+
raise ValueError(
|
342
|
+
f"Multiple tables found. Please specify a table_name. Options are: {', '.join(table_names)}"
|
343
|
+
)
|
344
|
+
elif table_name not in table_names:
|
345
|
+
raise ValueError(f"Table '{table_name}' not found in the database.")
|
346
|
+
|
347
|
+
# Get the number of rows in the table
|
348
|
+
with engine.connect() as connection:
|
349
|
+
result = connection.execute(text(f"SELECT COUNT(*) FROM {table_name}"))
|
350
|
+
row_count = result.scalar()
|
351
|
+
|
352
|
+
# Create a tqdm progress bar
|
353
|
+
pbar = tqdm(total=row_count, desc=f"Reading {table_name}", unit="rows")
|
354
|
+
|
355
|
+
# Read the SQL table in chunks with progress updates
|
356
|
+
chunks = []
|
357
|
+
for chunk in pd.read_sql(
|
358
|
+
f"SELECT * FROM {table_name}", engine, chunksize=chunksize
|
359
|
+
):
|
360
|
+
chunks.append(chunk)
|
361
|
+
pbar.update(len(chunk))
|
362
|
+
|
363
|
+
# Close the progress bar
|
364
|
+
pbar.close()
|
365
|
+
|
366
|
+
# Combine all chunks into a single DataFrame
|
367
|
+
df = pd.concat(chunks, ignore_index=True)
|
368
|
+
|
369
|
+
return df
|
370
|
+
|
371
|
+
|
372
|
+
def _create_optimized_db(dst_path: Path) -> None:
|
373
|
+
"""Helper function to create an optimized database."""
|
374
|
+
if dst_path.exists():
|
375
|
+
dst_path.unlink()
|
376
|
+
|
377
|
+
with sqlite3.connect(str(dst_path)) as conn:
|
378
|
+
# Use default page size but optimize other settings
|
379
|
+
conn.execute("PRAGMA journal_mode=OFF")
|
380
|
+
conn.execute(
|
381
|
+
"PRAGMA synchronous=FULL"
|
382
|
+
) # Start with FULL for safety during creation
|
383
|
+
conn.commit()
|
384
|
+
|
385
|
+
|
386
|
+
def _verify_destination(dst_path: Path) -> None:
|
387
|
+
"""Helper function to verify destination path is writable."""
|
388
|
+
# Convert to absolute path for consistent handling
|
389
|
+
dst_path = dst_path.absolute()
|
390
|
+
|
391
|
+
# Check if path is under root and we don't have root access
|
392
|
+
if str(dst_path).startswith("/") and not os.access("/", os.W_OK):
|
393
|
+
if any(not os.access(p, os.W_OK) for p in dst_path.parents):
|
394
|
+
raise OSError(f"No write permission for path: {dst_path}")
|
395
|
+
|
396
|
+
try:
|
397
|
+
# Check if parent directory exists
|
398
|
+
if not dst_path.parent.exists():
|
399
|
+
try:
|
400
|
+
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
401
|
+
except (OSError, PermissionError) as e:
|
402
|
+
raise OSError(
|
403
|
+
f"Cannot create parent directory: {dst_path.parent}"
|
404
|
+
) from e
|
405
|
+
|
406
|
+
# Check if parent directory is writable
|
407
|
+
if not os.access(str(dst_path.parent), os.W_OK):
|
408
|
+
raise OSError(
|
409
|
+
f"No write permission for parent directory: {dst_path.parent}"
|
410
|
+
)
|
411
|
+
|
412
|
+
# If file exists, check if it's writable
|
413
|
+
if dst_path.exists():
|
414
|
+
if not os.access(str(dst_path), os.W_OK):
|
415
|
+
raise OSError(f"No write permission for destination: {dst_path}")
|
416
|
+
else:
|
417
|
+
# Try to create and write to the file
|
418
|
+
try:
|
419
|
+
dst_path.touch()
|
420
|
+
dst_path.unlink() # Clean up the test file
|
421
|
+
except (OSError, PermissionError) as e:
|
422
|
+
raise OSError(f"Cannot write to destination path: {dst_path}") from e
|
423
|
+
|
424
|
+
except Exception as e:
|
425
|
+
# Ensure we always raise OSError
|
426
|
+
if not isinstance(e, OSError):
|
427
|
+
raise OSError(f"Cannot create or write to destination: {dst_path}") from e
|
428
|
+
raise
|
429
|
+
|
430
|
+
|
431
|
+
def optimize_protein_db(src_path: Union[str, Path], dst_path: Union[str, Path]):
|
432
|
+
"""
|
433
|
+
Optimize SQLite database containing protein sequences.
|
434
|
+
"""
|
435
|
+
src_path, dst_path = Path(src_path), Path(dst_path)
|
436
|
+
|
437
|
+
# Check source file exists
|
438
|
+
if not src_path.exists():
|
439
|
+
raise FileNotFoundError(f"Source database not found: {src_path}")
|
440
|
+
|
441
|
+
# Verify destination is writable
|
442
|
+
_verify_destination(dst_path)
|
443
|
+
|
444
|
+
# Create optimized database
|
445
|
+
_create_optimized_db(dst_path)
|
446
|
+
|
447
|
+
# Copy data from source to destination
|
448
|
+
with sqlite3.connect(str(dst_path)) as dst_conn:
|
449
|
+
with sqlite3.connect(src_path) as src_conn:
|
450
|
+
src_conn.backup(dst_conn)
|
451
|
+
|
452
|
+
# Apply conservative optimizations
|
453
|
+
dst_conn.execute("PRAGMA journal_mode=WAL")
|
454
|
+
dst_conn.execute("PRAGMA synchronous=FULL") # Keep FULL for protein data
|
455
|
+
dst_conn.execute("PRAGMA cache_size=-2000000") # 2GB cache
|
456
|
+
dst_conn.execute("PRAGMA temp_store=FILE")
|
457
|
+
dst_conn.commit()
|
458
|
+
|
459
|
+
|
460
|
+
def optimize_reaction_db(src_path: Union[str, Path], dst_path: Union[str, Path]):
|
461
|
+
"""
|
462
|
+
Optimize SQLite database containing reaction data.
|
463
|
+
"""
|
464
|
+
src_path, dst_path = Path(src_path), Path(dst_path)
|
465
|
+
|
466
|
+
# Check source file exists
|
467
|
+
if not src_path.exists():
|
468
|
+
raise FileNotFoundError(f"Source database not found: {src_path}")
|
469
|
+
|
470
|
+
# Verify destination is writable
|
471
|
+
_verify_destination(dst_path)
|
472
|
+
|
473
|
+
# Create optimized database
|
474
|
+
_create_optimized_db(dst_path)
|
475
|
+
|
476
|
+
# Copy data from source to destination
|
477
|
+
with sqlite3.connect(str(dst_path)) as dst_conn:
|
478
|
+
with sqlite3.connect(src_path) as src_conn:
|
479
|
+
src_conn.backup(dst_conn)
|
480
|
+
|
481
|
+
# Apply conservative optimizations
|
482
|
+
dst_conn.execute("PRAGMA journal_mode=WAL")
|
483
|
+
dst_conn.execute("PRAGMA synchronous=FULL") # Keep FULL for safety
|
484
|
+
dst_conn.execute("PRAGMA cache_size=-2000000") # 2GB cache
|
485
|
+
dst_conn.execute("PRAGMA temp_store=FILE") # Use FILE instead of MEMORY
|
486
|
+
dst_conn.commit()
|
487
|
+
|
488
|
+
|
489
|
+
def optimize_pairs_db(src_path: Union[str, Path], dst_path: Union[str, Path]):
|
490
|
+
"""
|
491
|
+
Optimize SQLite database containing protein-reaction pairs.
|
492
|
+
"""
|
493
|
+
src_path, dst_path = Path(src_path), Path(dst_path)
|
494
|
+
|
495
|
+
# Check source file exists
|
496
|
+
if not src_path.exists():
|
497
|
+
raise FileNotFoundError(f"Source database not found: {src_path}")
|
498
|
+
|
499
|
+
# Verify destination is writable
|
500
|
+
_verify_destination(dst_path)
|
501
|
+
|
502
|
+
# Create optimized database
|
503
|
+
_create_optimized_db(dst_path)
|
504
|
+
|
505
|
+
# Copy data from source to destination
|
506
|
+
with sqlite3.connect(str(dst_path)) as dst_conn:
|
507
|
+
with sqlite3.connect(src_path) as src_conn:
|
508
|
+
src_conn.backup(dst_conn)
|
509
|
+
|
510
|
+
# Apply conservative optimizations
|
511
|
+
dst_conn.execute("PRAGMA journal_mode=WAL")
|
512
|
+
dst_conn.execute("PRAGMA synchronous=FULL") # Keep FULL for safety
|
513
|
+
dst_conn.execute("PRAGMA cache_size=-2000000") # 2GB cache
|
514
|
+
dst_conn.execute("PRAGMA temp_store=FILE") # Use FILE instead of MEMORY
|
515
|
+
dst_conn.execute("PRAGMA mmap_size=30000000000") # 30GB memory mapping
|
516
|
+
dst_conn.commit()
|