dayhoff-tools 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/__init__.py +0 -0
- dayhoff_tools/chemistry/standardizer.py +297 -0
- dayhoff_tools/chemistry/utils.py +63 -0
- dayhoff_tools/cli/__init__.py +0 -0
- dayhoff_tools/cli/main.py +90 -0
- dayhoff_tools/cli/swarm_commands.py +156 -0
- dayhoff_tools/cli/utility_commands.py +244 -0
- dayhoff_tools/deployment/base.py +434 -0
- dayhoff_tools/deployment/deploy_aws.py +458 -0
- dayhoff_tools/deployment/deploy_gcp.py +176 -0
- dayhoff_tools/deployment/deploy_utils.py +781 -0
- dayhoff_tools/deployment/job_runner.py +153 -0
- dayhoff_tools/deployment/processors.py +125 -0
- dayhoff_tools/deployment/swarm.py +591 -0
- dayhoff_tools/embedders.py +893 -0
- dayhoff_tools/fasta.py +1082 -0
- dayhoff_tools/file_ops.py +261 -0
- dayhoff_tools/gcp.py +85 -0
- dayhoff_tools/h5.py +542 -0
- dayhoff_tools/kegg.py +37 -0
- dayhoff_tools/logs.py +27 -0
- dayhoff_tools/mmseqs.py +164 -0
- dayhoff_tools/sqlite.py +516 -0
- dayhoff_tools/structure.py +751 -0
- dayhoff_tools/uniprot.py +434 -0
- dayhoff_tools/warehouse.py +418 -0
- dayhoff_tools-1.0.0.dist-info/METADATA +122 -0
- dayhoff_tools-1.0.0.dist-info/RECORD +30 -0
- dayhoff_tools-1.0.0.dist-info/WHEEL +4 -0
- dayhoff_tools-1.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
import gzip
|
2
|
+
import inspect
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import re
|
6
|
+
import shutil
|
7
|
+
import tarfile
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Any, Set
|
10
|
+
|
11
|
+
from tqdm import tqdm
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def compress_gz(input_path: str, output_path: str):
|
17
|
+
"""Compress the input file using gzip, keep the original."""
|
18
|
+
with open(input_path, "rb") as f_in:
|
19
|
+
with gzip.open(output_path, "wb") as f_out:
|
20
|
+
f_out.writelines(f_in)
|
21
|
+
|
22
|
+
|
23
|
+
def decompress_gz(file_path: str) -> str:
|
24
|
+
"""
|
25
|
+
Decompress a .gz file and return the path to the decompressed file.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
file_path (str): Path to the .gz file.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
str: Path to the decompressed file.
|
32
|
+
"""
|
33
|
+
input_path = Path(file_path)
|
34
|
+
if input_path.suffix != ".gz":
|
35
|
+
raise ValueError(f"File {file_path} does not have a .gz extension")
|
36
|
+
|
37
|
+
output_path = input_path.with_suffix("") # Remove the .gz suffix
|
38
|
+
|
39
|
+
with gzip.open(input_path, "rb") as f_in:
|
40
|
+
with open(output_path, "wb") as f_out:
|
41
|
+
shutil.copyfileobj(f_in, f_out)
|
42
|
+
|
43
|
+
return str(output_path)
|
44
|
+
|
45
|
+
|
46
|
+
def compress_folder_into_tar_gz(folder_path: str) -> str:
|
47
|
+
"""
|
48
|
+
Compress a folder into a tar.gz file of the same name.
|
49
|
+
|
50
|
+
:param folder_path: The path to the folder to compress.
|
51
|
+
"""
|
52
|
+
output_path = folder_path + ".tar.gz"
|
53
|
+
with tarfile.open(output_path, "w:gz") as tarf:
|
54
|
+
for root, dirs, files in os.walk(folder_path):
|
55
|
+
for file in files:
|
56
|
+
file_path = os.path.join(root, file)
|
57
|
+
tarf.add(file_path, os.path.relpath(file_path, folder_path))
|
58
|
+
|
59
|
+
return output_path
|
60
|
+
|
61
|
+
|
62
|
+
def decompress_tar(compressed_folder: str):
|
63
|
+
"""Decompress a .tar.gz or .tar.bz2 folder to the same location
|
64
|
+
and return the path of the decompressed folder.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
compressed_folder (str): Path to a .tar.gz or .tar.bz2 file, which
|
68
|
+
itself is a compression of a multi-level folder.
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
str: the path to the decompressed folder
|
72
|
+
"""
|
73
|
+
# Get the directory of the compressed file
|
74
|
+
directory = os.path.dirname(compressed_folder)
|
75
|
+
|
76
|
+
# Determine the compression type
|
77
|
+
if compressed_folder.endswith(".tar.gz"):
|
78
|
+
mode = "r:gz"
|
79
|
+
decompressed_folder = compressed_folder.replace(".tar.gz", "")
|
80
|
+
elif compressed_folder.endswith(".tar.bz2"):
|
81
|
+
mode = "r:bz2"
|
82
|
+
decompressed_folder = compressed_folder.replace(".tar.bz2", "")
|
83
|
+
else:
|
84
|
+
raise ValueError(
|
85
|
+
"Unsupported compression type. Only .tar.gz and .tar.bz2 are supported."
|
86
|
+
)
|
87
|
+
|
88
|
+
# Open the compressed file
|
89
|
+
with tarfile.open(compressed_folder, mode) as tar:
|
90
|
+
# Check if the decompressed folder already exists
|
91
|
+
if os.path.exists(decompressed_folder):
|
92
|
+
# Remove the existing decompressed folder
|
93
|
+
shutil.rmtree(decompressed_folder)
|
94
|
+
|
95
|
+
# Extract the contents to the decompressed folder
|
96
|
+
tar.extractall(path=directory)
|
97
|
+
|
98
|
+
return decompressed_folder
|
99
|
+
|
100
|
+
|
101
|
+
def natural_sort_key(s):
|
102
|
+
"""
|
103
|
+
A sorting key function for natural (human) sorting of strings containing numbers.
|
104
|
+
Args:
|
105
|
+
s (str): The string to be split into parts for sorting.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
list: A list of strings and integers derived from the input string.
|
109
|
+
"""
|
110
|
+
return [
|
111
|
+
int(text) if text.isdigit() else text.lower()
|
112
|
+
for text in re.split("([0-9]+)", s)
|
113
|
+
]
|
114
|
+
|
115
|
+
|
116
|
+
def list_files_in_directory_to_txt(directory_path: str, output_txt_file: str):
|
117
|
+
"""
|
118
|
+
Lists all files in the specified directory in natural sorted order and writes their names to a .txt file.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
directory_path (str): Path to the directory whose files should be listed.
|
122
|
+
output_txt_file (str): Path to the .txt file where the list of file names will be written.
|
123
|
+
"""
|
124
|
+
# Ensure the directory exists
|
125
|
+
if not os.path.isdir(directory_path):
|
126
|
+
print(f"The directory {directory_path} does not exist.")
|
127
|
+
return
|
128
|
+
|
129
|
+
# Collect all file names in the directory
|
130
|
+
file_names = [
|
131
|
+
filename
|
132
|
+
for filename in os.listdir(directory_path)
|
133
|
+
if os.path.isfile(os.path.join(directory_path, filename))
|
134
|
+
]
|
135
|
+
|
136
|
+
# Sort the list of file names using the natural sort key
|
137
|
+
sorted_file_names = sorted(file_names, key=natural_sort_key)
|
138
|
+
|
139
|
+
# Open the output file in write mode
|
140
|
+
with open(output_txt_file, "w") as file_out:
|
141
|
+
# Iterate through the sorted list of file names
|
142
|
+
for filename in sorted_file_names:
|
143
|
+
# Write each file name to the output .txt file
|
144
|
+
file_out.write(f"{filename}\n")
|
145
|
+
|
146
|
+
print(f"File names have been written to {output_txt_file}")
|
147
|
+
|
148
|
+
|
149
|
+
def list_files_in_directory(directory_path: str) -> list[str]:
|
150
|
+
"""
|
151
|
+
Make a list of the names for all the files in a specified directory, in natural sorted order.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
directory_path (str): Path to the directory whose files should be listed.
|
155
|
+
"""
|
156
|
+
# Collect all file names in the directory
|
157
|
+
file_names = [
|
158
|
+
filename
|
159
|
+
for filename in os.listdir(directory_path)
|
160
|
+
if os.path.isfile(os.path.join(directory_path, filename))
|
161
|
+
]
|
162
|
+
|
163
|
+
# Sort the list of file names using the natural sort key
|
164
|
+
return sorted(file_names, key=natural_sort_key)
|
165
|
+
|
166
|
+
|
167
|
+
def write_set_to_file(set_data: Set[Any], filename: str) -> None:
|
168
|
+
"""
|
169
|
+
Write the contents of a set to a text file, one item per line.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
set_data (Set[Any]): The set containing items to write to the file.
|
173
|
+
filename (str): The name of the file to create or overwrite.
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
None
|
177
|
+
"""
|
178
|
+
with open(filename, "w") as file:
|
179
|
+
for item in tqdm(set_data, desc="Writing to file", unit="item"):
|
180
|
+
file.write(str(item) + "\n")
|
181
|
+
|
182
|
+
|
183
|
+
def read_file_to_set(filename: str) -> Set[str]:
|
184
|
+
"""
|
185
|
+
Read all lines from a text file and return them as a set.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
filename (str): The name of the file to read from.
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
Set[str]: A set containing all unique lines from the file.
|
192
|
+
"""
|
193
|
+
result_set = set()
|
194
|
+
with open(filename, "r") as file:
|
195
|
+
for line in file:
|
196
|
+
result_set.add(line.strip())
|
197
|
+
|
198
|
+
print(f"Item count: {len(result_set)}")
|
199
|
+
return result_set
|
200
|
+
|
201
|
+
|
202
|
+
def compare_sets(
|
203
|
+
set1: set, set2: set, set1_name: str | None = None, set2_name: str | None = None
|
204
|
+
) -> None:
|
205
|
+
"""
|
206
|
+
Compare two sets and return a string representation of the comparison.
|
207
|
+
|
208
|
+
This function performs set operations to find the intersection and differences
|
209
|
+
between the two input sets. It then generates a text-based Venn diagram and
|
210
|
+
detailed statistics about the comparison.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
set1 (set): The first set to compare.
|
214
|
+
set2 (set): The second set to compare.
|
215
|
+
set1_name (str, optional): The name of the first set. If None, attempts to extract from variable name.
|
216
|
+
set2_name (str, optional): The name of the second set. If None, attempts to extract from variable name.
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
str: A string containing the text-based Venn diagram and detailed statistics.
|
220
|
+
|
221
|
+
Example:
|
222
|
+
>>> set1 = {1, 2, 3, 4}
|
223
|
+
>>> set2 = {3, 4, 5, 6}
|
224
|
+
>>> print(compare_sets(set1, set2))
|
225
|
+
"""
|
226
|
+
# Try to extract variable names if not provided
|
227
|
+
if set1_name is None or set2_name is None:
|
228
|
+
frame = inspect.currentframe().f_back
|
229
|
+
local_vars = frame.f_locals
|
230
|
+
if set1_name is None:
|
231
|
+
set1_name = next(
|
232
|
+
(var for var, val in local_vars.items() if val is set1), "Set 1"
|
233
|
+
)
|
234
|
+
if set2_name is None:
|
235
|
+
set2_name = next(
|
236
|
+
(var for var, val in local_vars.items() if val is set2), "Set 2"
|
237
|
+
)
|
238
|
+
|
239
|
+
# Perform set operations
|
240
|
+
in_both = set1.intersection(set2)
|
241
|
+
only_in_set1 = set1.difference(set2)
|
242
|
+
only_in_set2 = set2.difference(set1)
|
243
|
+
|
244
|
+
# Calculate totals and percentages
|
245
|
+
total = len(set1.union(set2))
|
246
|
+
in_both_percent = len(in_both) / total * 100
|
247
|
+
only_in_set1_percent = len(only_in_set1) / total * 100
|
248
|
+
only_in_set2_percent = len(only_in_set2) / total * 100
|
249
|
+
|
250
|
+
# Create the output string
|
251
|
+
output = [
|
252
|
+
f"Set Comparison: {set1_name} vs {set2_name}",
|
253
|
+
"---------------------------------------",
|
254
|
+
f"Total unique elements: {total:,}",
|
255
|
+
"",
|
256
|
+
f"Only in {set1_name}: {len(only_in_set1):,} ({only_in_set1_percent:.2f}%)",
|
257
|
+
f"In both sets: {len(in_both):,} ({in_both_percent:.2f}%)",
|
258
|
+
f"Only in {set2_name}: {len(only_in_set2):,} ({only_in_set2_percent:.2f}%)",
|
259
|
+
]
|
260
|
+
|
261
|
+
print("\n".join(output))
|
dayhoff_tools/gcp.py
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
from typing import Tuple
|
4
|
+
|
5
|
+
import requests
|
6
|
+
from dayhoff_tools.file_ops import natural_sort_key
|
7
|
+
from google.cloud import storage
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
def upload_folder_to_gcs(
|
13
|
+
bucket_name: str,
|
14
|
+
source_folder: str,
|
15
|
+
destination_path: str,
|
16
|
+
) -> None:
|
17
|
+
"""
|
18
|
+
Uploads a local folder to Google Cloud Storage
|
19
|
+
|
20
|
+
:param bucket_name: The name of the GCS bucket to upload to
|
21
|
+
:param source_folder: The path to the local folder to upload
|
22
|
+
:param destination_path: The destination path in the GCS bucket (without leading '/')
|
23
|
+
"""
|
24
|
+
client = storage.Client()
|
25
|
+
bucket = client.bucket(bucket_name)
|
26
|
+
|
27
|
+
if destination_path.startswith("gs://"):
|
28
|
+
destination_path = destination_path[5:]
|
29
|
+
|
30
|
+
if destination_path.startswith(bucket_name):
|
31
|
+
destination_path = destination_path[len(bucket_name) + 1 :]
|
32
|
+
|
33
|
+
# Iterate through local files and upload them to GCS
|
34
|
+
for root, _, files in os.walk(source_folder):
|
35
|
+
for file_name in sorted(files, key=natural_sort_key):
|
36
|
+
local_file_path = os.path.join(root, file_name)
|
37
|
+
|
38
|
+
# Create a blob object in the destination path
|
39
|
+
relative_path = os.path.relpath(local_file_path, source_folder)
|
40
|
+
blob_path = os.path.join(destination_path, relative_path)
|
41
|
+
blob = bucket.blob(blob_path)
|
42
|
+
|
43
|
+
# Upload the local file to the blob
|
44
|
+
blob.upload_from_filename(local_file_path)
|
45
|
+
print(f"{local_file_path} uploaded to gs://{bucket_name}/")
|
46
|
+
|
47
|
+
|
48
|
+
def get_vm_name() -> str:
|
49
|
+
"""Query the Google Compute Engine metadata server to get the name of the current instance.
|
50
|
+
Only works on GCE VMs, of course.
|
51
|
+
"""
|
52
|
+
url = "http://metadata.google.internal/computeMetadata/v1/instance/name"
|
53
|
+
headers = {"Metadata-Flavor": "Google"}
|
54
|
+
try:
|
55
|
+
response = requests.get(url, headers=headers)
|
56
|
+
if response.status_code == 200:
|
57
|
+
return response.text
|
58
|
+
except Exception as e:
|
59
|
+
logger.error("Error retrieving machine type: %s", e)
|
60
|
+
|
61
|
+
return "Not a Google Compute Engine VM"
|
62
|
+
|
63
|
+
|
64
|
+
def get_vm_type() -> str:
|
65
|
+
"""Query the Google Compute Engine metadata server to get the type
|
66
|
+
(eg, n1-highmem-8) of the current instance. Only works on GCE VMs.
|
67
|
+
"""
|
68
|
+
metadata_url = (
|
69
|
+
"http://metadata.google.internal/computeMetadata/v1/instance/machine-type"
|
70
|
+
)
|
71
|
+
headers = {"Metadata-Flavor": "Google"}
|
72
|
+
|
73
|
+
try:
|
74
|
+
response = requests.get(metadata_url, headers=headers)
|
75
|
+
if response.status_code == 200:
|
76
|
+
# The response includes the full path. Extract just the machine type.
|
77
|
+
machine_type_path = response.text
|
78
|
+
# Example response: projects/123456789/machineTypes/n1-standard-1
|
79
|
+
# Extract machine type from the last segment of the path
|
80
|
+
machine_type = machine_type_path.split("/")[-1]
|
81
|
+
return machine_type
|
82
|
+
except Exception as e:
|
83
|
+
logger.error("Error retrieving machine type: %s", e)
|
84
|
+
|
85
|
+
return "Not a Google Compute Engine VM"
|