dayhoff-tools 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,261 @@
1
+ import gzip
2
+ import inspect
3
+ import logging
4
+ import os
5
+ import re
6
+ import shutil
7
+ import tarfile
8
+ from pathlib import Path
9
+ from typing import Any, Set
10
+
11
+ from tqdm import tqdm
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def compress_gz(input_path: str, output_path: str):
17
+ """Compress the input file using gzip, keep the original."""
18
+ with open(input_path, "rb") as f_in:
19
+ with gzip.open(output_path, "wb") as f_out:
20
+ f_out.writelines(f_in)
21
+
22
+
23
+ def decompress_gz(file_path: str) -> str:
24
+ """
25
+ Decompress a .gz file and return the path to the decompressed file.
26
+
27
+ Args:
28
+ file_path (str): Path to the .gz file.
29
+
30
+ Returns:
31
+ str: Path to the decompressed file.
32
+ """
33
+ input_path = Path(file_path)
34
+ if input_path.suffix != ".gz":
35
+ raise ValueError(f"File {file_path} does not have a .gz extension")
36
+
37
+ output_path = input_path.with_suffix("") # Remove the .gz suffix
38
+
39
+ with gzip.open(input_path, "rb") as f_in:
40
+ with open(output_path, "wb") as f_out:
41
+ shutil.copyfileobj(f_in, f_out)
42
+
43
+ return str(output_path)
44
+
45
+
46
+ def compress_folder_into_tar_gz(folder_path: str) -> str:
47
+ """
48
+ Compress a folder into a tar.gz file of the same name.
49
+
50
+ :param folder_path: The path to the folder to compress.
51
+ """
52
+ output_path = folder_path + ".tar.gz"
53
+ with tarfile.open(output_path, "w:gz") as tarf:
54
+ for root, dirs, files in os.walk(folder_path):
55
+ for file in files:
56
+ file_path = os.path.join(root, file)
57
+ tarf.add(file_path, os.path.relpath(file_path, folder_path))
58
+
59
+ return output_path
60
+
61
+
62
+ def decompress_tar(compressed_folder: str):
63
+ """Decompress a .tar.gz or .tar.bz2 folder to the same location
64
+ and return the path of the decompressed folder.
65
+
66
+ Args:
67
+ compressed_folder (str): Path to a .tar.gz or .tar.bz2 file, which
68
+ itself is a compression of a multi-level folder.
69
+
70
+ Returns:
71
+ str: the path to the decompressed folder
72
+ """
73
+ # Get the directory of the compressed file
74
+ directory = os.path.dirname(compressed_folder)
75
+
76
+ # Determine the compression type
77
+ if compressed_folder.endswith(".tar.gz"):
78
+ mode = "r:gz"
79
+ decompressed_folder = compressed_folder.replace(".tar.gz", "")
80
+ elif compressed_folder.endswith(".tar.bz2"):
81
+ mode = "r:bz2"
82
+ decompressed_folder = compressed_folder.replace(".tar.bz2", "")
83
+ else:
84
+ raise ValueError(
85
+ "Unsupported compression type. Only .tar.gz and .tar.bz2 are supported."
86
+ )
87
+
88
+ # Open the compressed file
89
+ with tarfile.open(compressed_folder, mode) as tar:
90
+ # Check if the decompressed folder already exists
91
+ if os.path.exists(decompressed_folder):
92
+ # Remove the existing decompressed folder
93
+ shutil.rmtree(decompressed_folder)
94
+
95
+ # Extract the contents to the decompressed folder
96
+ tar.extractall(path=directory)
97
+
98
+ return decompressed_folder
99
+
100
+
101
+ def natural_sort_key(s):
102
+ """
103
+ A sorting key function for natural (human) sorting of strings containing numbers.
104
+ Args:
105
+ s (str): The string to be split into parts for sorting.
106
+
107
+ Returns:
108
+ list: A list of strings and integers derived from the input string.
109
+ """
110
+ return [
111
+ int(text) if text.isdigit() else text.lower()
112
+ for text in re.split("([0-9]+)", s)
113
+ ]
114
+
115
+
116
+ def list_files_in_directory_to_txt(directory_path: str, output_txt_file: str):
117
+ """
118
+ Lists all files in the specified directory in natural sorted order and writes their names to a .txt file.
119
+
120
+ Args:
121
+ directory_path (str): Path to the directory whose files should be listed.
122
+ output_txt_file (str): Path to the .txt file where the list of file names will be written.
123
+ """
124
+ # Ensure the directory exists
125
+ if not os.path.isdir(directory_path):
126
+ print(f"The directory {directory_path} does not exist.")
127
+ return
128
+
129
+ # Collect all file names in the directory
130
+ file_names = [
131
+ filename
132
+ for filename in os.listdir(directory_path)
133
+ if os.path.isfile(os.path.join(directory_path, filename))
134
+ ]
135
+
136
+ # Sort the list of file names using the natural sort key
137
+ sorted_file_names = sorted(file_names, key=natural_sort_key)
138
+
139
+ # Open the output file in write mode
140
+ with open(output_txt_file, "w") as file_out:
141
+ # Iterate through the sorted list of file names
142
+ for filename in sorted_file_names:
143
+ # Write each file name to the output .txt file
144
+ file_out.write(f"{filename}\n")
145
+
146
+ print(f"File names have been written to {output_txt_file}")
147
+
148
+
149
+ def list_files_in_directory(directory_path: str) -> list[str]:
150
+ """
151
+ Make a list of the names for all the files in a specified directory, in natural sorted order.
152
+
153
+ Args:
154
+ directory_path (str): Path to the directory whose files should be listed.
155
+ """
156
+ # Collect all file names in the directory
157
+ file_names = [
158
+ filename
159
+ for filename in os.listdir(directory_path)
160
+ if os.path.isfile(os.path.join(directory_path, filename))
161
+ ]
162
+
163
+ # Sort the list of file names using the natural sort key
164
+ return sorted(file_names, key=natural_sort_key)
165
+
166
+
167
+ def write_set_to_file(set_data: Set[Any], filename: str) -> None:
168
+ """
169
+ Write the contents of a set to a text file, one item per line.
170
+
171
+ Args:
172
+ set_data (Set[Any]): The set containing items to write to the file.
173
+ filename (str): The name of the file to create or overwrite.
174
+
175
+ Returns:
176
+ None
177
+ """
178
+ with open(filename, "w") as file:
179
+ for item in tqdm(set_data, desc="Writing to file", unit="item"):
180
+ file.write(str(item) + "\n")
181
+
182
+
183
+ def read_file_to_set(filename: str) -> Set[str]:
184
+ """
185
+ Read all lines from a text file and return them as a set.
186
+
187
+ Args:
188
+ filename (str): The name of the file to read from.
189
+
190
+ Returns:
191
+ Set[str]: A set containing all unique lines from the file.
192
+ """
193
+ result_set = set()
194
+ with open(filename, "r") as file:
195
+ for line in file:
196
+ result_set.add(line.strip())
197
+
198
+ print(f"Item count: {len(result_set)}")
199
+ return result_set
200
+
201
+
202
+ def compare_sets(
203
+ set1: set, set2: set, set1_name: str | None = None, set2_name: str | None = None
204
+ ) -> None:
205
+ """
206
+ Compare two sets and return a string representation of the comparison.
207
+
208
+ This function performs set operations to find the intersection and differences
209
+ between the two input sets. It then generates a text-based Venn diagram and
210
+ detailed statistics about the comparison.
211
+
212
+ Args:
213
+ set1 (set): The first set to compare.
214
+ set2 (set): The second set to compare.
215
+ set1_name (str, optional): The name of the first set. If None, attempts to extract from variable name.
216
+ set2_name (str, optional): The name of the second set. If None, attempts to extract from variable name.
217
+
218
+ Returns:
219
+ str: A string containing the text-based Venn diagram and detailed statistics.
220
+
221
+ Example:
222
+ >>> set1 = {1, 2, 3, 4}
223
+ >>> set2 = {3, 4, 5, 6}
224
+ >>> print(compare_sets(set1, set2))
225
+ """
226
+ # Try to extract variable names if not provided
227
+ if set1_name is None or set2_name is None:
228
+ frame = inspect.currentframe().f_back
229
+ local_vars = frame.f_locals
230
+ if set1_name is None:
231
+ set1_name = next(
232
+ (var for var, val in local_vars.items() if val is set1), "Set 1"
233
+ )
234
+ if set2_name is None:
235
+ set2_name = next(
236
+ (var for var, val in local_vars.items() if val is set2), "Set 2"
237
+ )
238
+
239
+ # Perform set operations
240
+ in_both = set1.intersection(set2)
241
+ only_in_set1 = set1.difference(set2)
242
+ only_in_set2 = set2.difference(set1)
243
+
244
+ # Calculate totals and percentages
245
+ total = len(set1.union(set2))
246
+ in_both_percent = len(in_both) / total * 100
247
+ only_in_set1_percent = len(only_in_set1) / total * 100
248
+ only_in_set2_percent = len(only_in_set2) / total * 100
249
+
250
+ # Create the output string
251
+ output = [
252
+ f"Set Comparison: {set1_name} vs {set2_name}",
253
+ "---------------------------------------",
254
+ f"Total unique elements: {total:,}",
255
+ "",
256
+ f"Only in {set1_name}: {len(only_in_set1):,} ({only_in_set1_percent:.2f}%)",
257
+ f"In both sets: {len(in_both):,} ({in_both_percent:.2f}%)",
258
+ f"Only in {set2_name}: {len(only_in_set2):,} ({only_in_set2_percent:.2f}%)",
259
+ ]
260
+
261
+ print("\n".join(output))
dayhoff_tools/gcp.py ADDED
@@ -0,0 +1,85 @@
1
+ import logging
2
+ import os
3
+ from typing import Tuple
4
+
5
+ import requests
6
+ from dayhoff_tools.file_ops import natural_sort_key
7
+ from google.cloud import storage
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def upload_folder_to_gcs(
13
+ bucket_name: str,
14
+ source_folder: str,
15
+ destination_path: str,
16
+ ) -> None:
17
+ """
18
+ Uploads a local folder to Google Cloud Storage
19
+
20
+ :param bucket_name: The name of the GCS bucket to upload to
21
+ :param source_folder: The path to the local folder to upload
22
+ :param destination_path: The destination path in the GCS bucket (without leading '/')
23
+ """
24
+ client = storage.Client()
25
+ bucket = client.bucket(bucket_name)
26
+
27
+ if destination_path.startswith("gs://"):
28
+ destination_path = destination_path[5:]
29
+
30
+ if destination_path.startswith(bucket_name):
31
+ destination_path = destination_path[len(bucket_name) + 1 :]
32
+
33
+ # Iterate through local files and upload them to GCS
34
+ for root, _, files in os.walk(source_folder):
35
+ for file_name in sorted(files, key=natural_sort_key):
36
+ local_file_path = os.path.join(root, file_name)
37
+
38
+ # Create a blob object in the destination path
39
+ relative_path = os.path.relpath(local_file_path, source_folder)
40
+ blob_path = os.path.join(destination_path, relative_path)
41
+ blob = bucket.blob(blob_path)
42
+
43
+ # Upload the local file to the blob
44
+ blob.upload_from_filename(local_file_path)
45
+ print(f"{local_file_path} uploaded to gs://{bucket_name}/")
46
+
47
+
48
+ def get_vm_name() -> str:
49
+ """Query the Google Compute Engine metadata server to get the name of the current instance.
50
+ Only works on GCE VMs, of course.
51
+ """
52
+ url = "http://metadata.google.internal/computeMetadata/v1/instance/name"
53
+ headers = {"Metadata-Flavor": "Google"}
54
+ try:
55
+ response = requests.get(url, headers=headers)
56
+ if response.status_code == 200:
57
+ return response.text
58
+ except Exception as e:
59
+ logger.error("Error retrieving machine type: %s", e)
60
+
61
+ return "Not a Google Compute Engine VM"
62
+
63
+
64
+ def get_vm_type() -> str:
65
+ """Query the Google Compute Engine metadata server to get the type
66
+ (eg, n1-highmem-8) of the current instance. Only works on GCE VMs.
67
+ """
68
+ metadata_url = (
69
+ "http://metadata.google.internal/computeMetadata/v1/instance/machine-type"
70
+ )
71
+ headers = {"Metadata-Flavor": "Google"}
72
+
73
+ try:
74
+ response = requests.get(metadata_url, headers=headers)
75
+ if response.status_code == 200:
76
+ # The response includes the full path. Extract just the machine type.
77
+ machine_type_path = response.text
78
+ # Example response: projects/123456789/machineTypes/n1-standard-1
79
+ # Extract machine type from the last segment of the path
80
+ machine_type = machine_type_path.split("/")[-1]
81
+ return machine_type
82
+ except Exception as e:
83
+ logger.error("Error retrieving machine type: %s", e)
84
+
85
+ return "Not a Google Compute Engine VM"