opsci-toolbox 0.0.2__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +82 -0
- opsci_toolbox/helpers/common.py +566 -191
- opsci_toolbox/helpers/cv.py +298 -123
- opsci_toolbox/helpers/dataviz.py +1005 -216
- opsci_toolbox/helpers/dates.py +55 -8
- opsci_toolbox/helpers/nlp.py +768 -110
- opsci_toolbox/helpers/nlp_cuml.py +280 -0
- opsci_toolbox/helpers/sna.py +101 -10
- opsci_toolbox/helpers/surreaction.py +156 -0
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/METADATA +9 -11
- opsci_toolbox-0.0.6.dist-info/RECORD +21 -0
- opsci_toolbox-0.0.2.dist-info/RECORD +0 -19
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/common.py
CHANGED
@@ -15,12 +15,27 @@ import pyarrow.parquet as pq
|
|
15
15
|
from datetime import datetime
|
16
16
|
import hashlib
|
17
17
|
import ast
|
18
|
+
import subprocess
|
18
19
|
|
19
20
|
####################################################################################################
|
20
21
|
# FILE LOADERS
|
21
22
|
####################################################################################################
|
22
23
|
|
23
|
-
def load_file(path, delimiter = ";", decimal ="."):
|
24
|
+
def load_file(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFrame:
|
25
|
+
"""
|
26
|
+
Load a file into a Pandas DataFrame based on the file extension.
|
27
|
+
|
28
|
+
Parameters:
|
29
|
+
path (str): The file path to load.
|
30
|
+
delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
|
31
|
+
decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
35
|
+
|
36
|
+
Raises:
|
37
|
+
ValueError: If the file extension is not supported.
|
38
|
+
"""
|
24
39
|
extension = os.path.splitext(os.path.basename(path))[1]
|
25
40
|
if extension == ".parquet":
|
26
41
|
df = load_parquet(path)
|
@@ -38,9 +53,18 @@ def load_file(path, delimiter = ";", decimal ="."):
|
|
38
53
|
print("Check your input file. Extension isn't supported : .parquet, .pickle, .json, .jsonl, .csv, .tsv")
|
39
54
|
return df
|
40
55
|
|
41
|
-
def load_parquet(path):
|
56
|
+
def load_parquet(path: str) -> pd.DataFrame:
|
42
57
|
"""
|
43
|
-
Load a parquet file into a DataFrame
|
58
|
+
Load a parquet file into a DataFrame.
|
59
|
+
|
60
|
+
Parameters:
|
61
|
+
path (str): The file path to the parquet file.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
65
|
+
|
66
|
+
Raises:
|
67
|
+
Exception: If there is an error reading the parquet file.
|
44
68
|
"""
|
45
69
|
try:
|
46
70
|
table = pq.read_table(path)
|
@@ -50,66 +74,108 @@ def load_parquet(path):
|
|
50
74
|
print(e)
|
51
75
|
return df
|
52
76
|
|
53
|
-
def load_pickle(path: str):
|
54
|
-
"""
|
55
|
-
Load a pickle file into a dataframe
|
77
|
+
def load_pickle(path: str) -> pd.DataFrame:
|
56
78
|
"""
|
79
|
+
Load a pickle file into a DataFrame.
|
57
80
|
|
58
|
-
|
59
|
-
|
60
|
-
|
81
|
+
Parameters:
|
82
|
+
path (str): The file path to the pickle file.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
86
|
+
"""
|
87
|
+
return pd.read_pickle(path)
|
88
|
+
|
61
89
|
|
62
|
-
def load_json(path: str):
|
90
|
+
def load_json(path: str) -> pd.DataFrame:
|
63
91
|
"""
|
64
|
-
Load a
|
92
|
+
Load a JSON file into a DataFrame.
|
93
|
+
|
94
|
+
Parameters:
|
95
|
+
path (str): The file path to the JSON file.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
99
|
+
|
100
|
+
Raises:
|
101
|
+
Exception: If there is an error reading the JSON file.
|
65
102
|
"""
|
66
|
-
df=pd.DataFrame()
|
103
|
+
df = pd.DataFrame()
|
67
104
|
try:
|
68
105
|
with open(path, 'r') as json_file:
|
69
106
|
data = json.load(json_file)
|
70
|
-
df=pd.json_normalize(data)
|
71
|
-
|
107
|
+
df = pd.json_normalize(data)
|
72
108
|
except Exception as e:
|
73
|
-
|
74
|
-
|
109
|
+
print(f"Error reading the JSON file: {e}")
|
110
|
+
raise
|
75
111
|
return df
|
76
112
|
|
77
|
-
def load_jsonl(path: str):
|
113
|
+
def load_jsonl(path: str) -> pd.DataFrame:
|
78
114
|
"""
|
79
|
-
Load a jsonl file into a
|
115
|
+
Load a JSON Lines (jsonl) file into a DataFrame.
|
116
|
+
|
117
|
+
Parameters:
|
118
|
+
path (str): The file path to the jsonl file.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
122
|
+
|
123
|
+
Raises:
|
124
|
+
Exception: If there is an error reading the jsonl file.
|
80
125
|
"""
|
81
126
|
df = pd.DataFrame()
|
82
127
|
try:
|
83
128
|
data = []
|
84
129
|
with open(path, 'r') as json_file:
|
85
|
-
for line in tqdm(json_file):
|
130
|
+
for line in tqdm(json_file, desc="Loading JSON Lines"):
|
86
131
|
try:
|
87
132
|
data.append(json.loads(line))
|
88
|
-
except:
|
89
|
-
|
133
|
+
except json.JSONDecodeError as line_error:
|
134
|
+
print(f"Error decoding line: {line_error}")
|
90
135
|
|
91
136
|
df = pd.json_normalize(data)
|
92
137
|
except Exception as e:
|
93
|
-
|
94
|
-
|
138
|
+
print(f"Error reading the jsonl file: {e}")
|
139
|
+
raise
|
95
140
|
return df
|
96
141
|
|
97
142
|
|
98
|
-
def load_csv(path: str, delimiter: str =";", decimal:str ="."):
|
143
|
+
def load_csv(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFrame:
|
99
144
|
"""
|
100
|
-
Load a
|
145
|
+
Load a CSV file into a DataFrame.
|
146
|
+
|
147
|
+
Parameters:
|
148
|
+
path (str): The file path to the CSV file.
|
149
|
+
delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
|
150
|
+
decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
|
151
|
+
|
152
|
+
Returns:
|
153
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
154
|
+
|
155
|
+
Raises:
|
156
|
+
Exception: If there is an error reading the CSV file.
|
101
157
|
"""
|
102
|
-
df= pd.DataFrame()
|
158
|
+
df = pd.DataFrame()
|
103
159
|
try:
|
104
160
|
df = pd.read_csv(path, delimiter=delimiter, encoding="utf-8", decimal=decimal)
|
105
161
|
except Exception as e:
|
106
|
-
|
107
|
-
|
162
|
+
print(f"Error reading the CSV file: {e}")
|
163
|
+
raise
|
108
164
|
return df
|
109
165
|
|
110
|
-
def read_txt_to_list(file_path: str):
|
166
|
+
def read_txt_to_list(file_path: str) -> list[str]:
|
111
167
|
"""
|
112
|
-
Read a text file line by line and append to a Python list
|
168
|
+
Read a text file line by line and append to a Python list.
|
169
|
+
|
170
|
+
Parameters:
|
171
|
+
file_path (str): The file path to the text file.
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
list[str]: A list of lines read from the text file.
|
175
|
+
|
176
|
+
Raises:
|
177
|
+
FileNotFoundError: If the file does not exist.
|
178
|
+
Exception: If any other error occurs during file reading.
|
113
179
|
"""
|
114
180
|
|
115
181
|
# Initialize an empty list to store the lines
|
@@ -124,12 +190,22 @@ def read_txt_to_list(file_path: str):
|
|
124
190
|
print(f"File not found: {file_path}")
|
125
191
|
except Exception as e:
|
126
192
|
print(f"An error occurred: {e}")
|
193
|
+
raise
|
127
194
|
return lines
|
128
195
|
|
129
|
-
|
130
|
-
def read_json(path: str):
|
196
|
+
def read_json(path: str) -> dict:
|
131
197
|
"""
|
132
|
-
Read a
|
198
|
+
Read a JSON file and return a dictionary.
|
199
|
+
|
200
|
+
Parameters:
|
201
|
+
path (str): The file path to the JSON file.
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
dict: The data read from the JSON file as a dictionary.
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
FileNotFoundError: If the file does not exist.
|
208
|
+
Exception: If there is an error reading the JSON file.
|
133
209
|
"""
|
134
210
|
with open(path, 'r') as json_file:
|
135
211
|
data = json.load(json_file)
|
@@ -137,25 +213,55 @@ def read_json(path: str):
|
|
137
213
|
|
138
214
|
def read_txt_file(file_path: str) -> str:
|
139
215
|
"""
|
140
|
-
Read a text file
|
216
|
+
Read the content of a text file and return it as a string.
|
217
|
+
|
218
|
+
Parameters:
|
219
|
+
file_path (str): The file path to the text file.
|
220
|
+
|
221
|
+
Returns:
|
222
|
+
str: The content of the text file as a string.
|
223
|
+
|
224
|
+
Raises:
|
225
|
+
FileNotFoundError: If the file does not exist.
|
226
|
+
Exception: If there is an error reading the text file.
|
141
227
|
"""
|
142
|
-
|
143
|
-
|
228
|
+
try:
|
229
|
+
with open(file_path, 'r') as file:
|
230
|
+
content = file.read()
|
231
|
+
except FileNotFoundError:
|
232
|
+
print(f"File not found: {file_path}")
|
233
|
+
raise
|
234
|
+
except Exception as e:
|
235
|
+
print(f"An error occurred while reading the file: {e}")
|
236
|
+
raise
|
144
237
|
return content
|
145
238
|
|
146
|
-
def read_jsonl(path: str):
|
239
|
+
def read_jsonl(path: str) -> list[dict]:
|
147
240
|
"""
|
148
|
-
Load a jsonl file into a
|
241
|
+
Load a JSON Lines (jsonl) file into a list of dictionaries.
|
242
|
+
|
243
|
+
Parameters:
|
244
|
+
path (str): The file path to the jsonl file.
|
245
|
+
|
246
|
+
Returns:
|
247
|
+
list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
|
248
|
+
|
249
|
+
Raises:
|
250
|
+
FileNotFoundError: If the file does not exist.
|
251
|
+
Exception: If there is an error reading the jsonl file.
|
149
252
|
"""
|
150
253
|
json_data = []
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
254
|
+
try:
|
255
|
+
with open(path, 'r') as json_file:
|
256
|
+
for line in tqdm(json_file, desc="Reading JSON Lines"):
|
257
|
+
try:
|
258
|
+
json_data.append(json.loads(line))
|
259
|
+
except Exception as e:
|
260
|
+
print(f"Error decoding line: {e}")
|
261
|
+
raise
|
262
|
+
except FileNotFoundError:
|
263
|
+
print(f"File not found: {path}")
|
264
|
+
raise
|
159
265
|
return json_data
|
160
266
|
|
161
267
|
|
@@ -164,37 +270,55 @@ def read_jsonl(path: str):
|
|
164
270
|
#########################################################################################
|
165
271
|
|
166
272
|
|
167
|
-
def write_pickle(
|
273
|
+
def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
|
168
274
|
"""
|
169
|
-
Write a
|
170
|
-
|
171
|
-
|
275
|
+
Write a DataFrame into a pickle file.
|
276
|
+
|
277
|
+
Parameters:
|
278
|
+
data (pd.DataFrame): The DataFrame to be written to the pickle file.
|
279
|
+
path (str): The directory where the pickle file will be saved.
|
280
|
+
filename (str): The name of the pickle file (without the extension).
|
172
281
|
|
282
|
+
Returns:
|
283
|
+
str: The full path to the saved pickle file.
|
284
|
+
"""
|
285
|
+
file_path = os.path.join(path, filename + '.pickle')
|
173
286
|
with open(file_path, 'wb') as f:
|
174
|
-
pickle.dump(
|
287
|
+
pickle.dump(data, f)
|
175
288
|
return file_path
|
176
289
|
|
177
290
|
|
178
|
-
def write_list_to_txt(input_list: list, path: str, name: str):
|
291
|
+
def write_list_to_txt(input_list: list, path: str, name: str) -> str:
|
179
292
|
"""
|
180
293
|
Write a list to a text file, with each item on a new line.
|
181
294
|
|
182
295
|
Parameters:
|
183
|
-
- file_path (str): The path to the text file.
|
184
296
|
- input_list (list): The list to be written to the text file.
|
297
|
+
- path (str): The directory path where the text file will be saved.
|
298
|
+
- name (str): The name of the text file (without the extension).
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
str: The full path to the saved text file.
|
185
302
|
"""
|
186
|
-
file_path=os.path.join(path, name+'.txt')
|
303
|
+
file_path = os.path.join(path, name + '.txt')
|
187
304
|
with open(file_path, 'w') as file:
|
188
305
|
for item in input_list:
|
189
306
|
file.write(str(item) + '\n')
|
190
|
-
|
191
307
|
return file_path
|
192
308
|
|
193
|
-
def write_jsonl(data: list,
|
309
|
+
def write_jsonl(data: list[dict], path: str, name: str) -> str:
|
194
310
|
"""
|
195
|
-
Write a jsonl file.
|
311
|
+
Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
|
312
|
+
|
313
|
+
Parameters:
|
314
|
+
- data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
|
315
|
+
- path (str): The directory path where the JSON Lines file will be saved.
|
316
|
+
- name (str): The name of the JSON Lines file (without the extension).
|
317
|
+
|
318
|
+
Returns:
|
319
|
+
str: The full path to the saved JSON Lines file.
|
196
320
|
"""
|
197
|
-
file_path=os.path.join(path, name+'.jsonl')
|
321
|
+
file_path = os.path.join(path, name + '.jsonl')
|
198
322
|
with open(file_path, 'w') as file:
|
199
323
|
for entry in data:
|
200
324
|
json.dump(entry, file)
|
@@ -202,41 +326,67 @@ def write_jsonl(data: list, path: str, name: str):
|
|
202
326
|
return file_path
|
203
327
|
|
204
328
|
|
205
|
-
def write_json(json_dict: dict, path: str, name: str):
|
329
|
+
def write_json(json_dict: dict, path: str, name: str) -> str:
|
206
330
|
"""
|
207
|
-
Write a
|
331
|
+
Write a dictionary to a JSON file.
|
332
|
+
|
333
|
+
Parameters:
|
334
|
+
- json_dict (dict): The dictionary to be written to the JSON file.
|
335
|
+
- path (str): The directory path where the JSON file will be saved.
|
336
|
+
- name (str): The name of the JSON file (without the extension).
|
337
|
+
|
338
|
+
Returns:
|
339
|
+
str: The full path to the saved JSON file.
|
208
340
|
"""
|
209
|
-
file_path=os.path.join(path, name+'.json')
|
341
|
+
file_path = os.path.join(path, name + '.json')
|
210
342
|
with open(file_path, 'w') as outfile:
|
211
343
|
json.dump(json_dict, outfile)
|
212
344
|
return file_path
|
213
345
|
|
214
346
|
|
215
|
-
def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient='records'):
|
347
|
+
def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str = 'records') -> str:
|
216
348
|
"""
|
217
|
-
Write a
|
349
|
+
Write a DataFrame to a JSON file.
|
350
|
+
|
351
|
+
Parameters:
|
352
|
+
- df (pd.DataFrame): The DataFrame to be written to the JSON file.
|
353
|
+
- path (str): The directory path where the JSON file will be saved.
|
354
|
+
- name (str): The name of the JSON file (without the extension).
|
355
|
+
- orient (str, optional): The format of the JSON file. Default is 'records'.
|
356
|
+
|
357
|
+
Returns:
|
358
|
+
str: The full path to the saved JSON file.
|
218
359
|
"""
|
219
|
-
file_path=os.path.join(path, name+".json")
|
360
|
+
file_path = os.path.join(path, name + ".json")
|
220
361
|
df.to_json(file_path, orient=orient, lines=True)
|
362
|
+
return file_path
|
221
363
|
|
222
364
|
|
223
|
-
def save_dataframe_excel(df: pd.DataFrame, path: str, name
|
365
|
+
def save_dataframe_excel(df: pd.DataFrame, path: str, name: str, sheet_name: str) -> str:
|
224
366
|
"""
|
225
|
-
Write a
|
367
|
+
Write a DataFrame to an Excel file.
|
368
|
+
|
369
|
+
Parameters:
|
370
|
+
- df (pd.DataFrame): The DataFrame to be written to the Excel file.
|
371
|
+
- path (str): The directory path where the Excel file will be saved.
|
372
|
+
- name (str): The name of the Excel file (without the extension).
|
373
|
+
- sheet_name (str): The name of the Excel sheet.
|
374
|
+
|
375
|
+
Returns:
|
376
|
+
str: The full path to the saved Excel file.
|
226
377
|
"""
|
227
|
-
|
228
|
-
file_path=os.path.join(path, f"{name}.xlsx")
|
378
|
+
file_path = os.path.join(path, f"{name}.xlsx")
|
229
379
|
df.to_excel(file_path, sheet_name=sheet_name, index=False)
|
230
380
|
print(file_path, "- File created")
|
231
381
|
return file_path
|
232
382
|
|
233
|
-
def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_name: str):
|
383
|
+
def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_name: str) -> None:
|
234
384
|
"""
|
235
385
|
Adds a DataFrame to an existing Excel file as a new sheet.
|
236
386
|
|
237
387
|
Parameters:
|
388
|
+
- df (pd.DataFrame): The DataFrame to be added.
|
238
389
|
- existing_file_path (str): Path to the existing Excel file.
|
239
|
-
- dataframe (pd.DataFrame): The DataFrame to be added.
|
240
390
|
- new_sheet_name (str): Name of the new sheet in the Excel file.
|
241
391
|
|
242
392
|
Returns:
|
@@ -245,7 +395,7 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
|
|
245
395
|
# Read existing Excel file into a dictionary of DataFrames
|
246
396
|
excel_file = pd.read_excel(existing_file_path, sheet_name=None)
|
247
397
|
|
248
|
-
# Add the new DataFrame to the dictionary with the specified sheet
|
398
|
+
# Add the new DataFrame to the dictionary with the specified sheet name
|
249
399
|
excel_file[new_sheet_name] = df
|
250
400
|
|
251
401
|
# Write the updated dictionary of DataFrames back to the Excel file
|
@@ -253,46 +403,62 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
|
|
253
403
|
for sheet_name, df in excel_file.items():
|
254
404
|
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
255
405
|
|
256
|
-
def save_dataframe_csv(df: pd.DataFrame, path: str, name: str):
|
406
|
+
def save_dataframe_csv(df: pd.DataFrame, path: str, name: str) -> str:
|
257
407
|
"""
|
258
|
-
|
259
|
-
|
260
|
-
:
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
:
|
267
|
-
:type name: str
|
408
|
+
Save a DataFrame to a CSV file within a specified directory.
|
409
|
+
|
410
|
+
Parameters:
|
411
|
+
- df (pd.DataFrame): The DataFrame to be saved.
|
412
|
+
- path (str): The directory where the CSV file will be saved.
|
413
|
+
- name (str): The desired name for the CSV file (without extension).
|
414
|
+
|
415
|
+
Returns:
|
416
|
+
str: The full path to the saved CSV file.
|
268
417
|
"""
|
269
|
-
|
418
|
+
file_path = os.path.join(path, f"{name}.csv")
|
270
419
|
df.to_csv(
|
271
|
-
|
272
|
-
header=names,
|
420
|
+
file_path,
|
273
421
|
sep=";",
|
274
422
|
encoding="utf-8",
|
275
423
|
index=False,
|
276
424
|
decimal=",",
|
277
425
|
)
|
278
|
-
print("
|
426
|
+
print("File saved:", file_path)
|
427
|
+
return file_path
|
279
428
|
|
280
|
-
def write_txt_file(data: str,
|
429
|
+
def write_txt_file(data: str, path: str, name: str) -> str:
|
281
430
|
"""
|
282
|
-
Write a text file
|
431
|
+
Write a string to a text file.
|
432
|
+
|
433
|
+
Parameters:
|
434
|
+
- data (str): The string to be written to the text file.
|
435
|
+
- path (str): The directory path where the text file will be saved.
|
436
|
+
- name (str): The name of the text file (without the extension).
|
437
|
+
|
438
|
+
Returns:
|
439
|
+
str: The full path to the saved text file.
|
283
440
|
"""
|
284
|
-
file_path=os.path.join(path, name+'.txt')
|
441
|
+
file_path = os.path.join(path, name + '.txt')
|
285
442
|
with open(file_path, "w") as file:
|
286
443
|
file.write(data)
|
287
444
|
return file_path
|
288
445
|
|
289
|
-
def split_df_into_chunks(df, path, name, chunk_size = 10000):
|
446
|
+
def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
|
290
447
|
"""
|
291
|
-
Split a
|
448
|
+
Split a DataFrame into multiple pickle files with a specified chunk size.
|
449
|
+
|
450
|
+
Parameters:
|
451
|
+
- df (pd.DataFrame): The DataFrame to be split.
|
452
|
+
- path (str): The directory path where the pickle files will be saved.
|
453
|
+
- name (str): The base name for the pickle files.
|
454
|
+
- chunk_size (int, optional): The size of each chunk. Default is 10000.
|
455
|
+
|
456
|
+
Returns:
|
457
|
+
list[str]: A list of file paths to the saved pickle files.
|
292
458
|
"""
|
293
459
|
num_chunks = -(-len(df) // chunk_size) # Calculate the number of chunks using ceil division
|
294
460
|
|
295
|
-
file_paths=[]
|
461
|
+
file_paths = []
|
296
462
|
|
297
463
|
# create smaller datasets of chunk_size each
|
298
464
|
for i in range(num_chunks):
|
@@ -305,16 +471,19 @@ def split_df_into_chunks(df, path, name, chunk_size = 10000):
|
|
305
471
|
|
306
472
|
return file_paths
|
307
473
|
|
308
|
-
|
309
|
-
|
310
474
|
###################################################################################################
|
311
475
|
# FOLDERS / FILES HELPERS
|
312
476
|
###################################################################################################
|
313
477
|
|
314
|
-
def create_dir(path:str):
|
478
|
+
def create_dir(path: str) -> str:
|
315
479
|
"""
|
316
|
-
Create a local directory
|
480
|
+
Create a local directory if it doesn't exist.
|
481
|
+
|
482
|
+
Parameters:
|
483
|
+
- path (str): The directory path to be created.
|
317
484
|
|
485
|
+
Returns:
|
486
|
+
str: The path of the created directory.
|
318
487
|
"""
|
319
488
|
if not os.path.exists(path):
|
320
489
|
os.makedirs(path)
|
@@ -322,18 +491,31 @@ def create_dir(path:str):
|
|
322
491
|
return path
|
323
492
|
|
324
493
|
|
325
|
-
def list_files_in_dir(path: str, filetype:str ='*.json'):
|
494
|
+
def list_files_in_dir(path: str, filetype: str = '*.json') -> list[str]:
|
326
495
|
"""
|
327
|
-
List files of a specific format in a directory
|
496
|
+
List files of a specific format in a directory.
|
497
|
+
|
498
|
+
Parameters:
|
499
|
+
- path (str): The directory path to search for files.
|
500
|
+
- filetype (str, optional): The file type pattern to search for. Default is '*.json'.
|
501
|
+
|
502
|
+
Returns:
|
503
|
+
list[str]: A list of file paths matching the specified file type pattern.
|
328
504
|
"""
|
329
505
|
pattern = os.path.join(path, filetype)
|
330
506
|
files = glob.glob(pattern)
|
331
507
|
return files
|
332
508
|
|
333
509
|
|
334
|
-
def list_subdirectories(root_directory: str):
|
510
|
+
def list_subdirectories(root_directory: str) -> list[str]:
|
335
511
|
"""
|
336
|
-
List subdirectories in a root directory
|
512
|
+
List subdirectories in a root directory.
|
513
|
+
|
514
|
+
Parameters:
|
515
|
+
- root_directory (str): The root directory path.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
list[str]: A list of subdirectory names.
|
337
519
|
"""
|
338
520
|
subdirectories = []
|
339
521
|
for entry in os.scandir(root_directory):
|
@@ -342,9 +524,15 @@ def list_subdirectories(root_directory: str):
|
|
342
524
|
return subdirectories
|
343
525
|
|
344
526
|
|
345
|
-
def list_recursive_subdirectories(root_directory: str):
|
527
|
+
def list_recursive_subdirectories(root_directory: str) -> list[str]:
|
346
528
|
"""
|
347
|
-
List recursively all subdirectories from a root directory
|
529
|
+
List recursively all subdirectories from a root directory.
|
530
|
+
|
531
|
+
Parameters:
|
532
|
+
- root_directory (str): The root directory path.
|
533
|
+
|
534
|
+
Returns:
|
535
|
+
list[str]: A list of subdirectory paths.
|
348
536
|
"""
|
349
537
|
subdirectories = []
|
350
538
|
for root, dirs, files in os.walk(root_directory):
|
@@ -352,9 +540,16 @@ def list_recursive_subdirectories(root_directory: str):
|
|
352
540
|
return subdirectories
|
353
541
|
|
354
542
|
|
355
|
-
def list_files_in_subdirectories(path:str, filetype:str='*.json'):
|
543
|
+
def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list[str]:
|
356
544
|
"""
|
357
|
-
Walk through subdirectories of a root directory to list files of a specific format
|
545
|
+
Walk through subdirectories of a root directory to list files of a specific format.
|
546
|
+
|
547
|
+
Parameters:
|
548
|
+
- path (str): The root directory path.
|
549
|
+
- filetype (str, optional): The file type pattern to search for. Default is '*.json'.
|
550
|
+
|
551
|
+
Returns:
|
552
|
+
list[str]: A list of file paths matching the specified file type pattern in subdirectories.
|
358
553
|
"""
|
359
554
|
files = []
|
360
555
|
|
@@ -369,21 +564,36 @@ def list_files_in_subdirectories(path:str, filetype:str='*.json'):
|
|
369
564
|
|
370
565
|
return files
|
371
566
|
|
372
|
-
def copy_file(source_path: str, destination_path: str, new_filename:str):
|
567
|
+
def copy_file(source_path: str, destination_path: str, new_filename: str = '') -> str:
|
373
568
|
"""
|
374
|
-
|
569
|
+
Copy a file from a source path to a destination path.
|
570
|
+
|
571
|
+
Parameters:
|
572
|
+
- source_path (str): The path of the source file.
|
573
|
+
- destination_path (str): The path of the destination directory.
|
574
|
+
- new_filename (str, optional): The new filename. If not provided, the original filename is used.
|
575
|
+
|
576
|
+
Returns:
|
577
|
+
str: The path of the copied file.
|
375
578
|
"""
|
376
579
|
if new_filename:
|
377
|
-
file_path=os.path.join(destination_path, new_filename)
|
580
|
+
file_path = os.path.join(destination_path, new_filename)
|
378
581
|
else:
|
379
|
-
filename=os.path.basename(source_path)
|
380
|
-
file_path=os.path.join(destination_path,filename)
|
582
|
+
filename = os.path.basename(source_path)
|
583
|
+
file_path = os.path.join(destination_path, filename)
|
584
|
+
|
381
585
|
shutil.copy(source_path, file_path)
|
382
586
|
return file_path
|
383
587
|
|
384
|
-
def remove_file(file_path):
|
588
|
+
def remove_file(file_path: str) -> None:
|
385
589
|
"""
|
386
|
-
Remove a single file
|
590
|
+
Remove a single file.
|
591
|
+
|
592
|
+
Parameters:
|
593
|
+
- file_path (str): The path of the file to be removed.
|
594
|
+
|
595
|
+
Returns:
|
596
|
+
None
|
387
597
|
"""
|
388
598
|
try:
|
389
599
|
os.remove(file_path)
|
@@ -391,20 +601,33 @@ def remove_file(file_path):
|
|
391
601
|
except OSError as e:
|
392
602
|
print(f"Error removing file {file_path}: {e}")
|
393
603
|
|
394
|
-
def remove_folder(folder_path):
|
604
|
+
def remove_folder(folder_path: str) -> None:
|
395
605
|
"""
|
396
|
-
Remove a folder and all
|
606
|
+
Remove a folder and all its contents.
|
607
|
+
|
608
|
+
Parameters:
|
609
|
+
- folder_path (str): The path of the folder to be removed.
|
610
|
+
|
611
|
+
Returns:
|
612
|
+
None
|
397
613
|
"""
|
398
614
|
try:
|
399
615
|
shutil.rmtree(folder_path)
|
400
616
|
print(f"Folder {folder_path} and its contents removed successfully.")
|
401
617
|
except OSError as e:
|
402
|
-
print(f"Error removing folder {folder_path}: {e}")
|
618
|
+
print(f"Error removing folder {folder_path}: {e}")
|
403
619
|
|
404
620
|
|
405
|
-
def get_file_size(file_path):
|
621
|
+
def get_file_size(file_path: str) -> tuple[int, str]:
|
406
622
|
"""
|
407
|
-
Get a single file
|
623
|
+
Get the size of a single file in a readable format (KB, MB, GB).
|
624
|
+
|
625
|
+
Parameters:
|
626
|
+
- file_path (str): The path of the file.
|
627
|
+
|
628
|
+
Returns:
|
629
|
+
tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size.
|
630
|
+
If the file is not found, returns None.
|
408
631
|
"""
|
409
632
|
try:
|
410
633
|
size = os.path.getsize(file_path)
|
@@ -427,9 +650,16 @@ def get_file_size(file_path):
|
|
427
650
|
print(f"File not found: {file_path}")
|
428
651
|
return None
|
429
652
|
|
430
|
-
def get_folder_size(folder_path):
|
653
|
+
def get_folder_size(folder_path: str) -> tuple[int, str]:
|
431
654
|
"""
|
432
|
-
Get size of all files contained in a folder in a readable format (KB, MB, GB)
|
655
|
+
Get the size of all files contained in a folder in a readable format (KB, MB, GB).
|
656
|
+
|
657
|
+
Parameters:
|
658
|
+
- folder_path (str): The path of the folder.
|
659
|
+
|
660
|
+
Returns:
|
661
|
+
tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
|
662
|
+
If the folder is not found, returns None.
|
433
663
|
"""
|
434
664
|
total_size = 0
|
435
665
|
|
@@ -457,9 +687,16 @@ def get_folder_size(folder_path):
|
|
457
687
|
print(f"Folder not found: {folder_path}")
|
458
688
|
return None
|
459
689
|
|
460
|
-
def file_creation_date(file_path):
|
690
|
+
def file_creation_date(file_path: str) -> datetime:
|
461
691
|
"""
|
462
|
-
Return the last update timestamp
|
692
|
+
Return the last update timestamp of a file.
|
693
|
+
|
694
|
+
Parameters:
|
695
|
+
- file_path (str): The path of the file.
|
696
|
+
|
697
|
+
Returns:
|
698
|
+
datetime: The last update timestamp as a datetime object.
|
699
|
+
If the file does not exist, returns None.
|
463
700
|
"""
|
464
701
|
# Check if the file exists
|
465
702
|
if os.path.exists(file_path):
|
@@ -476,27 +713,34 @@ def file_creation_date(file_path):
|
|
476
713
|
############################################################################
|
477
714
|
|
478
715
|
|
479
|
-
def transform_to_n_items_list(
|
716
|
+
def transform_to_n_items_list(lst: list, n: int) -> list[list]:
|
480
717
|
"""
|
481
718
|
Transform a list into a list of n-items sublists.
|
482
719
|
|
483
720
|
Parameters:
|
484
|
-
-
|
485
|
-
- n: The number of items in each sublist.
|
721
|
+
- lst (list): The input list to be transformed.
|
722
|
+
- n (int): The number of items in each sublist.
|
486
723
|
|
487
724
|
Returns:
|
488
|
-
A list of n-items sublists.
|
725
|
+
list[list]: A list of n-items sublists.
|
489
726
|
"""
|
490
|
-
return [
|
727
|
+
return [lst[i:i + n] for i in range(0, len(lst), n)]
|
491
728
|
|
492
|
-
|
729
|
+
|
730
|
+
def unduplicate_list(lst: list) -> list:
|
493
731
|
"""
|
494
|
-
|
732
|
+
Remove duplicate elements from a list.
|
733
|
+
|
734
|
+
Parameters:
|
735
|
+
- lst (list): The input list with possible duplicate elements.
|
736
|
+
|
737
|
+
Returns:
|
738
|
+
list: A list with duplicate elements removed.
|
495
739
|
"""
|
496
740
|
return list(set(lst))
|
497
741
|
|
498
742
|
|
499
|
-
def sort_list(lst, reverse=False):
|
743
|
+
def sort_list(lst: list, reverse: bool = False) -> list:
|
500
744
|
"""
|
501
745
|
Sort the list in ascending or descending order.
|
502
746
|
|
@@ -506,12 +750,12 @@ def sort_list(lst, reverse=False):
|
|
506
750
|
If False (default), sort the list in ascending order.
|
507
751
|
|
508
752
|
Returns:
|
509
|
-
|
753
|
+
list: A new list sorted based on the specified order.
|
510
754
|
"""
|
511
755
|
return sorted(lst, reverse=reverse)
|
512
756
|
|
513
757
|
|
514
|
-
def map_list(lst, function):
|
758
|
+
def map_list(lst: list, function: callable) -> list:
|
515
759
|
"""
|
516
760
|
Apply a function to each element of the list.
|
517
761
|
|
@@ -520,12 +764,12 @@ def map_list(lst, function):
|
|
520
764
|
- function (callable): The function to apply to each element.
|
521
765
|
|
522
766
|
Returns:
|
523
|
-
|
767
|
+
list: A new list with the function applied to each element.
|
524
768
|
"""
|
525
769
|
return [function(element) for element in lst]
|
526
770
|
|
527
771
|
|
528
|
-
def flatten_list(lst):
|
772
|
+
def flatten_list(lst: list) -> list:
|
529
773
|
"""
|
530
774
|
Flatten a nested list into a single list.
|
531
775
|
|
@@ -533,7 +777,7 @@ def flatten_list(lst):
|
|
533
777
|
- lst (list): The input nested list.
|
534
778
|
|
535
779
|
Returns:
|
536
|
-
|
780
|
+
list: A new list with all nested elements flattened.
|
537
781
|
"""
|
538
782
|
flattened_list = []
|
539
783
|
|
@@ -548,7 +792,7 @@ def flatten_list(lst):
|
|
548
792
|
return flattened_list
|
549
793
|
|
550
794
|
|
551
|
-
def find_occurrences(lst, element):
|
795
|
+
def find_occurrences(lst: list, element) -> int:
|
552
796
|
"""
|
553
797
|
Find the occurrences of a specific element in the list.
|
554
798
|
|
@@ -557,12 +801,12 @@ def find_occurrences(lst, element):
|
|
557
801
|
- element: The element to find occurrences of.
|
558
802
|
|
559
803
|
Returns:
|
560
|
-
|
804
|
+
int: The number of occurrences of the specified element in the list.
|
561
805
|
"""
|
562
806
|
return lst.count(element)
|
563
807
|
|
564
808
|
|
565
|
-
def is_subset(subset, superset):
|
809
|
+
def is_subset(subset: list, superset: list) -> bool:
|
566
810
|
"""
|
567
811
|
Check if one list is a subset of another.
|
568
812
|
|
@@ -571,11 +815,11 @@ def is_subset(subset, superset):
|
|
571
815
|
- superset (list): The superset list.
|
572
816
|
|
573
817
|
Returns:
|
574
|
-
|
818
|
+
bool: True if the subset is a subset of the superset, False otherwise.
|
575
819
|
"""
|
576
820
|
return all(element in superset for element in subset)
|
577
821
|
|
578
|
-
def common_elements(list1, list2):
|
822
|
+
def common_elements(list1: list, list2: list) -> list:
|
579
823
|
"""
|
580
824
|
Find the common elements between two lists.
|
581
825
|
|
@@ -584,12 +828,12 @@ def common_elements(list1, list2):
|
|
584
828
|
- list2 (list): The second list.
|
585
829
|
|
586
830
|
Returns:
|
587
|
-
|
831
|
+
list: A new list containing the common elements between list1 and list2.
|
588
832
|
"""
|
589
833
|
return list(set(list1) & set(list2))
|
590
834
|
|
591
835
|
|
592
|
-
def shuffle_list(lst):
|
836
|
+
def shuffle_list(lst: list) -> list:
|
593
837
|
"""
|
594
838
|
Shuffle the elements of the list randomly.
|
595
839
|
|
@@ -597,14 +841,14 @@ def shuffle_list(lst):
|
|
597
841
|
- lst (list): The input list.
|
598
842
|
|
599
843
|
Returns:
|
600
|
-
|
844
|
+
list: A new list with the elements shuffled randomly.
|
601
845
|
"""
|
602
846
|
shuffled_list = lst.copy()
|
603
847
|
random.shuffle(shuffled_list)
|
604
848
|
return shuffled_list
|
605
849
|
|
606
850
|
|
607
|
-
def sample_list(lst, sample_size):
|
851
|
+
def sample_list(lst: list, sample_size) -> list:
|
608
852
|
"""
|
609
853
|
Sample a list based on an integer or a float representing the sample size.
|
610
854
|
|
@@ -614,7 +858,11 @@ def sample_list(lst, sample_size):
|
|
614
858
|
If a float, the percentage of elements to keep.
|
615
859
|
|
616
860
|
Returns:
|
617
|
-
|
861
|
+
list: A new list containing the sampled elements.
|
862
|
+
|
863
|
+
Raises:
|
864
|
+
- ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
|
865
|
+
- TypeError: If the sample size is neither an integer nor a float.
|
618
866
|
"""
|
619
867
|
if isinstance(sample_size, int):
|
620
868
|
if sample_size < 0:
|
@@ -628,7 +876,7 @@ def sample_list(lst, sample_size):
|
|
628
876
|
else:
|
629
877
|
raise TypeError("Sample size must be an integer or a float.")
|
630
878
|
|
631
|
-
def count_elements(lst):
|
879
|
+
def count_elements(lst: list) -> dict:
|
632
880
|
"""
|
633
881
|
Count the occurrences of each element in the list.
|
634
882
|
|
@@ -636,46 +884,70 @@ def count_elements(lst):
|
|
636
884
|
- lst (list): The input list.
|
637
885
|
|
638
886
|
Returns:
|
639
|
-
|
887
|
+
dict: A dictionary where keys are unique elements from the list, and values are their counts.
|
640
888
|
"""
|
641
889
|
return dict(Counter(lst))
|
642
890
|
|
643
|
-
def scale_list(lst, min_val=1, max_val=5):
|
891
|
+
def scale_list(lst: list, min_val: float = 1, max_val: float = 5) -> list:
|
892
|
+
"""
|
893
|
+
Scale the values of a list to a specified range.
|
894
|
+
|
895
|
+
Parameters:
|
896
|
+
- lst (list): The input list of values to be scaled.
|
897
|
+
- min_val (float): The minimum value of the output range (default is 1).
|
898
|
+
- max_val (float): The maximum value of the output range (default is 5).
|
899
|
+
|
900
|
+
Returns:
|
901
|
+
- list: A new list with values scaled to the specified range.
|
902
|
+
"""
|
644
903
|
min_w = min(lst)
|
645
904
|
max_w = max(lst)
|
646
|
-
scaled_w = [
|
905
|
+
scaled_w = []
|
647
906
|
for x in lst:
|
648
907
|
try:
|
649
908
|
scaled_value = (x - min_w) / (max_w - min_w) * (max_val - min_val) + min_val
|
650
|
-
except :
|
651
|
-
pass
|
909
|
+
except ZeroDivisionError:
|
652
910
|
scaled_value = min_val
|
653
|
-
|
654
911
|
scaled_w.append(scaled_value)
|
655
912
|
return scaled_w
|
656
913
|
|
657
|
-
|
914
|
+
|
915
|
+
def df_scale_column(df: pd.DataFrame, col_to_scale: str, col_out: str, min_val: float, max_val: float) -> pd.DataFrame:
|
916
|
+
"""
|
917
|
+
Scale values in a DataFrame column to a specified range.
|
918
|
+
|
919
|
+
Parameters:
|
920
|
+
- df (pd.DataFrame): The input DataFrame.
|
921
|
+
- col_to_scale (str): The name of the column to be scaled.
|
922
|
+
- col_out (str): The name of the new column to store scaled values.
|
923
|
+
- min_val (float): The minimum value of the output range.
|
924
|
+
- max_val (float): The maximum value of the output range.
|
925
|
+
|
926
|
+
Returns:
|
927
|
+
- pd.DataFrame: The DataFrame with a new column containing scaled values.
|
928
|
+
"""
|
658
929
|
min_freq = df[col_to_scale].min()
|
659
930
|
max_freq = df[col_to_scale].max()
|
660
|
-
df[col_out] = df[col_to_scale].apply(lambda x
|
931
|
+
df[col_out] = df[col_to_scale].apply(lambda x: ((x - min_freq) / (max_freq - min_freq)) * (max_val - min_val) + min_val)
|
661
932
|
return df
|
662
933
|
|
663
934
|
############################################################################
|
664
935
|
# ZIP HELPERS
|
665
936
|
############################################################################
|
666
937
|
|
667
|
-
def zip_file(source_file_path, zip_file_path, name):
|
938
|
+
def zip_file(source_file_path: str, zip_file_path: str, name: str) -> str:
|
668
939
|
"""
|
669
940
|
Zip a single file.
|
670
941
|
|
671
|
-
|
672
|
-
|
673
|
-
|
942
|
+
Parameters:
|
943
|
+
- source_file_path (str): Path to the file to be zipped.
|
944
|
+
- zip_file_path (str): Path for the resulting zip file.
|
945
|
+
- name (str): Name for the resulting zip file (without extension).
|
674
946
|
|
675
947
|
Returns:
|
676
|
-
|
948
|
+
str: Path to the resulting zip file.
|
677
949
|
"""
|
678
|
-
file_path=os.path.join(zip_file_path, name
|
950
|
+
file_path = os.path.join(zip_file_path, f"{name}.zip")
|
679
951
|
|
680
952
|
with zipfile.ZipFile(file_path, 'w') as zip_file:
|
681
953
|
# The second argument to `arcname` is used to set the name of the file inside the zip
|
@@ -683,18 +955,19 @@ def zip_file(source_file_path, zip_file_path, name):
|
|
683
955
|
|
684
956
|
return file_path
|
685
957
|
|
686
|
-
def zip_folder(source_folder_path, zip_file_path, name):
|
958
|
+
def zip_folder(source_folder_path: str, zip_file_path: str, name: str) -> str:
|
687
959
|
"""
|
688
960
|
Zip an entire folder.
|
689
961
|
|
690
|
-
|
691
|
-
|
692
|
-
|
962
|
+
Parameters:
|
963
|
+
- source_folder_path (str): Path to the folder to be zipped.
|
964
|
+
- zip_file_path (str): Path for the resulting zip file.
|
965
|
+
- name (str): Name for the resulting zip file (without extension).
|
693
966
|
|
694
967
|
Returns:
|
695
|
-
|
968
|
+
str: Path to the resulting zip file.
|
696
969
|
"""
|
697
|
-
file_path=os.path.join(zip_file_path, name
|
970
|
+
file_path = os.path.join(zip_file_path, f"{name}.zip")
|
698
971
|
|
699
972
|
with zipfile.ZipFile(file_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
700
973
|
for foldername, subfolders, filenames in os.walk(source_folder_path):
|
@@ -705,13 +978,19 @@ def zip_folder(source_folder_path, zip_file_path, name):
|
|
705
978
|
|
706
979
|
return file_path
|
707
980
|
|
708
|
-
def unzip_file(zip_file_path, destination_path):
|
981
|
+
def unzip_file(zip_file_path: str, destination_path: str) -> None:
|
709
982
|
"""
|
710
|
-
|
983
|
+
Unzip a zip file.
|
984
|
+
|
985
|
+
Parameters:
|
986
|
+
- zip_file_path (str): Path to the zip file to be unzipped.
|
987
|
+
- destination_path (str): Path where the contents of the zip file will be extracted.
|
988
|
+
|
989
|
+
Returns:
|
990
|
+
None
|
711
991
|
"""
|
712
992
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
713
993
|
zip_ref.extractall(destination_path)
|
714
|
-
|
715
994
|
|
716
995
|
|
717
996
|
############################################################################
|
@@ -719,19 +998,32 @@ def unzip_file(zip_file_path, destination_path):
|
|
719
998
|
############################################################################
|
720
999
|
|
721
1000
|
|
722
|
-
def create_google_spreadsheet_client(credentials:str):
|
1001
|
+
def create_google_spreadsheet_client(credentials: str):
|
723
1002
|
"""
|
724
|
-
Create a Gspread client to interact with Google Sheets
|
1003
|
+
Create a Gspread client to interact with Google Sheets.
|
1004
|
+
|
1005
|
+
Parameters:
|
1006
|
+
- credentials (str): Path to the JSON file containing Google Service Account credentials.
|
1007
|
+
|
1008
|
+
Returns:
|
1009
|
+
gspread.Client: A client object for interacting with Google Sheets.
|
725
1010
|
"""
|
726
1011
|
return gspread.service_account(filename=credentials)
|
727
1012
|
|
728
|
-
def read_google_spreadsheet(client, sheet_id: str, worksheet_name: str):
|
1013
|
+
def read_google_spreadsheet(client: gspread.Client, sheet_id: str, worksheet_name: str) -> pd.DataFrame:
|
729
1014
|
"""
|
730
|
-
|
1015
|
+
Read data from a Google spreadsheet and return it as a DataFrame.
|
1016
|
+
|
1017
|
+
Parameters:
|
1018
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1019
|
+
- sheet_id (str): The ID of the Google spreadsheet.
|
1020
|
+
- worksheet_name (str): The name of the worksheet within the spreadsheet.
|
1021
|
+
|
1022
|
+
Returns:
|
1023
|
+
pd.DataFrame: A DataFrame containing the data from the specified worksheet.
|
731
1024
|
"""
|
732
1025
|
try:
|
733
|
-
|
734
|
-
# Open the Google Spreadsheet by name
|
1026
|
+
# Open the Google Spreadsheet by ID
|
735
1027
|
sheet = client.open_by_key(sheet_id)
|
736
1028
|
|
737
1029
|
# Select a specific worksheet by name
|
@@ -751,29 +1043,52 @@ def read_google_spreadsheet(client, sheet_id: str, worksheet_name: str):
|
|
751
1043
|
print(f"An error occurred: {e}")
|
752
1044
|
|
753
1045
|
|
754
|
-
def list_google_worksheets(client, sheet_id:str):
|
1046
|
+
def list_google_worksheets(client: gspread.Client, sheet_id: str) -> list:
|
755
1047
|
"""
|
756
|
-
Return a list of worksheet names for a spreadsheet ID
|
1048
|
+
Return a list of worksheet names for a spreadsheet ID.
|
1049
|
+
|
1050
|
+
Parameters:
|
1051
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1052
|
+
- sheet_id (str): The ID of the Google spreadsheet.
|
1053
|
+
|
1054
|
+
Returns:
|
1055
|
+
list: A list of worksheet names.
|
757
1056
|
"""
|
758
1057
|
sheet = client.open_by_key(sheet_id)
|
759
1058
|
worksheet_obj = sheet.worksheets()
|
760
1059
|
worksheet_list = [sheet.title for sheet in worksheet_obj]
|
761
1060
|
return worksheet_list
|
762
1061
|
|
763
|
-
def get_spreadsheet_permissions(client, sheet_id:str):
|
1062
|
+
def get_spreadsheet_permissions(client: gspread.Client, sheet_id: str) -> pd.DataFrame:
|
764
1063
|
"""
|
765
|
-
Return a DataFrame with the list of user email and type that can access the document
|
1064
|
+
Return a DataFrame with the list of user email and type that can access the document.
|
1065
|
+
|
1066
|
+
Parameters:
|
1067
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1068
|
+
- sheet_id (str): The ID of the Google spreadsheet.
|
1069
|
+
|
1070
|
+
Returns:
|
1071
|
+
pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
|
766
1072
|
"""
|
767
1073
|
sheet = client.open_by_key(sheet_id)
|
768
|
-
permissions=sheet.list_permissions()
|
769
|
-
user_list=[(user.get("emailAddress"),user.get("type")) for user in permissions if user.get("emailAddress") is not None]
|
1074
|
+
permissions = sheet.list_permissions()
|
1075
|
+
user_list = [(user.get("emailAddress"), user.get("type")) for user in permissions if user.get("emailAddress") is not None]
|
770
1076
|
df = pd.DataFrame(user_list, columns=['email', 'type'])
|
771
1077
|
return df
|
772
1078
|
|
773
1079
|
|
774
|
-
def create_google_spreadsheet(client, df, filename:str, worksheet_name:str = "Sheet1"):
|
1080
|
+
def create_google_spreadsheet(client: gspread.Client, df: pd.DataFrame, filename: str, worksheet_name: str = "Sheet1") -> gspread.Spreadsheet:
|
775
1081
|
"""
|
776
|
-
|
1082
|
+
Create a new Google spreadsheet and load a DataFrame into it.
|
1083
|
+
|
1084
|
+
Parameters:
|
1085
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1086
|
+
- df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
|
1087
|
+
- filename (str): The desired filename for the new spreadsheet.
|
1088
|
+
- worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
|
1089
|
+
|
1090
|
+
Returns:
|
1091
|
+
gspread.Spreadsheet: The created spreadsheet object.
|
777
1092
|
"""
|
778
1093
|
spreadsheet = client.create(filename)
|
779
1094
|
worksheet = spreadsheet.sheet1
|
@@ -783,17 +1098,34 @@ def create_google_spreadsheet(client, df, filename:str, worksheet_name:str = "Sh
|
|
783
1098
|
|
784
1099
|
return spreadsheet
|
785
1100
|
|
786
|
-
def share_google_spreadsheet(spreadsheet, email, user_type="user", user_role="writer", notify=False, email_message=None, with_link=False):
|
1101
|
+
def share_google_spreadsheet(spreadsheet: gspread.Spreadsheet, email: str, user_type: str = "user", user_role: str = "writer", notify: bool = False, email_message: str = None, with_link: bool = False) -> gspread.Spreadsheet:
|
787
1102
|
"""
|
788
|
-
Share a spreadsheet with a user
|
1103
|
+
Share a spreadsheet with a user.
|
1104
|
+
|
1105
|
+
Parameters:
|
1106
|
+
- spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
|
1107
|
+
- email (str): The email address of the user with whom the spreadsheet will be shared.
|
1108
|
+
- user_type (str, optional): The permission type for the user. Defaults to "user".
|
1109
|
+
- user_role (str, optional): The role assigned to the user. Defaults to "writer".
|
1110
|
+
- notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
|
1111
|
+
- email_message (str, optional): The message to include in the notification email.
|
1112
|
+
- with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
|
1113
|
+
|
1114
|
+
Returns:
|
1115
|
+
gspread.Spreadsheet: The updated spreadsheet object.
|
789
1116
|
"""
|
790
|
-
spreadsheet.share(email, perm_type=user_type, role=user_role, notify
|
1117
|
+
spreadsheet.share(email, perm_type=user_type, role=user_role, notify=notify, email_message=email_message, with_link=with_link)
|
791
1118
|
return spreadsheet
|
792
1119
|
|
793
|
-
|
794
|
-
def generate_short_id(variables : dict):
|
1120
|
+
def generate_short_id(variables: dict) -> tuple[str, str]:
|
795
1121
|
"""
|
796
|
-
Generate
|
1122
|
+
Generate an 8-character ID using a dictionary as input.
|
1123
|
+
|
1124
|
+
Parameters:
|
1125
|
+
- variables (dict): A dictionary containing the variables to be serialized.
|
1126
|
+
|
1127
|
+
Returns:
|
1128
|
+
tuple: A tuple containing the generated short ID and the serialized variables.
|
797
1129
|
"""
|
798
1130
|
# Serialize variables into JSON string
|
799
1131
|
serialized_variables = json.dumps(variables, sort_keys=True)
|
@@ -803,7 +1135,7 @@ def generate_short_id(variables : dict):
|
|
803
1135
|
short_id = hash_value[:8]
|
804
1136
|
return short_id, serialized_variables
|
805
1137
|
|
806
|
-
def df_transform_column_as_list(column):
|
1138
|
+
def df_transform_column_as_list(column: pd.Series) -> pd.Series:
|
807
1139
|
def transform(cell):
|
808
1140
|
if isinstance(cell, str):
|
809
1141
|
# Check if it's a list formatted as string, and convert to list
|
@@ -812,9 +1144,7 @@ def df_transform_column_as_list(column):
|
|
812
1144
|
else:
|
813
1145
|
try:
|
814
1146
|
values = ast.literal_eval(cell)
|
815
|
-
|
816
1147
|
except Exception as e:
|
817
|
-
pass
|
818
1148
|
# If it's a single URL as string, make it a list
|
819
1149
|
values = [cell]
|
820
1150
|
elif isinstance(cell, (int, float, bool)):
|
@@ -832,7 +1162,11 @@ def df_transform_column_as_list(column):
|
|
832
1162
|
return column.apply(transform)
|
833
1163
|
|
834
1164
|
|
835
|
-
def top_rows_per_category(df
|
1165
|
+
def top_rows_per_category(df: pd.DataFrame,
|
1166
|
+
col_to_sort: str,
|
1167
|
+
col_to_gb: str,
|
1168
|
+
cols_to_keep: list[str],
|
1169
|
+
top_rows: int) -> pd.DataFrame:
|
836
1170
|
"""
|
837
1171
|
Select top rows for each category in a dataframe
|
838
1172
|
"""
|
@@ -842,3 +1176,44 @@ def top_rows_per_category(df, col_to_sort, col_to_gb, cols_to_keep, top_rows) :
|
|
842
1176
|
.reset_index(drop=True)
|
843
1177
|
)[cols_to_keep]
|
844
1178
|
return df_gb
|
1179
|
+
|
1180
|
+
def format_number(number: int) -> str:
|
1181
|
+
"""
|
1182
|
+
Function to format a number in K, M or B
|
1183
|
+
"""
|
1184
|
+
if number < 1000:
|
1185
|
+
return str(number)
|
1186
|
+
elif number < 1000000:
|
1187
|
+
return f"{number / 1000:.1f}K"
|
1188
|
+
elif number < 1000000000:
|
1189
|
+
return f"{number / 1000000:.1f}M"
|
1190
|
+
else:
|
1191
|
+
return f"{number / 1000000000:.1f}B"
|
1192
|
+
|
1193
|
+
|
1194
|
+
|
1195
|
+
def unrar_file(rar_file_path : str, output_dir : str) -> None:
|
1196
|
+
"""
|
1197
|
+
Extracts a .rar file to the specified output directory using the unrar command.
|
1198
|
+
|
1199
|
+
Parameters:
|
1200
|
+
rar_file_path (str): The path to the .rar file.
|
1201
|
+
output_dir (str): The directory where the contents should be extracted.
|
1202
|
+
|
1203
|
+
Returns:
|
1204
|
+
None
|
1205
|
+
"""
|
1206
|
+
try:
|
1207
|
+
# Ensure the output directory exists
|
1208
|
+
subprocess.run(['mkdir', '-p', output_dir], check=True)
|
1209
|
+
|
1210
|
+
# Run the unrar command
|
1211
|
+
result = subprocess.run(['unrar', 'x', '-y', rar_file_path, output_dir],
|
1212
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
1213
|
+
|
1214
|
+
# Check if the extraction was successful
|
1215
|
+
if result.returncode != 0:
|
1216
|
+
print(f"Extraction failed. Error: {result.stderr}")
|
1217
|
+
|
1218
|
+
except Exception as e:
|
1219
|
+
print(f"An error occurred: {e}")
|