opsci-toolbox 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +120 -21
- opsci_toolbox/apis/webscraping.py +186 -59
- opsci_toolbox/apis/youtube_helpers.py +103 -16
- opsci_toolbox/helpers/common.py +368 -254
- opsci_toolbox/helpers/cv.py +50 -60
- opsci_toolbox/helpers/dataviz.py +255 -184
- opsci_toolbox/helpers/dates.py +17 -18
- opsci_toolbox/helpers/nlp.py +154 -114
- opsci_toolbox/helpers/nlp_cuml.py +389 -36
- opsci_toolbox/helpers/sna.py +509 -0
- opsci_toolbox/helpers/sql.py +53 -0
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
- opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
- opsci_toolbox-0.0.6.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/common.py
CHANGED
@@ -16,6 +16,7 @@ from datetime import datetime
|
|
16
16
|
import hashlib
|
17
17
|
import ast
|
18
18
|
import subprocess
|
19
|
+
import chardet
|
19
20
|
|
20
21
|
####################################################################################################
|
21
22
|
# FILE LOADERS
|
@@ -25,16 +26,16 @@ def load_file(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFr
|
|
25
26
|
"""
|
26
27
|
Load a file into a Pandas DataFrame based on the file extension.
|
27
28
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
Args:
|
30
|
+
path (str): The file path to load.
|
31
|
+
delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
|
32
|
+
decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
|
32
33
|
|
33
34
|
Returns:
|
34
|
-
|
35
|
+
pandas.DataFrame: The loaded data as a Pandas DataFrame.
|
35
36
|
|
36
37
|
Raises:
|
37
|
-
|
38
|
+
ValueError: If the file extension is not supported.
|
38
39
|
"""
|
39
40
|
extension = os.path.splitext(os.path.basename(path))[1]
|
40
41
|
if extension == ".parquet":
|
@@ -57,14 +58,14 @@ def load_parquet(path: str) -> pd.DataFrame:
|
|
57
58
|
"""
|
58
59
|
Load a parquet file into a DataFrame.
|
59
60
|
|
60
|
-
|
61
|
-
|
61
|
+
Args:
|
62
|
+
path (str): The file path to the parquet file.
|
62
63
|
|
63
64
|
Returns:
|
64
|
-
|
65
|
+
pandas.DataFrame: The loaded data as a Pandas DataFrame.
|
65
66
|
|
66
67
|
Raises:
|
67
|
-
|
68
|
+
Exception: If there is an error reading the parquet file.
|
68
69
|
"""
|
69
70
|
try:
|
70
71
|
table = pq.read_table(path)
|
@@ -74,15 +75,37 @@ def load_parquet(path: str) -> pd.DataFrame:
|
|
74
75
|
print(e)
|
75
76
|
return df
|
76
77
|
|
78
|
+
def load_excel(path : str, sheet_name : str = ""):
|
79
|
+
"""
|
80
|
+
Loads an Excel sheet into a Pandas DataFrame.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
file_path (str): Path to the Excel file.
|
84
|
+
sheet_name (str, int, list, or None): Name of sheet or sheet number to load.
|
85
|
+
0 (default) - Load first sheet.
|
86
|
+
str - Load sheet with specified name.
|
87
|
+
list - Load multiple sheets, returns a dictionary of DataFrames.
|
88
|
+
None - Load all sheets, returns a dictionary of DataFrames.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
DataFrame or dict of DataFrames.
|
92
|
+
"""
|
93
|
+
try:
|
94
|
+
df = pd.read_excel(path, sheet_name=sheet_name)
|
95
|
+
return df
|
96
|
+
except Exception as e:
|
97
|
+
print(f"Error loading Excel file: {e}")
|
98
|
+
return None
|
99
|
+
|
77
100
|
def load_pickle(path: str) -> pd.DataFrame:
|
78
101
|
"""
|
79
102
|
Load a pickle file into a DataFrame.
|
80
103
|
|
81
|
-
|
82
|
-
|
104
|
+
Args:
|
105
|
+
path (str): The file path to the pickle file.
|
83
106
|
|
84
107
|
Returns:
|
85
|
-
|
108
|
+
pandas.DataFrame: The loaded data as a Pandas DataFrame.
|
86
109
|
"""
|
87
110
|
return pd.read_pickle(path)
|
88
111
|
|
@@ -91,14 +114,14 @@ def load_json(path: str) -> pd.DataFrame:
|
|
91
114
|
"""
|
92
115
|
Load a JSON file into a DataFrame.
|
93
116
|
|
94
|
-
|
95
|
-
|
117
|
+
Args:
|
118
|
+
path (str): The file path to the JSON file.
|
96
119
|
|
97
120
|
Returns:
|
98
|
-
|
121
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
99
122
|
|
100
123
|
Raises:
|
101
|
-
|
124
|
+
Exception: If there is an error reading the JSON file.
|
102
125
|
"""
|
103
126
|
df = pd.DataFrame()
|
104
127
|
try:
|
@@ -114,14 +137,14 @@ def load_jsonl(path: str) -> pd.DataFrame:
|
|
114
137
|
"""
|
115
138
|
Load a JSON Lines (jsonl) file into a DataFrame.
|
116
139
|
|
117
|
-
|
118
|
-
|
140
|
+
Args:
|
141
|
+
path (str): The file path to the jsonl file.
|
119
142
|
|
120
143
|
Returns:
|
121
|
-
|
144
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
122
145
|
|
123
146
|
Raises:
|
124
|
-
|
147
|
+
Exception: If there is an error reading the jsonl file.
|
125
148
|
"""
|
126
149
|
df = pd.DataFrame()
|
127
150
|
try:
|
@@ -144,16 +167,16 @@ def load_csv(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFram
|
|
144
167
|
"""
|
145
168
|
Load a CSV file into a DataFrame.
|
146
169
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
170
|
+
Args:
|
171
|
+
path (str): The file path to the CSV file.
|
172
|
+
delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
|
173
|
+
decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
|
151
174
|
|
152
175
|
Returns:
|
153
|
-
|
176
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
154
177
|
|
155
178
|
Raises:
|
156
|
-
|
179
|
+
Exception: If there is an error reading the CSV file.
|
157
180
|
"""
|
158
181
|
df = pd.DataFrame()
|
159
182
|
try:
|
@@ -167,15 +190,15 @@ def read_txt_to_list(file_path: str) -> list[str]:
|
|
167
190
|
"""
|
168
191
|
Read a text file line by line and append to a Python list.
|
169
192
|
|
170
|
-
|
171
|
-
|
193
|
+
Args:
|
194
|
+
file_path (str): The file path to the text file.
|
172
195
|
|
173
196
|
Returns:
|
174
|
-
|
197
|
+
list[str]: A list of lines read from the text file.
|
175
198
|
|
176
199
|
Raises:
|
177
|
-
|
178
|
-
|
200
|
+
FileNotFoundError: If the file does not exist.
|
201
|
+
Exception: If any other error occurs during file reading.
|
179
202
|
"""
|
180
203
|
|
181
204
|
# Initialize an empty list to store the lines
|
@@ -197,15 +220,15 @@ def read_json(path: str) -> dict:
|
|
197
220
|
"""
|
198
221
|
Read a JSON file and return a dictionary.
|
199
222
|
|
200
|
-
|
201
|
-
|
223
|
+
Args:
|
224
|
+
path (str): The file path to the JSON file.
|
202
225
|
|
203
226
|
Returns:
|
204
|
-
|
227
|
+
dict: The data read from the JSON file as a dictionary.
|
205
228
|
|
206
229
|
Raises:
|
207
|
-
|
208
|
-
|
230
|
+
FileNotFoundError: If the file does not exist.
|
231
|
+
Exception: If there is an error reading the JSON file.
|
209
232
|
"""
|
210
233
|
with open(path, 'r') as json_file:
|
211
234
|
data = json.load(json_file)
|
@@ -215,15 +238,15 @@ def read_txt_file(file_path: str) -> str:
|
|
215
238
|
"""
|
216
239
|
Read the content of a text file and return it as a string.
|
217
240
|
|
218
|
-
|
219
|
-
|
241
|
+
Args:
|
242
|
+
file_path (str): The file path to the text file.
|
220
243
|
|
221
244
|
Returns:
|
222
|
-
|
245
|
+
str: The content of the text file as a string.
|
223
246
|
|
224
247
|
Raises:
|
225
|
-
|
226
|
-
|
248
|
+
FileNotFoundError: If the file does not exist.
|
249
|
+
Exception: If there is an error reading the text file.
|
227
250
|
"""
|
228
251
|
try:
|
229
252
|
with open(file_path, 'r') as file:
|
@@ -240,15 +263,15 @@ def read_jsonl(path: str) -> list[dict]:
|
|
240
263
|
"""
|
241
264
|
Load a JSON Lines (jsonl) file into a list of dictionaries.
|
242
265
|
|
243
|
-
|
244
|
-
|
266
|
+
Args:
|
267
|
+
path (str): The file path to the jsonl file.
|
245
268
|
|
246
269
|
Returns:
|
247
|
-
|
270
|
+
list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
|
248
271
|
|
249
272
|
Raises:
|
250
|
-
|
251
|
-
|
273
|
+
FileNotFoundError: If the file does not exist.
|
274
|
+
Exception: If there is an error reading the jsonl file.
|
252
275
|
"""
|
253
276
|
json_data = []
|
254
277
|
try:
|
@@ -274,13 +297,13 @@ def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
|
|
274
297
|
"""
|
275
298
|
Write a DataFrame into a pickle file.
|
276
299
|
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
300
|
+
Args:
|
301
|
+
data (pd.DataFrame): The DataFrame to be written to the pickle file.
|
302
|
+
path (str): The directory where the pickle file will be saved.
|
303
|
+
filename (str): The name of the pickle file (without the extension).
|
281
304
|
|
282
305
|
Returns:
|
283
|
-
|
306
|
+
str: The full path to the saved pickle file.
|
284
307
|
"""
|
285
308
|
file_path = os.path.join(path, filename + '.pickle')
|
286
309
|
with open(file_path, 'wb') as f:
|
@@ -292,13 +315,13 @@ def write_list_to_txt(input_list: list, path: str, name: str) -> str:
|
|
292
315
|
"""
|
293
316
|
Write a list to a text file, with each item on a new line.
|
294
317
|
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
318
|
+
Args:
|
319
|
+
input_list (list): The list to be written to the text file.
|
320
|
+
path (str): The directory path where the text file will be saved.
|
321
|
+
name (str): The name of the text file (without the extension).
|
299
322
|
|
300
323
|
Returns:
|
301
|
-
|
324
|
+
str: The full path to the saved text file.
|
302
325
|
"""
|
303
326
|
file_path = os.path.join(path, name + '.txt')
|
304
327
|
with open(file_path, 'w') as file:
|
@@ -310,13 +333,13 @@ def write_jsonl(data: list[dict], path: str, name: str) -> str:
|
|
310
333
|
"""
|
311
334
|
Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
|
312
335
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
336
|
+
Args:
|
337
|
+
data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
|
338
|
+
path (str): The directory path where the JSON Lines file will be saved.
|
339
|
+
name (str): The name of the JSON Lines file (without the extension).
|
317
340
|
|
318
341
|
Returns:
|
319
|
-
|
342
|
+
str: The full path to the saved JSON Lines file.
|
320
343
|
"""
|
321
344
|
file_path = os.path.join(path, name + '.jsonl')
|
322
345
|
with open(file_path, 'w') as file:
|
@@ -330,13 +353,13 @@ def write_json(json_dict: dict, path: str, name: str) -> str:
|
|
330
353
|
"""
|
331
354
|
Write a dictionary to a JSON file.
|
332
355
|
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
356
|
+
Args:
|
357
|
+
json_dict (dict): The dictionary to be written to the JSON file.
|
358
|
+
path (str): The directory path where the JSON file will be saved.
|
359
|
+
name (str): The name of the JSON file (without the extension).
|
337
360
|
|
338
361
|
Returns:
|
339
|
-
|
362
|
+
str: The full path to the saved JSON file.
|
340
363
|
"""
|
341
364
|
file_path = os.path.join(path, name + '.json')
|
342
365
|
with open(file_path, 'w') as outfile:
|
@@ -348,14 +371,14 @@ def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str
|
|
348
371
|
"""
|
349
372
|
Write a DataFrame to a JSON file.
|
350
373
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
374
|
+
Args:
|
375
|
+
df (pd.DataFrame): The DataFrame to be written to the JSON file.
|
376
|
+
path (str): The directory path where the JSON file will be saved.
|
377
|
+
name (str): The name of the JSON file (without the extension).
|
378
|
+
orient (str, optional): The format of the JSON file. Default is 'records'.
|
356
379
|
|
357
380
|
Returns:
|
358
|
-
|
381
|
+
str: The full path to the saved JSON file.
|
359
382
|
"""
|
360
383
|
file_path = os.path.join(path, name + ".json")
|
361
384
|
df.to_json(file_path, orient=orient, lines=True)
|
@@ -366,14 +389,14 @@ def save_dataframe_excel(df: pd.DataFrame, path: str, name: str, sheet_name: str
|
|
366
389
|
"""
|
367
390
|
Write a DataFrame to an Excel file.
|
368
391
|
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
392
|
+
Args:
|
393
|
+
df (pd.DataFrame): The DataFrame to be written to the Excel file.
|
394
|
+
path (str): The directory path where the Excel file will be saved.
|
395
|
+
name (str): The name of the Excel file (without the extension).
|
396
|
+
sheet_name (str): The name of the Excel sheet.
|
374
397
|
|
375
398
|
Returns:
|
376
|
-
|
399
|
+
str: The full path to the saved Excel file.
|
377
400
|
"""
|
378
401
|
file_path = os.path.join(path, f"{name}.xlsx")
|
379
402
|
df.to_excel(file_path, sheet_name=sheet_name, index=False)
|
@@ -384,13 +407,13 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
|
|
384
407
|
"""
|
385
408
|
Adds a DataFrame to an existing Excel file as a new sheet.
|
386
409
|
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
410
|
+
Args:
|
411
|
+
df (pd.DataFrame): The DataFrame to be added.
|
412
|
+
existing_file_path (str): Path to the existing Excel file.
|
413
|
+
new_sheet_name (str): Name of the new sheet in the Excel file.
|
391
414
|
|
392
415
|
Returns:
|
393
|
-
|
416
|
+
None
|
394
417
|
"""
|
395
418
|
# Read existing Excel file into a dictionary of DataFrames
|
396
419
|
excel_file = pd.read_excel(existing_file_path, sheet_name=None)
|
@@ -407,13 +430,13 @@ def save_dataframe_csv(df: pd.DataFrame, path: str, name: str) -> str:
|
|
407
430
|
"""
|
408
431
|
Save a DataFrame to a CSV file within a specified directory.
|
409
432
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
433
|
+
Args:
|
434
|
+
df (pd.DataFrame): The DataFrame to be saved.
|
435
|
+
path (str): The directory where the CSV file will be saved.
|
436
|
+
name (str): The desired name for the CSV file (without extension).
|
414
437
|
|
415
438
|
Returns:
|
416
|
-
|
439
|
+
str: The full path to the saved CSV file.
|
417
440
|
"""
|
418
441
|
file_path = os.path.join(path, f"{name}.csv")
|
419
442
|
df.to_csv(
|
@@ -430,31 +453,31 @@ def write_txt_file(data: str, path: str, name: str) -> str:
|
|
430
453
|
"""
|
431
454
|
Write a string to a text file.
|
432
455
|
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
456
|
+
Args:
|
457
|
+
data (str): The string to be written to the text file.
|
458
|
+
path (str): The directory path where the text file will be saved.
|
459
|
+
name (str): The name of the text file (without the extension).
|
437
460
|
|
438
461
|
Returns:
|
439
|
-
|
462
|
+
str: The full path to the saved text file.
|
440
463
|
"""
|
441
464
|
file_path = os.path.join(path, name + '.txt')
|
442
465
|
with open(file_path, "w") as file:
|
443
466
|
file.write(data)
|
444
467
|
return file_path
|
445
468
|
|
446
|
-
def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list
|
469
|
+
def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list:
|
447
470
|
"""
|
448
471
|
Split a DataFrame into multiple pickle files with a specified chunk size.
|
449
472
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
473
|
+
Args:
|
474
|
+
df (pd.DataFrame): The DataFrame to be split.
|
475
|
+
path (str): The directory path where the pickle files will be saved.
|
476
|
+
name (str): The base name for the pickle files.
|
477
|
+
chunk_size (int, optional): The size of each chunk. Default is 10000.
|
455
478
|
|
456
479
|
Returns:
|
457
|
-
|
480
|
+
list[str]: A list of file paths to the saved pickle files.
|
458
481
|
"""
|
459
482
|
num_chunks = -(-len(df) // chunk_size) # Calculate the number of chunks using ceil division
|
460
483
|
|
@@ -479,11 +502,11 @@ def create_dir(path: str) -> str:
|
|
479
502
|
"""
|
480
503
|
Create a local directory if it doesn't exist.
|
481
504
|
|
482
|
-
|
483
|
-
|
505
|
+
Args:
|
506
|
+
path (str): The directory path to be created.
|
484
507
|
|
485
508
|
Returns:
|
486
|
-
|
509
|
+
str: The path of the created directory.
|
487
510
|
"""
|
488
511
|
if not os.path.exists(path):
|
489
512
|
os.makedirs(path)
|
@@ -491,31 +514,31 @@ def create_dir(path: str) -> str:
|
|
491
514
|
return path
|
492
515
|
|
493
516
|
|
494
|
-
def list_files_in_dir(path: str, filetype: str = '*.json') -> list
|
517
|
+
def list_files_in_dir(path: str, filetype: str = '*.json') -> list:
|
495
518
|
"""
|
496
519
|
List files of a specific format in a directory.
|
497
520
|
|
498
|
-
|
499
|
-
|
500
|
-
|
521
|
+
Args:
|
522
|
+
path (str): The directory path to search for files.
|
523
|
+
filetype (str, optional): The file type pattern to search for.
|
501
524
|
|
502
525
|
Returns:
|
503
|
-
|
526
|
+
list: A list of file paths matching the specified file type pattern.
|
504
527
|
"""
|
505
528
|
pattern = os.path.join(path, filetype)
|
506
529
|
files = glob.glob(pattern)
|
507
530
|
return files
|
508
531
|
|
509
532
|
|
510
|
-
def list_subdirectories(root_directory: str) -> list
|
533
|
+
def list_subdirectories(root_directory: str) -> list:
|
511
534
|
"""
|
512
535
|
List subdirectories in a root directory.
|
513
536
|
|
514
|
-
|
515
|
-
|
537
|
+
Args:
|
538
|
+
root_directory (str): The root directory path.
|
516
539
|
|
517
540
|
Returns:
|
518
|
-
|
541
|
+
list[str]: A list of subdirectory names.
|
519
542
|
"""
|
520
543
|
subdirectories = []
|
521
544
|
for entry in os.scandir(root_directory):
|
@@ -524,15 +547,15 @@ def list_subdirectories(root_directory: str) -> list[str]:
|
|
524
547
|
return subdirectories
|
525
548
|
|
526
549
|
|
527
|
-
def list_recursive_subdirectories(root_directory: str) -> list
|
550
|
+
def list_recursive_subdirectories(root_directory: str) -> list:
|
528
551
|
"""
|
529
552
|
List recursively all subdirectories from a root directory.
|
530
553
|
|
531
|
-
|
532
|
-
|
554
|
+
Args:
|
555
|
+
root_directory (str): The root directory path.
|
533
556
|
|
534
557
|
Returns:
|
535
|
-
|
558
|
+
list[str]: A list of subdirectory paths.
|
536
559
|
"""
|
537
560
|
subdirectories = []
|
538
561
|
for root, dirs, files in os.walk(root_directory):
|
@@ -540,16 +563,16 @@ def list_recursive_subdirectories(root_directory: str) -> list[str]:
|
|
540
563
|
return subdirectories
|
541
564
|
|
542
565
|
|
543
|
-
def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list
|
566
|
+
def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
|
544
567
|
"""
|
545
568
|
Walk through subdirectories of a root directory to list files of a specific format.
|
546
569
|
|
547
|
-
|
548
|
-
|
549
|
-
|
570
|
+
Args:
|
571
|
+
path (str): The root directory path.
|
572
|
+
filetype (str, optional): The file type pattern to search for.
|
550
573
|
|
551
574
|
Returns:
|
552
|
-
|
575
|
+
list[str]: A list of file paths matching the specified file type pattern in subdirectories.
|
553
576
|
"""
|
554
577
|
files = []
|
555
578
|
|
@@ -568,13 +591,13 @@ def copy_file(source_path: str, destination_path: str, new_filename: str = '') -
|
|
568
591
|
"""
|
569
592
|
Copy a file from a source path to a destination path.
|
570
593
|
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
594
|
+
Args:
|
595
|
+
source_path (str): The path of the source file.
|
596
|
+
destination_path (str): The path of the destination directory.
|
597
|
+
new_filename (str, optional): The new filename. If not provided, the original filename is used.
|
575
598
|
|
576
599
|
Returns:
|
577
|
-
|
600
|
+
str: The path of the copied file.
|
578
601
|
"""
|
579
602
|
if new_filename:
|
580
603
|
file_path = os.path.join(destination_path, new_filename)
|
@@ -589,11 +612,11 @@ def remove_file(file_path: str) -> None:
|
|
589
612
|
"""
|
590
613
|
Remove a single file.
|
591
614
|
|
592
|
-
|
593
|
-
|
615
|
+
Args:
|
616
|
+
file_path (str): The path of the file to be removed.
|
594
617
|
|
595
618
|
Returns:
|
596
|
-
|
619
|
+
None
|
597
620
|
"""
|
598
621
|
try:
|
599
622
|
os.remove(file_path)
|
@@ -605,11 +628,11 @@ def remove_folder(folder_path: str) -> None:
|
|
605
628
|
"""
|
606
629
|
Remove a folder and all its contents.
|
607
630
|
|
608
|
-
|
609
|
-
|
631
|
+
Args:
|
632
|
+
folder_path (str): The path of the folder to be removed.
|
610
633
|
|
611
634
|
Returns:
|
612
|
-
|
635
|
+
None
|
613
636
|
"""
|
614
637
|
try:
|
615
638
|
shutil.rmtree(folder_path)
|
@@ -622,12 +645,11 @@ def get_file_size(file_path: str) -> tuple[int, str]:
|
|
622
645
|
"""
|
623
646
|
Get the size of a single file in a readable format (KB, MB, GB).
|
624
647
|
|
625
|
-
|
626
|
-
|
648
|
+
Args:
|
649
|
+
file_path (str): The path of the file.
|
627
650
|
|
628
651
|
Returns:
|
629
|
-
|
630
|
-
If the file is not found, returns None.
|
652
|
+
tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size. If the file is not found, returns None.
|
631
653
|
"""
|
632
654
|
try:
|
633
655
|
size = os.path.getsize(file_path)
|
@@ -654,12 +676,12 @@ def get_folder_size(folder_path: str) -> tuple[int, str]:
|
|
654
676
|
"""
|
655
677
|
Get the size of all files contained in a folder in a readable format (KB, MB, GB).
|
656
678
|
|
657
|
-
|
658
|
-
|
679
|
+
Args:
|
680
|
+
folder_path (str): The path of the folder.
|
659
681
|
|
660
682
|
Returns:
|
661
|
-
|
662
|
-
|
683
|
+
tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
|
684
|
+
If the folder is not found, returns None.
|
663
685
|
"""
|
664
686
|
total_size = 0
|
665
687
|
|
@@ -691,12 +713,12 @@ def file_creation_date(file_path: str) -> datetime:
|
|
691
713
|
"""
|
692
714
|
Return the last update timestamp of a file.
|
693
715
|
|
694
|
-
|
695
|
-
|
716
|
+
Args:
|
717
|
+
file_path (str): The path of the file.
|
696
718
|
|
697
719
|
Returns:
|
698
|
-
|
699
|
-
|
720
|
+
datetime: The last update timestamp as a datetime object.
|
721
|
+
If the file does not exist, returns None.
|
700
722
|
"""
|
701
723
|
# Check if the file exists
|
702
724
|
if os.path.exists(file_path):
|
@@ -717,12 +739,12 @@ def transform_to_n_items_list(lst: list, n: int) -> list[list]:
|
|
717
739
|
"""
|
718
740
|
Transform a list into a list of n-items sublists.
|
719
741
|
|
720
|
-
|
721
|
-
|
722
|
-
|
742
|
+
Args:
|
743
|
+
lst (list): The input list to be transformed.
|
744
|
+
n (int): The number of items in each sublist.
|
723
745
|
|
724
746
|
Returns:
|
725
|
-
|
747
|
+
list[list]: A list of n-items sublists.
|
726
748
|
"""
|
727
749
|
return [lst[i:i + n] for i in range(0, len(lst), n)]
|
728
750
|
|
@@ -731,11 +753,11 @@ def unduplicate_list(lst: list) -> list:
|
|
731
753
|
"""
|
732
754
|
Remove duplicate elements from a list.
|
733
755
|
|
734
|
-
|
735
|
-
|
756
|
+
Args:
|
757
|
+
lst (list): The input list with possible duplicate elements.
|
736
758
|
|
737
759
|
Returns:
|
738
|
-
|
760
|
+
list: A list with duplicate elements removed.
|
739
761
|
"""
|
740
762
|
return list(set(lst))
|
741
763
|
|
@@ -744,13 +766,13 @@ def sort_list(lst: list, reverse: bool = False) -> list:
|
|
744
766
|
"""
|
745
767
|
Sort the list in ascending or descending order.
|
746
768
|
|
747
|
-
|
748
|
-
|
749
|
-
|
769
|
+
Args:
|
770
|
+
lst (list): The input list.
|
771
|
+
reverse (bool): If True, sort the list in descending order.
|
750
772
|
If False (default), sort the list in ascending order.
|
751
773
|
|
752
774
|
Returns:
|
753
|
-
|
775
|
+
list: A new list sorted based on the specified order.
|
754
776
|
"""
|
755
777
|
return sorted(lst, reverse=reverse)
|
756
778
|
|
@@ -759,12 +781,12 @@ def map_list(lst: list, function: callable) -> list:
|
|
759
781
|
"""
|
760
782
|
Apply a function to each element of the list.
|
761
783
|
|
762
|
-
|
763
|
-
|
764
|
-
|
784
|
+
Args:
|
785
|
+
lst (list): The input list.
|
786
|
+
function (callable): The function to apply to each element.
|
765
787
|
|
766
788
|
Returns:
|
767
|
-
|
789
|
+
list: A new list with the function applied to each element.
|
768
790
|
"""
|
769
791
|
return [function(element) for element in lst]
|
770
792
|
|
@@ -773,11 +795,11 @@ def flatten_list(lst: list) -> list:
|
|
773
795
|
"""
|
774
796
|
Flatten a nested list into a single list.
|
775
797
|
|
776
|
-
|
777
|
-
|
798
|
+
Args:
|
799
|
+
lst (list): The input nested list.
|
778
800
|
|
779
801
|
Returns:
|
780
|
-
|
802
|
+
list: A new list with all nested elements flattened.
|
781
803
|
"""
|
782
804
|
flattened_list = []
|
783
805
|
|
@@ -796,12 +818,12 @@ def find_occurrences(lst: list, element) -> int:
|
|
796
818
|
"""
|
797
819
|
Find the occurrences of a specific element in the list.
|
798
820
|
|
799
|
-
|
800
|
-
|
801
|
-
|
821
|
+
Args:
|
822
|
+
lst (list): The input list.
|
823
|
+
element: The element to find occurrences of.
|
802
824
|
|
803
825
|
Returns:
|
804
|
-
|
826
|
+
int: The number of occurrences of the specified element in the list.
|
805
827
|
"""
|
806
828
|
return lst.count(element)
|
807
829
|
|
@@ -810,12 +832,12 @@ def is_subset(subset: list, superset: list) -> bool:
|
|
810
832
|
"""
|
811
833
|
Check if one list is a subset of another.
|
812
834
|
|
813
|
-
|
814
|
-
|
815
|
-
|
835
|
+
Args:
|
836
|
+
subset (list): The potential subset list.
|
837
|
+
superset (list): The superset list.
|
816
838
|
|
817
839
|
Returns:
|
818
|
-
|
840
|
+
bool: True if the subset is a subset of the superset, False otherwise.
|
819
841
|
"""
|
820
842
|
return all(element in superset for element in subset)
|
821
843
|
|
@@ -823,12 +845,12 @@ def common_elements(list1: list, list2: list) -> list:
|
|
823
845
|
"""
|
824
846
|
Find the common elements between two lists.
|
825
847
|
|
826
|
-
|
827
|
-
|
828
|
-
|
848
|
+
Args:
|
849
|
+
list1 (list): The first list.
|
850
|
+
list2 (list): The second list.
|
829
851
|
|
830
852
|
Returns:
|
831
|
-
|
853
|
+
list: A new list containing the common elements between list1 and list2.
|
832
854
|
"""
|
833
855
|
return list(set(list1) & set(list2))
|
834
856
|
|
@@ -837,11 +859,11 @@ def shuffle_list(lst: list) -> list:
|
|
837
859
|
"""
|
838
860
|
Shuffle the elements of the list randomly.
|
839
861
|
|
840
|
-
|
841
|
-
|
862
|
+
Args:
|
863
|
+
lst (list): The input list.
|
842
864
|
|
843
865
|
Returns:
|
844
|
-
|
866
|
+
list: A new list with the elements shuffled randomly.
|
845
867
|
"""
|
846
868
|
shuffled_list = lst.copy()
|
847
869
|
random.shuffle(shuffled_list)
|
@@ -852,17 +874,17 @@ def sample_list(lst: list, sample_size) -> list:
|
|
852
874
|
"""
|
853
875
|
Sample a list based on an integer or a float representing the sample size.
|
854
876
|
|
855
|
-
|
856
|
-
|
857
|
-
|
877
|
+
Args:
|
878
|
+
lst (list): The input list.
|
879
|
+
sample_size (int or float): If an integer, the number of elements to keep.
|
858
880
|
If a float, the percentage of elements to keep.
|
859
881
|
|
860
882
|
Returns:
|
861
|
-
|
883
|
+
list: A new list containing the sampled elements.
|
862
884
|
|
863
885
|
Raises:
|
864
|
-
|
865
|
-
|
886
|
+
ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
|
887
|
+
TypeError: If the sample size is neither an integer nor a float.
|
866
888
|
"""
|
867
889
|
if isinstance(sample_size, int):
|
868
890
|
if sample_size < 0:
|
@@ -880,11 +902,11 @@ def count_elements(lst: list) -> dict:
|
|
880
902
|
"""
|
881
903
|
Count the occurrences of each element in the list.
|
882
904
|
|
883
|
-
|
884
|
-
|
905
|
+
Args:
|
906
|
+
lst (list): The input list.
|
885
907
|
|
886
908
|
Returns:
|
887
|
-
|
909
|
+
dict: A dictionary where keys are unique elements from the list, and values are their counts.
|
888
910
|
"""
|
889
911
|
return dict(Counter(lst))
|
890
912
|
|
@@ -892,13 +914,13 @@ def scale_list(lst: list, min_val: float = 1, max_val: float = 5) -> list:
|
|
892
914
|
"""
|
893
915
|
Scale the values of a list to a specified range.
|
894
916
|
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
917
|
+
Args:
|
918
|
+
lst (list): The input list of values to be scaled.
|
919
|
+
min_val (float): The minimum value of the output range (default is 1).
|
920
|
+
max_val (float): The maximum value of the output range (default is 5).
|
899
921
|
|
900
922
|
Returns:
|
901
|
-
|
923
|
+
list: A new list with values scaled to the specified range.
|
902
924
|
"""
|
903
925
|
min_w = min(lst)
|
904
926
|
max_w = max(lst)
|
@@ -916,15 +938,15 @@ def df_scale_column(df: pd.DataFrame, col_to_scale: str, col_out: str, min_val:
|
|
916
938
|
"""
|
917
939
|
Scale values in a DataFrame column to a specified range.
|
918
940
|
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
941
|
+
Args:
|
942
|
+
df (pd.DataFrame): The input DataFrame.
|
943
|
+
col_to_scale (str): The name of the column to be scaled.
|
944
|
+
col_out (str): The name of the new column to store scaled values.
|
945
|
+
min_val (float): The minimum value of the output range.
|
946
|
+
max_val (float): The maximum value of the output range.
|
925
947
|
|
926
948
|
Returns:
|
927
|
-
|
949
|
+
pd.DataFrame: The DataFrame with a new column containing scaled values.
|
928
950
|
"""
|
929
951
|
min_freq = df[col_to_scale].min()
|
930
952
|
max_freq = df[col_to_scale].max()
|
@@ -939,13 +961,13 @@ def zip_file(source_file_path: str, zip_file_path: str, name: str) -> str:
|
|
939
961
|
"""
|
940
962
|
Zip a single file.
|
941
963
|
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
964
|
+
Args:
|
965
|
+
source_file_path (str): Path to the file to be zipped.
|
966
|
+
zip_file_path (str): Path for the resulting zip file.
|
967
|
+
name (str): Name for the resulting zip file (without extension).
|
946
968
|
|
947
969
|
Returns:
|
948
|
-
|
970
|
+
str: Path to the resulting zip file.
|
949
971
|
"""
|
950
972
|
file_path = os.path.join(zip_file_path, f"{name}.zip")
|
951
973
|
|
@@ -959,13 +981,13 @@ def zip_folder(source_folder_path: str, zip_file_path: str, name: str) -> str:
|
|
959
981
|
"""
|
960
982
|
Zip an entire folder.
|
961
983
|
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
984
|
+
Args:
|
985
|
+
source_folder_path (str): Path to the folder to be zipped.
|
986
|
+
zip_file_path (str): Path for the resulting zip file.
|
987
|
+
name (str): Name for the resulting zip file (without extension).
|
966
988
|
|
967
989
|
Returns:
|
968
|
-
|
990
|
+
str: Path to the resulting zip file.
|
969
991
|
"""
|
970
992
|
file_path = os.path.join(zip_file_path, f"{name}.zip")
|
971
993
|
|
@@ -982,12 +1004,12 @@ def unzip_file(zip_file_path: str, destination_path: str) -> None:
|
|
982
1004
|
"""
|
983
1005
|
Unzip a zip file.
|
984
1006
|
|
985
|
-
|
986
|
-
|
987
|
-
|
1007
|
+
Args:
|
1008
|
+
zip_file_path (str): Path to the zip file to be unzipped.
|
1009
|
+
destination_path (str): Path where the contents of the zip file will be extracted.
|
988
1010
|
|
989
1011
|
Returns:
|
990
|
-
|
1012
|
+
None
|
991
1013
|
"""
|
992
1014
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
993
1015
|
zip_ref.extractall(destination_path)
|
@@ -1002,11 +1024,11 @@ def create_google_spreadsheet_client(credentials: str):
|
|
1002
1024
|
"""
|
1003
1025
|
Create a Gspread client to interact with Google Sheets.
|
1004
1026
|
|
1005
|
-
|
1006
|
-
|
1027
|
+
Args:
|
1028
|
+
credentials (str): Path to the JSON file containing Google Service Account credentials.
|
1007
1029
|
|
1008
1030
|
Returns:
|
1009
|
-
|
1031
|
+
gspread.Client: A client object for interacting with Google Sheets.
|
1010
1032
|
"""
|
1011
1033
|
return gspread.service_account(filename=credentials)
|
1012
1034
|
|
@@ -1014,13 +1036,13 @@ def read_google_spreadsheet(client: gspread.Client, sheet_id: str, worksheet_nam
|
|
1014
1036
|
"""
|
1015
1037
|
Read data from a Google spreadsheet and return it as a DataFrame.
|
1016
1038
|
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1039
|
+
Args:
|
1040
|
+
client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1041
|
+
sheet_id (str): The ID of the Google spreadsheet.
|
1042
|
+
worksheet_name (str): The name of the worksheet within the spreadsheet.
|
1021
1043
|
|
1022
1044
|
Returns:
|
1023
|
-
|
1045
|
+
pd.DataFrame: A DataFrame containing the data from the specified worksheet.
|
1024
1046
|
"""
|
1025
1047
|
try:
|
1026
1048
|
# Open the Google Spreadsheet by ID
|
@@ -1047,12 +1069,12 @@ def list_google_worksheets(client: gspread.Client, sheet_id: str) -> list:
|
|
1047
1069
|
"""
|
1048
1070
|
Return a list of worksheet names for a spreadsheet ID.
|
1049
1071
|
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1072
|
+
Args:
|
1073
|
+
client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1074
|
+
sheet_id (str): The ID of the Google spreadsheet.
|
1053
1075
|
|
1054
1076
|
Returns:
|
1055
|
-
|
1077
|
+
list: A list of worksheet names.
|
1056
1078
|
"""
|
1057
1079
|
sheet = client.open_by_key(sheet_id)
|
1058
1080
|
worksheet_obj = sheet.worksheets()
|
@@ -1063,12 +1085,12 @@ def get_spreadsheet_permissions(client: gspread.Client, sheet_id: str) -> pd.Dat
|
|
1063
1085
|
"""
|
1064
1086
|
Return a DataFrame with the list of user email and type that can access the document.
|
1065
1087
|
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1088
|
+
Args:
|
1089
|
+
client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1090
|
+
sheet_id (str): The ID of the Google spreadsheet.
|
1069
1091
|
|
1070
1092
|
Returns:
|
1071
|
-
|
1093
|
+
pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
|
1072
1094
|
"""
|
1073
1095
|
sheet = client.open_by_key(sheet_id)
|
1074
1096
|
permissions = sheet.list_permissions()
|
@@ -1081,14 +1103,14 @@ def create_google_spreadsheet(client: gspread.Client, df: pd.DataFrame, filename
|
|
1081
1103
|
"""
|
1082
1104
|
Create a new Google spreadsheet and load a DataFrame into it.
|
1083
1105
|
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1106
|
+
Args:
|
1107
|
+
client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1108
|
+
df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
|
1109
|
+
filename (str): The desired filename for the new spreadsheet.
|
1110
|
+
worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
|
1089
1111
|
|
1090
1112
|
Returns:
|
1091
|
-
|
1113
|
+
gspread.Spreadsheet: The created spreadsheet object.
|
1092
1114
|
"""
|
1093
1115
|
spreadsheet = client.create(filename)
|
1094
1116
|
worksheet = spreadsheet.sheet1
|
@@ -1102,17 +1124,17 @@ def share_google_spreadsheet(spreadsheet: gspread.Spreadsheet, email: str, user_
|
|
1102
1124
|
"""
|
1103
1125
|
Share a spreadsheet with a user.
|
1104
1126
|
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1127
|
+
Args:
|
1128
|
+
spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
|
1129
|
+
email (str): The email address of the user with whom the spreadsheet will be shared.
|
1130
|
+
user_type (str, optional): The permission type for the user. Defaults to "user".
|
1131
|
+
user_role (str, optional): The role assigned to the user. Defaults to "writer".
|
1132
|
+
notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
|
1133
|
+
email_message (str, optional): The message to include in the notification email.
|
1134
|
+
with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
|
1113
1135
|
|
1114
1136
|
Returns:
|
1115
|
-
|
1137
|
+
gspread.Spreadsheet: The updated spreadsheet object.
|
1116
1138
|
"""
|
1117
1139
|
spreadsheet.share(email, perm_type=user_type, role=user_role, notify=notify, email_message=email_message, with_link=with_link)
|
1118
1140
|
return spreadsheet
|
@@ -1121,11 +1143,11 @@ def generate_short_id(variables: dict) -> tuple[str, str]:
|
|
1121
1143
|
"""
|
1122
1144
|
Generate an 8-character ID using a dictionary as input.
|
1123
1145
|
|
1124
|
-
|
1125
|
-
|
1146
|
+
Args:
|
1147
|
+
variables (dict): A dictionary containing the variables to be serialized.
|
1126
1148
|
|
1127
1149
|
Returns:
|
1128
|
-
|
1150
|
+
tuple: A tuple containing the generated short ID and the serialized variables.
|
1129
1151
|
"""
|
1130
1152
|
# Serialize variables into JSON string
|
1131
1153
|
serialized_variables = json.dumps(variables, sort_keys=True)
|
@@ -1136,6 +1158,16 @@ def generate_short_id(variables: dict) -> tuple[str, str]:
|
|
1136
1158
|
return short_id, serialized_variables
|
1137
1159
|
|
1138
1160
|
def df_transform_column_as_list(column: pd.Series) -> pd.Series:
|
1161
|
+
"""
|
1162
|
+
Transform a pandas Series where each cell is a string representation of a list,
|
1163
|
+
a single value, or already a list into a pandas Series with each cell as a list.
|
1164
|
+
|
1165
|
+
Args:
|
1166
|
+
column (pd.Series): The input pandas Series to transform.
|
1167
|
+
|
1168
|
+
Returns:
|
1169
|
+
pd.Series: A pandas Series with each cell as a list.
|
1170
|
+
"""
|
1139
1171
|
def transform(cell):
|
1140
1172
|
if isinstance(cell, str):
|
1141
1173
|
# Check if it's a list formatted as string, and convert to list
|
@@ -1168,7 +1200,17 @@ def top_rows_per_category(df: pd.DataFrame,
|
|
1168
1200
|
cols_to_keep: list[str],
|
1169
1201
|
top_rows: int) -> pd.DataFrame:
|
1170
1202
|
"""
|
1171
|
-
Select top rows for each category in a dataframe
|
1203
|
+
Select the top rows for each category in a dataframe.
|
1204
|
+
|
1205
|
+
Args:
|
1206
|
+
df (pd.DataFrame): The input dataframe.
|
1207
|
+
col_to_sort (str): The column name by which to sort the rows.
|
1208
|
+
col_to_gb (str): The column name to group by.
|
1209
|
+
cols_to_keep (List[str]): The list of columns to keep in the final output.
|
1210
|
+
top_rows (int): The number of top rows to select for each group.
|
1211
|
+
|
1212
|
+
Returns:
|
1213
|
+
pd.DataFrame: A dataframe containing the top rows for each category.
|
1172
1214
|
"""
|
1173
1215
|
df_gb = (df.sort_values(by=col_to_sort, ascending=False)
|
1174
1216
|
.groupby(col_to_gb)
|
@@ -1179,7 +1221,13 @@ def top_rows_per_category(df: pd.DataFrame,
|
|
1179
1221
|
|
1180
1222
|
def format_number(number: int) -> str:
|
1181
1223
|
"""
|
1182
|
-
|
1224
|
+
Format a number into a human-readable string with K, M, or B suffixes.
|
1225
|
+
|
1226
|
+
Args:
|
1227
|
+
number (int): The number to format.
|
1228
|
+
|
1229
|
+
Returns:
|
1230
|
+
str: The formatted number as a string with an appropriate suffix.
|
1183
1231
|
"""
|
1184
1232
|
if number < 1000:
|
1185
1233
|
return str(number)
|
@@ -1196,12 +1244,12 @@ def unrar_file(rar_file_path : str, output_dir : str) -> None:
|
|
1196
1244
|
"""
|
1197
1245
|
Extracts a .rar file to the specified output directory using the unrar command.
|
1198
1246
|
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1247
|
+
Args:
|
1248
|
+
rar_file_path (str): The path to the .rar file.
|
1249
|
+
output_dir (str): The directory where the contents should be extracted.
|
1202
1250
|
|
1203
1251
|
Returns:
|
1204
|
-
|
1252
|
+
None
|
1205
1253
|
"""
|
1206
1254
|
try:
|
1207
1255
|
# Ensure the output directory exists
|
@@ -1216,4 +1264,70 @@ def unrar_file(rar_file_path : str, output_dir : str) -> None:
|
|
1216
1264
|
print(f"Extraction failed. Error: {result.stderr}")
|
1217
1265
|
|
1218
1266
|
except Exception as e:
|
1219
|
-
print(f"An error occurred: {e}")
|
1267
|
+
print(f"An error occurred: {e}")
|
1268
|
+
|
1269
|
+
|
1270
|
+
def fill_nan(df: pd.DataFrame) -> pd.DataFrame:
|
1271
|
+
"""
|
1272
|
+
Fill missing values in a DataFrame with appropriate defaults based on the column data type.
|
1273
|
+
|
1274
|
+
For string columns, missing values are replaced with an empty string.
|
1275
|
+
For numeric columns, missing values are replaced with zero.
|
1276
|
+
For datetime columns, missing values are replaced with the default date '1970-01-01'.
|
1277
|
+
For other types, missing values are filled with NaN.
|
1278
|
+
|
1279
|
+
Args:
|
1280
|
+
df (DataFrame): The DataFrame in which missing values will be filled.
|
1281
|
+
|
1282
|
+
Returns:
|
1283
|
+
DataFrame: The DataFrame with missing values filled.
|
1284
|
+
"""
|
1285
|
+
mixed_columns = df.columns[df.isna().any()]
|
1286
|
+
|
1287
|
+
for col in mixed_columns:
|
1288
|
+
if df[col].dtype == 'object':
|
1289
|
+
# For string columns, replace NaN with an empty string
|
1290
|
+
df[col] = df[col].fillna('')
|
1291
|
+
elif pd.api.types.is_numeric_dtype(df[col]):
|
1292
|
+
# For numeric columns, replace NaN with the column mean
|
1293
|
+
df[col] = df[col].fillna(0)
|
1294
|
+
elif pd.api.types.is_datetime64_any_dtype(df[col]):
|
1295
|
+
# For datetime columns, replace NaN with a default date
|
1296
|
+
default_date = pd.Timestamp('1970-01-01')
|
1297
|
+
df[col] = df[col].fillna(default_date)
|
1298
|
+
else:
|
1299
|
+
# For other types, we can use a general approach, such as fill with None or NaN
|
1300
|
+
df[col] = df[col].fillna(None)
|
1301
|
+
|
1302
|
+
return df
|
1303
|
+
|
1304
|
+
def detect_encoding(file_path : str) -> str:
|
1305
|
+
"""
|
1306
|
+
Detect the encoding of a file.
|
1307
|
+
|
1308
|
+
Args:
|
1309
|
+
file_path (str): The path to the file whose encoding needs to be detected.
|
1310
|
+
|
1311
|
+
Returns:
|
1312
|
+
str: The detected encoding of the file.
|
1313
|
+
"""
|
1314
|
+
with open(file_path, 'rb') as file:
|
1315
|
+
raw_data = file.read()
|
1316
|
+
result = chardet.detect(raw_data)
|
1317
|
+
return result['encoding']
|
1318
|
+
|
1319
|
+
def remove_empty_folders(path: str):
|
1320
|
+
"""
|
1321
|
+
Recursively remove empty folders from the specified directory.
|
1322
|
+
|
1323
|
+
Parameters:
|
1324
|
+
- path (str): Path to the directory to scan for empty folders.
|
1325
|
+
"""
|
1326
|
+
# Iterate over the directory tree
|
1327
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
1328
|
+
for dir_name in dirs:
|
1329
|
+
dir_path = os.path.join(root, dir_name)
|
1330
|
+
# If the directory is empty, remove it
|
1331
|
+
if not os.listdir(dir_path):
|
1332
|
+
os.rmdir(dir_path)
|
1333
|
+
print(f"Removed empty folder: {dir_path}")
|