opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@ from datetime import datetime
16
16
  import hashlib
17
17
  import ast
18
18
  import subprocess
19
+ import chardet
19
20
 
20
21
  ####################################################################################################
21
22
  # FILE LOADERS
@@ -25,16 +26,16 @@ def load_file(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFr
25
26
  """
26
27
  Load a file into a Pandas DataFrame based on the file extension.
27
28
 
28
- Parameters:
29
- path (str): The file path to load.
30
- delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
31
- decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
29
+ Args:
30
+ path (str): The file path to load.
31
+ delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
32
+ decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
32
33
 
33
34
  Returns:
34
- pd.DataFrame: The loaded data as a Pandas DataFrame.
35
+ pandas.DataFrame: The loaded data as a Pandas DataFrame.
35
36
 
36
37
  Raises:
37
- ValueError: If the file extension is not supported.
38
+ ValueError: If the file extension is not supported.
38
39
  """
39
40
  extension = os.path.splitext(os.path.basename(path))[1]
40
41
  if extension == ".parquet":
@@ -57,14 +58,14 @@ def load_parquet(path: str) -> pd.DataFrame:
57
58
  """
58
59
  Load a parquet file into a DataFrame.
59
60
 
60
- Parameters:
61
- path (str): The file path to the parquet file.
61
+ Args:
62
+ path (str): The file path to the parquet file.
62
63
 
63
64
  Returns:
64
- pd.DataFrame: The loaded data as a Pandas DataFrame.
65
+ pandas.DataFrame: The loaded data as a Pandas DataFrame.
65
66
 
66
67
  Raises:
67
- Exception: If there is an error reading the parquet file.
68
+ Exception: If there is an error reading the parquet file.
68
69
  """
69
70
  try:
70
71
  table = pq.read_table(path)
@@ -74,15 +75,37 @@ def load_parquet(path: str) -> pd.DataFrame:
74
75
  print(e)
75
76
  return df
76
77
 
78
+ def load_excel(path : str, sheet_name : str = ""):
79
+ """
80
+ Loads an Excel sheet into a Pandas DataFrame.
81
+
82
+ Args:
83
+ file_path (str): Path to the Excel file.
84
+ sheet_name (str, int, list, or None): Name of sheet or sheet number to load.
85
+ 0 (default) - Load first sheet.
86
+ str - Load sheet with specified name.
87
+ list - Load multiple sheets, returns a dictionary of DataFrames.
88
+ None - Load all sheets, returns a dictionary of DataFrames.
89
+
90
+ Returns:
91
+ DataFrame or dict of DataFrames.
92
+ """
93
+ try:
94
+ df = pd.read_excel(path, sheet_name=sheet_name)
95
+ return df
96
+ except Exception as e:
97
+ print(f"Error loading Excel file: {e}")
98
+ return None
99
+
77
100
  def load_pickle(path: str) -> pd.DataFrame:
78
101
  """
79
102
  Load a pickle file into a DataFrame.
80
103
 
81
- Parameters:
82
- path (str): The file path to the pickle file.
104
+ Args:
105
+ path (str): The file path to the pickle file.
83
106
 
84
107
  Returns:
85
- pd.DataFrame: The loaded data as a Pandas DataFrame.
108
+ pandas.DataFrame: The loaded data as a Pandas DataFrame.
86
109
  """
87
110
  return pd.read_pickle(path)
88
111
 
@@ -91,14 +114,14 @@ def load_json(path: str) -> pd.DataFrame:
91
114
  """
92
115
  Load a JSON file into a DataFrame.
93
116
 
94
- Parameters:
95
- path (str): The file path to the JSON file.
117
+ Args:
118
+ path (str): The file path to the JSON file.
96
119
 
97
120
  Returns:
98
- pd.DataFrame: The loaded data as a Pandas DataFrame.
121
+ pd.DataFrame: The loaded data as a Pandas DataFrame.
99
122
 
100
123
  Raises:
101
- Exception: If there is an error reading the JSON file.
124
+ Exception: If there is an error reading the JSON file.
102
125
  """
103
126
  df = pd.DataFrame()
104
127
  try:
@@ -114,14 +137,14 @@ def load_jsonl(path: str) -> pd.DataFrame:
114
137
  """
115
138
  Load a JSON Lines (jsonl) file into a DataFrame.
116
139
 
117
- Parameters:
118
- path (str): The file path to the jsonl file.
140
+ Args:
141
+ path (str): The file path to the jsonl file.
119
142
 
120
143
  Returns:
121
- pd.DataFrame: The loaded data as a Pandas DataFrame.
144
+ pd.DataFrame: The loaded data as a Pandas DataFrame.
122
145
 
123
146
  Raises:
124
- Exception: If there is an error reading the jsonl file.
147
+ Exception: If there is an error reading the jsonl file.
125
148
  """
126
149
  df = pd.DataFrame()
127
150
  try:
@@ -144,16 +167,16 @@ def load_csv(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFram
144
167
  """
145
168
  Load a CSV file into a DataFrame.
146
169
 
147
- Parameters:
148
- path (str): The file path to the CSV file.
149
- delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
150
- decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
170
+ Args:
171
+ path (str): The file path to the CSV file.
172
+ delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
173
+ decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
151
174
 
152
175
  Returns:
153
- pd.DataFrame: The loaded data as a Pandas DataFrame.
176
+ pd.DataFrame: The loaded data as a Pandas DataFrame.
154
177
 
155
178
  Raises:
156
- Exception: If there is an error reading the CSV file.
179
+ Exception: If there is an error reading the CSV file.
157
180
  """
158
181
  df = pd.DataFrame()
159
182
  try:
@@ -167,15 +190,15 @@ def read_txt_to_list(file_path: str) -> list[str]:
167
190
  """
168
191
  Read a text file line by line and append to a Python list.
169
192
 
170
- Parameters:
171
- file_path (str): The file path to the text file.
193
+ Args:
194
+ file_path (str): The file path to the text file.
172
195
 
173
196
  Returns:
174
- list[str]: A list of lines read from the text file.
197
+ list[str]: A list of lines read from the text file.
175
198
 
176
199
  Raises:
177
- FileNotFoundError: If the file does not exist.
178
- Exception: If any other error occurs during file reading.
200
+ FileNotFoundError: If the file does not exist.
201
+ Exception: If any other error occurs during file reading.
179
202
  """
180
203
 
181
204
  # Initialize an empty list to store the lines
@@ -197,15 +220,15 @@ def read_json(path: str) -> dict:
197
220
  """
198
221
  Read a JSON file and return a dictionary.
199
222
 
200
- Parameters:
201
- path (str): The file path to the JSON file.
223
+ Args:
224
+ path (str): The file path to the JSON file.
202
225
 
203
226
  Returns:
204
- dict: The data read from the JSON file as a dictionary.
227
+ dict: The data read from the JSON file as a dictionary.
205
228
 
206
229
  Raises:
207
- FileNotFoundError: If the file does not exist.
208
- Exception: If there is an error reading the JSON file.
230
+ FileNotFoundError: If the file does not exist.
231
+ Exception: If there is an error reading the JSON file.
209
232
  """
210
233
  with open(path, 'r') as json_file:
211
234
  data = json.load(json_file)
@@ -215,15 +238,15 @@ def read_txt_file(file_path: str) -> str:
215
238
  """
216
239
  Read the content of a text file and return it as a string.
217
240
 
218
- Parameters:
219
- file_path (str): The file path to the text file.
241
+ Args:
242
+ file_path (str): The file path to the text file.
220
243
 
221
244
  Returns:
222
- str: The content of the text file as a string.
245
+ str: The content of the text file as a string.
223
246
 
224
247
  Raises:
225
- FileNotFoundError: If the file does not exist.
226
- Exception: If there is an error reading the text file.
248
+ FileNotFoundError: If the file does not exist.
249
+ Exception: If there is an error reading the text file.
227
250
  """
228
251
  try:
229
252
  with open(file_path, 'r') as file:
@@ -240,15 +263,15 @@ def read_jsonl(path: str) -> list[dict]:
240
263
  """
241
264
  Load a JSON Lines (jsonl) file into a list of dictionaries.
242
265
 
243
- Parameters:
244
- path (str): The file path to the jsonl file.
266
+ Args:
267
+ path (str): The file path to the jsonl file.
245
268
 
246
269
  Returns:
247
- list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
270
+ list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
248
271
 
249
272
  Raises:
250
- FileNotFoundError: If the file does not exist.
251
- Exception: If there is an error reading the jsonl file.
273
+ FileNotFoundError: If the file does not exist.
274
+ Exception: If there is an error reading the jsonl file.
252
275
  """
253
276
  json_data = []
254
277
  try:
@@ -274,13 +297,13 @@ def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
274
297
  """
275
298
  Write a DataFrame into a pickle file.
276
299
 
277
- Parameters:
278
- data (pd.DataFrame): The DataFrame to be written to the pickle file.
279
- path (str): The directory where the pickle file will be saved.
280
- filename (str): The name of the pickle file (without the extension).
300
+ Args:
301
+ data (pd.DataFrame): The DataFrame to be written to the pickle file.
302
+ path (str): The directory where the pickle file will be saved.
303
+ filename (str): The name of the pickle file (without the extension).
281
304
 
282
305
  Returns:
283
- str: The full path to the saved pickle file.
306
+ str: The full path to the saved pickle file.
284
307
  """
285
308
  file_path = os.path.join(path, filename + '.pickle')
286
309
  with open(file_path, 'wb') as f:
@@ -292,13 +315,13 @@ def write_list_to_txt(input_list: list, path: str, name: str) -> str:
292
315
  """
293
316
  Write a list to a text file, with each item on a new line.
294
317
 
295
- Parameters:
296
- - input_list (list): The list to be written to the text file.
297
- - path (str): The directory path where the text file will be saved.
298
- - name (str): The name of the text file (without the extension).
318
+ Args:
319
+ input_list (list): The list to be written to the text file.
320
+ path (str): The directory path where the text file will be saved.
321
+ name (str): The name of the text file (without the extension).
299
322
 
300
323
  Returns:
301
- str: The full path to the saved text file.
324
+ str: The full path to the saved text file.
302
325
  """
303
326
  file_path = os.path.join(path, name + '.txt')
304
327
  with open(file_path, 'w') as file:
@@ -310,13 +333,13 @@ def write_jsonl(data: list[dict], path: str, name: str) -> str:
310
333
  """
311
334
  Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
312
335
 
313
- Parameters:
314
- - data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
315
- - path (str): The directory path where the JSON Lines file will be saved.
316
- - name (str): The name of the JSON Lines file (without the extension).
336
+ Args:
337
+ data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
338
+ path (str): The directory path where the JSON Lines file will be saved.
339
+ name (str): The name of the JSON Lines file (without the extension).
317
340
 
318
341
  Returns:
319
- str: The full path to the saved JSON Lines file.
342
+ str: The full path to the saved JSON Lines file.
320
343
  """
321
344
  file_path = os.path.join(path, name + '.jsonl')
322
345
  with open(file_path, 'w') as file:
@@ -330,13 +353,13 @@ def write_json(json_dict: dict, path: str, name: str) -> str:
330
353
  """
331
354
  Write a dictionary to a JSON file.
332
355
 
333
- Parameters:
334
- - json_dict (dict): The dictionary to be written to the JSON file.
335
- - path (str): The directory path where the JSON file will be saved.
336
- - name (str): The name of the JSON file (without the extension).
356
+ Args:
357
+ json_dict (dict): The dictionary to be written to the JSON file.
358
+ path (str): The directory path where the JSON file will be saved.
359
+ name (str): The name of the JSON file (without the extension).
337
360
 
338
361
  Returns:
339
- str: The full path to the saved JSON file.
362
+ str: The full path to the saved JSON file.
340
363
  """
341
364
  file_path = os.path.join(path, name + '.json')
342
365
  with open(file_path, 'w') as outfile:
@@ -348,14 +371,14 @@ def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str
348
371
  """
349
372
  Write a DataFrame to a JSON file.
350
373
 
351
- Parameters:
352
- - df (pd.DataFrame): The DataFrame to be written to the JSON file.
353
- - path (str): The directory path where the JSON file will be saved.
354
- - name (str): The name of the JSON file (without the extension).
355
- - orient (str, optional): The format of the JSON file. Default is 'records'.
374
+ Args:
375
+ df (pd.DataFrame): The DataFrame to be written to the JSON file.
376
+ path (str): The directory path where the JSON file will be saved.
377
+ name (str): The name of the JSON file (without the extension).
378
+ orient (str, optional): The format of the JSON file. Default is 'records'.
356
379
 
357
380
  Returns:
358
- str: The full path to the saved JSON file.
381
+ str: The full path to the saved JSON file.
359
382
  """
360
383
  file_path = os.path.join(path, name + ".json")
361
384
  df.to_json(file_path, orient=orient, lines=True)
@@ -366,14 +389,14 @@ def save_dataframe_excel(df: pd.DataFrame, path: str, name: str, sheet_name: str
366
389
  """
367
390
  Write a DataFrame to an Excel file.
368
391
 
369
- Parameters:
370
- - df (pd.DataFrame): The DataFrame to be written to the Excel file.
371
- - path (str): The directory path where the Excel file will be saved.
372
- - name (str): The name of the Excel file (without the extension).
373
- - sheet_name (str): The name of the Excel sheet.
392
+ Args:
393
+ df (pd.DataFrame): The DataFrame to be written to the Excel file.
394
+ path (str): The directory path where the Excel file will be saved.
395
+ name (str): The name of the Excel file (without the extension).
396
+ sheet_name (str): The name of the Excel sheet.
374
397
 
375
398
  Returns:
376
- str: The full path to the saved Excel file.
399
+ str: The full path to the saved Excel file.
377
400
  """
378
401
  file_path = os.path.join(path, f"{name}.xlsx")
379
402
  df.to_excel(file_path, sheet_name=sheet_name, index=False)
@@ -384,13 +407,13 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
384
407
  """
385
408
  Adds a DataFrame to an existing Excel file as a new sheet.
386
409
 
387
- Parameters:
388
- - df (pd.DataFrame): The DataFrame to be added.
389
- - existing_file_path (str): Path to the existing Excel file.
390
- - new_sheet_name (str): Name of the new sheet in the Excel file.
410
+ Args:
411
+ df (pd.DataFrame): The DataFrame to be added.
412
+ existing_file_path (str): Path to the existing Excel file.
413
+ new_sheet_name (str): Name of the new sheet in the Excel file.
391
414
 
392
415
  Returns:
393
- - None
416
+ None
394
417
  """
395
418
  # Read existing Excel file into a dictionary of DataFrames
396
419
  excel_file = pd.read_excel(existing_file_path, sheet_name=None)
@@ -407,13 +430,13 @@ def save_dataframe_csv(df: pd.DataFrame, path: str, name: str) -> str:
407
430
  """
408
431
  Save a DataFrame to a CSV file within a specified directory.
409
432
 
410
- Parameters:
411
- - df (pd.DataFrame): The DataFrame to be saved.
412
- - path (str): The directory where the CSV file will be saved.
413
- - name (str): The desired name for the CSV file (without extension).
433
+ Args:
434
+ df (pd.DataFrame): The DataFrame to be saved.
435
+ path (str): The directory where the CSV file will be saved.
436
+ name (str): The desired name for the CSV file (without extension).
414
437
 
415
438
  Returns:
416
- str: The full path to the saved CSV file.
439
+ str: The full path to the saved CSV file.
417
440
  """
418
441
  file_path = os.path.join(path, f"{name}.csv")
419
442
  df.to_csv(
@@ -430,31 +453,31 @@ def write_txt_file(data: str, path: str, name: str) -> str:
430
453
  """
431
454
  Write a string to a text file.
432
455
 
433
- Parameters:
434
- - data (str): The string to be written to the text file.
435
- - path (str): The directory path where the text file will be saved.
436
- - name (str): The name of the text file (without the extension).
456
+ Args:
457
+ data (str): The string to be written to the text file.
458
+ path (str): The directory path where the text file will be saved.
459
+ name (str): The name of the text file (without the extension).
437
460
 
438
461
  Returns:
439
- str: The full path to the saved text file.
462
+ str: The full path to the saved text file.
440
463
  """
441
464
  file_path = os.path.join(path, name + '.txt')
442
465
  with open(file_path, "w") as file:
443
466
  file.write(data)
444
467
  return file_path
445
468
 
446
- def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
469
+ def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list:
447
470
  """
448
471
  Split a DataFrame into multiple pickle files with a specified chunk size.
449
472
 
450
- Parameters:
451
- - df (pd.DataFrame): The DataFrame to be split.
452
- - path (str): The directory path where the pickle files will be saved.
453
- - name (str): The base name for the pickle files.
454
- - chunk_size (int, optional): The size of each chunk. Default is 10000.
473
+ Args:
474
+ df (pd.DataFrame): The DataFrame to be split.
475
+ path (str): The directory path where the pickle files will be saved.
476
+ name (str): The base name for the pickle files.
477
+ chunk_size (int, optional): The size of each chunk. Default is 10000.
455
478
 
456
479
  Returns:
457
- list[str]: A list of file paths to the saved pickle files.
480
+ list[str]: A list of file paths to the saved pickle files.
458
481
  """
459
482
  num_chunks = -(-len(df) // chunk_size) # Calculate the number of chunks using ceil division
460
483
 
@@ -479,11 +502,11 @@ def create_dir(path: str) -> str:
479
502
  """
480
503
  Create a local directory if it doesn't exist.
481
504
 
482
- Parameters:
483
- - path (str): The directory path to be created.
505
+ Args:
506
+ path (str): The directory path to be created.
484
507
 
485
508
  Returns:
486
- str: The path of the created directory.
509
+ str: The path of the created directory.
487
510
  """
488
511
  if not os.path.exists(path):
489
512
  os.makedirs(path)
@@ -491,31 +514,31 @@ def create_dir(path: str) -> str:
491
514
  return path
492
515
 
493
516
 
494
- def list_files_in_dir(path: str, filetype: str = '*.json') -> list[str]:
517
+ def list_files_in_dir(path: str, filetype: str = '*.json') -> list:
495
518
  """
496
519
  List files of a specific format in a directory.
497
520
 
498
- Parameters:
499
- - path (str): The directory path to search for files.
500
- - filetype (str, optional): The file type pattern to search for. Default is '*.json'.
521
+ Args:
522
+ path (str): The directory path to search for files.
523
+ filetype (str, optional): The file type pattern to search for.
501
524
 
502
525
  Returns:
503
- list[str]: A list of file paths matching the specified file type pattern.
526
+ list: A list of file paths matching the specified file type pattern.
504
527
  """
505
528
  pattern = os.path.join(path, filetype)
506
529
  files = glob.glob(pattern)
507
530
  return files
508
531
 
509
532
 
510
- def list_subdirectories(root_directory: str) -> list[str]:
533
+ def list_subdirectories(root_directory: str) -> list:
511
534
  """
512
535
  List subdirectories in a root directory.
513
536
 
514
- Parameters:
515
- - root_directory (str): The root directory path.
537
+ Args:
538
+ root_directory (str): The root directory path.
516
539
 
517
540
  Returns:
518
- list[str]: A list of subdirectory names.
541
+ list[str]: A list of subdirectory names.
519
542
  """
520
543
  subdirectories = []
521
544
  for entry in os.scandir(root_directory):
@@ -524,15 +547,15 @@ def list_subdirectories(root_directory: str) -> list[str]:
524
547
  return subdirectories
525
548
 
526
549
 
527
- def list_recursive_subdirectories(root_directory: str) -> list[str]:
550
+ def list_recursive_subdirectories(root_directory: str) -> list:
528
551
  """
529
552
  List recursively all subdirectories from a root directory.
530
553
 
531
- Parameters:
532
- - root_directory (str): The root directory path.
554
+ Args:
555
+ root_directory (str): The root directory path.
533
556
 
534
557
  Returns:
535
- list[str]: A list of subdirectory paths.
558
+ list[str]: A list of subdirectory paths.
536
559
  """
537
560
  subdirectories = []
538
561
  for root, dirs, files in os.walk(root_directory):
@@ -540,16 +563,16 @@ def list_recursive_subdirectories(root_directory: str) -> list[str]:
540
563
  return subdirectories
541
564
 
542
565
 
543
- def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list[str]:
566
+ def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
544
567
  """
545
568
  Walk through subdirectories of a root directory to list files of a specific format.
546
569
 
547
- Parameters:
548
- - path (str): The root directory path.
549
- - filetype (str, optional): The file type pattern to search for. Default is '*.json'.
570
+ Args:
571
+ path (str): The root directory path.
572
+ filetype (str, optional): The file type pattern to search for.
550
573
 
551
574
  Returns:
552
- list[str]: A list of file paths matching the specified file type pattern in subdirectories.
575
+ list[str]: A list of file paths matching the specified file type pattern in subdirectories.
553
576
  """
554
577
  files = []
555
578
 
@@ -568,13 +591,13 @@ def copy_file(source_path: str, destination_path: str, new_filename: str = '') -
568
591
  """
569
592
  Copy a file from a source path to a destination path.
570
593
 
571
- Parameters:
572
- - source_path (str): The path of the source file.
573
- - destination_path (str): The path of the destination directory.
574
- - new_filename (str, optional): The new filename. If not provided, the original filename is used.
594
+ Args:
595
+ source_path (str): The path of the source file.
596
+ destination_path (str): The path of the destination directory.
597
+ new_filename (str, optional): The new filename. If not provided, the original filename is used.
575
598
 
576
599
  Returns:
577
- str: The path of the copied file.
600
+ str: The path of the copied file.
578
601
  """
579
602
  if new_filename:
580
603
  file_path = os.path.join(destination_path, new_filename)
@@ -589,11 +612,11 @@ def remove_file(file_path: str) -> None:
589
612
  """
590
613
  Remove a single file.
591
614
 
592
- Parameters:
593
- - file_path (str): The path of the file to be removed.
615
+ Args:
616
+ file_path (str): The path of the file to be removed.
594
617
 
595
618
  Returns:
596
- None
619
+ None
597
620
  """
598
621
  try:
599
622
  os.remove(file_path)
@@ -605,11 +628,11 @@ def remove_folder(folder_path: str) -> None:
605
628
  """
606
629
  Remove a folder and all its contents.
607
630
 
608
- Parameters:
609
- - folder_path (str): The path of the folder to be removed.
631
+ Args:
632
+ folder_path (str): The path of the folder to be removed.
610
633
 
611
634
  Returns:
612
- None
635
+ None
613
636
  """
614
637
  try:
615
638
  shutil.rmtree(folder_path)
@@ -622,12 +645,11 @@ def get_file_size(file_path: str) -> tuple[int, str]:
622
645
  """
623
646
  Get the size of a single file in a readable format (KB, MB, GB).
624
647
 
625
- Parameters:
626
- - file_path (str): The path of the file.
648
+ Args:
649
+ file_path (str): The path of the file.
627
650
 
628
651
  Returns:
629
- tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size.
630
- If the file is not found, returns None.
652
+ tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size. If the file is not found, returns None.
631
653
  """
632
654
  try:
633
655
  size = os.path.getsize(file_path)
@@ -654,12 +676,12 @@ def get_folder_size(folder_path: str) -> tuple[int, str]:
654
676
  """
655
677
  Get the size of all files contained in a folder in a readable format (KB, MB, GB).
656
678
 
657
- Parameters:
658
- - folder_path (str): The path of the folder.
679
+ Args:
680
+ folder_path (str): The path of the folder.
659
681
 
660
682
  Returns:
661
- tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
662
- If the folder is not found, returns None.
683
+ tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
684
+ If the folder is not found, returns None.
663
685
  """
664
686
  total_size = 0
665
687
 
@@ -691,12 +713,12 @@ def file_creation_date(file_path: str) -> datetime:
691
713
  """
692
714
  Return the last update timestamp of a file.
693
715
 
694
- Parameters:
695
- - file_path (str): The path of the file.
716
+ Args:
717
+ file_path (str): The path of the file.
696
718
 
697
719
  Returns:
698
- datetime: The last update timestamp as a datetime object.
699
- If the file does not exist, returns None.
720
+ datetime: The last update timestamp as a datetime object.
721
+ If the file does not exist, returns None.
700
722
  """
701
723
  # Check if the file exists
702
724
  if os.path.exists(file_path):
@@ -717,12 +739,12 @@ def transform_to_n_items_list(lst: list, n: int) -> list[list]:
717
739
  """
718
740
  Transform a list into a list of n-items sublists.
719
741
 
720
- Parameters:
721
- - lst (list): The input list to be transformed.
722
- - n (int): The number of items in each sublist.
742
+ Args:
743
+ lst (list): The input list to be transformed.
744
+ n (int): The number of items in each sublist.
723
745
 
724
746
  Returns:
725
- list[list]: A list of n-items sublists.
747
+ list[list]: A list of n-items sublists.
726
748
  """
727
749
  return [lst[i:i + n] for i in range(0, len(lst), n)]
728
750
 
@@ -731,11 +753,11 @@ def unduplicate_list(lst: list) -> list:
731
753
  """
732
754
  Remove duplicate elements from a list.
733
755
 
734
- Parameters:
735
- - lst (list): The input list with possible duplicate elements.
756
+ Args:
757
+ lst (list): The input list with possible duplicate elements.
736
758
 
737
759
  Returns:
738
- list: A list with duplicate elements removed.
760
+ list: A list with duplicate elements removed.
739
761
  """
740
762
  return list(set(lst))
741
763
 
@@ -744,13 +766,13 @@ def sort_list(lst: list, reverse: bool = False) -> list:
744
766
  """
745
767
  Sort the list in ascending or descending order.
746
768
 
747
- Parameters:
748
- - lst (list): The input list.
749
- - reverse (bool): If True, sort the list in descending order.
769
+ Args:
770
+ lst (list): The input list.
771
+ reverse (bool): If True, sort the list in descending order.
750
772
  If False (default), sort the list in ascending order.
751
773
 
752
774
  Returns:
753
- list: A new list sorted based on the specified order.
775
+ list: A new list sorted based on the specified order.
754
776
  """
755
777
  return sorted(lst, reverse=reverse)
756
778
 
@@ -759,12 +781,12 @@ def map_list(lst: list, function: callable) -> list:
759
781
  """
760
782
  Apply a function to each element of the list.
761
783
 
762
- Parameters:
763
- - lst (list): The input list.
764
- - function (callable): The function to apply to each element.
784
+ Args:
785
+ lst (list): The input list.
786
+ function (callable): The function to apply to each element.
765
787
 
766
788
  Returns:
767
- list: A new list with the function applied to each element.
789
+ list: A new list with the function applied to each element.
768
790
  """
769
791
  return [function(element) for element in lst]
770
792
 
@@ -773,11 +795,11 @@ def flatten_list(lst: list) -> list:
773
795
  """
774
796
  Flatten a nested list into a single list.
775
797
 
776
- Parameters:
777
- - lst (list): The input nested list.
798
+ Args:
799
+ lst (list): The input nested list.
778
800
 
779
801
  Returns:
780
- list: A new list with all nested elements flattened.
802
+ list: A new list with all nested elements flattened.
781
803
  """
782
804
  flattened_list = []
783
805
 
@@ -796,12 +818,12 @@ def find_occurrences(lst: list, element) -> int:
796
818
  """
797
819
  Find the occurrences of a specific element in the list.
798
820
 
799
- Parameters:
800
- - lst (list): The input list.
801
- - element: The element to find occurrences of.
821
+ Args:
822
+ lst (list): The input list.
823
+ element: The element to find occurrences of.
802
824
 
803
825
  Returns:
804
- int: The number of occurrences of the specified element in the list.
826
+ int: The number of occurrences of the specified element in the list.
805
827
  """
806
828
  return lst.count(element)
807
829
 
@@ -810,12 +832,12 @@ def is_subset(subset: list, superset: list) -> bool:
810
832
  """
811
833
  Check if one list is a subset of another.
812
834
 
813
- Parameters:
814
- - subset (list): The potential subset list.
815
- - superset (list): The superset list.
835
+ Args:
836
+ subset (list): The potential subset list.
837
+ superset (list): The superset list.
816
838
 
817
839
  Returns:
818
- bool: True if the subset is a subset of the superset, False otherwise.
840
+ bool: True if the subset is a subset of the superset, False otherwise.
819
841
  """
820
842
  return all(element in superset for element in subset)
821
843
 
@@ -823,12 +845,12 @@ def common_elements(list1: list, list2: list) -> list:
823
845
  """
824
846
  Find the common elements between two lists.
825
847
 
826
- Parameters:
827
- - list1 (list): The first list.
828
- - list2 (list): The second list.
848
+ Args:
849
+ list1 (list): The first list.
850
+ list2 (list): The second list.
829
851
 
830
852
  Returns:
831
- list: A new list containing the common elements between list1 and list2.
853
+ list: A new list containing the common elements between list1 and list2.
832
854
  """
833
855
  return list(set(list1) & set(list2))
834
856
 
@@ -837,11 +859,11 @@ def shuffle_list(lst: list) -> list:
837
859
  """
838
860
  Shuffle the elements of the list randomly.
839
861
 
840
- Parameters:
841
- - lst (list): The input list.
862
+ Args:
863
+ lst (list): The input list.
842
864
 
843
865
  Returns:
844
- list: A new list with the elements shuffled randomly.
866
+ list: A new list with the elements shuffled randomly.
845
867
  """
846
868
  shuffled_list = lst.copy()
847
869
  random.shuffle(shuffled_list)
@@ -852,17 +874,17 @@ def sample_list(lst: list, sample_size) -> list:
852
874
  """
853
875
  Sample a list based on an integer or a float representing the sample size.
854
876
 
855
- Parameters:
856
- - lst (list): The input list.
857
- - sample_size (int or float): If an integer, the number of elements to keep.
877
+ Args:
878
+ lst (list): The input list.
879
+ sample_size (int or float): If an integer, the number of elements to keep.
858
880
  If a float, the percentage of elements to keep.
859
881
 
860
882
  Returns:
861
- list: A new list containing the sampled elements.
883
+ list: A new list containing the sampled elements.
862
884
 
863
885
  Raises:
864
- - ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
865
- - TypeError: If the sample size is neither an integer nor a float.
886
+ ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
887
+ TypeError: If the sample size is neither an integer nor a float.
866
888
  """
867
889
  if isinstance(sample_size, int):
868
890
  if sample_size < 0:
@@ -880,11 +902,11 @@ def count_elements(lst: list) -> dict:
880
902
  """
881
903
  Count the occurrences of each element in the list.
882
904
 
883
- Parameters:
884
- - lst (list): The input list.
905
+ Args:
906
+ lst (list): The input list.
885
907
 
886
908
  Returns:
887
- dict: A dictionary where keys are unique elements from the list, and values are their counts.
909
+ dict: A dictionary where keys are unique elements from the list, and values are their counts.
888
910
  """
889
911
  return dict(Counter(lst))
890
912
 
@@ -892,13 +914,13 @@ def scale_list(lst: list, min_val: float = 1, max_val: float = 5) -> list:
892
914
  """
893
915
  Scale the values of a list to a specified range.
894
916
 
895
- Parameters:
896
- - lst (list): The input list of values to be scaled.
897
- - min_val (float): The minimum value of the output range (default is 1).
898
- - max_val (float): The maximum value of the output range (default is 5).
917
+ Args:
918
+ lst (list): The input list of values to be scaled.
919
+ min_val (float): The minimum value of the output range (default is 1).
920
+ max_val (float): The maximum value of the output range (default is 5).
899
921
 
900
922
  Returns:
901
- - list: A new list with values scaled to the specified range.
923
+ list: A new list with values scaled to the specified range.
902
924
  """
903
925
  min_w = min(lst)
904
926
  max_w = max(lst)
@@ -916,15 +938,15 @@ def df_scale_column(df: pd.DataFrame, col_to_scale: str, col_out: str, min_val:
916
938
  """
917
939
  Scale values in a DataFrame column to a specified range.
918
940
 
919
- Parameters:
920
- - df (pd.DataFrame): The input DataFrame.
921
- - col_to_scale (str): The name of the column to be scaled.
922
- - col_out (str): The name of the new column to store scaled values.
923
- - min_val (float): The minimum value of the output range.
924
- - max_val (float): The maximum value of the output range.
941
+ Args:
942
+ df (pd.DataFrame): The input DataFrame.
943
+ col_to_scale (str): The name of the column to be scaled.
944
+ col_out (str): The name of the new column to store scaled values.
945
+ min_val (float): The minimum value of the output range.
946
+ max_val (float): The maximum value of the output range.
925
947
 
926
948
  Returns:
927
- - pd.DataFrame: The DataFrame with a new column containing scaled values.
949
+ pd.DataFrame: The DataFrame with a new column containing scaled values.
928
950
  """
929
951
  min_freq = df[col_to_scale].min()
930
952
  max_freq = df[col_to_scale].max()
@@ -939,13 +961,13 @@ def zip_file(source_file_path: str, zip_file_path: str, name: str) -> str:
939
961
  """
940
962
  Zip a single file.
941
963
 
942
- Parameters:
943
- - source_file_path (str): Path to the file to be zipped.
944
- - zip_file_path (str): Path for the resulting zip file.
945
- - name (str): Name for the resulting zip file (without extension).
964
+ Args:
965
+ source_file_path (str): Path to the file to be zipped.
966
+ zip_file_path (str): Path for the resulting zip file.
967
+ name (str): Name for the resulting zip file (without extension).
946
968
 
947
969
  Returns:
948
- str: Path to the resulting zip file.
970
+ str: Path to the resulting zip file.
949
971
  """
950
972
  file_path = os.path.join(zip_file_path, f"{name}.zip")
951
973
 
@@ -959,13 +981,13 @@ def zip_folder(source_folder_path: str, zip_file_path: str, name: str) -> str:
959
981
  """
960
982
  Zip an entire folder.
961
983
 
962
- Parameters:
963
- - source_folder_path (str): Path to the folder to be zipped.
964
- - zip_file_path (str): Path for the resulting zip file.
965
- - name (str): Name for the resulting zip file (without extension).
984
+ Args:
985
+ source_folder_path (str): Path to the folder to be zipped.
986
+ zip_file_path (str): Path for the resulting zip file.
987
+ name (str): Name for the resulting zip file (without extension).
966
988
 
967
989
  Returns:
968
- str: Path to the resulting zip file.
990
+ str: Path to the resulting zip file.
969
991
  """
970
992
  file_path = os.path.join(zip_file_path, f"{name}.zip")
971
993
 
@@ -982,12 +1004,12 @@ def unzip_file(zip_file_path: str, destination_path: str) -> None:
982
1004
  """
983
1005
  Unzip a zip file.
984
1006
 
985
- Parameters:
986
- - zip_file_path (str): Path to the zip file to be unzipped.
987
- - destination_path (str): Path where the contents of the zip file will be extracted.
1007
+ Args:
1008
+ zip_file_path (str): Path to the zip file to be unzipped.
1009
+ destination_path (str): Path where the contents of the zip file will be extracted.
988
1010
 
989
1011
  Returns:
990
- None
1012
+ None
991
1013
  """
992
1014
  with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
993
1015
  zip_ref.extractall(destination_path)
@@ -1002,11 +1024,11 @@ def create_google_spreadsheet_client(credentials: str):
1002
1024
  """
1003
1025
  Create a Gspread client to interact with Google Sheets.
1004
1026
 
1005
- Parameters:
1006
- - credentials (str): Path to the JSON file containing Google Service Account credentials.
1027
+ Args:
1028
+ credentials (str): Path to the JSON file containing Google Service Account credentials.
1007
1029
 
1008
1030
  Returns:
1009
- gspread.Client: A client object for interacting with Google Sheets.
1031
+ gspread.Client: A client object for interacting with Google Sheets.
1010
1032
  """
1011
1033
  return gspread.service_account(filename=credentials)
1012
1034
 
@@ -1014,13 +1036,13 @@ def read_google_spreadsheet(client: gspread.Client, sheet_id: str, worksheet_nam
1014
1036
  """
1015
1037
  Read data from a Google spreadsheet and return it as a DataFrame.
1016
1038
 
1017
- Parameters:
1018
- - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1019
- - sheet_id (str): The ID of the Google spreadsheet.
1020
- - worksheet_name (str): The name of the worksheet within the spreadsheet.
1039
+ Args:
1040
+ client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1041
+ sheet_id (str): The ID of the Google spreadsheet.
1042
+ worksheet_name (str): The name of the worksheet within the spreadsheet.
1021
1043
 
1022
1044
  Returns:
1023
- pd.DataFrame: A DataFrame containing the data from the specified worksheet.
1045
+ pd.DataFrame: A DataFrame containing the data from the specified worksheet.
1024
1046
  """
1025
1047
  try:
1026
1048
  # Open the Google Spreadsheet by ID
@@ -1047,12 +1069,12 @@ def list_google_worksheets(client: gspread.Client, sheet_id: str) -> list:
1047
1069
  """
1048
1070
  Return a list of worksheet names for a spreadsheet ID.
1049
1071
 
1050
- Parameters:
1051
- - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1052
- - sheet_id (str): The ID of the Google spreadsheet.
1072
+ Args:
1073
+ client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1074
+ sheet_id (str): The ID of the Google spreadsheet.
1053
1075
 
1054
1076
  Returns:
1055
- list: A list of worksheet names.
1077
+ list: A list of worksheet names.
1056
1078
  """
1057
1079
  sheet = client.open_by_key(sheet_id)
1058
1080
  worksheet_obj = sheet.worksheets()
@@ -1063,12 +1085,12 @@ def get_spreadsheet_permissions(client: gspread.Client, sheet_id: str) -> pd.Dat
1063
1085
  """
1064
1086
  Return a DataFrame with the list of user email and type that can access the document.
1065
1087
 
1066
- Parameters:
1067
- - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1068
- - sheet_id (str): The ID of the Google spreadsheet.
1088
+ Args:
1089
+ client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1090
+ sheet_id (str): The ID of the Google spreadsheet.
1069
1091
 
1070
1092
  Returns:
1071
- pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
1093
+ pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
1072
1094
  """
1073
1095
  sheet = client.open_by_key(sheet_id)
1074
1096
  permissions = sheet.list_permissions()
@@ -1081,14 +1103,14 @@ def create_google_spreadsheet(client: gspread.Client, df: pd.DataFrame, filename
1081
1103
  """
1082
1104
  Create a new Google spreadsheet and load a DataFrame into it.
1083
1105
 
1084
- Parameters:
1085
- - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1086
- - df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
1087
- - filename (str): The desired filename for the new spreadsheet.
1088
- - worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
1106
+ Args:
1107
+ client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
1108
+ df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
1109
+ filename (str): The desired filename for the new spreadsheet.
1110
+ worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
1089
1111
 
1090
1112
  Returns:
1091
- gspread.Spreadsheet: The created spreadsheet object.
1113
+ gspread.Spreadsheet: The created spreadsheet object.
1092
1114
  """
1093
1115
  spreadsheet = client.create(filename)
1094
1116
  worksheet = spreadsheet.sheet1
@@ -1102,17 +1124,17 @@ def share_google_spreadsheet(spreadsheet: gspread.Spreadsheet, email: str, user_
1102
1124
  """
1103
1125
  Share a spreadsheet with a user.
1104
1126
 
1105
- Parameters:
1106
- - spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
1107
- - email (str): The email address of the user with whom the spreadsheet will be shared.
1108
- - user_type (str, optional): The permission type for the user. Defaults to "user".
1109
- - user_role (str, optional): The role assigned to the user. Defaults to "writer".
1110
- - notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
1111
- - email_message (str, optional): The message to include in the notification email.
1112
- - with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
1127
+ Args:
1128
+ spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
1129
+ email (str): The email address of the user with whom the spreadsheet will be shared.
1130
+ user_type (str, optional): The permission type for the user. Defaults to "user".
1131
+ user_role (str, optional): The role assigned to the user. Defaults to "writer".
1132
+ notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
1133
+ email_message (str, optional): The message to include in the notification email.
1134
+ with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
1113
1135
 
1114
1136
  Returns:
1115
- gspread.Spreadsheet: The updated spreadsheet object.
1137
+ gspread.Spreadsheet: The updated spreadsheet object.
1116
1138
  """
1117
1139
  spreadsheet.share(email, perm_type=user_type, role=user_role, notify=notify, email_message=email_message, with_link=with_link)
1118
1140
  return spreadsheet
@@ -1121,11 +1143,11 @@ def generate_short_id(variables: dict) -> tuple[str, str]:
1121
1143
  """
1122
1144
  Generate an 8-character ID using a dictionary as input.
1123
1145
 
1124
- Parameters:
1125
- - variables (dict): A dictionary containing the variables to be serialized.
1146
+ Args:
1147
+ variables (dict): A dictionary containing the variables to be serialized.
1126
1148
 
1127
1149
  Returns:
1128
- tuple: A tuple containing the generated short ID and the serialized variables.
1150
+ tuple: A tuple containing the generated short ID and the serialized variables.
1129
1151
  """
1130
1152
  # Serialize variables into JSON string
1131
1153
  serialized_variables = json.dumps(variables, sort_keys=True)
@@ -1136,6 +1158,16 @@ def generate_short_id(variables: dict) -> tuple[str, str]:
1136
1158
  return short_id, serialized_variables
1137
1159
 
1138
1160
  def df_transform_column_as_list(column: pd.Series) -> pd.Series:
1161
+ """
1162
+ Transform a pandas Series where each cell is a string representation of a list,
1163
+ a single value, or already a list into a pandas Series with each cell as a list.
1164
+
1165
+ Args:
1166
+ column (pd.Series): The input pandas Series to transform.
1167
+
1168
+ Returns:
1169
+ pd.Series: A pandas Series with each cell as a list.
1170
+ """
1139
1171
  def transform(cell):
1140
1172
  if isinstance(cell, str):
1141
1173
  # Check if it's a list formatted as string, and convert to list
@@ -1168,7 +1200,17 @@ def top_rows_per_category(df: pd.DataFrame,
1168
1200
  cols_to_keep: list[str],
1169
1201
  top_rows: int) -> pd.DataFrame:
1170
1202
  """
1171
- Select top rows for each category in a dataframe
1203
+ Select the top rows for each category in a dataframe.
1204
+
1205
+ Args:
1206
+ df (pd.DataFrame): The input dataframe.
1207
+ col_to_sort (str): The column name by which to sort the rows.
1208
+ col_to_gb (str): The column name to group by.
1209
+ cols_to_keep (List[str]): The list of columns to keep in the final output.
1210
+ top_rows (int): The number of top rows to select for each group.
1211
+
1212
+ Returns:
1213
+ pd.DataFrame: A dataframe containing the top rows for each category.
1172
1214
  """
1173
1215
  df_gb = (df.sort_values(by=col_to_sort, ascending=False)
1174
1216
  .groupby(col_to_gb)
@@ -1179,7 +1221,13 @@ def top_rows_per_category(df: pd.DataFrame,
1179
1221
 
1180
1222
  def format_number(number: int) -> str:
1181
1223
  """
1182
- Function to format a number in K, M or B
1224
+ Format a number into a human-readable string with K, M, or B suffixes.
1225
+
1226
+ Args:
1227
+ number (int): The number to format.
1228
+
1229
+ Returns:
1230
+ str: The formatted number as a string with an appropriate suffix.
1183
1231
  """
1184
1232
  if number < 1000:
1185
1233
  return str(number)
@@ -1196,12 +1244,12 @@ def unrar_file(rar_file_path : str, output_dir : str) -> None:
1196
1244
  """
1197
1245
  Extracts a .rar file to the specified output directory using the unrar command.
1198
1246
 
1199
- Parameters:
1200
- rar_file_path (str): The path to the .rar file.
1201
- output_dir (str): The directory where the contents should be extracted.
1247
+ Args:
1248
+ rar_file_path (str): The path to the .rar file.
1249
+ output_dir (str): The directory where the contents should be extracted.
1202
1250
 
1203
1251
  Returns:
1204
- None
1252
+ None
1205
1253
  """
1206
1254
  try:
1207
1255
  # Ensure the output directory exists
@@ -1216,4 +1264,70 @@ def unrar_file(rar_file_path : str, output_dir : str) -> None:
1216
1264
  print(f"Extraction failed. Error: {result.stderr}")
1217
1265
 
1218
1266
  except Exception as e:
1219
- print(f"An error occurred: {e}")
1267
+ print(f"An error occurred: {e}")
1268
+
1269
+
1270
+ def fill_nan(df: pd.DataFrame) -> pd.DataFrame:
1271
+ """
1272
+ Fill missing values in a DataFrame with appropriate defaults based on the column data type.
1273
+
1274
+ For string columns, missing values are replaced with an empty string.
1275
+ For numeric columns, missing values are replaced with zero.
1276
+ For datetime columns, missing values are replaced with the default date '1970-01-01'.
1277
+ For other types, missing values are filled with NaN.
1278
+
1279
+ Args:
1280
+ df (DataFrame): The DataFrame in which missing values will be filled.
1281
+
1282
+ Returns:
1283
+ DataFrame: The DataFrame with missing values filled.
1284
+ """
1285
+ mixed_columns = df.columns[df.isna().any()]
1286
+
1287
+ for col in mixed_columns:
1288
+ if df[col].dtype == 'object':
1289
+ # For string columns, replace NaN with an empty string
1290
+ df[col] = df[col].fillna('')
1291
+ elif pd.api.types.is_numeric_dtype(df[col]):
1292
+ # For numeric columns, replace NaN with the column mean
1293
+ df[col] = df[col].fillna(0)
1294
+ elif pd.api.types.is_datetime64_any_dtype(df[col]):
1295
+ # For datetime columns, replace NaN with a default date
1296
+ default_date = pd.Timestamp('1970-01-01')
1297
+ df[col] = df[col].fillna(default_date)
1298
+ else:
1299
+ # For other types, we can use a general approach, such as fill with None or NaN
1300
+ df[col] = df[col].fillna(None)
1301
+
1302
+ return df
1303
+
1304
+ def detect_encoding(file_path : str) -> str:
1305
+ """
1306
+ Detect the encoding of a file.
1307
+
1308
+ Args:
1309
+ file_path (str): The path to the file whose encoding needs to be detected.
1310
+
1311
+ Returns:
1312
+ str: The detected encoding of the file.
1313
+ """
1314
+ with open(file_path, 'rb') as file:
1315
+ raw_data = file.read()
1316
+ result = chardet.detect(raw_data)
1317
+ return result['encoding']
1318
+
1319
+ def remove_empty_folders(path: str):
1320
+ """
1321
+ Recursively remove empty folders from the specified directory.
1322
+
1323
+ Parameters:
1324
+ - path (str): Path to the directory to scan for empty folders.
1325
+ """
1326
+ # Iterate over the directory tree
1327
+ for root, dirs, files in os.walk(path, topdown=False):
1328
+ for dir_name in dirs:
1329
+ dir_path = os.path.join(root, dir_name)
1330
+ # If the directory is empty, remove it
1331
+ if not os.listdir(dir_path):
1332
+ os.rmdir(dir_path)
1333
+ print(f"Removed empty folder: {dir_path}")