rgwfuncs 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rgwfuncs/df_lib.py CHANGED
@@ -18,22 +18,30 @@ import sqlite3
18
18
  from email.mime.multipart import MIMEMultipart
19
19
  from email.mime.text import MIMEText
20
20
  from email.mime.base import MIMEBase
21
+ from email import encoders
21
22
  from googleapiclient.discovery import build
22
23
  import base64
23
- from typing import Optional, Callable, Dict, List
24
+ import inspect
25
+ from typing import Optional, Callable, Dict, List, Tuple, Any
24
26
 
25
27
 
26
28
  def docs(method_type_filter: Optional[str] = None) -> None:
27
29
  """
28
- Print a list of function names in alphabetical order. If method_type_filter is specified,
29
- print the docstrings of the functions that match the filter.
30
+ Print a list of function names in alphabetical order. If
31
+ method_type_filter is specified, print the docstrings of the functions
32
+ that match the filter. Using '*' as a filter will print the docstrings for
33
+ all functions.
30
34
 
31
35
  Parameters:
32
- method_type_filter: Optional filter string, comma-separated, to select docstring types.
36
+ method_type_filter: Optional filter string, comma-separated to select
37
+ docstring types, or '*' for all.
33
38
  """
34
39
  # Get the current module's namespace
40
+ current_module = __name__
41
+
35
42
  local_functions: Dict[str, Callable] = {
36
- name: obj for name, obj in globals().items() if callable(obj)
43
+ name: obj for name, obj in globals().items()
44
+ if inspect.isfunction(obj) and obj.__module__ == current_module
37
45
  }
38
46
 
39
47
  # List of function names sorted alphabetically
@@ -44,23 +52,28 @@ def docs(method_type_filter: Optional[str] = None) -> None:
44
52
  for name in function_names:
45
53
  print(name)
46
54
 
47
- # If a filter is provided, print the docstrings of functions that match the filter
55
+ # If a filter is provided or '*', print the docstrings of functions
48
56
  if method_type_filter:
49
- function_type_list: List[str] = [mt.strip() for mt in method_type_filter.split(',')]
50
57
  print("\nFiltered function documentation:")
51
-
52
58
  for name, func in local_functions.items():
53
59
  docstring: Optional[str] = func.__doc__
54
60
  if docstring:
55
- # Extract only the first line of the docstring
56
- first_line: str = docstring.split('\n')[0]
57
- if "::" in first_line:
58
- # Find the first occurrence of "::" and split there
59
- split_index: int = first_line.find("::")
60
- function_type: str = first_line[:split_index].strip()
61
- if function_type in function_type_list:
62
- function_description: str = first_line[split_index + 2:].strip()
63
- print(f"{name}: {function_description}")
61
+ if method_type_filter == '*':
62
+ # Print the entire docstring for each function
63
+ print(f"\n{name}:\n{docstring}")
64
+ else:
65
+ # Extract only the first line of the docstring
66
+ first_line: str = docstring.split('\n')[0]
67
+ if "::" in first_line:
68
+ # Find the first occurrence of "::" and split there
69
+ split_index: int = first_line.find("::")
70
+ function_type: str = first_line[:split_index].strip()
71
+ function_type_list: List[str] = [
72
+ mt.strip() for mt in method_type_filter.split(',')]
73
+ if function_type in function_type_list:
74
+ # Print the entire docstring if the filter matches
75
+ print(f"\n{name}:\n{docstring}")
76
+
64
77
 
65
78
  def numeric_clean(
66
79
  df: pd.DataFrame,
@@ -70,34 +83,45 @@ def numeric_clean(
70
83
  ) -> pd.DataFrame:
71
84
  """
72
85
  Cleans the numeric columns based on specified treatments.
73
-
86
+
74
87
  Parameters:
75
88
  df: The DataFrame to clean.
76
- column_names: A comma-separated string containing the names of the columns to clean.
77
- column_type: The type to convert the column to ('INTEGER' or 'FLOAT').
78
- irregular_value_treatment: How to treat irregular values ('NAN', 'TO_ZERO', 'MEAN').
89
+ column_names: A comma-separated string containing the names of the
90
+ columns to clean.
91
+ column_type: The type to convert the column to ('INTEGER' or
92
+ 'FLOAT').
93
+ irregular_value_treatment: How to treat irregular values ('NAN',
94
+ 'TO_ZERO', 'MEAN').
79
95
 
80
96
  Returns:
81
97
  A new DataFrame with cleaned numeric columns.
82
98
  """
83
99
  df_copy = df.copy() # Avoid mutating the original DataFrame
84
- columns_list: List[str] = [name.strip() for name in column_names.split(',')]
100
+ columns_list: List[str] = [name.strip()
101
+ for name in column_names.split(',')]
85
102
 
86
103
  for column_name in columns_list:
87
104
  if column_name not in df_copy.columns:
88
- raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
105
+ raise ValueError(
106
+ f"Column '{column_name}' does not exist in the DataFrame.")
89
107
 
90
108
  if column_type not in ['INTEGER', 'FLOAT']:
91
109
  raise ValueError("column_type must be 'INTEGER' or 'FLOAT'.")
92
110
 
93
111
  if irregular_value_treatment not in ['NAN', 'TO_ZERO', 'MEAN']:
94
- raise ValueError("irregular_value_treatment must be 'NAN', 'TO_ZERO', or 'MEAN'.")
112
+ raise ValueError(
113
+ "irregular_value_treatment must be 'NAN', 'TO_ZERO', or"
114
+ + "'MEAN'.")
95
115
 
96
116
  # Convert column type
97
117
  if column_type == 'INTEGER':
98
- df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(pd.Int64Dtype())
118
+ df_copy[column_name] = pd.to_numeric(
119
+ df_copy[column_name],
120
+ errors='coerce').astype(
121
+ pd.Int64Dtype())
99
122
  elif column_type == 'FLOAT':
100
- df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(float)
123
+ df_copy[column_name] = pd.to_numeric(
124
+ df_copy[column_name], errors='coerce').astype(float)
101
125
 
102
126
  # Handle irregular values
103
127
  if irregular_value_treatment == 'NAN':
@@ -110,6 +134,7 @@ def numeric_clean(
110
134
 
111
135
  return df_copy
112
136
 
137
+
113
138
  def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
114
139
  """
115
140
  Limit the DataFrame to a specified number of rows.
@@ -120,15 +145,16 @@ def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
120
145
 
121
146
  Returns:
122
147
  A new DataFrame limited to the specified number of rows.
123
-
148
+
124
149
  Raises:
125
150
  ValueError: If num_rows is not an integer.
126
151
  """
127
152
  if not isinstance(num_rows, int):
128
153
  raise ValueError("The number of rows should be an integer.")
129
-
154
+
130
155
  return df.head(num_rows)
131
156
 
157
+
132
158
  def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
133
159
  """
134
160
  Create a DataFrame from raw data.
@@ -150,13 +176,15 @@ def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
150
176
 
151
177
  return df
152
178
 
179
+
153
180
  def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
154
181
  """
155
182
  Append rows to the DataFrame.
156
183
 
157
184
  Parameters:
158
185
  df: The original DataFrame.
159
- rows: A list of lists, where each inner list represents a row to be appended.
186
+ rows: A list of lists, where each inner list represents a row to be
187
+ appended.
160
188
 
161
189
  Returns:
162
190
  A new DataFrame with the appended rows.
@@ -164,7 +192,12 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
164
192
  Raises:
165
193
  ValueError: If rows are not in the correct format.
166
194
  """
167
- if not isinstance(rows, list) or not all(isinstance(row, list) for row in rows):
195
+ if not isinstance(
196
+ rows,
197
+ list) or not all(
198
+ isinstance(
199
+ row,
200
+ list) for row in rows):
168
201
  raise ValueError("Rows should be provided as a list of lists.")
169
202
 
170
203
  if df.empty:
@@ -175,6 +208,7 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
175
208
 
176
209
  return new_df
177
210
 
211
+
178
212
  def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
179
213
  """
180
214
  Append columns to the DataFrame with None values.
@@ -198,6 +232,7 @@ def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
198
232
 
199
233
  return new_df
200
234
 
235
+
201
236
  def update_rows(
202
237
  df: pd.DataFrame,
203
238
  condition: str,
@@ -209,7 +244,8 @@ def update_rows(
209
244
  Parameters:
210
245
  df: The original DataFrame.
211
246
  condition: A query condition to identify rows for updating.
212
- updates: A dictionary with column names as keys and new values as values.
247
+ updates: A dictionary with column names as keys and new values as
248
+ values.
213
249
 
214
250
  Returns:
215
251
  A new DataFrame with the updated rows.
@@ -227,7 +263,9 @@ def update_rows(
227
263
 
228
264
  invalid_cols = [col for col in updates if col not in df.columns]
229
265
  if invalid_cols:
230
- raise ValueError(f"Columns {', '.join(invalid_cols)} do not exist in the DataFrame.")
266
+ raise ValueError(
267
+ f"Columns {
268
+ ', '.join(invalid_cols)} do not exist in the DataFrame.")
231
269
 
232
270
  new_df = df.copy()
233
271
  for col_name, new_value in updates.items():
@@ -235,6 +273,7 @@ def update_rows(
235
273
 
236
274
  return new_df
237
275
 
276
+
238
277
  def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
239
278
  """
240
279
  Delete rows from the DataFrame based on a condition.
@@ -258,6 +297,7 @@ def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
258
297
 
259
298
  return new_df
260
299
 
300
+
261
301
  def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
262
302
  """
263
303
  Drop duplicate rows in the DataFrame, retaining the first occurrence.
@@ -267,7 +307,7 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
267
307
 
268
308
  Returns:
269
309
  A new DataFrame with duplicates removed.
270
-
310
+
271
311
  Raises:
272
312
  ValueError: If the DataFrame is None.
273
313
  """
@@ -275,54 +315,73 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
275
315
  raise ValueError("DataFrame is not initialized.")
276
316
  return df.drop_duplicates(keep='first')
277
317
 
278
- def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
318
+
319
+ def drop_duplicates_retain_first(
320
+ df: pd.DataFrame,
321
+ columns: Optional[str] = None) -> pd.DataFrame:
279
322
  """
280
- Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
323
+ Drop duplicate rows in the DataFrame based on specified columns, retaining
324
+ the first occurrence.
281
325
 
282
326
  Parameters:
283
327
  df: The DataFrame from which duplicates will be dropped.
284
- columns: A comma-separated string with the column names used to identify duplicates.
328
+ columns: A comma-separated string with the column names used to
329
+ identify duplicates.
285
330
 
286
331
  Returns:
287
332
  A new DataFrame with duplicates removed.
288
-
333
+
289
334
  Raises:
290
335
  ValueError: If the DataFrame is None.
291
336
  """
292
337
  if df is None:
293
338
  raise ValueError("DataFrame is not initialized.")
294
-
295
- columns_list = [col.strip() for col in columns.split(',')] if columns else None
339
+
340
+ columns_list = [col.strip()
341
+ for col in columns.split(',')] if columns else None
296
342
  return df.drop_duplicates(subset=columns_list, keep='first')
297
343
 
298
- def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
344
+
345
+ def drop_duplicates_retain_last(
346
+ df: pd.DataFrame,
347
+ columns: Optional[str] = None) -> pd.DataFrame:
299
348
  """
300
- Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
349
+ Drop duplicate rows in the DataFrame based on specified columns, retaining
350
+ the last occurrence.
301
351
 
302
352
  Parameters:
303
353
  df: The DataFrame from which duplicates will be dropped.
304
- columns: A comma-separated string with the column names used to identify duplicates.
354
+ columns: A comma-separated string with the column names used to
355
+ identify duplicates.
305
356
 
306
357
  Returns:
307
358
  A new DataFrame with duplicates removed.
308
-
359
+
309
360
  Raises:
310
361
  ValueError: If the DataFrame is None.
311
362
  """
312
363
  if df is None:
313
364
  raise ValueError("DataFrame is not initialized.")
314
-
315
- columns_list = [col.strip() for col in columns.split(',')] if columns else None
365
+
366
+ columns_list = [col.strip()
367
+ for col in columns.split(',')] if columns else None
316
368
  return df.drop_duplicates(subset=columns_list, keep='last')
317
369
 
318
- def load_data_from_query(db_preset_name: str, query: str, config_file_name: str = "rgwml.config") -> pd.DataFrame:
370
+
371
+ def load_data_from_query(
372
+ db_preset_name: str,
373
+ query: str,
374
+ config_file_name: str = "rgwml.config") -> pd.DataFrame:
319
375
  """
320
- Load data from a database query into a DataFrame based on a configuration preset.
376
+ Load data from a database query into a DataFrame based on a configuration
377
+ preset.
321
378
 
322
379
  Parameters:
323
- db_preset_name: The name of the database preset in the configuration file.
380
+ db_preset_name: The name of the database preset in the configuration
381
+ file.
324
382
  query: The SQL query to execute.
325
- config_file_name: Name of the configuration file (default: 'rgwml.config').
383
+ config_file_name: Name of the configuration file
384
+ (default: 'rgwml.config').
326
385
 
327
386
  Returns:
328
387
  A DataFrame containing the query result.
@@ -344,45 +403,56 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
344
403
  for root, dirs, files in os.walk(path):
345
404
  if filename in files:
346
405
  return os.path.join(root, filename)
347
- raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
406
+ raise FileNotFoundError(
407
+ f"{filename} not found in Desktop, Documents, or Downloads"
408
+ + "folders")
348
409
 
349
410
  def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
350
- """Execute a query on an MSSQL database and return the result as a DataFrame."""
351
411
  server = db_preset['host']
352
412
  user = db_preset['username']
353
413
  password = db_preset['password']
354
414
  database = db_preset.get('database', '')
355
415
 
356
- with pymssql.connect(server=server, user=user, password=password, database=database) as conn:
416
+ with pymssql.connect(server=server, user=user, password=password,
417
+ database=database) as conn:
357
418
  with conn.cursor() as cursor:
358
419
  cursor.execute(query)
359
420
  rows = cursor.fetchall()
360
421
  columns = [desc[0] for desc in cursor.description]
361
-
422
+
362
423
  return pd.DataFrame(rows, columns=columns)
363
424
 
364
425
  def query_mysql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
365
- """Execute a query on a MySQL database and return the result as a DataFrame."""
366
426
  host = db_preset['host']
367
427
  user = db_preset['username']
368
428
  password = db_preset['password']
369
429
  database = db_preset.get('database', '')
370
430
 
371
- with mysql.connector.connect(host=host, user=user, password=password, database=database) as conn:
431
+ with mysql.connector.connect(
432
+ host=host,
433
+ user=user,
434
+ password=password,
435
+ database=database
436
+ ) as conn:
372
437
  with conn.cursor() as cursor:
373
438
  cursor.execute(query)
374
439
  rows = cursor.fetchall()
375
- columns = [desc[0] for desc in cursor.description] if cursor.description else []
376
-
440
+ columns = (
441
+ [desc[0] for desc in cursor.description]
442
+ if cursor.description
443
+ else []
444
+ )
445
+
377
446
  return pd.DataFrame(rows, columns=columns)
378
447
 
379
- def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
380
- """Query a ClickHouse database and return the result as a DataFrame."""
448
+ def query_clickhouse(
449
+ db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
450
+
381
451
  host = db_preset['host']
382
452
  user = db_preset['username']
383
453
  password = db_preset['password']
384
454
  database = db_preset['database']
385
-
455
+
386
456
  max_retries = 5
387
457
  retry_delay = 5
388
458
 
@@ -405,21 +475,23 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
405
475
  print(f"Retrying in {retry_delay} seconds...")
406
476
  time.sleep(retry_delay)
407
477
  else:
408
- raise ConnectionError("All attempts to connect to ClickHouse failed.")
478
+ raise ConnectionError(
479
+ "All attempts to connect to ClickHouse failed.")
409
480
 
410
- def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
411
- """Query a Google BigQuery database and return the result as a DataFrame."""
481
+ def query_google_big_query(
482
+ db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
412
483
  json_file_path = db_preset['json_file_path']
413
484
  project_id = db_preset['project_id']
414
485
 
415
- credentials = service_account.Credentials.from_service_account_file(json_file_path)
486
+ credentials = service_account.Credentials.from_service_account_file(
487
+ json_file_path)
416
488
  client = bigquery.Client(credentials=credentials, project=project_id)
417
489
 
418
490
  query_job = client.query(query)
419
491
  results = query_job.result()
420
492
  rows = [list(row.values()) for row in results]
421
493
  columns = [field.name for field in results.schema]
422
-
494
+
423
495
  return pd.DataFrame(rows, columns=columns)
424
496
 
425
497
  # Read the configuration file to get the database preset
@@ -428,7 +500,9 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
428
500
  config = json.load(f)
429
501
 
430
502
  db_presets = config.get('db_presets', [])
431
- db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
503
+ db_preset = next(
504
+ (preset for preset in db_presets if preset['name'] == db_preset_name),
505
+ None)
432
506
  if not db_preset:
433
507
  raise ValueError(f"No matching db_preset found for {db_preset_name}")
434
508
 
@@ -446,7 +520,6 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
446
520
  raise ValueError(f"Unsupported db_type: {db_type}")
447
521
 
448
522
 
449
-
450
523
  def load_data_from_path(file_path: str) -> pd.DataFrame:
451
524
  """
452
525
  Load data from a file into a DataFrame based on the file extension.
@@ -460,7 +533,7 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
460
533
  Raises:
461
534
  ValueError: If the file extension is unsupported.
462
535
  """
463
-
536
+
464
537
  def load_hdf5(file_path: str) -> pd.DataFrame:
465
538
  """Helper function to load HDF5 files and select a key if necessary."""
466
539
  with pd.HDFStore(file_path, mode='r') as store:
@@ -476,7 +549,8 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
476
549
  df = pd.read_hdf(file_path, key=key)
477
550
  break
478
551
  else:
479
- print(f"Key '{key}' is not in the available keys. Please try again.")
552
+ print(
553
+ f"Key '{key}' is not in the available keys.")
480
554
  return df
481
555
 
482
556
  # Ensure the file path is absolute
@@ -510,7 +584,8 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
510
584
 
511
585
  def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
512
586
  """
513
- Execute a query on a SQLite database specified by its path and return the results as a DataFrame.
587
+ Execute a query on a SQLite database specified by its path and return the
588
+ results as a DataFrame.
514
589
 
515
590
  Parameters:
516
591
  sqlite_path: The absolute path to the SQLite database file.
@@ -522,7 +597,7 @@ def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
522
597
  Raises:
523
598
  ValueError: If there is a problem executing the query.
524
599
  """
525
-
600
+
526
601
  # Ensure the file path is absolute
527
602
  sqlite_path = os.path.abspath(sqlite_path)
528
603
 
@@ -535,68 +610,168 @@ def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
535
610
  gc.collect()
536
611
  return df
537
612
 
613
+
538
614
  def first_n_rows(df: pd.DataFrame, n: int) -> None:
539
- """Print the first n rows of the DataFrame."""
615
+ """
616
+ Display the first n rows of the DataFrame.
617
+
618
+ This function prints out the first `n` rows of a given DataFrame. Each row
619
+ is formatted for clarity and
620
+ printed as a dictionary. If the DataFrame is empty or `None`, it raises a
621
+ ValueError.
622
+
623
+ Parameters:
624
+ - df (pd.DataFrame): The DataFrame to display rows from.
625
+ - n (int): The number of rows to display from the start of the DataFrame.
626
+
627
+ Raises:
628
+ - ValueError: If the DataFrame is `None`.
629
+ """
540
630
  if df is not None:
541
631
  first_n_rows = df.head(n).to_dict(orient="records")
542
632
  for row in first_n_rows:
543
633
  pprint(row, indent=4)
544
634
  print()
545
635
  else:
546
- raise ValueError("No DataFrame to display. Please provide a DataFrame.")
636
+ raise ValueError(
637
+ "No DataFrame to display. Please provide a DataFrame.")
547
638
 
548
639
  gc.collect()
549
640
 
641
+
550
642
  def last_n_rows(df: pd.DataFrame, n: int) -> None:
551
- """Print the last n rows of the DataFrame."""
643
+ """
644
+ Display the last n rows of the DataFrame.
645
+
646
+ Prints the last `n` rows of a given DataFrame, formatted as dictionaries.
647
+ Useful for end-segment analysis and verifying data continuity.
648
+
649
+ Parameters:
650
+ - df (pd.DataFrame): The DataFrame from which to display rows.
651
+ - n (int): The number of rows to display from the end of the DataFrame.
652
+
653
+ Raises:
654
+ - ValueError: If the DataFrame is `None`.
655
+ """
552
656
  if df is not None:
553
657
  last_n_rows = df.tail(n).to_dict(orient="records")
554
658
  for row in last_n_rows:
555
659
  pprint(row, indent=4)
556
660
  print()
557
661
  else:
558
- raise ValueError("No DataFrame to display. Please provide a DataFrame.")
662
+ raise ValueError(
663
+ "No DataFrame to display. Please provide a DataFrame.")
559
664
 
560
665
  gc.collect()
561
666
 
667
+
562
668
  def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
563
- """Print top n unique values for specified columns in the DataFrame."""
669
+ """
670
+ Print the top `n` unique values for specified columns in the DataFrame.
671
+
672
+ This method calculates and prints the top `n` unique frequency values for
673
+ specified columns in a DataFrame.
674
+
675
+ Parameters:
676
+ - df (pd.DataFrame): The DataFrame from which to calculate top unique
677
+ values.
678
+ - n (int): Number of top values to display.
679
+ - columns (List[str]): List of column names for which to display top
680
+ unique values.
681
+
682
+ Raises:
683
+ - ValueError: If the DataFrame is `None`.
684
+ """
564
685
  if df is not None:
565
686
  report = {}
566
687
  for column in columns:
567
688
  if column in df.columns:
568
689
  frequency = df[column].astype(str).value_counts(dropna=False)
569
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
690
+ frequency = frequency.rename(
691
+ index={
692
+ 'nan': 'NaN',
693
+ 'NaT': 'NaT',
694
+ 'None': 'None',
695
+ '': 'Empty'})
570
696
  top_n_values = frequency.nlargest(n)
571
- report[column] = {str(value): str(count) for value, count in top_n_values.items()}
572
- print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
697
+ report[column] = {str(value): str(count)
698
+ for value, count in top_n_values.items()}
699
+ print(
700
+ f"Top {n} unique values for column '{column}':\n{
701
+ json.dumps(
702
+ report[column],
703
+ indent=2)}\n")
573
704
  else:
574
705
  print(f"Column '{column}' does not exist in the DataFrame.")
575
706
  else:
576
- raise ValueError("No DataFrame to display. Please provide a DataFrame.")
707
+ raise ValueError(
708
+ "No DataFrame to display. Please provide a DataFrame.")
577
709
 
578
710
  gc.collect()
579
711
 
580
- def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
581
- """Print bottom n unique values for specified columns in the DataFrame."""
712
+
713
+ def bottom_n_unique_values(
714
+ df: pd.DataFrame,
715
+ n: int,
716
+ columns: List[str]) -> None:
717
+ """
718
+ Print the bottom `n` unique values for specified columns in the DataFrame.
719
+
720
+ This method calculates and prints the bottom `n` unique frequency values
721
+ for specified columns in a DataFrame.
722
+
723
+ Parameters:
724
+ - df (pd.DataFrame): The DataFrame from which to calculate bottom unique
725
+ values.
726
+ - n (int): Number of bottom unique frequency values to display.
727
+ - columns (List[str]): List of column names for which to display bottom
728
+ unique values.
729
+
730
+ Raises:
731
+ - ValueError: If the DataFrame is `None`.
732
+ """
582
733
  if df is not None:
583
734
  report = {}
584
735
  for column in columns:
585
736
  if column in df.columns:
586
737
  frequency = df[column].astype(str).value_counts(dropna=False)
587
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
738
+ frequency = frequency.rename(
739
+ index={
740
+ 'nan': 'NaN',
741
+ 'NaT': 'NaT',
742
+ 'None': 'None',
743
+ '': 'Empty'})
588
744
  bottom_n_values = frequency.nsmallest(n)
589
- report[column] = {str(value): str(count) for value, count in bottom_n_values.items()}
590
- print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
745
+ report[column] = {
746
+ str(value): str(count) for value,
747
+ count in bottom_n_values.items()}
748
+ print(
749
+ f"Bottom {n} unique values for column '{column}':\n{
750
+ json.dumps(
751
+ report[column],
752
+ indent=2)}\n")
591
753
  else:
592
754
  print(f"Column '{column}' does not exist in the DataFrame.")
593
755
  else:
594
- raise ValueError("No DataFrame to display. Please provide a DataFrame.")
756
+ raise ValueError(
757
+ "No DataFrame to display. Please provide a DataFrame.")
595
758
 
596
759
  gc.collect()
597
760
 
598
- def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
599
- """Print correlation for multiple pairs of columns in the DataFrame."""
761
+
762
+ def print_correlation(
763
+ df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
764
+ """
765
+ Print correlation for multiple pairs of columns in the DataFrame.
766
+
767
+ This function computes and displays the correlation coefficients for
768
+ specified pairs of columns.
769
+
770
+ Parameters:
771
+ - df (pd.DataFrame): The DataFrame containing the columns to analyze.
772
+ - column_pairs (List[Tuple[str, str]]): List of column pairs for which to
773
+ compute correlations.
774
+ """
600
775
  if df is not None:
601
776
  for col1, col2 in column_pairs:
602
777
  if col1 in df.columns and col2 in df.columns:
@@ -606,30 +781,68 @@ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) ->
606
781
 
607
782
  correlation = numeric_col1.corr(numeric_col2)
608
783
  if pd.notnull(correlation):
609
- print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
784
+ print(
785
+ f"The correlation between '{col1}' and '{col2}'"
786
+ + f" is {correlation}.")
610
787
  else:
611
- print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
788
+ print(
789
+ f"Cannot calculate correlation between '{col1}'"
790
+ + f" and '{col2}' due to insufficient numeric"
791
+ + " data.")
612
792
  except Exception as e:
613
- print(f"Error processing columns '{col1}' and '{col2}': {e}")
793
+ print(
794
+ f"Error processing cols '{col1}' and '{col2}': {e}")
614
795
  else:
615
- print(f"One or both of the specified columns ('{col1}', '{col2}') do not exist in the DataFrame.")
796
+ print(
797
+ f"One or both of the specified cols ('{col1}', '{col2}')"
798
+ + " do not exist in the DataFrame.")
616
799
  else:
617
800
  print("The DataFrame is empty.")
618
801
 
619
802
  gc.collect()
620
803
 
804
+
621
805
  def print_memory_usage(df: pd.DataFrame) -> None:
622
- """Print memory usage of the DataFrame."""
806
+ """
807
+ Prints the memory usage of the DataFrame.
808
+
809
+ This function computes the memory footprint of a DataFrame in megabytes
810
+ and displays it, rounding to two decimal places for clarity.
811
+
812
+ Parameters:
813
+ - df (pd.DataFrame): The DataFrame for which the memory usage is computed.
814
+
815
+ Raises:
816
+ - ValueError: If the DataFrame is `None`.
817
+ """
623
818
  if df is not None:
624
- memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024) # Convert bytes to MB
819
+ memory_usage = df.memory_usage(deep=True).sum(
820
+ ) / (1024 * 1024) # Convert bytes to MB
625
821
  print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
626
822
  else:
627
823
  raise ValueError("No DataFrame to print. Please provide a DataFrame.")
628
824
 
629
825
  gc.collect()
630
826
 
827
+
631
828
  def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
632
- """Filter DataFrame with a given expression."""
829
+ """
830
+ Return a filtered DataFrame according to the given expression.
831
+
832
+ This function filters rows of a DataFrame using a specified query
833
+ expression, returning a new DataFrame containing only the rows that
834
+ match the criteria.
835
+
836
+ Parameters:
837
+ - df (pd.DataFrame): The original DataFrame to be filtered.
838
+ - filter_expr (str): A query string to be evaluated against the DataFrame.
839
+
840
+ Returns:
841
+ - pd.DataFrame: A new DataFrame containing the filtered rows.
842
+
843
+ Raises:
844
+ - ValueError: If the DataFrame is `None`.
845
+ """
633
846
  if df is not None:
634
847
  try:
635
848
  filtered_df = df.query(filter_expr)
@@ -642,14 +855,34 @@ def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
642
855
 
643
856
  return filtered_df
644
857
 
858
+
645
859
  def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
646
- """Filter DataFrame for Indian mobile numbers."""
860
+ """
861
+ Filter and return DataFrame rows containing valid Indian mobile numbers.
862
+
863
+ This function processes a DataFrame to extract and retain rows where the
864
+ specified column matches the typical format for Indian mobile numbers.
865
+ An Indian mobile number is expected to be a digit-only string starting
866
+ with 6, 7, 8, or 9, and should have at least 4 distinct digits.
867
+
868
+ Parameters:
869
+ - df (pd.DataFrame): The DataFrame to filter.
870
+ - mobile_col (str): The name of the column in the DataFrame that contains
871
+ mobile number data.
872
+
873
+ Returns:
874
+ - pd.DataFrame: A new DataFrame containing only rows with valid Indian
875
+ mobile numbers.
876
+
877
+ Raises:
878
+ - ValueError: If the DataFrame is `None`.
879
+ """
647
880
  if df is not None:
648
881
  filtered_df = df[
649
882
  df[mobile_col].apply(
650
883
  lambda x: (
651
- str(x).isdigit() and
652
- str(x).startswith(('6', '7', '8', '9')) and
884
+ str(x).isdigit() and
885
+ str(x).startswith(('6', '7', '8', '9')) and
653
886
  len(set(str(x))) >= 4
654
887
  )
655
888
  )
@@ -661,17 +894,21 @@ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
661
894
 
662
895
  return filtered_df
663
896
 
897
+
664
898
  def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
665
899
  """
666
- Print the DataFrame and its column types. If a source path is provided, print it as well.
900
+ Print the DataFrame and its column types. If a source path is provided,
901
+ print it as well.
667
902
 
668
903
  Parameters:
669
904
  df: The DataFrame to print.
670
- source: Optional; The source path of the DataFrame for logging purposes.
905
+ source: Optional; The source path of the DataFrame for logging
906
+ purposes.
671
907
  """
672
908
  if df is not None:
673
909
  print(df)
674
- columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
910
+ columns_with_types = [
911
+ f"{col} ({df[col].dtypes})" for col in df.columns]
675
912
  print("Columns:", columns_with_types)
676
913
  if source:
677
914
  print(f"Source: {source}")
@@ -680,28 +917,43 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
680
917
 
681
918
  gc.collect()
682
919
 
683
- def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
920
+
921
+ def send_dataframe_via_telegram(
922
+ df: pd.DataFrame,
923
+ bot_name: str,
924
+ message: Optional[str] = None,
925
+ as_file: bool = True,
926
+ remove_after_send: bool = True) -> None:
684
927
  """
685
928
  Send a DataFrame via Telegram using a specified bot configuration.
686
929
 
687
930
  Parameters:
688
931
  df: The DataFrame to send.
689
- bot_name: The name of the Telegram bot as specified in the configuration.
932
+ bot_name: The name of the Telegram bot as specified in the
933
+ configuration.
690
934
  message: Custom message to send along with the DataFrame or file.
691
- as_file: Boolean flag to decide whether to send the DataFrame as a file or as text.
935
+ as_file: Boolean flag to decide whether to send the DataFrame as a
936
+ file or as text.
692
937
  remove_after_send: If True, removes the file after sending.
693
938
  """
694
939
 
695
940
  def locate_config_file(filename: str = "rgwml.config") -> str:
696
941
  """Retrieve the configuration file path."""
697
942
  home_dir = os.path.expanduser("~")
698
- search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
943
+ search_paths = [
944
+ os.path.join(
945
+ home_dir,
946
+ folder) for folder in [
947
+ "Desktop",
948
+ "Documents",
949
+ "Downloads"]]
699
950
 
700
951
  for path in search_paths:
701
952
  for root, _, files in os.walk(path):
702
953
  if filename in files:
703
954
  return os.path.join(root, filename)
704
- raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
955
+ raise FileNotFoundError(
956
+ f"{filename} not found in Desktop, Documents, or Downloads")
705
957
 
706
958
  def get_config(config_path: str) -> dict:
707
959
  """Load configuration from a json file."""
@@ -710,8 +962,14 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
710
962
 
711
963
  config_path = locate_config_file()
712
964
  config = get_config(config_path)
965
+ bot_config = next(
966
+ (
967
+ bot for bot in config['telegram_bot_presets']
968
+ if bot['name'] == bot_name
969
+ ),
970
+ None
971
+ )
713
972
 
714
- bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
715
973
  if not bot_config:
716
974
  raise ValueError(f"No bot found with the name {bot_name}")
717
975
 
@@ -724,9 +982,15 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
724
982
  df.to_csv(file_name, index=False)
725
983
  try:
726
984
  with open(file_name, 'rb') as file:
727
- payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
985
+ payload = {
986
+ 'chat_id': bot_config['chat_id'],
987
+ 'caption': message or ''}
728
988
  files = {'document': file}
729
- response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
989
+ response = requests.post(
990
+ f"https://api.telegram.org/bot{
991
+ bot_config['bot_token']}/sendDocument",
992
+ data=payload,
993
+ files=files)
730
994
  if remove_after_send and os.path.exists(file_name):
731
995
  os.remove(file_name)
732
996
  except Exception as e:
@@ -734,14 +998,20 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
734
998
  raise
735
999
  else:
736
1000
  df_str = df.to_string()
737
- payload = {'chat_id': bot_config['chat_id'], 'text': message + "\n\n" + df_str if message else df_str, 'parse_mode': 'HTML'}
738
- response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
1001
+ payload = {
1002
+ 'chat_id': bot_config['chat_id'],
1003
+ 'text': message + "\n\n" + df_str if message else df_str,
1004
+ 'parse_mode': 'HTML'}
1005
+ response = requests.post(
1006
+ f"https://api.telegram.org/bot{bot_config['bot_token']}"
1007
+ + "/sendMessage", data=payload)
739
1008
 
740
1009
  if not response.ok:
741
1010
  raise Exception(f"Error sending message: {response.text}")
742
1011
 
743
1012
  print("Message sent successfully.")
744
1013
 
1014
+
745
1015
  def send_data_to_email(
746
1016
  df: pd.DataFrame,
747
1017
  preset_name: str,
@@ -752,39 +1022,50 @@ def send_data_to_email(
752
1022
  remove_after_send: bool = True
753
1023
  ) -> None:
754
1024
  """
755
- Send an email with optional DataFrame attachment using Gmail API via a specified preset.
1025
+ Send an email with optional DataFrame attachment using Gmail API via a
1026
+ specified preset.
756
1027
 
757
1028
  Parameters:
758
1029
  df: The DataFrame to send.
759
- preset_name: The configuration preset name to use for sending the email.
1030
+ preset_name: The configuration preset name to use for sending the
1031
+ email.
760
1032
  to_email: The recipient email address.
761
1033
  subject: Optional subject of the email.
762
1034
  body: Optional message body of the email.
763
- as_file: Boolean flag to decide whether to send the DataFrame as a file.
1035
+ as_file: Boolean flag to decide whether to send the DataFrame as a
1036
+ file.
764
1037
  remove_after_send: If True, removes the CSV file after sending.
765
1038
  """
766
1039
 
767
1040
  def locate_config_file(filename: str = "rgwml.config") -> str:
768
1041
  """Locate config file in common user directories."""
769
1042
  home_dir = os.path.expanduser("~")
770
- search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
1043
+ search_paths = [
1044
+ os.path.join(
1045
+ home_dir,
1046
+ folder) for folder in [
1047
+ "Desktop",
1048
+ "Documents",
1049
+ "Downloads"]]
771
1050
 
772
1051
  for path in search_paths:
773
1052
  for root, _, files in os.walk(path):
774
1053
  if filename in files:
775
1054
  return os.path.join(root, filename)
776
- raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
1055
+ raise FileNotFoundError(
1056
+ f"{filename} not found in Desktop, Documents, or Downloads"
1057
+ + " folders")
777
1058
 
778
1059
  def get_config(config_path: str) -> dict:
779
- """Load configuration from a json file."""
780
1060
  with open(config_path, 'r') as file:
781
1061
  try:
782
1062
  return json.load(file)
783
1063
  except json.JSONDecodeError as e:
784
1064
  raise ValueError(f"Invalid JSON format in config file: {e}")
785
1065
 
786
- def authenticate_service_account(service_account_credentials_path: str, sender_email_id: str) -> Any:
787
- """Authenticate the service account and return a Gmail API service instance."""
1066
+ def authenticate_service_account(
1067
+ service_account_credentials_path: str,
1068
+ sender_email_id: str) -> Any:
788
1069
  credentials = service_account.Credentials.from_service_account_file(
789
1070
  service_account_credentials_path,
790
1071
  scopes=['https://mail.google.com/'],
@@ -797,7 +1078,14 @@ def send_data_to_email(
797
1078
  config = get_config(config_path)
798
1079
 
799
1080
  # Retrieve Gmail preset configuration
800
- gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
1081
+ gmail_config = next(
1082
+ (
1083
+ preset for preset in config['gmail_bot_presets']
1084
+ if preset['name'] == preset_name
1085
+ ),
1086
+ None
1087
+ )
1088
+
801
1089
  if not gmail_config:
802
1090
  raise ValueError(f"No preset found with the name {preset_name}")
803
1091
 
@@ -809,7 +1097,9 @@ def send_data_to_email(
809
1097
 
810
1098
  if as_file:
811
1099
  # Create a temporary file for the DataFrame as CSV
812
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
1100
+ with tempfile.NamedTemporaryFile(
1101
+ delete=False, suffix=".csv"
1102
+ ) as tmp_file:
813
1103
  tmp_file_name = tmp_file.name
814
1104
  df.to_csv(tmp_file_name, index=False)
815
1105
 
@@ -819,13 +1109,18 @@ def send_data_to_email(
819
1109
  message['to'] = to_email
820
1110
  message['from'] = sender_email
821
1111
  message['subject'] = subject if subject else 'DataFrame CSV File'
822
- message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
1112
+ message.attach(
1113
+ MIMEText(
1114
+ body if body else 'Please find the CSV file attached.'))
823
1115
 
824
1116
  with open(tmp_file_name, 'rb') as file:
825
1117
  part = MIMEBase('application', 'octet-stream')
826
1118
  part.set_payload(file.read())
827
1119
  encoders.encode_base64(part)
828
- part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
1120
+ part.add_header(
1121
+ 'Content-Disposition',
1122
+ f'attachment; filename={
1123
+ os.path.basename(tmp_file_name)}')
829
1124
  message.attach(part)
830
1125
 
831
1126
  if remove_after_send and os.path.exists(tmp_file_name):
@@ -847,11 +1142,13 @@ def send_data_to_email(
847
1142
  try:
848
1143
  raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
849
1144
  email_body = {'raw': raw}
850
- sent_message = service.users().messages().send(userId="me", body=email_body).execute()
1145
+ sent_message = service.users().messages().send(
1146
+ userId="me", body=email_body).execute()
851
1147
  print(f"Email with Message Id {sent_message['id']} successfully sent.")
852
1148
  except Exception as error:
853
1149
  raise Exception(f"Error sending email: {error}")
854
1150
 
1151
+
855
1152
  def send_data_to_slack(
856
1153
  df: pd.DataFrame,
857
1154
  bot_name: str,
@@ -866,20 +1163,29 @@ def send_data_to_slack(
866
1163
  df: The DataFrame to send.
867
1164
  bot_name: The Slack bot configuration preset name.
868
1165
  message: Custom message to send along with the DataFrame or file.
869
- as_file: Boolean flag to decide whether to send the DataFrame as a file.
1166
+ as_file: Boolean flag to decide whether to send the DataFrame as a
1167
+ file.
870
1168
  remove_after_send: If True, removes the CSV file after sending.
871
1169
  """
872
1170
 
873
1171
  def locate_config_file(filename: str = "rgwml.config") -> str:
874
1172
  """Locate config file in common user directories."""
875
1173
  home_dir = os.path.expanduser("~")
876
- search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
1174
+ search_paths = [
1175
+ os.path.join(
1176
+ home_dir,
1177
+ folder) for folder in [
1178
+ "Desktop",
1179
+ "Documents",
1180
+ "Downloads"]]
877
1181
 
878
1182
  for path in search_paths:
879
1183
  for root, _, files in os.walk(path):
880
1184
  if filename in files:
881
1185
  return os.path.join(root, filename)
882
- raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
1186
+ raise FileNotFoundError(
1187
+ f"{filename} not found in Desktop, Documents, or Downloads"
1188
+ + " folders")
883
1189
 
884
1190
  def get_config(config_path: str) -> dict:
885
1191
  """Load configuration from a JSON file."""
@@ -890,7 +1196,14 @@ def send_data_to_slack(
890
1196
  config_path = locate_config_file()
891
1197
  config = get_config(config_path)
892
1198
 
893
- bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
1199
+ bot_config = next(
1200
+ (
1201
+ bot for bot in config['slack_bot_presets']
1202
+ if bot['name'] == bot_name
1203
+ ),
1204
+ None
1205
+ )
1206
+
894
1207
  if not bot_config:
895
1208
  raise ValueError(f"No bot found with the name {bot_name}")
896
1209
 
@@ -898,7 +1211,9 @@ def send_data_to_slack(
898
1211
 
899
1212
  if as_file:
900
1213
  # Create a temporary file for the DataFrame as CSV
901
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
1214
+ with tempfile.NamedTemporaryFile(
1215
+ delete=False, suffix=".csv"
1216
+ ) as tmp_file:
902
1217
  file_name = tmp_file.name
903
1218
  df.to_csv(file_name, index=False)
904
1219
 
@@ -927,13 +1242,15 @@ def send_data_to_slack(
927
1242
 
928
1243
  print("Message sent successfully.")
929
1244
 
1245
+
930
1246
  def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
931
1247
  """
932
1248
  Reorder the columns of the DataFrame based on a string input.
933
1249
 
934
1250
  Parameters:
935
1251
  df: The DataFrame whose columns will be reordered.
936
- column_order_str: A string specifying the desired order of columns, using ',' to separate columns.
1252
+ column_order_str: A string specifying the desired order of columns,
1253
+ using ',' to separate columns.
937
1254
 
938
1255
  Returns:
939
1256
  A new DataFrame with reordered columns.
@@ -942,7 +1259,8 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
942
1259
  ValueError: If a specified column does not exist in the DataFrame.
943
1260
  """
944
1261
  if df is None:
945
- raise ValueError("No DataFrame to reorder. Please provide a valid DataFrame.")
1262
+ raise ValueError(
1263
+ "No DataFrame to reorder. Please provide a valid DataFrame.")
946
1264
 
947
1265
  columns = df.columns.tolist()
948
1266
  parts = [part.strip() for part in column_order_str.split(',')]
@@ -972,10 +1290,11 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
972
1290
 
973
1291
  return df[new_order]
974
1292
 
1293
+
975
1294
  def append_ranged_classification_column(
976
- df: pd.DataFrame,
977
- ranges: str,
978
- target_col: str,
1295
+ df: pd.DataFrame,
1296
+ ranges: str,
1297
+ target_col: str,
979
1298
  new_col_name: str
980
1299
  ) -> pd.DataFrame:
981
1300
  """
@@ -992,7 +1311,6 @@ def append_ranged_classification_column(
992
1311
  """
993
1312
 
994
1313
  def pad_number(number, integer_length, decimal_length=0, decimal=False):
995
- """Pad number to have a consistent length for integer and decimal parts."""
996
1314
  if decimal:
997
1315
  str_number = f"{number:.{decimal_length}f}"
998
1316
  integer_part, decimal_part = str_number.split('.')
@@ -1006,25 +1324,70 @@ def append_ranged_classification_column(
1006
1324
 
1007
1325
  if has_decimals:
1008
1326
  range_list = [float(r) for r in range_list]
1009
- max_decimal_length = max(len(str(r).split('.')[1]) for r in range_list if '.' in str(r))
1010
- max_integer_length = max(len(str(int(float(r)))) for r in range_list)
1011
- labels = [f"{pad_number(range_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(range_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(range_list) - 1)]
1327
+
1328
+ max_decimal_length = max(
1329
+ len(str(r).split('.')[1])
1330
+ for r in range_list
1331
+ if '.' in str(r)
1332
+ )
1333
+
1334
+ max_integer_length = max(
1335
+ len(str(int(float(r))))
1336
+ for r in range_list
1337
+ )
1338
+
1339
+ labels = []
1340
+
1341
+ for i in range(len(range_list) - 1):
1342
+ start = pad_number(
1343
+ range_list[i],
1344
+ max_integer_length,
1345
+ max_decimal_length,
1346
+ decimal=True
1347
+ )
1348
+
1349
+ end = pad_number(
1350
+ range_list[i + 1],
1351
+ max_integer_length,
1352
+ max_decimal_length,
1353
+ decimal=True
1354
+ )
1355
+
1356
+ label = f"{start} to {end}"
1357
+ labels.append(label)
1358
+
1012
1359
  else:
1013
1360
  range_list = [int(r) for r in range_list]
1014
- max_integer_length = max(len(str(r)) for r in range_list)
1015
- labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
1361
+
1362
+ max_integer_length = max(
1363
+ len(str(r))
1364
+ for r in range_list
1365
+ )
1366
+
1367
+ labels = [
1368
+ f"{pad_number(range_list[i], max_integer_length)}"
1369
+ f" to "
1370
+ f"{pad_number(range_list[i + 1], max_integer_length)}"
1371
+ for i in range(len(range_list) - 1)
1372
+ ]
1016
1373
 
1017
1374
  # Ensure the target column is numeric
1018
1375
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1019
1376
 
1020
- df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
1377
+ df[new_col_name] = pd.cut(
1378
+ df[target_col],
1379
+ bins=range_list,
1380
+ labels=labels,
1381
+ right=False,
1382
+ include_lowest=True)
1021
1383
 
1022
1384
  return df
1023
1385
 
1386
+
1024
1387
  def append_percentile_classification_column(
1025
- df: pd.DataFrame,
1026
- percentiles: str,
1027
- target_col: str,
1388
+ df: pd.DataFrame,
1389
+ percentiles: str,
1390
+ target_col: str,
1028
1391
  new_col_name: str
1029
1392
  ) -> pd.DataFrame:
1030
1393
  """
@@ -1032,7 +1395,8 @@ def append_percentile_classification_column(
1032
1395
 
1033
1396
  Parameters:
1034
1397
  df: The DataFrame to modify.
1035
- percentiles: A string representation of percentile values separated by commas.
1398
+ percentiles: A string representation of percentile values separated
1399
+ by commas.
1036
1400
  target_col: The column to analyze.
1037
1401
  new_col_name: The name of the new classification column.
1038
1402
 
@@ -1041,7 +1405,6 @@ def append_percentile_classification_column(
1041
1405
  """
1042
1406
 
1043
1407
  def pad_number(number, integer_length, decimal_length=0, decimal=False):
1044
- """Pad number to have a consistent length for integer and decimal parts."""
1045
1408
  if decimal:
1046
1409
  str_number = f"{number:.{decimal_length}f}"
1047
1410
  integer_part, decimal_part = str_number.split('.')
@@ -1055,26 +1418,78 @@ def append_percentile_classification_column(
1055
1418
 
1056
1419
  if has_decimals:
1057
1420
  percentiles_list = [float(p) for p in percentiles_list]
1058
- max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
1059
- max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
1060
- labels = [f"{pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(percentiles_list) - 1)]
1421
+
1422
+ max_decimal_length = max(
1423
+ len(str(p).split('.')[1])
1424
+ for p in percentiles_list
1425
+ if '.' in str(p)
1426
+ )
1427
+
1428
+ max_integer_length = max(
1429
+ len(str(int(float(p))))
1430
+ for p in percentiles_list
1431
+ )
1432
+
1433
+ labels = []
1434
+
1435
+ for i in range(len(percentiles_list) - 1):
1436
+ start = pad_number(
1437
+ percentiles_list[i],
1438
+ max_integer_length,
1439
+ max_decimal_length,
1440
+ decimal=True
1441
+ )
1442
+
1443
+ end = pad_number(
1444
+ percentiles_list[i + 1],
1445
+ max_integer_length,
1446
+ max_decimal_length,
1447
+ decimal=True
1448
+ )
1449
+
1450
+ label = f"{start} to {end}"
1451
+ labels.append(label)
1061
1452
  else:
1062
1453
  percentiles_list = [int(p) for p in percentiles_list]
1063
- max_integer_length = max(len(str(p)) for p in percentiles_list)
1064
- labels = [f"{pad_number(percentiles_list[i], max_integer_length)} to {pad_number(percentiles_list[i + 1], max_integer_length)}" for i in range(len(percentiles_list) - 1)]
1454
+
1455
+ max_integer_length = max(
1456
+ len(str(p))
1457
+ for p in percentiles_list
1458
+ )
1459
+
1460
+ labels = []
1461
+
1462
+ for i in range(len(percentiles_list) - 1):
1463
+ start = pad_number(
1464
+ percentiles_list[i],
1465
+ max_integer_length
1466
+ )
1467
+
1468
+ end = pad_number(
1469
+ percentiles_list[i + 1],
1470
+ max_integer_length
1471
+ )
1472
+
1473
+ label = f"{start} to {end}"
1474
+ labels.append(label)
1065
1475
 
1066
1476
  # Ensure the target column is numeric
1067
1477
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1068
1478
  quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
1069
-
1070
- df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
1479
+
1480
+ df[new_col_name] = pd.cut(
1481
+ df[target_col],
1482
+ bins=quantiles,
1483
+ labels=labels,
1484
+ include_lowest=True)
1071
1485
 
1072
1486
  return df
1073
1487
 
1488
+
1074
1489
  def append_ranged_date_classification_column(
1075
- df: pd.DataFrame,
1076
- date_ranges: str,
1077
- target_col: str,
1490
+ df: pd.DataFrame,
1491
+ date_ranges: str,
1492
+ target_col: str,
1078
1493
  new_col_name: str
1079
1494
  ) -> pd.DataFrame:
1080
1495
  """
@@ -1082,7 +1497,8 @@ def append_ranged_date_classification_column(
1082
1497
 
1083
1498
  Parameters:
1084
1499
  df: The DataFrame to modify.
1085
- date_ranges: A string representation of date ranges separated by commas.
1500
+ date_ranges: A string representation of date ranges separated by
1501
+ commas.
1086
1502
  target_col: The date column to analyze.
1087
1503
  new_col_name: The name of the new date classification column.
1088
1504
 
@@ -1091,41 +1507,61 @@ def append_ranged_date_classification_column(
1091
1507
  """
1092
1508
 
1093
1509
  date_list = [pd.to_datetime(date) for date in date_ranges.split(',')]
1094
- labels = [f"{date_list[i].strftime('%Y-%m-%d')} to {date_list[i + 1].strftime('%Y-%m-%d')}" for i in range(len(date_list) - 1)]
1095
1510
 
1096
- df[new_col_name] = pd.cut(pd.to_datetime(df[target_col]), bins=date_list, labels=labels, right=False)
1511
+ labels = []
1512
+
1513
+ for i in range(len(date_list) - 1):
1514
+ start_date = date_list[i].strftime('%Y-%m-%d')
1515
+ end_date = date_list[i + 1].strftime('%Y-%m-%d')
1516
+ label = f"{start_date} to {end_date}"
1517
+ labels.append(label)
1518
+
1519
+ df[new_col_name] = pd.cut(
1520
+ pd.to_datetime(df[target_col]),
1521
+ bins=date_list,
1522
+ labels=labels,
1523
+ right=False)
1097
1524
 
1098
1525
  return df
1099
1526
 
1100
- def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
1527
+
1528
+ def rename_columns(df: pd.DataFrame,
1529
+ rename_pairs: Dict[str,
1530
+ str]) -> pd.DataFrame:
1101
1531
  """
1102
1532
  Rename columns in the DataFrame.
1103
1533
 
1104
1534
  Parameters:
1105
1535
  df: The DataFrame to modify.
1106
- rename_pairs: A dictionary mapping old column names to new column names.
1536
+ rename_pairs: A dictionary mapping old column names to new column
1537
+ names.
1107
1538
 
1108
1539
  Returns:
1109
1540
  A new DataFrame with columns renamed.
1110
1541
  """
1111
1542
  if df is None:
1112
- raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
1543
+ raise ValueError(
1544
+ "No DataFrame to rename columns. Please provide a valid"
1545
+ + " DataFrame.")
1113
1546
 
1114
1547
  return df.rename(columns=rename_pairs)
1115
1548
 
1549
+
1116
1550
  def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
1117
1551
  """
1118
1552
  Cascade sort the DataFrame by specified columns and order.
1119
1553
 
1120
1554
  Parameters:
1121
1555
  df: The DataFrame to sort.
1122
- columns: A list of column names with sorting order, e.g., ['Column1::ASC', 'Column2::DESC'].
1556
+ columns: A list of column names with sorting order, e.g.,
1557
+ ['Column1::ASC', 'Column2::DESC'].
1123
1558
 
1124
1559
  Returns:
1125
1560
  A new DataFrame sorted by specified columns.
1126
1561
  """
1127
1562
  if df is None:
1128
- raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
1563
+ raise ValueError(
1564
+ "No DataFrame to sort. Please provide a valid DataFrame.")
1129
1565
 
1130
1566
  col_names = []
1131
1567
  asc_order = []
@@ -1147,19 +1583,22 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
1147
1583
 
1148
1584
  return df.sort_values(by=col_names, ascending=asc_order)
1149
1585
 
1586
+
1150
1587
  def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1151
1588
  """
1152
1589
  Append XGB training labels based on a ratio string.
1153
1590
 
1154
1591
  Parameters:
1155
1592
  df: The DataFrame to modify.
1156
- ratio_str: A string specifying the ratio of TRAIN:TEST or TRAIN:VALIDATE:TEST.
1593
+ ratio_str: A string specifying the ratio of TRAIN:TEST or
1594
+ TRAIN:VALIDATE:TEST.
1157
1595
 
1158
1596
  Returns:
1159
1597
  A new DataFrame with XGB_TYPE labels appended.
1160
1598
  """
1161
1599
  if df is None:
1162
- raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
1600
+ raise ValueError(
1601
+ "No DataFrame to add labels. Please provide a valid DataFrame.")
1163
1602
 
1164
1603
  ratios = list(map(int, ratio_str.split(':')))
1165
1604
  total_ratio = sum(ratios)
@@ -1173,25 +1612,30 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1173
1612
  train_rows = (ratios[0] * total_rows) // total_ratio
1174
1613
  validate_rows = (ratios[1] * total_rows) // total_ratio
1175
1614
  test_rows = total_rows - train_rows - validate_rows
1176
- labels = ['TRAIN'] * train_rows + ['VALIDATE'] * validate_rows + ['TEST'] * test_rows
1615
+ labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
1616
+ validate_rows + ['TEST'] * test_rows
1177
1617
  else:
1178
- raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
1618
+ raise ValueError(
1619
+ "Invalid ratio string format. Use 'TRAIN:TEST' or"
1620
+ + "'TRAIN:VALIDATE:TEST'.")
1179
1621
 
1180
1622
  df_with_labels = df.copy()
1181
1623
  df_with_labels['XGB_TYPE'] = labels
1182
1624
 
1183
1625
  return df_with_labels
1184
1626
 
1627
+
1185
1628
  def append_xgb_regression_predictions(
1186
- df: pd.DataFrame,
1187
- target_col: str,
1188
- feature_cols: str,
1189
- pred_col: str,
1190
- boosting_rounds: int = 100,
1629
+ df: pd.DataFrame,
1630
+ target_col: str,
1631
+ feature_cols: str,
1632
+ pred_col: str,
1633
+ boosting_rounds: int = 100,
1191
1634
  model_path: Optional[str] = None
1192
1635
  ) -> pd.DataFrame:
1193
1636
  """
1194
- Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1637
+ Append XGB regression predictions to DataFrame. Assumes data is labeled
1638
+ by an 'XGB_TYPE' column.
1195
1639
 
1196
1640
  Parameters:
1197
1641
  df: DataFrame to modify.
@@ -1205,7 +1649,8 @@ def append_xgb_regression_predictions(
1205
1649
  DataFrame with predictions appended.
1206
1650
  """
1207
1651
  if df is None or 'XGB_TYPE' not in df.columns:
1208
- raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1652
+ raise ValueError(
1653
+ "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1209
1654
 
1210
1655
  features = feature_cols.replace(' ', '').split(',')
1211
1656
 
@@ -1215,13 +1660,23 @@ def append_xgb_regression_predictions(
1215
1660
  df[col] = df[col].astype('category')
1216
1661
 
1217
1662
  train_data = df[df['XGB_TYPE'] == 'TRAIN']
1218
- validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
1219
1663
 
1220
- dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1664
+ if 'VALIDATE' in df['XGB_TYPE'].values:
1665
+ validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
1666
+ else:
1667
+ validate_data = None
1668
+
1669
+ dtrain = xgb.DMatrix(
1670
+ train_data[features],
1671
+ label=train_data[target_col],
1672
+ enable_categorical=True)
1221
1673
  evals = [(dtrain, 'train')]
1222
1674
 
1223
1675
  if validate_data is not None:
1224
- dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1676
+ dvalidate = xgb.DMatrix(
1677
+ validate_data[features],
1678
+ label=validate_data[target_col],
1679
+ enable_categorical=True)
1225
1680
  evals.append((dvalidate, 'validate'))
1226
1681
 
1227
1682
  params = {
@@ -1229,7 +1684,12 @@ def append_xgb_regression_predictions(
1229
1684
  'eval_metric': 'rmse'
1230
1685
  }
1231
1686
 
1232
- model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1687
+ model = xgb.train(
1688
+ params,
1689
+ dtrain,
1690
+ num_boost_round=boosting_rounds,
1691
+ evals=evals,
1692
+ early_stopping_rounds=10 if validate_data is not None else None)
1233
1693
 
1234
1694
  # Make predictions for all data
1235
1695
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1238,21 +1698,24 @@ def append_xgb_regression_predictions(
1238
1698
  if model_path:
1239
1699
  model.save_model(model_path)
1240
1700
 
1241
- columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1701
+ columns_order = [col for col in df.columns if col not in [
1702
+ 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1242
1703
  df = df[columns_order]
1243
1704
 
1244
1705
  return df
1245
1706
 
1707
+
1246
1708
  def append_xgb_logistic_regression_predictions(
1247
- df: pd.DataFrame,
1248
- target_col: str,
1249
- feature_cols: str,
1250
- pred_col: str,
1251
- boosting_rounds: int = 100,
1709
+ df: pd.DataFrame,
1710
+ target_col: str,
1711
+ feature_cols: str,
1712
+ pred_col: str,
1713
+ boosting_rounds: int = 100,
1252
1714
  model_path: Optional[str] = None
1253
1715
  ) -> pd.DataFrame:
1254
1716
  """
1255
- Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1717
+ Append XGB logistic regression predictions to DataFrame. Assumes data is
1718
+ labeled by an 'XGB_TYPE' column.
1256
1719
 
1257
1720
  Parameters:
1258
1721
  df: DataFrame to modify.
@@ -1266,7 +1729,8 @@ def append_xgb_logistic_regression_predictions(
1266
1729
  DataFrame with predictions appended.
1267
1730
  """
1268
1731
  if df is None or 'XGB_TYPE' not in df.columns:
1269
- raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1732
+ raise ValueError(
1733
+ "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1270
1734
 
1271
1735
  features = feature_cols.replace(' ', '').split(',')
1272
1736
 
@@ -1276,13 +1740,22 @@ def append_xgb_logistic_regression_predictions(
1276
1740
  df[col] = df[col].astype('category')
1277
1741
 
1278
1742
  train_data = df[df['XGB_TYPE'] == 'TRAIN']
1279
- validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
1280
1743
 
1281
- dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1744
+ validate_data = None
1745
+ if 'VALIDATE' in df['XGB_TYPE'].values:
1746
+ validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
1747
+
1748
+ dtrain = xgb.DMatrix(
1749
+ train_data[features],
1750
+ label=train_data[target_col],
1751
+ enable_categorical=True)
1282
1752
  evals = [(dtrain, 'train')]
1283
1753
 
1284
1754
  if validate_data is not None:
1285
- dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1755
+ dvalidate = xgb.DMatrix(
1756
+ validate_data[features],
1757
+ label=validate_data[target_col],
1758
+ enable_categorical=True)
1286
1759
  evals.append((dvalidate, 'validate'))
1287
1760
 
1288
1761
  params = {
@@ -1290,7 +1763,12 @@ def append_xgb_logistic_regression_predictions(
1290
1763
  'eval_metric': 'auc'
1291
1764
  }
1292
1765
 
1293
- model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1766
+ model = xgb.train(
1767
+ params,
1768
+ dtrain,
1769
+ num_boost_round=boosting_rounds,
1770
+ evals=evals,
1771
+ early_stopping_rounds=10 if validate_data is not None else None)
1294
1772
 
1295
1773
  # Make predictions for all data
1296
1774
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1299,15 +1777,17 @@ def append_xgb_logistic_regression_predictions(
1299
1777
  if model_path:
1300
1778
  model.save_model(model_path)
1301
1779
 
1302
- columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1780
+ columns_order = [col for col in df.columns if col not in [
1781
+ 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1303
1782
  df = df[columns_order]
1304
1783
 
1305
1784
  return df
1306
1785
 
1786
+
1307
1787
  def print_n_frequency_cascading(
1308
- df: pd.DataFrame,
1309
- n: int,
1310
- columns: str,
1788
+ df: pd.DataFrame,
1789
+ n: int,
1790
+ columns: str,
1311
1791
  order_by: str = "FREQ_DESC"
1312
1792
  ) -> None:
1313
1793
  """
@@ -1332,7 +1812,12 @@ def print_n_frequency_cascading(
1332
1812
  # Convert the column to string representation
1333
1813
  df[current_col] = df[current_col].astype(str)
1334
1814
  frequency = df[current_col].value_counts(dropna=False)
1335
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1815
+ frequency = frequency.rename(
1816
+ index={
1817
+ 'nan': 'NaN',
1818
+ 'NaT': 'NaT',
1819
+ 'None': 'None',
1820
+ '': 'Empty'})
1336
1821
 
1337
1822
  if limit is not None:
1338
1823
  frequency = frequency.nlargest(limit)
@@ -1347,11 +1832,11 @@ def print_n_frequency_cascading(
1347
1832
  filtered_df = df[df[current_col] == value]
1348
1833
 
1349
1834
  if len(columns) > 1:
1350
- sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
1835
+ sub_report = generate_cascade_report(
1836
+ filtered_df, columns[1:], limit, order_by)
1351
1837
  report[value] = {
1352
- "count": str(count),
1353
- f"sub_distribution({columns[1]})": sub_report if sub_report else {}
1354
- }
1838
+ "count": str(count), f"sub_distribution({
1839
+ columns[1]})": sub_report if sub_report else {}}
1355
1840
  else:
1356
1841
  report[value] = {
1357
1842
  "count": str(count)
@@ -1363,19 +1848,28 @@ def print_n_frequency_cascading(
1363
1848
  if order_by == "ASC":
1364
1849
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1365
1850
  elif order_by == "DESC":
1366
- return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1851
+ return dict(
1852
+ sorted(
1853
+ frequency.items(),
1854
+ key=lambda item: item[0],
1855
+ reverse=True))
1367
1856
  elif order_by == "FREQ_ASC":
1368
1857
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1369
1858
  else: # Default to "FREQ_DESC"
1370
- return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1859
+ return dict(
1860
+ sorted(
1861
+ frequency.items(),
1862
+ key=lambda item: item[1],
1863
+ reverse=True))
1371
1864
 
1372
1865
  report = generate_cascade_report(df, columns, n, order_by)
1373
1866
  print(json.dumps(report, indent=2))
1374
1867
 
1868
+
1375
1869
  def print_n_frequency_linear(
1376
- df: pd.DataFrame,
1377
- n: int,
1378
- columns: str,
1870
+ df: pd.DataFrame,
1871
+ n: int,
1872
+ columns: str,
1379
1873
  order_by: str = "FREQ_DESC"
1380
1874
  ) -> None:
1381
1875
  """
@@ -1397,13 +1891,19 @@ def print_n_frequency_linear(
1397
1891
  continue
1398
1892
 
1399
1893
  frequency = df[current_col].astype(str).value_counts(dropna=False)
1400
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1894
+ frequency = frequency.rename(
1895
+ index={
1896
+ 'nan': 'NaN',
1897
+ 'NaT': 'NaT',
1898
+ 'None': 'None',
1899
+ '': 'Empty'})
1401
1900
 
1402
1901
  if limit is not None:
1403
1902
  frequency = frequency.nlargest(limit)
1404
1903
 
1405
1904
  sorted_frequency = sort_frequency(frequency, order_by)
1406
- col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
1905
+ col_report = {str(value): str(count)
1906
+ for value, count in sorted_frequency.items()}
1407
1907
  report[current_col] = col_report
1408
1908
 
1409
1909
  return report
@@ -1412,16 +1912,27 @@ def print_n_frequency_linear(
1412
1912
  if order_by == "ASC":
1413
1913
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1414
1914
  elif order_by == "DESC":
1415
- return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1915
+ return dict(
1916
+ sorted(
1917
+ frequency.items(),
1918
+ key=lambda item: item[0],
1919
+ reverse=True))
1416
1920
  elif order_by == "FREQ_ASC":
1417
1921
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1418
1922
  else: # Default to "FREQ_DESC"
1419
- return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1923
+ return dict(
1924
+ sorted(
1925
+ frequency.items(),
1926
+ key=lambda item: item[1],
1927
+ reverse=True))
1420
1928
 
1421
1929
  report = generate_linear_report(df, columns, n, order_by)
1422
1930
  print(json.dumps(report, indent=2))
1423
1931
 
1424
- def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
1932
+
1933
+ def retain_columns(
1934
+ df: pd.DataFrame,
1935
+ columns_to_retain: List[str]) -> pd.DataFrame:
1425
1936
  """
1426
1937
  Retain specified columns in the DataFrame and drop the others.
1427
1938
 
@@ -1436,9 +1947,10 @@ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFra
1436
1947
  raise ValueError("columns_to_retain should be a list of column names.")
1437
1948
  return df[columns_to_retain]
1438
1949
 
1950
+
1439
1951
  def mask_against_dataframe(
1440
- df: pd.DataFrame,
1441
- other_df: pd.DataFrame,
1952
+ df: pd.DataFrame,
1953
+ other_df: pd.DataFrame,
1442
1954
  column_name: str
1443
1955
  ) -> pd.DataFrame:
1444
1956
  """
@@ -1456,9 +1968,10 @@ def mask_against_dataframe(
1456
1968
  raise ValueError("The specified column must exist in both DataFrames.")
1457
1969
  return df[df[column_name].isin(other_df[column_name])]
1458
1970
 
1971
+
1459
1972
  def mask_against_dataframe_converse(
1460
- df: pd.DataFrame,
1461
- other_df: pd.DataFrame,
1973
+ df: pd.DataFrame,
1974
+ other_df: pd.DataFrame,
1462
1975
  column_name: str
1463
1976
  ) -> pd.DataFrame:
1464
1977
  """
@@ -1470,10 +1983,10 @@ def mask_against_dataframe_converse(
1470
1983
  column_name: The column name to use for comparison.
1471
1984
 
1472
1985
  Returns:
1473
- A new DataFrame with rows whose column values do not exist in 'other_df'.
1986
+ A new DataFrame with rows whose column values do not exist in
1987
+ 'other_df'.
1474
1988
  """
1475
1989
  if column_name not in df.columns or column_name not in other_df.columns:
1476
1990
  raise ValueError("The specified column must exist in both DataFrames.")
1477
-
1478
- return df[~df[column_name].isin(other_df[column_name])]
1479
1991
 
1992
+ return df[~df[column_name].isin(other_df[column_name])]