rgwfuncs 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rgwfuncs/df_lib.py ADDED
@@ -0,0 +1,1479 @@
1
+ import pandas as pd
2
+ import pymssql
3
+ import os
4
+ import json
5
+ from datetime import datetime
6
+ import time
7
+ import gc
8
+ import mysql.connector
9
+ import tempfile
10
+ import clickhouse_connect
11
+ from google.cloud import bigquery
12
+ from google.oauth2 import service_account
13
+ import xgboost as xgb
14
+ from pprint import pprint
15
+ import requests
16
+ from slack_sdk import WebClient
17
+ import sqlite3
18
+ from email.mime.multipart import MIMEMultipart
19
+ from email.mime.text import MIMEText
20
+ from email.mime.base import MIMEBase
21
+ from googleapiclient.discovery import build
22
+ import base64
23
+ from typing import Optional, Callable, Dict, List
24
+
25
+
26
+ def docs(method_type_filter: Optional[str] = None) -> None:
27
+ """
28
+ Print a list of function names in alphabetical order. If method_type_filter is specified,
29
+ print the docstrings of the functions that match the filter.
30
+
31
+ Parameters:
32
+ method_type_filter: Optional filter string, comma-separated, to select docstring types.
33
+ """
34
+ # Get the current module's namespace
35
+ local_functions: Dict[str, Callable] = {
36
+ name: obj for name, obj in globals().items() if callable(obj)
37
+ }
38
+
39
+ # List of function names sorted alphabetically
40
+ function_names: List[str] = sorted(local_functions.keys())
41
+
42
+ # Print function names
43
+ print("Functions in alphabetical order:")
44
+ for name in function_names:
45
+ print(name)
46
+
47
+ # If a filter is provided, print the docstrings of functions that match the filter
48
+ if method_type_filter:
49
+ function_type_list: List[str] = [mt.strip() for mt in method_type_filter.split(',')]
50
+ print("\nFiltered function documentation:")
51
+
52
+ for name, func in local_functions.items():
53
+ docstring: Optional[str] = func.__doc__
54
+ if docstring:
55
+ # Extract only the first line of the docstring
56
+ first_line: str = docstring.split('\n')[0]
57
+ if "::" in first_line:
58
+ # Find the first occurrence of "::" and split there
59
+ split_index: int = first_line.find("::")
60
+ function_type: str = first_line[:split_index].strip()
61
+ if function_type in function_type_list:
62
+ function_description: str = first_line[split_index + 2:].strip()
63
+ print(f"{name}: {function_description}")
64
+
65
+ def numeric_clean(
66
+ df: pd.DataFrame,
67
+ column_names: str,
68
+ column_type: str,
69
+ irregular_value_treatment: str
70
+ ) -> pd.DataFrame:
71
+ """
72
+ Cleans the numeric columns based on specified treatments.
73
+
74
+ Parameters:
75
+ df: The DataFrame to clean.
76
+ column_names: A comma-separated string containing the names of the columns to clean.
77
+ column_type: The type to convert the column to ('INTEGER' or 'FLOAT').
78
+ irregular_value_treatment: How to treat irregular values ('NAN', 'TO_ZERO', 'MEAN').
79
+
80
+ Returns:
81
+ A new DataFrame with cleaned numeric columns.
82
+ """
83
+ df_copy = df.copy() # Avoid mutating the original DataFrame
84
+ columns_list: List[str] = [name.strip() for name in column_names.split(',')]
85
+
86
+ for column_name in columns_list:
87
+ if column_name not in df_copy.columns:
88
+ raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
89
+
90
+ if column_type not in ['INTEGER', 'FLOAT']:
91
+ raise ValueError("column_type must be 'INTEGER' or 'FLOAT'.")
92
+
93
+ if irregular_value_treatment not in ['NAN', 'TO_ZERO', 'MEAN']:
94
+ raise ValueError("irregular_value_treatment must be 'NAN', 'TO_ZERO', or 'MEAN'.")
95
+
96
+ # Convert column type
97
+ if column_type == 'INTEGER':
98
+ df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(pd.Int64Dtype())
99
+ elif column_type == 'FLOAT':
100
+ df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(float)
101
+
102
+ # Handle irregular values
103
+ if irregular_value_treatment == 'NAN':
104
+ pass # Already converted to NaN
105
+ elif irregular_value_treatment == 'TO_ZERO':
106
+ df_copy[column_name] = df_copy[column_name].fillna(0)
107
+ elif irregular_value_treatment == 'MEAN':
108
+ mean_value = df_copy[column_name].mean()
109
+ df_copy[column_name] = df_copy[column_name].fillna(mean_value)
110
+
111
+ return df_copy
112
+
113
+ def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
114
+ """
115
+ Limit the DataFrame to a specified number of rows.
116
+
117
+ Parameters:
118
+ df: The DataFrame to limit.
119
+ num_rows: The number of rows to retain.
120
+
121
+ Returns:
122
+ A new DataFrame limited to the specified number of rows.
123
+
124
+ Raises:
125
+ ValueError: If num_rows is not an integer.
126
+ """
127
+ if not isinstance(num_rows, int):
128
+ raise ValueError("The number of rows should be an integer.")
129
+
130
+ return df.head(num_rows)
131
+
132
+ def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
133
+ """
134
+ Create a DataFrame from raw data.
135
+
136
+ Parameters:
137
+ headers: A list of column headers.
138
+ data: A two-dimensional list of data.
139
+
140
+ Returns:
141
+ A DataFrame created from the raw data.
142
+
143
+ Raises:
144
+ ValueError: If data is not in the correct format.
145
+ """
146
+ if isinstance(data, list) and all(isinstance(row, list) for row in data):
147
+ df = pd.DataFrame(data, columns=headers)
148
+ else:
149
+ raise ValueError("Data should be an array of arrays.")
150
+
151
+ return df
152
+
153
+ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
154
+ """
155
+ Append rows to the DataFrame.
156
+
157
+ Parameters:
158
+ df: The original DataFrame.
159
+ rows: A list of lists, where each inner list represents a row to be appended.
160
+
161
+ Returns:
162
+ A new DataFrame with the appended rows.
163
+
164
+ Raises:
165
+ ValueError: If rows are not in the correct format.
166
+ """
167
+ if not isinstance(rows, list) or not all(isinstance(row, list) for row in rows):
168
+ raise ValueError("Rows should be provided as a list of lists.")
169
+
170
+ if df.empty:
171
+ new_df = pd.DataFrame(rows)
172
+ else:
173
+ new_rows_df = pd.DataFrame(rows, columns=df.columns)
174
+ new_df = pd.concat([df, new_rows_df], ignore_index=True)
175
+
176
+ return new_df
177
+
178
+ def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
179
+ """
180
+ Append columns to the DataFrame with None values.
181
+
182
+ Parameters:
183
+ df: The original DataFrame.
184
+ col_names: The names of the columns to add.
185
+
186
+ Returns:
187
+ A new DataFrame with the appended columns.
188
+
189
+ Raises:
190
+ ValueError: If column names are not provided correctly.
191
+ """
192
+ if not all(isinstance(col_name, str) for col_name in col_names):
193
+ raise ValueError("Column names should be provided as strings.")
194
+
195
+ new_df = df.copy()
196
+ for col_name in col_names:
197
+ new_df[col_name] = pd.Series([None] * len(df), dtype='object')
198
+
199
+ return new_df
200
+
201
+ def update_rows(
202
+ df: pd.DataFrame,
203
+ condition: str,
204
+ updates: Dict[str, any]
205
+ ) -> pd.DataFrame:
206
+ """
207
+ Update specific rows in the DataFrame based on a condition.
208
+
209
+ Parameters:
210
+ df: The original DataFrame.
211
+ condition: A query condition to identify rows for updating.
212
+ updates: A dictionary with column names as keys and new values as values.
213
+
214
+ Returns:
215
+ A new DataFrame with the updated rows.
216
+
217
+ Raises:
218
+ ValueError: If no rows match the condition or updates are invalid.
219
+ """
220
+ mask = df.query(condition)
221
+
222
+ if mask.empty:
223
+ raise ValueError("No rows match the given condition.")
224
+
225
+ if not isinstance(updates, dict):
226
+ raise ValueError("Updates should be provided as a dictionary.")
227
+
228
+ invalid_cols = [col for col in updates if col not in df.columns]
229
+ if invalid_cols:
230
+ raise ValueError(f"Columns {', '.join(invalid_cols)} do not exist in the DataFrame.")
231
+
232
+ new_df = df.copy()
233
+ for col_name, new_value in updates.items():
234
+ new_df.loc[mask.index, col_name] = new_value
235
+
236
+ return new_df
237
+
238
+ def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
239
+ """
240
+ Delete rows from the DataFrame based on a condition.
241
+
242
+ Parameters:
243
+ df: The original DataFrame.
244
+ condition: A query condition to identify rows for deletion.
245
+
246
+ Returns:
247
+ A new DataFrame with the specified rows deleted.
248
+
249
+ Raises:
250
+ ValueError: If no rows match the condition.
251
+ """
252
+ mask = df.query(condition)
253
+
254
+ if mask.empty:
255
+ raise ValueError("No rows match the given condition.")
256
+
257
+ new_df = df.drop(mask.index).reset_index(drop=True)
258
+
259
+ return new_df
260
+
261
+ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
262
+ """
263
+ Drop duplicate rows in the DataFrame, retaining the first occurrence.
264
+
265
+ Parameters:
266
+ df: The DataFrame from which duplicates will be dropped.
267
+
268
+ Returns:
269
+ A new DataFrame with duplicates removed.
270
+
271
+ Raises:
272
+ ValueError: If the DataFrame is None.
273
+ """
274
+ if df is None:
275
+ raise ValueError("DataFrame is not initialized.")
276
+ return df.drop_duplicates(keep='first')
277
+
278
+ def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
279
+ """
280
+ Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
281
+
282
+ Parameters:
283
+ df: The DataFrame from which duplicates will be dropped.
284
+ columns: A comma-separated string with the column names used to identify duplicates.
285
+
286
+ Returns:
287
+ A new DataFrame with duplicates removed.
288
+
289
+ Raises:
290
+ ValueError: If the DataFrame is None.
291
+ """
292
+ if df is None:
293
+ raise ValueError("DataFrame is not initialized.")
294
+
295
+ columns_list = [col.strip() for col in columns.split(',')] if columns else None
296
+ return df.drop_duplicates(subset=columns_list, keep='first')
297
+
298
+ def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
299
+ """
300
+ Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
301
+
302
+ Parameters:
303
+ df: The DataFrame from which duplicates will be dropped.
304
+ columns: A comma-separated string with the column names used to identify duplicates.
305
+
306
+ Returns:
307
+ A new DataFrame with duplicates removed.
308
+
309
+ Raises:
310
+ ValueError: If the DataFrame is None.
311
+ """
312
+ if df is None:
313
+ raise ValueError("DataFrame is not initialized.")
314
+
315
+ columns_list = [col.strip() for col in columns.split(',')] if columns else None
316
+ return df.drop_duplicates(subset=columns_list, keep='last')
317
+
318
+ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str = "rgwml.config") -> pd.DataFrame:
319
+ """
320
+ Load data from a database query into a DataFrame based on a configuration preset.
321
+
322
+ Parameters:
323
+ db_preset_name: The name of the database preset in the configuration file.
324
+ query: The SQL query to execute.
325
+ config_file_name: Name of the configuration file (default: 'rgwml.config').
326
+
327
+ Returns:
328
+ A DataFrame containing the query result.
329
+
330
+ Raises:
331
+ FileNotFoundError: If the configuration file is not found.
332
+ ValueError: If the database preset or db_type is invalid.
333
+ """
334
+
335
+ def locate_config_file(filename: str = config_file_name) -> str:
336
+ home_dir = os.path.expanduser("~")
337
+ search_paths = [
338
+ os.path.join(home_dir, "Desktop"),
339
+ os.path.join(home_dir, "Documents"),
340
+ os.path.join(home_dir, "Downloads"),
341
+ ]
342
+
343
+ for path in search_paths:
344
+ for root, dirs, files in os.walk(path):
345
+ if filename in files:
346
+ return os.path.join(root, filename)
347
+ raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
348
+
349
+ def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
350
+ """Execute a query on an MSSQL database and return the result as a DataFrame."""
351
+ server = db_preset['host']
352
+ user = db_preset['username']
353
+ password = db_preset['password']
354
+ database = db_preset.get('database', '')
355
+
356
+ with pymssql.connect(server=server, user=user, password=password, database=database) as conn:
357
+ with conn.cursor() as cursor:
358
+ cursor.execute(query)
359
+ rows = cursor.fetchall()
360
+ columns = [desc[0] for desc in cursor.description]
361
+
362
+ return pd.DataFrame(rows, columns=columns)
363
+
364
+ def query_mysql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
365
+ """Execute a query on a MySQL database and return the result as a DataFrame."""
366
+ host = db_preset['host']
367
+ user = db_preset['username']
368
+ password = db_preset['password']
369
+ database = db_preset.get('database', '')
370
+
371
+ with mysql.connector.connect(host=host, user=user, password=password, database=database) as conn:
372
+ with conn.cursor() as cursor:
373
+ cursor.execute(query)
374
+ rows = cursor.fetchall()
375
+ columns = [desc[0] for desc in cursor.description] if cursor.description else []
376
+
377
+ return pd.DataFrame(rows, columns=columns)
378
+
379
+ def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
380
+ """Query a ClickHouse database and return the result as a DataFrame."""
381
+ host = db_preset['host']
382
+ user = db_preset['username']
383
+ password = db_preset['password']
384
+ database = db_preset['database']
385
+
386
+ max_retries = 5
387
+ retry_delay = 5
388
+
389
+ for attempt in range(max_retries):
390
+ try:
391
+ client = clickhouse_connect.get_client(
392
+ host=host,
393
+ port='8123',
394
+ username=user,
395
+ password=password,
396
+ database=database
397
+ )
398
+ data = client.query(query)
399
+ rows = data.result_rows
400
+ columns = data.column_names
401
+ return pd.DataFrame(rows, columns=columns)
402
+ except Exception as e:
403
+ print(f"Attempt {attempt + 1} failed: {e}")
404
+ if attempt < max_retries - 1:
405
+ print(f"Retrying in {retry_delay} seconds...")
406
+ time.sleep(retry_delay)
407
+ else:
408
+ raise ConnectionError("All attempts to connect to ClickHouse failed.")
409
+
410
+ def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
411
+ """Query a Google BigQuery database and return the result as a DataFrame."""
412
+ json_file_path = db_preset['json_file_path']
413
+ project_id = db_preset['project_id']
414
+
415
+ credentials = service_account.Credentials.from_service_account_file(json_file_path)
416
+ client = bigquery.Client(credentials=credentials, project=project_id)
417
+
418
+ query_job = client.query(query)
419
+ results = query_job.result()
420
+ rows = [list(row.values()) for row in results]
421
+ columns = [field.name for field in results.schema]
422
+
423
+ return pd.DataFrame(rows, columns=columns)
424
+
425
+ # Read the configuration file to get the database preset
426
+ config_path = locate_config_file()
427
+ with open(config_path, 'r') as f:
428
+ config = json.load(f)
429
+
430
+ db_presets = config.get('db_presets', [])
431
+ db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
432
+ if not db_preset:
433
+ raise ValueError(f"No matching db_preset found for {db_preset_name}")
434
+
435
+ db_type = db_preset['db_type']
436
+
437
+ if db_type == 'mssql':
438
+ return query_mssql(db_preset, query)
439
+ elif db_type == 'mysql':
440
+ return query_mysql(db_preset, query)
441
+ elif db_type == 'clickhouse':
442
+ return query_clickhouse(db_preset, query)
443
+ elif db_type == 'google_big_query':
444
+ return query_google_big_query(db_preset, query)
445
+ else:
446
+ raise ValueError(f"Unsupported db_type: {db_type}")
447
+
448
+
449
+
450
+ def load_data_from_path(file_path: str) -> pd.DataFrame:
451
+ """
452
+ Load data from a file into a DataFrame based on the file extension.
453
+
454
+ Parameters:
455
+ file_path: The absolute path to the data file.
456
+
457
+ Returns:
458
+ A DataFrame containing the data loaded from the file.
459
+
460
+ Raises:
461
+ ValueError: If the file extension is unsupported.
462
+ """
463
+
464
+ def load_hdf5(file_path: str) -> pd.DataFrame:
465
+ """Helper function to load HDF5 files and select a key if necessary."""
466
+ with pd.HDFStore(file_path, mode='r') as store:
467
+ available_keys = store.keys()
468
+ if len(available_keys) == 1:
469
+ df = pd.read_hdf(file_path, key=available_keys[0])
470
+ print(f"Loaded key: {available_keys[0]}")
471
+ else:
472
+ while True:
473
+ print("Available keys:", available_keys)
474
+ key = input("Enter the key for the HDF5 dataset: ").strip()
475
+ if key in available_keys:
476
+ df = pd.read_hdf(file_path, key=key)
477
+ break
478
+ else:
479
+ print(f"Key '{key}' is not in the available keys. Please try again.")
480
+ return df
481
+
482
+ # Ensure the file path is absolute
483
+ file_path = os.path.abspath(file_path)
484
+
485
+ # Determine file type by extension
486
+ file_extension = file_path.split('.')[-1].lower()
487
+
488
+ # Load data based on file type
489
+ if file_extension == 'csv':
490
+ df = pd.read_csv(file_path, dtype=str)
491
+ df.replace('', None, inplace=True)
492
+ elif file_extension in ['xls', 'xlsx']:
493
+ df = pd.read_excel(file_path)
494
+ elif file_extension == 'json':
495
+ df = pd.read_json(file_path)
496
+ elif file_extension == 'parquet':
497
+ df = pd.read_parquet(file_path)
498
+ elif file_extension in ['h5', 'hdf5']:
499
+ df = load_hdf5(file_path)
500
+ elif file_extension == 'feather':
501
+ df = pd.read_feather(file_path)
502
+ elif file_extension == 'pkl':
503
+ df = pd.read_pickle(file_path)
504
+ else:
505
+ raise ValueError(f"Unsupported file extension: {file_extension}")
506
+
507
+ gc.collect()
508
+ return df
509
+
510
+
511
+ def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
512
+ """
513
+ Execute a query on a SQLite database specified by its path and return the results as a DataFrame.
514
+
515
+ Parameters:
516
+ sqlite_path: The absolute path to the SQLite database file.
517
+ query: The SQL query to execute.
518
+
519
+ Returns:
520
+ A DataFrame containing the query results.
521
+
522
+ Raises:
523
+ ValueError: If there is a problem executing the query.
524
+ """
525
+
526
+ # Ensure the file path is absolute
527
+ sqlite_path = os.path.abspath(sqlite_path)
528
+
529
+ try:
530
+ with sqlite3.connect(sqlite_path) as conn:
531
+ df = pd.read_sql_query(query, conn)
532
+ except sqlite3.Error as e:
533
+ raise ValueError(f"SQLite error: {e}")
534
+
535
+ gc.collect()
536
+ return df
537
+
538
+ def first_n_rows(df: pd.DataFrame, n: int) -> None:
539
+ """Print the first n rows of the DataFrame."""
540
+ if df is not None:
541
+ first_n_rows = df.head(n).to_dict(orient="records")
542
+ for row in first_n_rows:
543
+ pprint(row, indent=4)
544
+ print()
545
+ else:
546
+ raise ValueError("No DataFrame to display. Please provide a DataFrame.")
547
+
548
+ gc.collect()
549
+
550
+ def last_n_rows(df: pd.DataFrame, n: int) -> None:
551
+ """Print the last n rows of the DataFrame."""
552
+ if df is not None:
553
+ last_n_rows = df.tail(n).to_dict(orient="records")
554
+ for row in last_n_rows:
555
+ pprint(row, indent=4)
556
+ print()
557
+ else:
558
+ raise ValueError("No DataFrame to display. Please provide a DataFrame.")
559
+
560
+ gc.collect()
561
+
562
+ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
563
+ """Print top n unique values for specified columns in the DataFrame."""
564
+ if df is not None:
565
+ report = {}
566
+ for column in columns:
567
+ if column in df.columns:
568
+ frequency = df[column].astype(str).value_counts(dropna=False)
569
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
570
+ top_n_values = frequency.nlargest(n)
571
+ report[column] = {str(value): str(count) for value, count in top_n_values.items()}
572
+ print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
573
+ else:
574
+ print(f"Column '{column}' does not exist in the DataFrame.")
575
+ else:
576
+ raise ValueError("No DataFrame to display. Please provide a DataFrame.")
577
+
578
+ gc.collect()
579
+
580
+ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
581
+ """Print bottom n unique values for specified columns in the DataFrame."""
582
+ if df is not None:
583
+ report = {}
584
+ for column in columns:
585
+ if column in df.columns:
586
+ frequency = df[column].astype(str).value_counts(dropna=False)
587
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
588
+ bottom_n_values = frequency.nsmallest(n)
589
+ report[column] = {str(value): str(count) for value, count in bottom_n_values.items()}
590
+ print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
591
+ else:
592
+ print(f"Column '{column}' does not exist in the DataFrame.")
593
+ else:
594
+ raise ValueError("No DataFrame to display. Please provide a DataFrame.")
595
+
596
+ gc.collect()
597
+
598
+ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
599
+ """Print correlation for multiple pairs of columns in the DataFrame."""
600
+ if df is not None:
601
+ for col1, col2 in column_pairs:
602
+ if col1 in df.columns and col2 in df.columns:
603
+ try:
604
+ numeric_col1 = pd.to_numeric(df[col1], errors='coerce')
605
+ numeric_col2 = pd.to_numeric(df[col2], errors='coerce')
606
+
607
+ correlation = numeric_col1.corr(numeric_col2)
608
+ if pd.notnull(correlation):
609
+ print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
610
+ else:
611
+ print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
612
+ except Exception as e:
613
+ print(f"Error processing columns '{col1}' and '{col2}': {e}")
614
+ else:
615
+ print(f"One or both of the specified columns ('{col1}', '{col2}') do not exist in the DataFrame.")
616
+ else:
617
+ print("The DataFrame is empty.")
618
+
619
+ gc.collect()
620
+
621
+ def print_memory_usage(df: pd.DataFrame) -> None:
622
+ """Print memory usage of the DataFrame."""
623
+ if df is not None:
624
+ memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024) # Convert bytes to MB
625
+ print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
626
+ else:
627
+ raise ValueError("No DataFrame to print. Please provide a DataFrame.")
628
+
629
+ gc.collect()
630
+
631
+ def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
632
+ """Filter DataFrame with a given expression."""
633
+ if df is not None:
634
+ try:
635
+ filtered_df = df.query(filter_expr)
636
+ except Exception:
637
+ filtered_df = df[df.eval(filter_expr)]
638
+ else:
639
+ raise ValueError("No DataFrame to filter. Please provide a DataFrame.")
640
+
641
+ gc.collect()
642
+
643
+ return filtered_df
644
+
645
+ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
646
+ """Filter DataFrame for Indian mobile numbers."""
647
+ if df is not None:
648
+ filtered_df = df[
649
+ df[mobile_col].apply(
650
+ lambda x: (
651
+ str(x).isdigit() and
652
+ str(x).startswith(('6', '7', '8', '9')) and
653
+ len(set(str(x))) >= 4
654
+ )
655
+ )
656
+ ]
657
+ else:
658
+ raise ValueError("No DataFrame to filter. Please provide a DataFrame.")
659
+
660
+ gc.collect()
661
+
662
+ return filtered_df
663
+
664
+ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
665
+ """
666
+ Print the DataFrame and its column types. If a source path is provided, print it as well.
667
+
668
+ Parameters:
669
+ df: The DataFrame to print.
670
+ source: Optional; The source path of the DataFrame for logging purposes.
671
+ """
672
+ if df is not None:
673
+ print(df)
674
+ columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
675
+ print("Columns:", columns_with_types)
676
+ if source:
677
+ print(f"Source: {source}")
678
+ else:
679
+ raise ValueError("No DataFrame to print. Please provide a DataFrame.")
680
+
681
+ gc.collect()
682
+
683
+ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
684
+ """
685
+ Send a DataFrame via Telegram using a specified bot configuration.
686
+
687
+ Parameters:
688
+ df: The DataFrame to send.
689
+ bot_name: The name of the Telegram bot as specified in the configuration.
690
+ message: Custom message to send along with the DataFrame or file.
691
+ as_file: Boolean flag to decide whether to send the DataFrame as a file or as text.
692
+ remove_after_send: If True, removes the file after sending.
693
+ """
694
+
695
+ def locate_config_file(filename: str = "rgwml.config") -> str:
696
+ """Retrieve the configuration file path."""
697
+ home_dir = os.path.expanduser("~")
698
+ search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
699
+
700
+ for path in search_paths:
701
+ for root, _, files in os.walk(path):
702
+ if filename in files:
703
+ return os.path.join(root, filename)
704
+ raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
705
+
706
+ def get_config(config_path: str) -> dict:
707
+ """Load configuration from a json file."""
708
+ with open(config_path, 'r') as file:
709
+ return json.load(file)
710
+
711
+ config_path = locate_config_file()
712
+ config = get_config(config_path)
713
+
714
+ bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
715
+ if not bot_config:
716
+ raise ValueError(f"No bot found with the name {bot_name}")
717
+
718
+ if df is None:
719
+ raise ValueError("No DataFrame to send. Please provide a DataFrame.")
720
+
721
+ if as_file:
722
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
723
+ file_name = f"df_{timestamp}.csv"
724
+ df.to_csv(file_name, index=False)
725
+ try:
726
+ with open(file_name, 'rb') as file:
727
+ payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
728
+ files = {'document': file}
729
+ response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
730
+ if remove_after_send and os.path.exists(file_name):
731
+ os.remove(file_name)
732
+ except Exception as e:
733
+ print(f"Failed to send document: {e}")
734
+ raise
735
+ else:
736
+ df_str = df.to_string()
737
+ payload = {'chat_id': bot_config['chat_id'], 'text': message + "\n\n" + df_str if message else df_str, 'parse_mode': 'HTML'}
738
+ response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
739
+
740
+ if not response.ok:
741
+ raise Exception(f"Error sending message: {response.text}")
742
+
743
+ print("Message sent successfully.")
744
+
745
+ def send_data_to_email(
746
+ df: pd.DataFrame,
747
+ preset_name: str,
748
+ to_email: str,
749
+ subject: Optional[str] = None,
750
+ body: Optional[str] = None,
751
+ as_file: bool = True,
752
+ remove_after_send: bool = True
753
+ ) -> None:
754
+ """
755
+ Send an email with optional DataFrame attachment using Gmail API via a specified preset.
756
+
757
+ Parameters:
758
+ df: The DataFrame to send.
759
+ preset_name: The configuration preset name to use for sending the email.
760
+ to_email: The recipient email address.
761
+ subject: Optional subject of the email.
762
+ body: Optional message body of the email.
763
+ as_file: Boolean flag to decide whether to send the DataFrame as a file.
764
+ remove_after_send: If True, removes the CSV file after sending.
765
+ """
766
+
767
+ def locate_config_file(filename: str = "rgwml.config") -> str:
768
+ """Locate config file in common user directories."""
769
+ home_dir = os.path.expanduser("~")
770
+ search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
771
+
772
+ for path in search_paths:
773
+ for root, _, files in os.walk(path):
774
+ if filename in files:
775
+ return os.path.join(root, filename)
776
+ raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
777
+
778
+ def get_config(config_path: str) -> dict:
779
+ """Load configuration from a json file."""
780
+ with open(config_path, 'r') as file:
781
+ try:
782
+ return json.load(file)
783
+ except json.JSONDecodeError as e:
784
+ raise ValueError(f"Invalid JSON format in config file: {e}")
785
+
786
+ def authenticate_service_account(service_account_credentials_path: str, sender_email_id: str) -> Any:
787
+ """Authenticate the service account and return a Gmail API service instance."""
788
+ credentials = service_account.Credentials.from_service_account_file(
789
+ service_account_credentials_path,
790
+ scopes=['https://mail.google.com/'],
791
+ subject=sender_email_id
792
+ )
793
+ return build('gmail', 'v1', credentials=credentials)
794
+
795
+ # Load configuration
796
+ config_path = locate_config_file()
797
+ config = get_config(config_path)
798
+
799
+ # Retrieve Gmail preset configuration
800
+ gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
801
+ if not gmail_config:
802
+ raise ValueError(f"No preset found with the name {preset_name}")
803
+
804
+ sender_email = gmail_config['name']
805
+ credentials_path = gmail_config['service_account_credentials_path']
806
+
807
+ # Authenticate and get the Gmail service
808
+ service = authenticate_service_account(credentials_path, sender_email)
809
+
810
+ if as_file:
811
+ # Create a temporary file for the DataFrame as CSV
812
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
813
+ tmp_file_name = tmp_file.name
814
+ df.to_csv(tmp_file_name, index=False)
815
+
816
+ # Create email with attachment
817
+ try:
818
+ message = MIMEMultipart()
819
+ message['to'] = to_email
820
+ message['from'] = sender_email
821
+ message['subject'] = subject if subject else 'DataFrame CSV File'
822
+ message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
823
+
824
+ with open(tmp_file_name, 'rb') as file:
825
+ part = MIMEBase('application', 'octet-stream')
826
+ part.set_payload(file.read())
827
+ encoders.encode_base64(part)
828
+ part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
829
+ message.attach(part)
830
+
831
+ if remove_after_send and os.path.exists(tmp_file_name):
832
+ os.remove(tmp_file_name)
833
+
834
+ except Exception as e:
835
+ raise Exception(f"Failed to prepare the document: {e}")
836
+
837
+ else:
838
+ # Create email body as plain text
839
+ df_str = df.to_string()
840
+ full_body = body + "\n\n" + df_str if body else df_str
841
+ message = MIMEText(full_body)
842
+ message['to'] = to_email
843
+ message['from'] = sender_email
844
+ message['subject'] = subject or 'DataFrame Content'
845
+
846
+ # Sending the email
847
+ try:
848
+ raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
849
+ email_body = {'raw': raw}
850
+ sent_message = service.users().messages().send(userId="me", body=email_body).execute()
851
+ print(f"Email with Message Id {sent_message['id']} successfully sent.")
852
+ except Exception as error:
853
+ raise Exception(f"Error sending email: {error}")
854
+
855
+ def send_data_to_slack(
856
+ df: pd.DataFrame,
857
+ bot_name: str,
858
+ message: Optional[str] = None,
859
+ as_file: bool = True,
860
+ remove_after_send: bool = True
861
+ ) -> None:
862
+ """
863
+ Send a DataFrame or message to Slack using a specified bot configuration.
864
+
865
+ Parameters:
866
+ df: The DataFrame to send.
867
+ bot_name: The Slack bot configuration preset name.
868
+ message: Custom message to send along with the DataFrame or file.
869
+ as_file: Boolean flag to decide whether to send the DataFrame as a file.
870
+ remove_after_send: If True, removes the CSV file after sending.
871
+ """
872
+
873
+ def locate_config_file(filename: str = "rgwml.config") -> str:
874
+ """Locate config file in common user directories."""
875
+ home_dir = os.path.expanduser("~")
876
+ search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
877
+
878
+ for path in search_paths:
879
+ for root, _, files in os.walk(path):
880
+ if filename in files:
881
+ return os.path.join(root, filename)
882
+ raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
883
+
884
+ def get_config(config_path: str) -> dict:
885
+ """Load configuration from a JSON file."""
886
+ with open(config_path, 'r') as file:
887
+ return json.load(file)
888
+
889
+ # Load the Slack configuration
890
+ config_path = locate_config_file()
891
+ config = get_config(config_path)
892
+
893
+ bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
894
+ if not bot_config:
895
+ raise ValueError(f"No bot found with the name {bot_name}")
896
+
897
+ client = WebClient(token=bot_config['bot_token'])
898
+
899
+ if as_file:
900
+ # Create a temporary file for the DataFrame as CSV
901
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
902
+ file_name = tmp_file.name
903
+ df.to_csv(file_name, index=False)
904
+
905
+ try:
906
+ with open(file_name, 'rb') as file:
907
+ response = client.files_upload(
908
+ channels=bot_config['channel_id'],
909
+ file=file,
910
+ filename=os.path.basename(file_name),
911
+ title="DataFrame Upload",
912
+ initial_comment=message or ''
913
+ )
914
+ finally:
915
+ if remove_after_send and os.path.exists(file_name):
916
+ os.remove(file_name)
917
+ else:
918
+ df_str = df.to_string()
919
+ response = client.chat_postMessage(
920
+ channel=bot_config['channel_id'],
921
+ text=(message + "\n\n" + df_str) if message else df_str
922
+ )
923
+
924
+ # Check if the message was sent successfully
925
+ if not response["ok"]:
926
+ raise Exception(f"Error sending message: {response['error']}")
927
+
928
+ print("Message sent successfully.")
929
+
930
+ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
931
+ """
932
+ Reorder the columns of the DataFrame based on a string input.
933
+
934
+ Parameters:
935
+ df: The DataFrame whose columns will be reordered.
936
+ column_order_str: A string specifying the desired order of columns, using ',' to separate columns.
937
+
938
+ Returns:
939
+ A new DataFrame with reordered columns.
940
+
941
+ Raises:
942
+ ValueError: If a specified column does not exist in the DataFrame.
943
+ """
944
+ if df is None:
945
+ raise ValueError("No DataFrame to reorder. Please provide a valid DataFrame.")
946
+
947
+ columns = df.columns.tolist()
948
+ parts = [part.strip() for part in column_order_str.split(',')]
949
+
950
+ new_order = []
951
+ seen = set()
952
+
953
+ for part in parts:
954
+ if part == '...':
955
+ continue
956
+ elif part in columns:
957
+ new_order.append(part)
958
+ seen.add(part)
959
+ else:
960
+ raise ValueError(f"Column '{part}' not found in DataFrame.")
961
+
962
+ remaining = [col for col in columns if col not in seen]
963
+
964
+ # Determine the position of '...' and arrange the columns
965
+ if parts[0] == '...':
966
+ new_order = remaining + new_order
967
+ elif parts[-1] == '...':
968
+ new_order = new_order + remaining
969
+ else:
970
+ pos = parts.index('...')
971
+ new_order = new_order[:pos] + remaining + new_order[pos:]
972
+
973
+ return df[new_order]
974
+
975
+ def append_ranged_classification_column(
976
+ df: pd.DataFrame,
977
+ ranges: str,
978
+ target_col: str,
979
+ new_col_name: str
980
+ ) -> pd.DataFrame:
981
+ """
982
+ Append a ranged classification column to the DataFrame.
983
+
984
+ Parameters:
985
+ df: The DataFrame to modify.
986
+ ranges: A string representation of numeric ranges separated by commas.
987
+ target_col: The column to analyze.
988
+ new_col_name: The name of the new classification column.
989
+
990
+ Returns:
991
+ A new DataFrame with the classification column appended.
992
+ """
993
+
994
+ def pad_number(number, integer_length, decimal_length=0, decimal=False):
995
+ """Pad number to have a consistent length for integer and decimal parts."""
996
+ if decimal:
997
+ str_number = f"{number:.{decimal_length}f}"
998
+ integer_part, decimal_part = str_number.split('.')
999
+ padded_integer_part = integer_part.zfill(integer_length)
1000
+ return f"{padded_integer_part}.{decimal_part}"
1001
+ else:
1002
+ return str(int(number)).zfill(integer_length)
1003
+
1004
+ range_list = ranges.split(',')
1005
+ has_decimals = any('.' in r for r in range_list)
1006
+
1007
+ if has_decimals:
1008
+ range_list = [float(r) for r in range_list]
1009
+ max_decimal_length = max(len(str(r).split('.')[1]) for r in range_list if '.' in str(r))
1010
+ max_integer_length = max(len(str(int(float(r)))) for r in range_list)
1011
+ labels = [f"{pad_number(range_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(range_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(range_list) - 1)]
1012
+ else:
1013
+ range_list = [int(r) for r in range_list]
1014
+ max_integer_length = max(len(str(r)) for r in range_list)
1015
+ labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
1016
+
1017
+ # Ensure the target column is numeric
1018
+ df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1019
+
1020
+ df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
1021
+
1022
+ return df
1023
+
1024
+ def append_percentile_classification_column(
1025
+ df: pd.DataFrame,
1026
+ percentiles: str,
1027
+ target_col: str,
1028
+ new_col_name: str
1029
+ ) -> pd.DataFrame:
1030
+ """
1031
+ Append a percentile classification column to the DataFrame.
1032
+
1033
+ Parameters:
1034
+ df: The DataFrame to modify.
1035
+ percentiles: A string representation of percentile values separated by commas.
1036
+ target_col: The column to analyze.
1037
+ new_col_name: The name of the new classification column.
1038
+
1039
+ Returns:
1040
+ A new DataFrame with the classification column appended.
1041
+ """
1042
+
1043
+ def pad_number(number, integer_length, decimal_length=0, decimal=False):
1044
+ """Pad number to have a consistent length for integer and decimal parts."""
1045
+ if decimal:
1046
+ str_number = f"{number:.{decimal_length}f}"
1047
+ integer_part, decimal_part = str_number.split('.')
1048
+ padded_integer_part = integer_part.zfill(integer_length)
1049
+ return f"{padded_integer_part}.{decimal_part}"
1050
+ else:
1051
+ return str(int(number)).zfill(integer_length)
1052
+
1053
+ percentiles_list = percentiles.split(',')
1054
+ has_decimals = any('.' in p for p in percentiles_list)
1055
+
1056
+ if has_decimals:
1057
+ percentiles_list = [float(p) for p in percentiles_list]
1058
+ max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
1059
+ max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
1060
+ labels = [f"{pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(percentiles_list) - 1)]
1061
+ else:
1062
+ percentiles_list = [int(p) for p in percentiles_list]
1063
+ max_integer_length = max(len(str(p)) for p in percentiles_list)
1064
+ labels = [f"{pad_number(percentiles_list[i], max_integer_length)} to {pad_number(percentiles_list[i + 1], max_integer_length)}" for i in range(len(percentiles_list) - 1)]
1065
+
1066
+ # Ensure the target column is numeric
1067
+ df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1068
+ quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
1069
+
1070
+ df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
1071
+
1072
+ return df
1073
+
1074
+ def append_ranged_date_classification_column(
1075
+ df: pd.DataFrame,
1076
+ date_ranges: str,
1077
+ target_col: str,
1078
+ new_col_name: str
1079
+ ) -> pd.DataFrame:
1080
+ """
1081
+ Append a ranged date classification column to the DataFrame.
1082
+
1083
+ Parameters:
1084
+ df: The DataFrame to modify.
1085
+ date_ranges: A string representation of date ranges separated by commas.
1086
+ target_col: The date column to analyze.
1087
+ new_col_name: The name of the new date classification column.
1088
+
1089
+ Returns:
1090
+ A new DataFrame with the date classification column appended.
1091
+ """
1092
+
1093
+ date_list = [pd.to_datetime(date) for date in date_ranges.split(',')]
1094
+ labels = [f"{date_list[i].strftime('%Y-%m-%d')} to {date_list[i + 1].strftime('%Y-%m-%d')}" for i in range(len(date_list) - 1)]
1095
+
1096
+ df[new_col_name] = pd.cut(pd.to_datetime(df[target_col]), bins=date_list, labels=labels, right=False)
1097
+
1098
+ return df
1099
+
1100
+ def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
1101
+ """
1102
+ Rename columns in the DataFrame.
1103
+
1104
+ Parameters:
1105
+ df: The DataFrame to modify.
1106
+ rename_pairs: A dictionary mapping old column names to new column names.
1107
+
1108
+ Returns:
1109
+ A new DataFrame with columns renamed.
1110
+ """
1111
+ if df is None:
1112
+ raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
1113
+
1114
+ return df.rename(columns=rename_pairs)
1115
+
1116
+ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
1117
+ """
1118
+ Cascade sort the DataFrame by specified columns and order.
1119
+
1120
+ Parameters:
1121
+ df: The DataFrame to sort.
1122
+ columns: A list of column names with sorting order, e.g., ['Column1::ASC', 'Column2::DESC'].
1123
+
1124
+ Returns:
1125
+ A new DataFrame sorted by specified columns.
1126
+ """
1127
+ if df is None:
1128
+ raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
1129
+
1130
+ col_names = []
1131
+ asc_order = []
1132
+
1133
+ # Parse the columns and sorting order
1134
+ for col in columns:
1135
+ if "::" in col:
1136
+ name, order = col.split("::")
1137
+ col_names.append(name)
1138
+ asc_order.append(order.upper() == "ASC")
1139
+ else:
1140
+ col_names.append(col)
1141
+ asc_order.append(True)
1142
+
1143
+ # Ensure all specified columns exist
1144
+ for name in col_names:
1145
+ if name not in df.columns:
1146
+ raise ValueError(f"Column {name} not found in DataFrame")
1147
+
1148
+ return df.sort_values(by=col_names, ascending=asc_order)
1149
+
1150
+ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1151
+ """
1152
+ Append XGB training labels based on a ratio string.
1153
+
1154
+ Parameters:
1155
+ df: The DataFrame to modify.
1156
+ ratio_str: A string specifying the ratio of TRAIN:TEST or TRAIN:VALIDATE:TEST.
1157
+
1158
+ Returns:
1159
+ A new DataFrame with XGB_TYPE labels appended.
1160
+ """
1161
+ if df is None:
1162
+ raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
1163
+
1164
+ ratios = list(map(int, ratio_str.split(':')))
1165
+ total_ratio = sum(ratios)
1166
+ total_rows = len(df)
1167
+
1168
+ if len(ratios) == 2:
1169
+ train_rows = (ratios[0] * total_rows) // total_ratio
1170
+ test_rows = total_rows - train_rows
1171
+ labels = ['TRAIN'] * train_rows + ['TEST'] * test_rows
1172
+ elif len(ratios) == 3:
1173
+ train_rows = (ratios[0] * total_rows) // total_ratio
1174
+ validate_rows = (ratios[1] * total_rows) // total_ratio
1175
+ test_rows = total_rows - train_rows - validate_rows
1176
+ labels = ['TRAIN'] * train_rows + ['VALIDATE'] * validate_rows + ['TEST'] * test_rows
1177
+ else:
1178
+ raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
1179
+
1180
+ df_with_labels = df.copy()
1181
+ df_with_labels['XGB_TYPE'] = labels
1182
+
1183
+ return df_with_labels
1184
+
1185
+ def append_xgb_regression_predictions(
1186
+ df: pd.DataFrame,
1187
+ target_col: str,
1188
+ feature_cols: str,
1189
+ pred_col: str,
1190
+ boosting_rounds: int = 100,
1191
+ model_path: Optional[str] = None
1192
+ ) -> pd.DataFrame:
1193
+ """
1194
+ Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1195
+
1196
+ Parameters:
1197
+ df: DataFrame to modify.
1198
+ target_col: The target column for regression.
1199
+ feature_cols: Comma-separated string of feature columns.
1200
+ pred_col: Name of the prediction column.
1201
+ boosting_rounds: (Optional) Number of boosting rounds for training.
1202
+ model_path: (Optional) Path to save the trained model.
1203
+
1204
+ Returns:
1205
+ DataFrame with predictions appended.
1206
+ """
1207
+ if df is None or 'XGB_TYPE' not in df.columns:
1208
+ raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1209
+
1210
+ features = feature_cols.replace(' ', '').split(',')
1211
+
1212
+ # Convert categorical columns to 'category' dtype
1213
+ for col in features:
1214
+ if df[col].dtype == 'object':
1215
+ df[col] = df[col].astype('category')
1216
+
1217
+ train_data = df[df['XGB_TYPE'] == 'TRAIN']
1218
+ validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
1219
+
1220
+ dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1221
+ evals = [(dtrain, 'train')]
1222
+
1223
+ if validate_data is not None:
1224
+ dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1225
+ evals.append((dvalidate, 'validate'))
1226
+
1227
+ params = {
1228
+ 'objective': 'reg:squarederror',
1229
+ 'eval_metric': 'rmse'
1230
+ }
1231
+
1232
+ model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1233
+
1234
+ # Make predictions for all data
1235
+ dall = xgb.DMatrix(df[features], enable_categorical=True)
1236
+ df[pred_col] = model.predict(dall)
1237
+
1238
+ if model_path:
1239
+ model.save_model(model_path)
1240
+
1241
+ columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1242
+ df = df[columns_order]
1243
+
1244
+ return df
1245
+
1246
+ def append_xgb_logistic_regression_predictions(
1247
+ df: pd.DataFrame,
1248
+ target_col: str,
1249
+ feature_cols: str,
1250
+ pred_col: str,
1251
+ boosting_rounds: int = 100,
1252
+ model_path: Optional[str] = None
1253
+ ) -> pd.DataFrame:
1254
+ """
1255
+ Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1256
+
1257
+ Parameters:
1258
+ df: DataFrame to modify.
1259
+ target_col: The target column for logistic regression.
1260
+ feature_cols: Comma-separated string of feature columns.
1261
+ pred_col: Name of the prediction column.
1262
+ boosting_rounds: (Optional) Number of boosting rounds for training.
1263
+ model_path: (Optional) Path to save the trained model.
1264
+
1265
+ Returns:
1266
+ DataFrame with predictions appended.
1267
+ """
1268
+ if df is None or 'XGB_TYPE' not in df.columns:
1269
+ raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1270
+
1271
+ features = feature_cols.replace(' ', '').split(',')
1272
+
1273
+ # Convert categorical columns to 'category' dtype
1274
+ for col in features:
1275
+ if df[col].dtype == 'object':
1276
+ df[col] = df[col].astype('category')
1277
+
1278
+ train_data = df[df['XGB_TYPE'] == 'TRAIN']
1279
+ validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
1280
+
1281
+ dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1282
+ evals = [(dtrain, 'train')]
1283
+
1284
+ if validate_data is not None:
1285
+ dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1286
+ evals.append((dvalidate, 'validate'))
1287
+
1288
+ params = {
1289
+ 'objective': 'binary:logistic',
1290
+ 'eval_metric': 'auc'
1291
+ }
1292
+
1293
+ model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1294
+
1295
+ # Make predictions for all data
1296
+ dall = xgb.DMatrix(df[features], enable_categorical=True)
1297
+ df[pred_col] = model.predict(dall)
1298
+
1299
+ if model_path:
1300
+ model.save_model(model_path)
1301
+
1302
+ columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1303
+ df = df[columns_order]
1304
+
1305
+ return df
1306
+
1307
+ def print_n_frequency_cascading(
1308
+ df: pd.DataFrame,
1309
+ n: int,
1310
+ columns: str,
1311
+ order_by: str = "FREQ_DESC"
1312
+ ) -> None:
1313
+ """
1314
+ Print the cascading frequency of top n values for specified columns.
1315
+
1316
+ Parameters:
1317
+ df: DataFrame to analyze.
1318
+ n: Number of top values to print.
1319
+ columns: Comma-separated column names to analyze.
1320
+ order_by: Order of frequency: ACS, DESC, FREQ_ASC, FREQ_DESC.
1321
+ """
1322
+ columns = [col.strip() for col in columns.split(",")]
1323
+
1324
+ def generate_cascade_report(df, columns, limit, order_by):
1325
+ if not columns:
1326
+ return None
1327
+
1328
+ current_col = columns[0]
1329
+ if current_col not in df.columns:
1330
+ return None
1331
+
1332
+ # Convert the column to string representation
1333
+ df[current_col] = df[current_col].astype(str)
1334
+ frequency = df[current_col].value_counts(dropna=False)
1335
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1336
+
1337
+ if limit is not None:
1338
+ frequency = frequency.nlargest(limit)
1339
+
1340
+ sorted_frequency = sort_frequency(frequency, order_by)
1341
+
1342
+ report = {}
1343
+ for value, count in sorted_frequency.items():
1344
+ if value in ['NaN', 'NaT', 'None', 'Empty']:
1345
+ filtered_df = df[df[current_col].isna()]
1346
+ else:
1347
+ filtered_df = df[df[current_col] == value]
1348
+
1349
+ if len(columns) > 1:
1350
+ sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
1351
+ report[value] = {
1352
+ "count": str(count),
1353
+ f"sub_distribution({columns[1]})": sub_report if sub_report else {}
1354
+ }
1355
+ else:
1356
+ report[value] = {
1357
+ "count": str(count)
1358
+ }
1359
+
1360
+ return report
1361
+
1362
+ def sort_frequency(frequency, order_by):
1363
+ if order_by == "ASC":
1364
+ return dict(sorted(frequency.items(), key=lambda item: item[0]))
1365
+ elif order_by == "DESC":
1366
+ return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1367
+ elif order_by == "FREQ_ASC":
1368
+ return dict(sorted(frequency.items(), key=lambda item: item[1]))
1369
+ else: # Default to "FREQ_DESC"
1370
+ return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1371
+
1372
+ report = generate_cascade_report(df, columns, n, order_by)
1373
+ print(json.dumps(report, indent=2))
1374
+
1375
+ def print_n_frequency_linear(
1376
+ df: pd.DataFrame,
1377
+ n: int,
1378
+ columns: str,
1379
+ order_by: str = "FREQ_DESC"
1380
+ ) -> None:
1381
+ """
1382
+ Print the linear frequency of top n values for specified columns.
1383
+
1384
+ Parameters:
1385
+ df: DataFrame to analyze.
1386
+ n: Number of top values to print.
1387
+ columns: Comma-separated column names to analyze.
1388
+ order_by: Order of frequency: ACS, DESC, FREQ_ASC, FREQ_DESC.
1389
+ """
1390
+ columns = [col.strip() for col in columns.split(",")]
1391
+
1392
+ def generate_linear_report(df, columns, limit, order_by):
1393
+ report = {}
1394
+
1395
+ for current_col in columns:
1396
+ if current_col not in df.columns:
1397
+ continue
1398
+
1399
+ frequency = df[current_col].astype(str).value_counts(dropna=False)
1400
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1401
+
1402
+ if limit is not None:
1403
+ frequency = frequency.nlargest(limit)
1404
+
1405
+ sorted_frequency = sort_frequency(frequency, order_by)
1406
+ col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
1407
+ report[current_col] = col_report
1408
+
1409
+ return report
1410
+
1411
+ def sort_frequency(frequency, order_by):
1412
+ if order_by == "ASC":
1413
+ return dict(sorted(frequency.items(), key=lambda item: item[0]))
1414
+ elif order_by == "DESC":
1415
+ return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1416
+ elif order_by == "FREQ_ASC":
1417
+ return dict(sorted(frequency.items(), key=lambda item: item[1]))
1418
+ else: # Default to "FREQ_DESC"
1419
+ return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1420
+
1421
+ report = generate_linear_report(df, columns, n, order_by)
1422
+ print(json.dumps(report, indent=2))
1423
+
1424
+ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
1425
+ """
1426
+ Retain specified columns in the DataFrame and drop the others.
1427
+
1428
+ Parameters:
1429
+ df: DataFrame to modify.
1430
+ columns_to_retain: List of column names to retain.
1431
+
1432
+ Returns:
1433
+ A new DataFrame with only the retained columns.
1434
+ """
1435
+ if not isinstance(columns_to_retain, list):
1436
+ raise ValueError("columns_to_retain should be a list of column names.")
1437
+ return df[columns_to_retain]
1438
+
1439
+ def mask_against_dataframe(
1440
+ df: pd.DataFrame,
1441
+ other_df: pd.DataFrame,
1442
+ column_name: str
1443
+ ) -> pd.DataFrame:
1444
+ """
1445
+ Retain only rows with common column values between two DataFrames.
1446
+
1447
+ Parameters:
1448
+ df: DataFrame to modify.
1449
+ other_df: DataFrame to compare against.
1450
+ column_name: Column name to compare.
1451
+
1452
+ Returns:
1453
+ A new DataFrame with rows whose column value exist in both DataFrames.
1454
+ """
1455
+ if column_name not in df.columns or column_name not in other_df.columns:
1456
+ raise ValueError("The specified column must exist in both DataFrames.")
1457
+ return df[df[column_name].isin(other_df[column_name])]
1458
+
1459
+ def mask_against_dataframe_converse(
1460
+ df: pd.DataFrame,
1461
+ other_df: pd.DataFrame,
1462
+ column_name: str
1463
+ ) -> pd.DataFrame:
1464
+ """
1465
+ Retain only rows with uncommon column values between two DataFrames.
1466
+
1467
+ Parameters:
1468
+ df: The primary DataFrame to modify.
1469
+ other_df: The DataFrame to compare against.
1470
+ column_name: The column name to use for comparison.
1471
+
1472
+ Returns:
1473
+ A new DataFrame with rows whose column values do not exist in 'other_df'.
1474
+ """
1475
+ if column_name not in df.columns or column_name not in other_df.columns:
1476
+ raise ValueError("The specified column must exist in both DataFrames.")
1477
+
1478
+ return df[~df[column_name].isin(other_df[column_name])]
1479
+