rgwfuncs 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rgwfuncs/df_lib.py +730 -217
- rgwfuncs-0.0.4.dist-info/METADATA +999 -0
- rgwfuncs-0.0.4.dist-info/RECORD +8 -0
- rgwfuncs-0.0.2.dist-info/METADATA +0 -325
- rgwfuncs-0.0.2.dist-info/RECORD +0 -8
- {rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/LICENSE +0 -0
- {rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/WHEEL +0 -0
- {rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/entry_points.txt +0 -0
- {rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/top_level.txt +0 -0
rgwfuncs/df_lib.py
CHANGED
@@ -18,22 +18,30 @@ import sqlite3
|
|
18
18
|
from email.mime.multipart import MIMEMultipart
|
19
19
|
from email.mime.text import MIMEText
|
20
20
|
from email.mime.base import MIMEBase
|
21
|
+
from email import encoders
|
21
22
|
from googleapiclient.discovery import build
|
22
23
|
import base64
|
23
|
-
|
24
|
+
import inspect
|
25
|
+
from typing import Optional, Callable, Dict, List, Tuple, Any
|
24
26
|
|
25
27
|
|
26
28
|
def docs(method_type_filter: Optional[str] = None) -> None:
|
27
29
|
"""
|
28
|
-
Print a list of function names in alphabetical order. If
|
29
|
-
print the docstrings of the functions
|
30
|
+
Print a list of function names in alphabetical order. If
|
31
|
+
method_type_filter is specified, print the docstrings of the functions
|
32
|
+
that match the filter. Using '*' as a filter will print the docstrings for
|
33
|
+
all functions.
|
30
34
|
|
31
35
|
Parameters:
|
32
|
-
method_type_filter: Optional filter string, comma-separated
|
36
|
+
method_type_filter: Optional filter string, comma-separated to select
|
37
|
+
docstring types, or '*' for all.
|
33
38
|
"""
|
34
39
|
# Get the current module's namespace
|
40
|
+
current_module = __name__
|
41
|
+
|
35
42
|
local_functions: Dict[str, Callable] = {
|
36
|
-
name: obj for name, obj in globals().items()
|
43
|
+
name: obj for name, obj in globals().items()
|
44
|
+
if inspect.isfunction(obj) and obj.__module__ == current_module
|
37
45
|
}
|
38
46
|
|
39
47
|
# List of function names sorted alphabetically
|
@@ -44,23 +52,28 @@ def docs(method_type_filter: Optional[str] = None) -> None:
|
|
44
52
|
for name in function_names:
|
45
53
|
print(name)
|
46
54
|
|
47
|
-
# If a filter is provided, print the docstrings of functions
|
55
|
+
# If a filter is provided or '*', print the docstrings of functions
|
48
56
|
if method_type_filter:
|
49
|
-
function_type_list: List[str] = [mt.strip() for mt in method_type_filter.split(',')]
|
50
57
|
print("\nFiltered function documentation:")
|
51
|
-
|
52
58
|
for name, func in local_functions.items():
|
53
59
|
docstring: Optional[str] = func.__doc__
|
54
60
|
if docstring:
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
if
|
62
|
-
|
63
|
-
|
61
|
+
if method_type_filter == '*':
|
62
|
+
# Print the entire docstring for each function
|
63
|
+
print(f"\n{name}:\n{docstring}")
|
64
|
+
else:
|
65
|
+
# Extract only the first line of the docstring
|
66
|
+
first_line: str = docstring.split('\n')[0]
|
67
|
+
if "::" in first_line:
|
68
|
+
# Find the first occurrence of "::" and split there
|
69
|
+
split_index: int = first_line.find("::")
|
70
|
+
function_type: str = first_line[:split_index].strip()
|
71
|
+
function_type_list: List[str] = [
|
72
|
+
mt.strip() for mt in method_type_filter.split(',')]
|
73
|
+
if function_type in function_type_list:
|
74
|
+
# Print the entire docstring if the filter matches
|
75
|
+
print(f"\n{name}:\n{docstring}")
|
76
|
+
|
64
77
|
|
65
78
|
def numeric_clean(
|
66
79
|
df: pd.DataFrame,
|
@@ -70,34 +83,45 @@ def numeric_clean(
|
|
70
83
|
) -> pd.DataFrame:
|
71
84
|
"""
|
72
85
|
Cleans the numeric columns based on specified treatments.
|
73
|
-
|
86
|
+
|
74
87
|
Parameters:
|
75
88
|
df: The DataFrame to clean.
|
76
|
-
column_names: A comma-separated string containing the names of the
|
77
|
-
|
78
|
-
|
89
|
+
column_names: A comma-separated string containing the names of the
|
90
|
+
columns to clean.
|
91
|
+
column_type: The type to convert the column to ('INTEGER' or
|
92
|
+
'FLOAT').
|
93
|
+
irregular_value_treatment: How to treat irregular values ('NAN',
|
94
|
+
'TO_ZERO', 'MEAN').
|
79
95
|
|
80
96
|
Returns:
|
81
97
|
A new DataFrame with cleaned numeric columns.
|
82
98
|
"""
|
83
99
|
df_copy = df.copy() # Avoid mutating the original DataFrame
|
84
|
-
columns_list: List[str] = [name.strip()
|
100
|
+
columns_list: List[str] = [name.strip()
|
101
|
+
for name in column_names.split(',')]
|
85
102
|
|
86
103
|
for column_name in columns_list:
|
87
104
|
if column_name not in df_copy.columns:
|
88
|
-
raise ValueError(
|
105
|
+
raise ValueError(
|
106
|
+
f"Column '{column_name}' does not exist in the DataFrame.")
|
89
107
|
|
90
108
|
if column_type not in ['INTEGER', 'FLOAT']:
|
91
109
|
raise ValueError("column_type must be 'INTEGER' or 'FLOAT'.")
|
92
110
|
|
93
111
|
if irregular_value_treatment not in ['NAN', 'TO_ZERO', 'MEAN']:
|
94
|
-
raise ValueError(
|
112
|
+
raise ValueError(
|
113
|
+
"irregular_value_treatment must be 'NAN', 'TO_ZERO', or"
|
114
|
+
+ "'MEAN'.")
|
95
115
|
|
96
116
|
# Convert column type
|
97
117
|
if column_type == 'INTEGER':
|
98
|
-
df_copy[column_name] = pd.to_numeric(
|
118
|
+
df_copy[column_name] = pd.to_numeric(
|
119
|
+
df_copy[column_name],
|
120
|
+
errors='coerce').astype(
|
121
|
+
pd.Int64Dtype())
|
99
122
|
elif column_type == 'FLOAT':
|
100
|
-
df_copy[column_name] = pd.to_numeric(
|
123
|
+
df_copy[column_name] = pd.to_numeric(
|
124
|
+
df_copy[column_name], errors='coerce').astype(float)
|
101
125
|
|
102
126
|
# Handle irregular values
|
103
127
|
if irregular_value_treatment == 'NAN':
|
@@ -110,6 +134,7 @@ def numeric_clean(
|
|
110
134
|
|
111
135
|
return df_copy
|
112
136
|
|
137
|
+
|
113
138
|
def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
|
114
139
|
"""
|
115
140
|
Limit the DataFrame to a specified number of rows.
|
@@ -120,15 +145,16 @@ def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
|
|
120
145
|
|
121
146
|
Returns:
|
122
147
|
A new DataFrame limited to the specified number of rows.
|
123
|
-
|
148
|
+
|
124
149
|
Raises:
|
125
150
|
ValueError: If num_rows is not an integer.
|
126
151
|
"""
|
127
152
|
if not isinstance(num_rows, int):
|
128
153
|
raise ValueError("The number of rows should be an integer.")
|
129
|
-
|
154
|
+
|
130
155
|
return df.head(num_rows)
|
131
156
|
|
157
|
+
|
132
158
|
def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
|
133
159
|
"""
|
134
160
|
Create a DataFrame from raw data.
|
@@ -150,13 +176,15 @@ def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
|
|
150
176
|
|
151
177
|
return df
|
152
178
|
|
179
|
+
|
153
180
|
def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
|
154
181
|
"""
|
155
182
|
Append rows to the DataFrame.
|
156
183
|
|
157
184
|
Parameters:
|
158
185
|
df: The original DataFrame.
|
159
|
-
rows: A list of lists, where each inner list represents a row to be
|
186
|
+
rows: A list of lists, where each inner list represents a row to be
|
187
|
+
appended.
|
160
188
|
|
161
189
|
Returns:
|
162
190
|
A new DataFrame with the appended rows.
|
@@ -164,7 +192,12 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
|
|
164
192
|
Raises:
|
165
193
|
ValueError: If rows are not in the correct format.
|
166
194
|
"""
|
167
|
-
if not isinstance(
|
195
|
+
if not isinstance(
|
196
|
+
rows,
|
197
|
+
list) or not all(
|
198
|
+
isinstance(
|
199
|
+
row,
|
200
|
+
list) for row in rows):
|
168
201
|
raise ValueError("Rows should be provided as a list of lists.")
|
169
202
|
|
170
203
|
if df.empty:
|
@@ -175,6 +208,7 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
|
|
175
208
|
|
176
209
|
return new_df
|
177
210
|
|
211
|
+
|
178
212
|
def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
|
179
213
|
"""
|
180
214
|
Append columns to the DataFrame with None values.
|
@@ -198,6 +232,7 @@ def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
|
|
198
232
|
|
199
233
|
return new_df
|
200
234
|
|
235
|
+
|
201
236
|
def update_rows(
|
202
237
|
df: pd.DataFrame,
|
203
238
|
condition: str,
|
@@ -209,7 +244,8 @@ def update_rows(
|
|
209
244
|
Parameters:
|
210
245
|
df: The original DataFrame.
|
211
246
|
condition: A query condition to identify rows for updating.
|
212
|
-
updates: A dictionary with column names as keys and new values as
|
247
|
+
updates: A dictionary with column names as keys and new values as
|
248
|
+
values.
|
213
249
|
|
214
250
|
Returns:
|
215
251
|
A new DataFrame with the updated rows.
|
@@ -227,7 +263,9 @@ def update_rows(
|
|
227
263
|
|
228
264
|
invalid_cols = [col for col in updates if col not in df.columns]
|
229
265
|
if invalid_cols:
|
230
|
-
raise ValueError(
|
266
|
+
raise ValueError(
|
267
|
+
f"Columns {
|
268
|
+
', '.join(invalid_cols)} do not exist in the DataFrame.")
|
231
269
|
|
232
270
|
new_df = df.copy()
|
233
271
|
for col_name, new_value in updates.items():
|
@@ -235,6 +273,7 @@ def update_rows(
|
|
235
273
|
|
236
274
|
return new_df
|
237
275
|
|
276
|
+
|
238
277
|
def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
|
239
278
|
"""
|
240
279
|
Delete rows from the DataFrame based on a condition.
|
@@ -258,6 +297,7 @@ def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
|
|
258
297
|
|
259
298
|
return new_df
|
260
299
|
|
300
|
+
|
261
301
|
def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
262
302
|
"""
|
263
303
|
Drop duplicate rows in the DataFrame, retaining the first occurrence.
|
@@ -267,7 +307,7 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
|
267
307
|
|
268
308
|
Returns:
|
269
309
|
A new DataFrame with duplicates removed.
|
270
|
-
|
310
|
+
|
271
311
|
Raises:
|
272
312
|
ValueError: If the DataFrame is None.
|
273
313
|
"""
|
@@ -275,54 +315,73 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
|
275
315
|
raise ValueError("DataFrame is not initialized.")
|
276
316
|
return df.drop_duplicates(keep='first')
|
277
317
|
|
278
|
-
|
318
|
+
|
319
|
+
def drop_duplicates_retain_first(
|
320
|
+
df: pd.DataFrame,
|
321
|
+
columns: Optional[str] = None) -> pd.DataFrame:
|
279
322
|
"""
|
280
|
-
Drop duplicate rows in the DataFrame based on specified columns, retaining
|
323
|
+
Drop duplicate rows in the DataFrame based on specified columns, retaining
|
324
|
+
the first occurrence.
|
281
325
|
|
282
326
|
Parameters:
|
283
327
|
df: The DataFrame from which duplicates will be dropped.
|
284
|
-
columns: A comma-separated string with the column names used to
|
328
|
+
columns: A comma-separated string with the column names used to
|
329
|
+
identify duplicates.
|
285
330
|
|
286
331
|
Returns:
|
287
332
|
A new DataFrame with duplicates removed.
|
288
|
-
|
333
|
+
|
289
334
|
Raises:
|
290
335
|
ValueError: If the DataFrame is None.
|
291
336
|
"""
|
292
337
|
if df is None:
|
293
338
|
raise ValueError("DataFrame is not initialized.")
|
294
|
-
|
295
|
-
columns_list = [col.strip()
|
339
|
+
|
340
|
+
columns_list = [col.strip()
|
341
|
+
for col in columns.split(',')] if columns else None
|
296
342
|
return df.drop_duplicates(subset=columns_list, keep='first')
|
297
343
|
|
298
|
-
|
344
|
+
|
345
|
+
def drop_duplicates_retain_last(
|
346
|
+
df: pd.DataFrame,
|
347
|
+
columns: Optional[str] = None) -> pd.DataFrame:
|
299
348
|
"""
|
300
|
-
Drop duplicate rows in the DataFrame based on specified columns, retaining
|
349
|
+
Drop duplicate rows in the DataFrame based on specified columns, retaining
|
350
|
+
the last occurrence.
|
301
351
|
|
302
352
|
Parameters:
|
303
353
|
df: The DataFrame from which duplicates will be dropped.
|
304
|
-
columns: A comma-separated string with the column names used to
|
354
|
+
columns: A comma-separated string with the column names used to
|
355
|
+
identify duplicates.
|
305
356
|
|
306
357
|
Returns:
|
307
358
|
A new DataFrame with duplicates removed.
|
308
|
-
|
359
|
+
|
309
360
|
Raises:
|
310
361
|
ValueError: If the DataFrame is None.
|
311
362
|
"""
|
312
363
|
if df is None:
|
313
364
|
raise ValueError("DataFrame is not initialized.")
|
314
|
-
|
315
|
-
columns_list = [col.strip()
|
365
|
+
|
366
|
+
columns_list = [col.strip()
|
367
|
+
for col in columns.split(',')] if columns else None
|
316
368
|
return df.drop_duplicates(subset=columns_list, keep='last')
|
317
369
|
|
318
|
-
|
370
|
+
|
371
|
+
def load_data_from_query(
|
372
|
+
db_preset_name: str,
|
373
|
+
query: str,
|
374
|
+
config_file_name: str = "rgwml.config") -> pd.DataFrame:
|
319
375
|
"""
|
320
|
-
Load data from a database query into a DataFrame based on a configuration
|
376
|
+
Load data from a database query into a DataFrame based on a configuration
|
377
|
+
preset.
|
321
378
|
|
322
379
|
Parameters:
|
323
|
-
db_preset_name: The name of the database preset in the configuration
|
380
|
+
db_preset_name: The name of the database preset in the configuration
|
381
|
+
file.
|
324
382
|
query: The SQL query to execute.
|
325
|
-
config_file_name: Name of the configuration file
|
383
|
+
config_file_name: Name of the configuration file
|
384
|
+
(default: 'rgwml.config').
|
326
385
|
|
327
386
|
Returns:
|
328
387
|
A DataFrame containing the query result.
|
@@ -344,45 +403,56 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
344
403
|
for root, dirs, files in os.walk(path):
|
345
404
|
if filename in files:
|
346
405
|
return os.path.join(root, filename)
|
347
|
-
raise FileNotFoundError(
|
406
|
+
raise FileNotFoundError(
|
407
|
+
f"{filename} not found in Desktop, Documents, or Downloads"
|
408
|
+
+ "folders")
|
348
409
|
|
349
410
|
def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
350
|
-
"""Execute a query on an MSSQL database and return the result as a DataFrame."""
|
351
411
|
server = db_preset['host']
|
352
412
|
user = db_preset['username']
|
353
413
|
password = db_preset['password']
|
354
414
|
database = db_preset.get('database', '')
|
355
415
|
|
356
|
-
with pymssql.connect(server=server, user=user, password=password,
|
416
|
+
with pymssql.connect(server=server, user=user, password=password,
|
417
|
+
database=database) as conn:
|
357
418
|
with conn.cursor() as cursor:
|
358
419
|
cursor.execute(query)
|
359
420
|
rows = cursor.fetchall()
|
360
421
|
columns = [desc[0] for desc in cursor.description]
|
361
|
-
|
422
|
+
|
362
423
|
return pd.DataFrame(rows, columns=columns)
|
363
424
|
|
364
425
|
def query_mysql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
365
|
-
"""Execute a query on a MySQL database and return the result as a DataFrame."""
|
366
426
|
host = db_preset['host']
|
367
427
|
user = db_preset['username']
|
368
428
|
password = db_preset['password']
|
369
429
|
database = db_preset.get('database', '')
|
370
430
|
|
371
|
-
with mysql.connector.connect(
|
431
|
+
with mysql.connector.connect(
|
432
|
+
host=host,
|
433
|
+
user=user,
|
434
|
+
password=password,
|
435
|
+
database=database
|
436
|
+
) as conn:
|
372
437
|
with conn.cursor() as cursor:
|
373
438
|
cursor.execute(query)
|
374
439
|
rows = cursor.fetchall()
|
375
|
-
columns =
|
376
|
-
|
440
|
+
columns = (
|
441
|
+
[desc[0] for desc in cursor.description]
|
442
|
+
if cursor.description
|
443
|
+
else []
|
444
|
+
)
|
445
|
+
|
377
446
|
return pd.DataFrame(rows, columns=columns)
|
378
447
|
|
379
|
-
def query_clickhouse(
|
380
|
-
|
448
|
+
def query_clickhouse(
|
449
|
+
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
450
|
+
|
381
451
|
host = db_preset['host']
|
382
452
|
user = db_preset['username']
|
383
453
|
password = db_preset['password']
|
384
454
|
database = db_preset['database']
|
385
|
-
|
455
|
+
|
386
456
|
max_retries = 5
|
387
457
|
retry_delay = 5
|
388
458
|
|
@@ -405,21 +475,23 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
405
475
|
print(f"Retrying in {retry_delay} seconds...")
|
406
476
|
time.sleep(retry_delay)
|
407
477
|
else:
|
408
|
-
raise ConnectionError(
|
478
|
+
raise ConnectionError(
|
479
|
+
"All attempts to connect to ClickHouse failed.")
|
409
480
|
|
410
|
-
def query_google_big_query(
|
411
|
-
|
481
|
+
def query_google_big_query(
|
482
|
+
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
412
483
|
json_file_path = db_preset['json_file_path']
|
413
484
|
project_id = db_preset['project_id']
|
414
485
|
|
415
|
-
credentials = service_account.Credentials.from_service_account_file(
|
486
|
+
credentials = service_account.Credentials.from_service_account_file(
|
487
|
+
json_file_path)
|
416
488
|
client = bigquery.Client(credentials=credentials, project=project_id)
|
417
489
|
|
418
490
|
query_job = client.query(query)
|
419
491
|
results = query_job.result()
|
420
492
|
rows = [list(row.values()) for row in results]
|
421
493
|
columns = [field.name for field in results.schema]
|
422
|
-
|
494
|
+
|
423
495
|
return pd.DataFrame(rows, columns=columns)
|
424
496
|
|
425
497
|
# Read the configuration file to get the database preset
|
@@ -428,7 +500,9 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
428
500
|
config = json.load(f)
|
429
501
|
|
430
502
|
db_presets = config.get('db_presets', [])
|
431
|
-
db_preset = next(
|
503
|
+
db_preset = next(
|
504
|
+
(preset for preset in db_presets if preset['name'] == db_preset_name),
|
505
|
+
None)
|
432
506
|
if not db_preset:
|
433
507
|
raise ValueError(f"No matching db_preset found for {db_preset_name}")
|
434
508
|
|
@@ -446,7 +520,6 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
446
520
|
raise ValueError(f"Unsupported db_type: {db_type}")
|
447
521
|
|
448
522
|
|
449
|
-
|
450
523
|
def load_data_from_path(file_path: str) -> pd.DataFrame:
|
451
524
|
"""
|
452
525
|
Load data from a file into a DataFrame based on the file extension.
|
@@ -460,7 +533,7 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
|
|
460
533
|
Raises:
|
461
534
|
ValueError: If the file extension is unsupported.
|
462
535
|
"""
|
463
|
-
|
536
|
+
|
464
537
|
def load_hdf5(file_path: str) -> pd.DataFrame:
|
465
538
|
"""Helper function to load HDF5 files and select a key if necessary."""
|
466
539
|
with pd.HDFStore(file_path, mode='r') as store:
|
@@ -476,7 +549,8 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
|
|
476
549
|
df = pd.read_hdf(file_path, key=key)
|
477
550
|
break
|
478
551
|
else:
|
479
|
-
print(
|
552
|
+
print(
|
553
|
+
f"Key '{key}' is not in the available keys.")
|
480
554
|
return df
|
481
555
|
|
482
556
|
# Ensure the file path is absolute
|
@@ -510,7 +584,8 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
|
|
510
584
|
|
511
585
|
def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
|
512
586
|
"""
|
513
|
-
Execute a query on a SQLite database specified by its path and return the
|
587
|
+
Execute a query on a SQLite database specified by its path and return the
|
588
|
+
results as a DataFrame.
|
514
589
|
|
515
590
|
Parameters:
|
516
591
|
sqlite_path: The absolute path to the SQLite database file.
|
@@ -522,7 +597,7 @@ def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
|
|
522
597
|
Raises:
|
523
598
|
ValueError: If there is a problem executing the query.
|
524
599
|
"""
|
525
|
-
|
600
|
+
|
526
601
|
# Ensure the file path is absolute
|
527
602
|
sqlite_path = os.path.abspath(sqlite_path)
|
528
603
|
|
@@ -535,68 +610,168 @@ def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
|
|
535
610
|
gc.collect()
|
536
611
|
return df
|
537
612
|
|
613
|
+
|
538
614
|
def first_n_rows(df: pd.DataFrame, n: int) -> None:
|
539
|
-
"""
|
615
|
+
"""
|
616
|
+
Display the first n rows of the DataFrame.
|
617
|
+
|
618
|
+
This function prints out the first `n` rows of a given DataFrame. Each row
|
619
|
+
is formatted for clarity and
|
620
|
+
printed as a dictionary. If the DataFrame is empty or `None`, it raises a
|
621
|
+
ValueError.
|
622
|
+
|
623
|
+
Parameters:
|
624
|
+
- df (pd.DataFrame): The DataFrame to display rows from.
|
625
|
+
- n (int): The number of rows to display from the start of the DataFrame.
|
626
|
+
|
627
|
+
Raises:
|
628
|
+
- ValueError: If the DataFrame is `None`.
|
629
|
+
"""
|
540
630
|
if df is not None:
|
541
631
|
first_n_rows = df.head(n).to_dict(orient="records")
|
542
632
|
for row in first_n_rows:
|
543
633
|
pprint(row, indent=4)
|
544
634
|
print()
|
545
635
|
else:
|
546
|
-
raise ValueError(
|
636
|
+
raise ValueError(
|
637
|
+
"No DataFrame to display. Please provide a DataFrame.")
|
547
638
|
|
548
639
|
gc.collect()
|
549
640
|
|
641
|
+
|
550
642
|
def last_n_rows(df: pd.DataFrame, n: int) -> None:
|
551
|
-
"""
|
643
|
+
"""
|
644
|
+
Display the last n rows of the DataFrame.
|
645
|
+
|
646
|
+
Prints the last `n` rows of a given DataFrame, formatted as dictionaries.
|
647
|
+
Useful for end-segment analysis and verifying data continuity.
|
648
|
+
|
649
|
+
Parameters:
|
650
|
+
- df (pd.DataFrame): The DataFrame from which to display rows.
|
651
|
+
- n (int): The number of rows to display from the end of the DataFrame.
|
652
|
+
|
653
|
+
Raises:
|
654
|
+
- ValueError: If the DataFrame is `None`.
|
655
|
+
"""
|
552
656
|
if df is not None:
|
553
657
|
last_n_rows = df.tail(n).to_dict(orient="records")
|
554
658
|
for row in last_n_rows:
|
555
659
|
pprint(row, indent=4)
|
556
660
|
print()
|
557
661
|
else:
|
558
|
-
raise ValueError(
|
662
|
+
raise ValueError(
|
663
|
+
"No DataFrame to display. Please provide a DataFrame.")
|
559
664
|
|
560
665
|
gc.collect()
|
561
666
|
|
667
|
+
|
562
668
|
def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
563
|
-
"""
|
669
|
+
"""
|
670
|
+
Print the top `n` unique values for specified columns in the DataFrame.
|
671
|
+
|
672
|
+
This method calculates and prints the top `n` unique frequency values for
|
673
|
+
specified columns in a DataFrame.
|
674
|
+
|
675
|
+
Parameters:
|
676
|
+
- df (pd.DataFrame): The DataFrame from which to calculate top unique
|
677
|
+
values.
|
678
|
+
- n (int): Number of top values to display.
|
679
|
+
- columns (List[str]): List of column names for which to display top
|
680
|
+
unique values.
|
681
|
+
|
682
|
+
Raises:
|
683
|
+
- ValueError: If the DataFrame is `None`.
|
684
|
+
"""
|
564
685
|
if df is not None:
|
565
686
|
report = {}
|
566
687
|
for column in columns:
|
567
688
|
if column in df.columns:
|
568
689
|
frequency = df[column].astype(str).value_counts(dropna=False)
|
569
|
-
frequency = frequency.rename(
|
690
|
+
frequency = frequency.rename(
|
691
|
+
index={
|
692
|
+
'nan': 'NaN',
|
693
|
+
'NaT': 'NaT',
|
694
|
+
'None': 'None',
|
695
|
+
'': 'Empty'})
|
570
696
|
top_n_values = frequency.nlargest(n)
|
571
|
-
report[column] = {str(value): str(count)
|
572
|
-
|
697
|
+
report[column] = {str(value): str(count)
|
698
|
+
for value, count in top_n_values.items()}
|
699
|
+
print(
|
700
|
+
f"Top {n} unique values for column '{column}':\n{
|
701
|
+
json.dumps(
|
702
|
+
report[column],
|
703
|
+
indent=2)}\n")
|
573
704
|
else:
|
574
705
|
print(f"Column '{column}' does not exist in the DataFrame.")
|
575
706
|
else:
|
576
|
-
raise ValueError(
|
707
|
+
raise ValueError(
|
708
|
+
"No DataFrame to display. Please provide a DataFrame.")
|
577
709
|
|
578
710
|
gc.collect()
|
579
711
|
|
580
|
-
|
581
|
-
|
712
|
+
|
713
|
+
def bottom_n_unique_values(
|
714
|
+
df: pd.DataFrame,
|
715
|
+
n: int,
|
716
|
+
columns: List[str]) -> None:
|
717
|
+
"""
|
718
|
+
Print the bottom `n` unique values for specified columns in the DataFrame.
|
719
|
+
|
720
|
+
This method calculates and prints the bottom `n` unique frequency values
|
721
|
+
for specified columns in a DataFrame.
|
722
|
+
|
723
|
+
Parameters:
|
724
|
+
- df (pd.DataFrame): The DataFrame from which to calculate bottom unique
|
725
|
+
values.
|
726
|
+
- n (int): Number of bottom unique frequency values to display.
|
727
|
+
- columns (List[str]): List of column names for which to display bottom
|
728
|
+
unique values.
|
729
|
+
|
730
|
+
Raises:
|
731
|
+
- ValueError: If the DataFrame is `None`.
|
732
|
+
"""
|
582
733
|
if df is not None:
|
583
734
|
report = {}
|
584
735
|
for column in columns:
|
585
736
|
if column in df.columns:
|
586
737
|
frequency = df[column].astype(str).value_counts(dropna=False)
|
587
|
-
frequency = frequency.rename(
|
738
|
+
frequency = frequency.rename(
|
739
|
+
index={
|
740
|
+
'nan': 'NaN',
|
741
|
+
'NaT': 'NaT',
|
742
|
+
'None': 'None',
|
743
|
+
'': 'Empty'})
|
588
744
|
bottom_n_values = frequency.nsmallest(n)
|
589
|
-
report[column] = {
|
590
|
-
|
745
|
+
report[column] = {
|
746
|
+
str(value): str(count) for value,
|
747
|
+
count in bottom_n_values.items()}
|
748
|
+
print(
|
749
|
+
f"Bottom {n} unique values for column '{column}':\n{
|
750
|
+
json.dumps(
|
751
|
+
report[column],
|
752
|
+
indent=2)}\n")
|
591
753
|
else:
|
592
754
|
print(f"Column '{column}' does not exist in the DataFrame.")
|
593
755
|
else:
|
594
|
-
raise ValueError(
|
756
|
+
raise ValueError(
|
757
|
+
"No DataFrame to display. Please provide a DataFrame.")
|
595
758
|
|
596
759
|
gc.collect()
|
597
760
|
|
598
|
-
|
599
|
-
|
761
|
+
|
762
|
+
def print_correlation(
|
763
|
+
df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
|
764
|
+
"""
|
765
|
+
Print correlation for multiple pairs of columns in the DataFrame.
|
766
|
+
|
767
|
+
This function computes and displays the correlation coefficients for
|
768
|
+
specified pairs of columns.
|
769
|
+
|
770
|
+
Parameters:
|
771
|
+
- df (pd.DataFrame): The DataFrame containing the columns to analyze.
|
772
|
+
- column_pairs (List[Tuple[str, str]]): List of column pairs for which to
|
773
|
+
compute correlations.
|
774
|
+
"""
|
600
775
|
if df is not None:
|
601
776
|
for col1, col2 in column_pairs:
|
602
777
|
if col1 in df.columns and col2 in df.columns:
|
@@ -606,30 +781,68 @@ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) ->
|
|
606
781
|
|
607
782
|
correlation = numeric_col1.corr(numeric_col2)
|
608
783
|
if pd.notnull(correlation):
|
609
|
-
print(
|
784
|
+
print(
|
785
|
+
f"The correlation between '{col1}' and '{col2}'"
|
786
|
+
+ f" is {correlation}.")
|
610
787
|
else:
|
611
|
-
print(
|
788
|
+
print(
|
789
|
+
f"Cannot calculate correlation between '{col1}'"
|
790
|
+
+ f" and '{col2}' due to insufficient numeric"
|
791
|
+
+ " data.")
|
612
792
|
except Exception as e:
|
613
|
-
print(
|
793
|
+
print(
|
794
|
+
f"Error processing cols '{col1}' and '{col2}': {e}")
|
614
795
|
else:
|
615
|
-
print(
|
796
|
+
print(
|
797
|
+
f"One or both of the specified cols ('{col1}', '{col2}')"
|
798
|
+
+ " do not exist in the DataFrame.")
|
616
799
|
else:
|
617
800
|
print("The DataFrame is empty.")
|
618
801
|
|
619
802
|
gc.collect()
|
620
803
|
|
804
|
+
|
621
805
|
def print_memory_usage(df: pd.DataFrame) -> None:
|
622
|
-
"""
|
806
|
+
"""
|
807
|
+
Prints the memory usage of the DataFrame.
|
808
|
+
|
809
|
+
This function computes the memory footprint of a DataFrame in megabytes
|
810
|
+
and displays it, rounding to two decimal places for clarity.
|
811
|
+
|
812
|
+
Parameters:
|
813
|
+
- df (pd.DataFrame): The DataFrame for which the memory usage is computed.
|
814
|
+
|
815
|
+
Raises:
|
816
|
+
- ValueError: If the DataFrame is `None`.
|
817
|
+
"""
|
623
818
|
if df is not None:
|
624
|
-
memory_usage = df.memory_usage(deep=True).sum(
|
819
|
+
memory_usage = df.memory_usage(deep=True).sum(
|
820
|
+
) / (1024 * 1024) # Convert bytes to MB
|
625
821
|
print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
|
626
822
|
else:
|
627
823
|
raise ValueError("No DataFrame to print. Please provide a DataFrame.")
|
628
824
|
|
629
825
|
gc.collect()
|
630
826
|
|
827
|
+
|
631
828
|
def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
|
632
|
-
"""
|
829
|
+
"""
|
830
|
+
Return a filtered DataFrame according to the given expression.
|
831
|
+
|
832
|
+
This function filters rows of a DataFrame using a specified query
|
833
|
+
expression, returning a new DataFrame containing only the rows that
|
834
|
+
match the criteria.
|
835
|
+
|
836
|
+
Parameters:
|
837
|
+
- df (pd.DataFrame): The original DataFrame to be filtered.
|
838
|
+
- filter_expr (str): A query string to be evaluated against the DataFrame.
|
839
|
+
|
840
|
+
Returns:
|
841
|
+
- pd.DataFrame: A new DataFrame containing the filtered rows.
|
842
|
+
|
843
|
+
Raises:
|
844
|
+
- ValueError: If the DataFrame is `None`.
|
845
|
+
"""
|
633
846
|
if df is not None:
|
634
847
|
try:
|
635
848
|
filtered_df = df.query(filter_expr)
|
@@ -642,14 +855,34 @@ def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
|
|
642
855
|
|
643
856
|
return filtered_df
|
644
857
|
|
858
|
+
|
645
859
|
def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
|
646
|
-
"""
|
860
|
+
"""
|
861
|
+
Filter and return DataFrame rows containing valid Indian mobile numbers.
|
862
|
+
|
863
|
+
This function processes a DataFrame to extract and retain rows where the
|
864
|
+
specified column matches the typical format for Indian mobile numbers.
|
865
|
+
An Indian mobile number is expected to be a digit-only string starting
|
866
|
+
with 6, 7, 8, or 9, and should have at least 4 distinct digits.
|
867
|
+
|
868
|
+
Parameters:
|
869
|
+
- df (pd.DataFrame): The DataFrame to filter.
|
870
|
+
- mobile_col (str): The name of the column in the DataFrame that contains
|
871
|
+
mobile number data.
|
872
|
+
|
873
|
+
Returns:
|
874
|
+
- pd.DataFrame: A new DataFrame containing only rows with valid Indian
|
875
|
+
mobile numbers.
|
876
|
+
|
877
|
+
Raises:
|
878
|
+
- ValueError: If the DataFrame is `None`.
|
879
|
+
"""
|
647
880
|
if df is not None:
|
648
881
|
filtered_df = df[
|
649
882
|
df[mobile_col].apply(
|
650
883
|
lambda x: (
|
651
|
-
str(x).isdigit() and
|
652
|
-
str(x).startswith(('6', '7', '8', '9')) and
|
884
|
+
str(x).isdigit() and
|
885
|
+
str(x).startswith(('6', '7', '8', '9')) and
|
653
886
|
len(set(str(x))) >= 4
|
654
887
|
)
|
655
888
|
)
|
@@ -661,17 +894,21 @@ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
|
|
661
894
|
|
662
895
|
return filtered_df
|
663
896
|
|
897
|
+
|
664
898
|
def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
665
899
|
"""
|
666
|
-
Print the DataFrame and its column types. If a source path is provided,
|
900
|
+
Print the DataFrame and its column types. If a source path is provided,
|
901
|
+
print it as well.
|
667
902
|
|
668
903
|
Parameters:
|
669
904
|
df: The DataFrame to print.
|
670
|
-
source: Optional; The source path of the DataFrame for logging
|
905
|
+
source: Optional; The source path of the DataFrame for logging
|
906
|
+
purposes.
|
671
907
|
"""
|
672
908
|
if df is not None:
|
673
909
|
print(df)
|
674
|
-
columns_with_types = [
|
910
|
+
columns_with_types = [
|
911
|
+
f"{col} ({df[col].dtypes})" for col in df.columns]
|
675
912
|
print("Columns:", columns_with_types)
|
676
913
|
if source:
|
677
914
|
print(f"Source: {source}")
|
@@ -680,28 +917,43 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
|
680
917
|
|
681
918
|
gc.collect()
|
682
919
|
|
683
|
-
|
920
|
+
|
921
|
+
def send_dataframe_via_telegram(
|
922
|
+
df: pd.DataFrame,
|
923
|
+
bot_name: str,
|
924
|
+
message: Optional[str] = None,
|
925
|
+
as_file: bool = True,
|
926
|
+
remove_after_send: bool = True) -> None:
|
684
927
|
"""
|
685
928
|
Send a DataFrame via Telegram using a specified bot configuration.
|
686
929
|
|
687
930
|
Parameters:
|
688
931
|
df: The DataFrame to send.
|
689
|
-
bot_name: The name of the Telegram bot as specified in the
|
932
|
+
bot_name: The name of the Telegram bot as specified in the
|
933
|
+
configuration.
|
690
934
|
message: Custom message to send along with the DataFrame or file.
|
691
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a
|
935
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a
|
936
|
+
file or as text.
|
692
937
|
remove_after_send: If True, removes the file after sending.
|
693
938
|
"""
|
694
939
|
|
695
940
|
def locate_config_file(filename: str = "rgwml.config") -> str:
|
696
941
|
"""Retrieve the configuration file path."""
|
697
942
|
home_dir = os.path.expanduser("~")
|
698
|
-
search_paths = [
|
943
|
+
search_paths = [
|
944
|
+
os.path.join(
|
945
|
+
home_dir,
|
946
|
+
folder) for folder in [
|
947
|
+
"Desktop",
|
948
|
+
"Documents",
|
949
|
+
"Downloads"]]
|
699
950
|
|
700
951
|
for path in search_paths:
|
701
952
|
for root, _, files in os.walk(path):
|
702
953
|
if filename in files:
|
703
954
|
return os.path.join(root, filename)
|
704
|
-
raise FileNotFoundError(
|
955
|
+
raise FileNotFoundError(
|
956
|
+
f"{filename} not found in Desktop, Documents, or Downloads")
|
705
957
|
|
706
958
|
def get_config(config_path: str) -> dict:
|
707
959
|
"""Load configuration from a json file."""
|
@@ -710,8 +962,14 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
710
962
|
|
711
963
|
config_path = locate_config_file()
|
712
964
|
config = get_config(config_path)
|
965
|
+
bot_config = next(
|
966
|
+
(
|
967
|
+
bot for bot in config['telegram_bot_presets']
|
968
|
+
if bot['name'] == bot_name
|
969
|
+
),
|
970
|
+
None
|
971
|
+
)
|
713
972
|
|
714
|
-
bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
|
715
973
|
if not bot_config:
|
716
974
|
raise ValueError(f"No bot found with the name {bot_name}")
|
717
975
|
|
@@ -724,9 +982,15 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
724
982
|
df.to_csv(file_name, index=False)
|
725
983
|
try:
|
726
984
|
with open(file_name, 'rb') as file:
|
727
|
-
payload = {
|
985
|
+
payload = {
|
986
|
+
'chat_id': bot_config['chat_id'],
|
987
|
+
'caption': message or ''}
|
728
988
|
files = {'document': file}
|
729
|
-
response = requests.post(
|
989
|
+
response = requests.post(
|
990
|
+
f"https://api.telegram.org/bot{
|
991
|
+
bot_config['bot_token']}/sendDocument",
|
992
|
+
data=payload,
|
993
|
+
files=files)
|
730
994
|
if remove_after_send and os.path.exists(file_name):
|
731
995
|
os.remove(file_name)
|
732
996
|
except Exception as e:
|
@@ -734,14 +998,20 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
734
998
|
raise
|
735
999
|
else:
|
736
1000
|
df_str = df.to_string()
|
737
|
-
payload = {
|
738
|
-
|
1001
|
+
payload = {
|
1002
|
+
'chat_id': bot_config['chat_id'],
|
1003
|
+
'text': message + "\n\n" + df_str if message else df_str,
|
1004
|
+
'parse_mode': 'HTML'}
|
1005
|
+
response = requests.post(
|
1006
|
+
f"https://api.telegram.org/bot{bot_config['bot_token']}"
|
1007
|
+
+ "/sendMessage", data=payload)
|
739
1008
|
|
740
1009
|
if not response.ok:
|
741
1010
|
raise Exception(f"Error sending message: {response.text}")
|
742
1011
|
|
743
1012
|
print("Message sent successfully.")
|
744
1013
|
|
1014
|
+
|
745
1015
|
def send_data_to_email(
|
746
1016
|
df: pd.DataFrame,
|
747
1017
|
preset_name: str,
|
@@ -752,39 +1022,50 @@ def send_data_to_email(
|
|
752
1022
|
remove_after_send: bool = True
|
753
1023
|
) -> None:
|
754
1024
|
"""
|
755
|
-
Send an email with optional DataFrame attachment using Gmail API via a
|
1025
|
+
Send an email with optional DataFrame attachment using Gmail API via a
|
1026
|
+
specified preset.
|
756
1027
|
|
757
1028
|
Parameters:
|
758
1029
|
df: The DataFrame to send.
|
759
|
-
preset_name: The configuration preset name to use for sending the
|
1030
|
+
preset_name: The configuration preset name to use for sending the
|
1031
|
+
email.
|
760
1032
|
to_email: The recipient email address.
|
761
1033
|
subject: Optional subject of the email.
|
762
1034
|
body: Optional message body of the email.
|
763
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a
|
1035
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a
|
1036
|
+
file.
|
764
1037
|
remove_after_send: If True, removes the CSV file after sending.
|
765
1038
|
"""
|
766
1039
|
|
767
1040
|
def locate_config_file(filename: str = "rgwml.config") -> str:
|
768
1041
|
"""Locate config file in common user directories."""
|
769
1042
|
home_dir = os.path.expanduser("~")
|
770
|
-
search_paths = [
|
1043
|
+
search_paths = [
|
1044
|
+
os.path.join(
|
1045
|
+
home_dir,
|
1046
|
+
folder) for folder in [
|
1047
|
+
"Desktop",
|
1048
|
+
"Documents",
|
1049
|
+
"Downloads"]]
|
771
1050
|
|
772
1051
|
for path in search_paths:
|
773
1052
|
for root, _, files in os.walk(path):
|
774
1053
|
if filename in files:
|
775
1054
|
return os.path.join(root, filename)
|
776
|
-
raise FileNotFoundError(
|
1055
|
+
raise FileNotFoundError(
|
1056
|
+
f"{filename} not found in Desktop, Documents, or Downloads"
|
1057
|
+
+ " folders")
|
777
1058
|
|
778
1059
|
def get_config(config_path: str) -> dict:
|
779
|
-
"""Load configuration from a json file."""
|
780
1060
|
with open(config_path, 'r') as file:
|
781
1061
|
try:
|
782
1062
|
return json.load(file)
|
783
1063
|
except json.JSONDecodeError as e:
|
784
1064
|
raise ValueError(f"Invalid JSON format in config file: {e}")
|
785
1065
|
|
786
|
-
def authenticate_service_account(
|
787
|
-
|
1066
|
+
def authenticate_service_account(
|
1067
|
+
service_account_credentials_path: str,
|
1068
|
+
sender_email_id: str) -> Any:
|
788
1069
|
credentials = service_account.Credentials.from_service_account_file(
|
789
1070
|
service_account_credentials_path,
|
790
1071
|
scopes=['https://mail.google.com/'],
|
@@ -797,7 +1078,14 @@ def send_data_to_email(
|
|
797
1078
|
config = get_config(config_path)
|
798
1079
|
|
799
1080
|
# Retrieve Gmail preset configuration
|
800
|
-
gmail_config = next(
|
1081
|
+
gmail_config = next(
|
1082
|
+
(
|
1083
|
+
preset for preset in config['gmail_bot_presets']
|
1084
|
+
if preset['name'] == preset_name
|
1085
|
+
),
|
1086
|
+
None
|
1087
|
+
)
|
1088
|
+
|
801
1089
|
if not gmail_config:
|
802
1090
|
raise ValueError(f"No preset found with the name {preset_name}")
|
803
1091
|
|
@@ -809,7 +1097,9 @@ def send_data_to_email(
|
|
809
1097
|
|
810
1098
|
if as_file:
|
811
1099
|
# Create a temporary file for the DataFrame as CSV
|
812
|
-
with tempfile.NamedTemporaryFile(
|
1100
|
+
with tempfile.NamedTemporaryFile(
|
1101
|
+
delete=False, suffix=".csv"
|
1102
|
+
) as tmp_file:
|
813
1103
|
tmp_file_name = tmp_file.name
|
814
1104
|
df.to_csv(tmp_file_name, index=False)
|
815
1105
|
|
@@ -819,13 +1109,18 @@ def send_data_to_email(
|
|
819
1109
|
message['to'] = to_email
|
820
1110
|
message['from'] = sender_email
|
821
1111
|
message['subject'] = subject if subject else 'DataFrame CSV File'
|
822
|
-
message.attach(
|
1112
|
+
message.attach(
|
1113
|
+
MIMEText(
|
1114
|
+
body if body else 'Please find the CSV file attached.'))
|
823
1115
|
|
824
1116
|
with open(tmp_file_name, 'rb') as file:
|
825
1117
|
part = MIMEBase('application', 'octet-stream')
|
826
1118
|
part.set_payload(file.read())
|
827
1119
|
encoders.encode_base64(part)
|
828
|
-
part.add_header(
|
1120
|
+
part.add_header(
|
1121
|
+
'Content-Disposition',
|
1122
|
+
f'attachment; filename={
|
1123
|
+
os.path.basename(tmp_file_name)}')
|
829
1124
|
message.attach(part)
|
830
1125
|
|
831
1126
|
if remove_after_send and os.path.exists(tmp_file_name):
|
@@ -847,11 +1142,13 @@ def send_data_to_email(
|
|
847
1142
|
try:
|
848
1143
|
raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
|
849
1144
|
email_body = {'raw': raw}
|
850
|
-
sent_message = service.users().messages().send(
|
1145
|
+
sent_message = service.users().messages().send(
|
1146
|
+
userId="me", body=email_body).execute()
|
851
1147
|
print(f"Email with Message Id {sent_message['id']} successfully sent.")
|
852
1148
|
except Exception as error:
|
853
1149
|
raise Exception(f"Error sending email: {error}")
|
854
1150
|
|
1151
|
+
|
855
1152
|
def send_data_to_slack(
|
856
1153
|
df: pd.DataFrame,
|
857
1154
|
bot_name: str,
|
@@ -866,20 +1163,29 @@ def send_data_to_slack(
|
|
866
1163
|
df: The DataFrame to send.
|
867
1164
|
bot_name: The Slack bot configuration preset name.
|
868
1165
|
message: Custom message to send along with the DataFrame or file.
|
869
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a
|
1166
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a
|
1167
|
+
file.
|
870
1168
|
remove_after_send: If True, removes the CSV file after sending.
|
871
1169
|
"""
|
872
1170
|
|
873
1171
|
def locate_config_file(filename: str = "rgwml.config") -> str:
|
874
1172
|
"""Locate config file in common user directories."""
|
875
1173
|
home_dir = os.path.expanduser("~")
|
876
|
-
search_paths = [
|
1174
|
+
search_paths = [
|
1175
|
+
os.path.join(
|
1176
|
+
home_dir,
|
1177
|
+
folder) for folder in [
|
1178
|
+
"Desktop",
|
1179
|
+
"Documents",
|
1180
|
+
"Downloads"]]
|
877
1181
|
|
878
1182
|
for path in search_paths:
|
879
1183
|
for root, _, files in os.walk(path):
|
880
1184
|
if filename in files:
|
881
1185
|
return os.path.join(root, filename)
|
882
|
-
raise FileNotFoundError(
|
1186
|
+
raise FileNotFoundError(
|
1187
|
+
f"{filename} not found in Desktop, Documents, or Downloads"
|
1188
|
+
+ " folders")
|
883
1189
|
|
884
1190
|
def get_config(config_path: str) -> dict:
|
885
1191
|
"""Load configuration from a JSON file."""
|
@@ -890,7 +1196,14 @@ def send_data_to_slack(
|
|
890
1196
|
config_path = locate_config_file()
|
891
1197
|
config = get_config(config_path)
|
892
1198
|
|
893
|
-
bot_config = next(
|
1199
|
+
bot_config = next(
|
1200
|
+
(
|
1201
|
+
bot for bot in config['slack_bot_presets']
|
1202
|
+
if bot['name'] == bot_name
|
1203
|
+
),
|
1204
|
+
None
|
1205
|
+
)
|
1206
|
+
|
894
1207
|
if not bot_config:
|
895
1208
|
raise ValueError(f"No bot found with the name {bot_name}")
|
896
1209
|
|
@@ -898,7 +1211,9 @@ def send_data_to_slack(
|
|
898
1211
|
|
899
1212
|
if as_file:
|
900
1213
|
# Create a temporary file for the DataFrame as CSV
|
901
|
-
with tempfile.NamedTemporaryFile(
|
1214
|
+
with tempfile.NamedTemporaryFile(
|
1215
|
+
delete=False, suffix=".csv"
|
1216
|
+
) as tmp_file:
|
902
1217
|
file_name = tmp_file.name
|
903
1218
|
df.to_csv(file_name, index=False)
|
904
1219
|
|
@@ -927,13 +1242,15 @@ def send_data_to_slack(
|
|
927
1242
|
|
928
1243
|
print("Message sent successfully.")
|
929
1244
|
|
1245
|
+
|
930
1246
|
def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
|
931
1247
|
"""
|
932
1248
|
Reorder the columns of the DataFrame based on a string input.
|
933
1249
|
|
934
1250
|
Parameters:
|
935
1251
|
df: The DataFrame whose columns will be reordered.
|
936
|
-
column_order_str: A string specifying the desired order of columns,
|
1252
|
+
column_order_str: A string specifying the desired order of columns,
|
1253
|
+
using ',' to separate columns.
|
937
1254
|
|
938
1255
|
Returns:
|
939
1256
|
A new DataFrame with reordered columns.
|
@@ -942,7 +1259,8 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
|
|
942
1259
|
ValueError: If a specified column does not exist in the DataFrame.
|
943
1260
|
"""
|
944
1261
|
if df is None:
|
945
|
-
raise ValueError(
|
1262
|
+
raise ValueError(
|
1263
|
+
"No DataFrame to reorder. Please provide a valid DataFrame.")
|
946
1264
|
|
947
1265
|
columns = df.columns.tolist()
|
948
1266
|
parts = [part.strip() for part in column_order_str.split(',')]
|
@@ -972,10 +1290,11 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
|
|
972
1290
|
|
973
1291
|
return df[new_order]
|
974
1292
|
|
1293
|
+
|
975
1294
|
def append_ranged_classification_column(
|
976
|
-
df: pd.DataFrame,
|
977
|
-
ranges: str,
|
978
|
-
target_col: str,
|
1295
|
+
df: pd.DataFrame,
|
1296
|
+
ranges: str,
|
1297
|
+
target_col: str,
|
979
1298
|
new_col_name: str
|
980
1299
|
) -> pd.DataFrame:
|
981
1300
|
"""
|
@@ -992,7 +1311,6 @@ def append_ranged_classification_column(
|
|
992
1311
|
"""
|
993
1312
|
|
994
1313
|
def pad_number(number, integer_length, decimal_length=0, decimal=False):
|
995
|
-
"""Pad number to have a consistent length for integer and decimal parts."""
|
996
1314
|
if decimal:
|
997
1315
|
str_number = f"{number:.{decimal_length}f}"
|
998
1316
|
integer_part, decimal_part = str_number.split('.')
|
@@ -1006,25 +1324,70 @@ def append_ranged_classification_column(
|
|
1006
1324
|
|
1007
1325
|
if has_decimals:
|
1008
1326
|
range_list = [float(r) for r in range_list]
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1327
|
+
|
1328
|
+
max_decimal_length = max(
|
1329
|
+
len(str(r).split('.')[1])
|
1330
|
+
for r in range_list
|
1331
|
+
if '.' in str(r)
|
1332
|
+
)
|
1333
|
+
|
1334
|
+
max_integer_length = max(
|
1335
|
+
len(str(int(float(r))))
|
1336
|
+
for r in range_list
|
1337
|
+
)
|
1338
|
+
|
1339
|
+
labels = []
|
1340
|
+
|
1341
|
+
for i in range(len(range_list) - 1):
|
1342
|
+
start = pad_number(
|
1343
|
+
range_list[i],
|
1344
|
+
max_integer_length,
|
1345
|
+
max_decimal_length,
|
1346
|
+
decimal=True
|
1347
|
+
)
|
1348
|
+
|
1349
|
+
end = pad_number(
|
1350
|
+
range_list[i + 1],
|
1351
|
+
max_integer_length,
|
1352
|
+
max_decimal_length,
|
1353
|
+
decimal=True
|
1354
|
+
)
|
1355
|
+
|
1356
|
+
label = f"{start} to {end}"
|
1357
|
+
labels.append(label)
|
1358
|
+
|
1012
1359
|
else:
|
1013
1360
|
range_list = [int(r) for r in range_list]
|
1014
|
-
|
1015
|
-
|
1361
|
+
|
1362
|
+
max_integer_length = max(
|
1363
|
+
len(str(r))
|
1364
|
+
for r in range_list
|
1365
|
+
)
|
1366
|
+
|
1367
|
+
labels = [
|
1368
|
+
f"{pad_number(range_list[i], max_integer_length)}"
|
1369
|
+
f" to "
|
1370
|
+
f"{pad_number(range_list[i + 1], max_integer_length)}"
|
1371
|
+
for i in range(len(range_list) - 1)
|
1372
|
+
]
|
1016
1373
|
|
1017
1374
|
# Ensure the target column is numeric
|
1018
1375
|
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1019
1376
|
|
1020
|
-
df[new_col_name] = pd.cut(
|
1377
|
+
df[new_col_name] = pd.cut(
|
1378
|
+
df[target_col],
|
1379
|
+
bins=range_list,
|
1380
|
+
labels=labels,
|
1381
|
+
right=False,
|
1382
|
+
include_lowest=True)
|
1021
1383
|
|
1022
1384
|
return df
|
1023
1385
|
|
1386
|
+
|
1024
1387
|
def append_percentile_classification_column(
|
1025
|
-
df: pd.DataFrame,
|
1026
|
-
percentiles: str,
|
1027
|
-
target_col: str,
|
1388
|
+
df: pd.DataFrame,
|
1389
|
+
percentiles: str,
|
1390
|
+
target_col: str,
|
1028
1391
|
new_col_name: str
|
1029
1392
|
) -> pd.DataFrame:
|
1030
1393
|
"""
|
@@ -1032,7 +1395,8 @@ def append_percentile_classification_column(
|
|
1032
1395
|
|
1033
1396
|
Parameters:
|
1034
1397
|
df: The DataFrame to modify.
|
1035
|
-
percentiles: A string representation of percentile values separated
|
1398
|
+
percentiles: A string representation of percentile values separated
|
1399
|
+
by commas.
|
1036
1400
|
target_col: The column to analyze.
|
1037
1401
|
new_col_name: The name of the new classification column.
|
1038
1402
|
|
@@ -1041,7 +1405,6 @@ def append_percentile_classification_column(
|
|
1041
1405
|
"""
|
1042
1406
|
|
1043
1407
|
def pad_number(number, integer_length, decimal_length=0, decimal=False):
|
1044
|
-
"""Pad number to have a consistent length for integer and decimal parts."""
|
1045
1408
|
if decimal:
|
1046
1409
|
str_number = f"{number:.{decimal_length}f}"
|
1047
1410
|
integer_part, decimal_part = str_number.split('.')
|
@@ -1055,26 +1418,78 @@ def append_percentile_classification_column(
|
|
1055
1418
|
|
1056
1419
|
if has_decimals:
|
1057
1420
|
percentiles_list = [float(p) for p in percentiles_list]
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1421
|
+
|
1422
|
+
max_decimal_length = max(
|
1423
|
+
len(str(p).split('.')[1])
|
1424
|
+
for p in percentiles_list
|
1425
|
+
if '.' in str(p)
|
1426
|
+
)
|
1427
|
+
|
1428
|
+
max_integer_length = max(
|
1429
|
+
len(str(int(float(p))))
|
1430
|
+
for p in percentiles_list
|
1431
|
+
)
|
1432
|
+
|
1433
|
+
labels = []
|
1434
|
+
|
1435
|
+
for i in range(len(percentiles_list) - 1):
|
1436
|
+
start = pad_number(
|
1437
|
+
percentiles_list[i],
|
1438
|
+
max_integer_length,
|
1439
|
+
max_decimal_length,
|
1440
|
+
decimal=True
|
1441
|
+
)
|
1442
|
+
|
1443
|
+
end = pad_number(
|
1444
|
+
percentiles_list[i + 1],
|
1445
|
+
max_integer_length,
|
1446
|
+
max_decimal_length,
|
1447
|
+
decimal=True
|
1448
|
+
)
|
1449
|
+
|
1450
|
+
label = f"{start} to {end}"
|
1451
|
+
labels.append(label)
|
1061
1452
|
else:
|
1062
1453
|
percentiles_list = [int(p) for p in percentiles_list]
|
1063
|
-
|
1064
|
-
|
1454
|
+
|
1455
|
+
max_integer_length = max(
|
1456
|
+
len(str(p))
|
1457
|
+
for p in percentiles_list
|
1458
|
+
)
|
1459
|
+
|
1460
|
+
labels = []
|
1461
|
+
|
1462
|
+
for i in range(len(percentiles_list) - 1):
|
1463
|
+
start = pad_number(
|
1464
|
+
percentiles_list[i],
|
1465
|
+
max_integer_length
|
1466
|
+
)
|
1467
|
+
|
1468
|
+
end = pad_number(
|
1469
|
+
percentiles_list[i + 1],
|
1470
|
+
max_integer_length
|
1471
|
+
)
|
1472
|
+
|
1473
|
+
label = f"{start} to {end}"
|
1474
|
+
labels.append(label)
|
1065
1475
|
|
1066
1476
|
# Ensure the target column is numeric
|
1067
1477
|
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1068
1478
|
quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
|
1069
|
-
|
1070
|
-
df[new_col_name] = pd.cut(
|
1479
|
+
|
1480
|
+
df[new_col_name] = pd.cut(
|
1481
|
+
df[target_col],
|
1482
|
+
bins=quantiles,
|
1483
|
+
labels=labels,
|
1484
|
+
include_lowest=True)
|
1071
1485
|
|
1072
1486
|
return df
|
1073
1487
|
|
1488
|
+
|
1074
1489
|
def append_ranged_date_classification_column(
|
1075
|
-
df: pd.DataFrame,
|
1076
|
-
date_ranges: str,
|
1077
|
-
target_col: str,
|
1490
|
+
df: pd.DataFrame,
|
1491
|
+
date_ranges: str,
|
1492
|
+
target_col: str,
|
1078
1493
|
new_col_name: str
|
1079
1494
|
) -> pd.DataFrame:
|
1080
1495
|
"""
|
@@ -1082,7 +1497,8 @@ def append_ranged_date_classification_column(
|
|
1082
1497
|
|
1083
1498
|
Parameters:
|
1084
1499
|
df: The DataFrame to modify.
|
1085
|
-
date_ranges: A string representation of date ranges separated by
|
1500
|
+
date_ranges: A string representation of date ranges separated by
|
1501
|
+
commas.
|
1086
1502
|
target_col: The date column to analyze.
|
1087
1503
|
new_col_name: The name of the new date classification column.
|
1088
1504
|
|
@@ -1091,41 +1507,61 @@ def append_ranged_date_classification_column(
|
|
1091
1507
|
"""
|
1092
1508
|
|
1093
1509
|
date_list = [pd.to_datetime(date) for date in date_ranges.split(',')]
|
1094
|
-
labels = [f"{date_list[i].strftime('%Y-%m-%d')} to {date_list[i + 1].strftime('%Y-%m-%d')}" for i in range(len(date_list) - 1)]
|
1095
1510
|
|
1096
|
-
|
1511
|
+
labels = []
|
1512
|
+
|
1513
|
+
for i in range(len(date_list) - 1):
|
1514
|
+
start_date = date_list[i].strftime('%Y-%m-%d')
|
1515
|
+
end_date = date_list[i + 1].strftime('%Y-%m-%d')
|
1516
|
+
label = f"{start_date} to {end_date}"
|
1517
|
+
labels.append(label)
|
1518
|
+
|
1519
|
+
df[new_col_name] = pd.cut(
|
1520
|
+
pd.to_datetime(df[target_col]),
|
1521
|
+
bins=date_list,
|
1522
|
+
labels=labels,
|
1523
|
+
right=False)
|
1097
1524
|
|
1098
1525
|
return df
|
1099
1526
|
|
1100
|
-
|
1527
|
+
|
1528
|
+
def rename_columns(df: pd.DataFrame,
|
1529
|
+
rename_pairs: Dict[str,
|
1530
|
+
str]) -> pd.DataFrame:
|
1101
1531
|
"""
|
1102
1532
|
Rename columns in the DataFrame.
|
1103
1533
|
|
1104
1534
|
Parameters:
|
1105
1535
|
df: The DataFrame to modify.
|
1106
|
-
rename_pairs: A dictionary mapping old column names to new column
|
1536
|
+
rename_pairs: A dictionary mapping old column names to new column
|
1537
|
+
names.
|
1107
1538
|
|
1108
1539
|
Returns:
|
1109
1540
|
A new DataFrame with columns renamed.
|
1110
1541
|
"""
|
1111
1542
|
if df is None:
|
1112
|
-
raise ValueError(
|
1543
|
+
raise ValueError(
|
1544
|
+
"No DataFrame to rename columns. Please provide a valid"
|
1545
|
+
+ " DataFrame.")
|
1113
1546
|
|
1114
1547
|
return df.rename(columns=rename_pairs)
|
1115
1548
|
|
1549
|
+
|
1116
1550
|
def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
|
1117
1551
|
"""
|
1118
1552
|
Cascade sort the DataFrame by specified columns and order.
|
1119
1553
|
|
1120
1554
|
Parameters:
|
1121
1555
|
df: The DataFrame to sort.
|
1122
|
-
columns: A list of column names with sorting order, e.g.,
|
1556
|
+
columns: A list of column names with sorting order, e.g.,
|
1557
|
+
['Column1::ASC', 'Column2::DESC'].
|
1123
1558
|
|
1124
1559
|
Returns:
|
1125
1560
|
A new DataFrame sorted by specified columns.
|
1126
1561
|
"""
|
1127
1562
|
if df is None:
|
1128
|
-
raise ValueError(
|
1563
|
+
raise ValueError(
|
1564
|
+
"No DataFrame to sort. Please provide a valid DataFrame.")
|
1129
1565
|
|
1130
1566
|
col_names = []
|
1131
1567
|
asc_order = []
|
@@ -1147,19 +1583,22 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
|
|
1147
1583
|
|
1148
1584
|
return df.sort_values(by=col_names, ascending=asc_order)
|
1149
1585
|
|
1586
|
+
|
1150
1587
|
def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
1151
1588
|
"""
|
1152
1589
|
Append XGB training labels based on a ratio string.
|
1153
1590
|
|
1154
1591
|
Parameters:
|
1155
1592
|
df: The DataFrame to modify.
|
1156
|
-
ratio_str: A string specifying the ratio of TRAIN:TEST or
|
1593
|
+
ratio_str: A string specifying the ratio of TRAIN:TEST or
|
1594
|
+
TRAIN:VALIDATE:TEST.
|
1157
1595
|
|
1158
1596
|
Returns:
|
1159
1597
|
A new DataFrame with XGB_TYPE labels appended.
|
1160
1598
|
"""
|
1161
1599
|
if df is None:
|
1162
|
-
raise ValueError(
|
1600
|
+
raise ValueError(
|
1601
|
+
"No DataFrame to add labels. Please provide a valid DataFrame.")
|
1163
1602
|
|
1164
1603
|
ratios = list(map(int, ratio_str.split(':')))
|
1165
1604
|
total_ratio = sum(ratios)
|
@@ -1173,25 +1612,30 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
|
1173
1612
|
train_rows = (ratios[0] * total_rows) // total_ratio
|
1174
1613
|
validate_rows = (ratios[1] * total_rows) // total_ratio
|
1175
1614
|
test_rows = total_rows - train_rows - validate_rows
|
1176
|
-
labels = ['TRAIN'] * train_rows + ['VALIDATE'] *
|
1615
|
+
labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
|
1616
|
+
validate_rows + ['TEST'] * test_rows
|
1177
1617
|
else:
|
1178
|
-
raise ValueError(
|
1618
|
+
raise ValueError(
|
1619
|
+
"Invalid ratio string format. Use 'TRAIN:TEST' or"
|
1620
|
+
+ "'TRAIN:VALIDATE:TEST'.")
|
1179
1621
|
|
1180
1622
|
df_with_labels = df.copy()
|
1181
1623
|
df_with_labels['XGB_TYPE'] = labels
|
1182
1624
|
|
1183
1625
|
return df_with_labels
|
1184
1626
|
|
1627
|
+
|
1185
1628
|
def append_xgb_regression_predictions(
|
1186
|
-
df: pd.DataFrame,
|
1187
|
-
target_col: str,
|
1188
|
-
feature_cols: str,
|
1189
|
-
pred_col: str,
|
1190
|
-
boosting_rounds: int = 100,
|
1629
|
+
df: pd.DataFrame,
|
1630
|
+
target_col: str,
|
1631
|
+
feature_cols: str,
|
1632
|
+
pred_col: str,
|
1633
|
+
boosting_rounds: int = 100,
|
1191
1634
|
model_path: Optional[str] = None
|
1192
1635
|
) -> pd.DataFrame:
|
1193
1636
|
"""
|
1194
|
-
Append XGB regression predictions to DataFrame. Assumes data is labeled
|
1637
|
+
Append XGB regression predictions to DataFrame. Assumes data is labeled
|
1638
|
+
by an 'XGB_TYPE' column.
|
1195
1639
|
|
1196
1640
|
Parameters:
|
1197
1641
|
df: DataFrame to modify.
|
@@ -1205,7 +1649,8 @@ def append_xgb_regression_predictions(
|
|
1205
1649
|
DataFrame with predictions appended.
|
1206
1650
|
"""
|
1207
1651
|
if df is None or 'XGB_TYPE' not in df.columns:
|
1208
|
-
raise ValueError(
|
1652
|
+
raise ValueError(
|
1653
|
+
"DataFrame is not initialized or 'XGB_TYPE' column is missing.")
|
1209
1654
|
|
1210
1655
|
features = feature_cols.replace(' ', '').split(',')
|
1211
1656
|
|
@@ -1215,13 +1660,23 @@ def append_xgb_regression_predictions(
|
|
1215
1660
|
df[col] = df[col].astype('category')
|
1216
1661
|
|
1217
1662
|
train_data = df[df['XGB_TYPE'] == 'TRAIN']
|
1218
|
-
validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
|
1219
1663
|
|
1220
|
-
|
1664
|
+
if 'VALIDATE' in df['XGB_TYPE'].values:
|
1665
|
+
validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
|
1666
|
+
else:
|
1667
|
+
validate_data = None
|
1668
|
+
|
1669
|
+
dtrain = xgb.DMatrix(
|
1670
|
+
train_data[features],
|
1671
|
+
label=train_data[target_col],
|
1672
|
+
enable_categorical=True)
|
1221
1673
|
evals = [(dtrain, 'train')]
|
1222
1674
|
|
1223
1675
|
if validate_data is not None:
|
1224
|
-
dvalidate = xgb.DMatrix(
|
1676
|
+
dvalidate = xgb.DMatrix(
|
1677
|
+
validate_data[features],
|
1678
|
+
label=validate_data[target_col],
|
1679
|
+
enable_categorical=True)
|
1225
1680
|
evals.append((dvalidate, 'validate'))
|
1226
1681
|
|
1227
1682
|
params = {
|
@@ -1229,7 +1684,12 @@ def append_xgb_regression_predictions(
|
|
1229
1684
|
'eval_metric': 'rmse'
|
1230
1685
|
}
|
1231
1686
|
|
1232
|
-
model = xgb.train(
|
1687
|
+
model = xgb.train(
|
1688
|
+
params,
|
1689
|
+
dtrain,
|
1690
|
+
num_boost_round=boosting_rounds,
|
1691
|
+
evals=evals,
|
1692
|
+
early_stopping_rounds=10 if validate_data is not None else None)
|
1233
1693
|
|
1234
1694
|
# Make predictions for all data
|
1235
1695
|
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
@@ -1238,21 +1698,24 @@ def append_xgb_regression_predictions(
|
|
1238
1698
|
if model_path:
|
1239
1699
|
model.save_model(model_path)
|
1240
1700
|
|
1241
|
-
columns_order = [col for col in df.columns if col not in [
|
1701
|
+
columns_order = [col for col in df.columns if col not in [
|
1702
|
+
'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1242
1703
|
df = df[columns_order]
|
1243
1704
|
|
1244
1705
|
return df
|
1245
1706
|
|
1707
|
+
|
1246
1708
|
def append_xgb_logistic_regression_predictions(
|
1247
|
-
df: pd.DataFrame,
|
1248
|
-
target_col: str,
|
1249
|
-
feature_cols: str,
|
1250
|
-
pred_col: str,
|
1251
|
-
boosting_rounds: int = 100,
|
1709
|
+
df: pd.DataFrame,
|
1710
|
+
target_col: str,
|
1711
|
+
feature_cols: str,
|
1712
|
+
pred_col: str,
|
1713
|
+
boosting_rounds: int = 100,
|
1252
1714
|
model_path: Optional[str] = None
|
1253
1715
|
) -> pd.DataFrame:
|
1254
1716
|
"""
|
1255
|
-
Append XGB logistic regression predictions to DataFrame. Assumes data is
|
1717
|
+
Append XGB logistic regression predictions to DataFrame. Assumes data is
|
1718
|
+
labeled by an 'XGB_TYPE' column.
|
1256
1719
|
|
1257
1720
|
Parameters:
|
1258
1721
|
df: DataFrame to modify.
|
@@ -1266,7 +1729,8 @@ def append_xgb_logistic_regression_predictions(
|
|
1266
1729
|
DataFrame with predictions appended.
|
1267
1730
|
"""
|
1268
1731
|
if df is None or 'XGB_TYPE' not in df.columns:
|
1269
|
-
raise ValueError(
|
1732
|
+
raise ValueError(
|
1733
|
+
"DataFrame is not initialized or 'XGB_TYPE' column is missing.")
|
1270
1734
|
|
1271
1735
|
features = feature_cols.replace(' ', '').split(',')
|
1272
1736
|
|
@@ -1276,13 +1740,22 @@ def append_xgb_logistic_regression_predictions(
|
|
1276
1740
|
df[col] = df[col].astype('category')
|
1277
1741
|
|
1278
1742
|
train_data = df[df['XGB_TYPE'] == 'TRAIN']
|
1279
|
-
validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
|
1280
1743
|
|
1281
|
-
|
1744
|
+
validate_data = None
|
1745
|
+
if 'VALIDATE' in df['XGB_TYPE'].values:
|
1746
|
+
validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
|
1747
|
+
|
1748
|
+
dtrain = xgb.DMatrix(
|
1749
|
+
train_data[features],
|
1750
|
+
label=train_data[target_col],
|
1751
|
+
enable_categorical=True)
|
1282
1752
|
evals = [(dtrain, 'train')]
|
1283
1753
|
|
1284
1754
|
if validate_data is not None:
|
1285
|
-
dvalidate = xgb.DMatrix(
|
1755
|
+
dvalidate = xgb.DMatrix(
|
1756
|
+
validate_data[features],
|
1757
|
+
label=validate_data[target_col],
|
1758
|
+
enable_categorical=True)
|
1286
1759
|
evals.append((dvalidate, 'validate'))
|
1287
1760
|
|
1288
1761
|
params = {
|
@@ -1290,7 +1763,12 @@ def append_xgb_logistic_regression_predictions(
|
|
1290
1763
|
'eval_metric': 'auc'
|
1291
1764
|
}
|
1292
1765
|
|
1293
|
-
model = xgb.train(
|
1766
|
+
model = xgb.train(
|
1767
|
+
params,
|
1768
|
+
dtrain,
|
1769
|
+
num_boost_round=boosting_rounds,
|
1770
|
+
evals=evals,
|
1771
|
+
early_stopping_rounds=10 if validate_data is not None else None)
|
1294
1772
|
|
1295
1773
|
# Make predictions for all data
|
1296
1774
|
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
@@ -1299,15 +1777,17 @@ def append_xgb_logistic_regression_predictions(
|
|
1299
1777
|
if model_path:
|
1300
1778
|
model.save_model(model_path)
|
1301
1779
|
|
1302
|
-
columns_order = [col for col in df.columns if col not in [
|
1780
|
+
columns_order = [col for col in df.columns if col not in [
|
1781
|
+
'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1303
1782
|
df = df[columns_order]
|
1304
1783
|
|
1305
1784
|
return df
|
1306
1785
|
|
1786
|
+
|
1307
1787
|
def print_n_frequency_cascading(
|
1308
|
-
df: pd.DataFrame,
|
1309
|
-
n: int,
|
1310
|
-
columns: str,
|
1788
|
+
df: pd.DataFrame,
|
1789
|
+
n: int,
|
1790
|
+
columns: str,
|
1311
1791
|
order_by: str = "FREQ_DESC"
|
1312
1792
|
) -> None:
|
1313
1793
|
"""
|
@@ -1332,7 +1812,12 @@ def print_n_frequency_cascading(
|
|
1332
1812
|
# Convert the column to string representation
|
1333
1813
|
df[current_col] = df[current_col].astype(str)
|
1334
1814
|
frequency = df[current_col].value_counts(dropna=False)
|
1335
|
-
frequency = frequency.rename(
|
1815
|
+
frequency = frequency.rename(
|
1816
|
+
index={
|
1817
|
+
'nan': 'NaN',
|
1818
|
+
'NaT': 'NaT',
|
1819
|
+
'None': 'None',
|
1820
|
+
'': 'Empty'})
|
1336
1821
|
|
1337
1822
|
if limit is not None:
|
1338
1823
|
frequency = frequency.nlargest(limit)
|
@@ -1347,11 +1832,11 @@ def print_n_frequency_cascading(
|
|
1347
1832
|
filtered_df = df[df[current_col] == value]
|
1348
1833
|
|
1349
1834
|
if len(columns) > 1:
|
1350
|
-
sub_report = generate_cascade_report(
|
1835
|
+
sub_report = generate_cascade_report(
|
1836
|
+
filtered_df, columns[1:], limit, order_by)
|
1351
1837
|
report[value] = {
|
1352
|
-
"count": str(count),
|
1353
|
-
|
1354
|
-
}
|
1838
|
+
"count": str(count), f"sub_distribution({
|
1839
|
+
columns[1]})": sub_report if sub_report else {}}
|
1355
1840
|
else:
|
1356
1841
|
report[value] = {
|
1357
1842
|
"count": str(count)
|
@@ -1363,19 +1848,28 @@ def print_n_frequency_cascading(
|
|
1363
1848
|
if order_by == "ASC":
|
1364
1849
|
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1365
1850
|
elif order_by == "DESC":
|
1366
|
-
return dict(
|
1851
|
+
return dict(
|
1852
|
+
sorted(
|
1853
|
+
frequency.items(),
|
1854
|
+
key=lambda item: item[0],
|
1855
|
+
reverse=True))
|
1367
1856
|
elif order_by == "FREQ_ASC":
|
1368
1857
|
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1369
1858
|
else: # Default to "FREQ_DESC"
|
1370
|
-
return dict(
|
1859
|
+
return dict(
|
1860
|
+
sorted(
|
1861
|
+
frequency.items(),
|
1862
|
+
key=lambda item: item[1],
|
1863
|
+
reverse=True))
|
1371
1864
|
|
1372
1865
|
report = generate_cascade_report(df, columns, n, order_by)
|
1373
1866
|
print(json.dumps(report, indent=2))
|
1374
1867
|
|
1868
|
+
|
1375
1869
|
def print_n_frequency_linear(
|
1376
|
-
df: pd.DataFrame,
|
1377
|
-
n: int,
|
1378
|
-
columns: str,
|
1870
|
+
df: pd.DataFrame,
|
1871
|
+
n: int,
|
1872
|
+
columns: str,
|
1379
1873
|
order_by: str = "FREQ_DESC"
|
1380
1874
|
) -> None:
|
1381
1875
|
"""
|
@@ -1397,13 +1891,19 @@ def print_n_frequency_linear(
|
|
1397
1891
|
continue
|
1398
1892
|
|
1399
1893
|
frequency = df[current_col].astype(str).value_counts(dropna=False)
|
1400
|
-
frequency = frequency.rename(
|
1894
|
+
frequency = frequency.rename(
|
1895
|
+
index={
|
1896
|
+
'nan': 'NaN',
|
1897
|
+
'NaT': 'NaT',
|
1898
|
+
'None': 'None',
|
1899
|
+
'': 'Empty'})
|
1401
1900
|
|
1402
1901
|
if limit is not None:
|
1403
1902
|
frequency = frequency.nlargest(limit)
|
1404
1903
|
|
1405
1904
|
sorted_frequency = sort_frequency(frequency, order_by)
|
1406
|
-
col_report = {str(value): str(count)
|
1905
|
+
col_report = {str(value): str(count)
|
1906
|
+
for value, count in sorted_frequency.items()}
|
1407
1907
|
report[current_col] = col_report
|
1408
1908
|
|
1409
1909
|
return report
|
@@ -1412,16 +1912,27 @@ def print_n_frequency_linear(
|
|
1412
1912
|
if order_by == "ASC":
|
1413
1913
|
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1414
1914
|
elif order_by == "DESC":
|
1415
|
-
return dict(
|
1915
|
+
return dict(
|
1916
|
+
sorted(
|
1917
|
+
frequency.items(),
|
1918
|
+
key=lambda item: item[0],
|
1919
|
+
reverse=True))
|
1416
1920
|
elif order_by == "FREQ_ASC":
|
1417
1921
|
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1418
1922
|
else: # Default to "FREQ_DESC"
|
1419
|
-
return dict(
|
1923
|
+
return dict(
|
1924
|
+
sorted(
|
1925
|
+
frequency.items(),
|
1926
|
+
key=lambda item: item[1],
|
1927
|
+
reverse=True))
|
1420
1928
|
|
1421
1929
|
report = generate_linear_report(df, columns, n, order_by)
|
1422
1930
|
print(json.dumps(report, indent=2))
|
1423
1931
|
|
1424
|
-
|
1932
|
+
|
1933
|
+
def retain_columns(
|
1934
|
+
df: pd.DataFrame,
|
1935
|
+
columns_to_retain: List[str]) -> pd.DataFrame:
|
1425
1936
|
"""
|
1426
1937
|
Retain specified columns in the DataFrame and drop the others.
|
1427
1938
|
|
@@ -1436,9 +1947,10 @@ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFra
|
|
1436
1947
|
raise ValueError("columns_to_retain should be a list of column names.")
|
1437
1948
|
return df[columns_to_retain]
|
1438
1949
|
|
1950
|
+
|
1439
1951
|
def mask_against_dataframe(
|
1440
|
-
df: pd.DataFrame,
|
1441
|
-
other_df: pd.DataFrame,
|
1952
|
+
df: pd.DataFrame,
|
1953
|
+
other_df: pd.DataFrame,
|
1442
1954
|
column_name: str
|
1443
1955
|
) -> pd.DataFrame:
|
1444
1956
|
"""
|
@@ -1456,9 +1968,10 @@ def mask_against_dataframe(
|
|
1456
1968
|
raise ValueError("The specified column must exist in both DataFrames.")
|
1457
1969
|
return df[df[column_name].isin(other_df[column_name])]
|
1458
1970
|
|
1971
|
+
|
1459
1972
|
def mask_against_dataframe_converse(
|
1460
|
-
df: pd.DataFrame,
|
1461
|
-
other_df: pd.DataFrame,
|
1973
|
+
df: pd.DataFrame,
|
1974
|
+
other_df: pd.DataFrame,
|
1462
1975
|
column_name: str
|
1463
1976
|
) -> pd.DataFrame:
|
1464
1977
|
"""
|
@@ -1470,10 +1983,10 @@ def mask_against_dataframe_converse(
|
|
1470
1983
|
column_name: The column name to use for comparison.
|
1471
1984
|
|
1472
1985
|
Returns:
|
1473
|
-
A new DataFrame with rows whose column values do not exist in
|
1986
|
+
A new DataFrame with rows whose column values do not exist in
|
1987
|
+
'other_df'.
|
1474
1988
|
"""
|
1475
1989
|
if column_name not in df.columns or column_name not in other_df.columns:
|
1476
1990
|
raise ValueError("The specified column must exist in both DataFrames.")
|
1477
|
-
|
1478
|
-
return df[~df[column_name].isin(other_df[column_name])]
|
1479
1991
|
|
1992
|
+
return df[~df[column_name].isin(other_df[column_name])]
|