rgwfuncs 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rgwfuncs/__init__.py +3 -0
- rgwfuncs/df_lib.py +1479 -0
- rgwfuncs-0.0.2.dist-info/LICENSE +19 -0
- rgwfuncs-0.0.2.dist-info/METADATA +325 -0
- rgwfuncs-0.0.2.dist-info/RECORD +8 -0
- rgwfuncs-0.0.2.dist-info/WHEEL +5 -0
- rgwfuncs-0.0.2.dist-info/entry_points.txt +2 -0
- rgwfuncs-0.0.2.dist-info/top_level.txt +1 -0
rgwfuncs/df_lib.py
ADDED
@@ -0,0 +1,1479 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import pymssql
|
3
|
+
import os
|
4
|
+
import json
|
5
|
+
from datetime import datetime
|
6
|
+
import time
|
7
|
+
import gc
|
8
|
+
import mysql.connector
|
9
|
+
import tempfile
|
10
|
+
import clickhouse_connect
|
11
|
+
from google.cloud import bigquery
|
12
|
+
from google.oauth2 import service_account
|
13
|
+
import xgboost as xgb
|
14
|
+
from pprint import pprint
|
15
|
+
import requests
|
16
|
+
from slack_sdk import WebClient
|
17
|
+
import sqlite3
|
18
|
+
from email.mime.multipart import MIMEMultipart
|
19
|
+
from email.mime.text import MIMEText
|
20
|
+
from email.mime.base import MIMEBase
|
21
|
+
from googleapiclient.discovery import build
|
22
|
+
import base64
|
23
|
+
from typing import Optional, Callable, Dict, List
|
24
|
+
|
25
|
+
|
26
|
+
def docs(method_type_filter: Optional[str] = None) -> None:
|
27
|
+
"""
|
28
|
+
Print a list of function names in alphabetical order. If method_type_filter is specified,
|
29
|
+
print the docstrings of the functions that match the filter.
|
30
|
+
|
31
|
+
Parameters:
|
32
|
+
method_type_filter: Optional filter string, comma-separated, to select docstring types.
|
33
|
+
"""
|
34
|
+
# Get the current module's namespace
|
35
|
+
local_functions: Dict[str, Callable] = {
|
36
|
+
name: obj for name, obj in globals().items() if callable(obj)
|
37
|
+
}
|
38
|
+
|
39
|
+
# List of function names sorted alphabetically
|
40
|
+
function_names: List[str] = sorted(local_functions.keys())
|
41
|
+
|
42
|
+
# Print function names
|
43
|
+
print("Functions in alphabetical order:")
|
44
|
+
for name in function_names:
|
45
|
+
print(name)
|
46
|
+
|
47
|
+
# If a filter is provided, print the docstrings of functions that match the filter
|
48
|
+
if method_type_filter:
|
49
|
+
function_type_list: List[str] = [mt.strip() for mt in method_type_filter.split(',')]
|
50
|
+
print("\nFiltered function documentation:")
|
51
|
+
|
52
|
+
for name, func in local_functions.items():
|
53
|
+
docstring: Optional[str] = func.__doc__
|
54
|
+
if docstring:
|
55
|
+
# Extract only the first line of the docstring
|
56
|
+
first_line: str = docstring.split('\n')[0]
|
57
|
+
if "::" in first_line:
|
58
|
+
# Find the first occurrence of "::" and split there
|
59
|
+
split_index: int = first_line.find("::")
|
60
|
+
function_type: str = first_line[:split_index].strip()
|
61
|
+
if function_type in function_type_list:
|
62
|
+
function_description: str = first_line[split_index + 2:].strip()
|
63
|
+
print(f"{name}: {function_description}")
|
64
|
+
|
65
|
+
def numeric_clean(
|
66
|
+
df: pd.DataFrame,
|
67
|
+
column_names: str,
|
68
|
+
column_type: str,
|
69
|
+
irregular_value_treatment: str
|
70
|
+
) -> pd.DataFrame:
|
71
|
+
"""
|
72
|
+
Cleans the numeric columns based on specified treatments.
|
73
|
+
|
74
|
+
Parameters:
|
75
|
+
df: The DataFrame to clean.
|
76
|
+
column_names: A comma-separated string containing the names of the columns to clean.
|
77
|
+
column_type: The type to convert the column to ('INTEGER' or 'FLOAT').
|
78
|
+
irregular_value_treatment: How to treat irregular values ('NAN', 'TO_ZERO', 'MEAN').
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
A new DataFrame with cleaned numeric columns.
|
82
|
+
"""
|
83
|
+
df_copy = df.copy() # Avoid mutating the original DataFrame
|
84
|
+
columns_list: List[str] = [name.strip() for name in column_names.split(',')]
|
85
|
+
|
86
|
+
for column_name in columns_list:
|
87
|
+
if column_name not in df_copy.columns:
|
88
|
+
raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
|
89
|
+
|
90
|
+
if column_type not in ['INTEGER', 'FLOAT']:
|
91
|
+
raise ValueError("column_type must be 'INTEGER' or 'FLOAT'.")
|
92
|
+
|
93
|
+
if irregular_value_treatment not in ['NAN', 'TO_ZERO', 'MEAN']:
|
94
|
+
raise ValueError("irregular_value_treatment must be 'NAN', 'TO_ZERO', or 'MEAN'.")
|
95
|
+
|
96
|
+
# Convert column type
|
97
|
+
if column_type == 'INTEGER':
|
98
|
+
df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(pd.Int64Dtype())
|
99
|
+
elif column_type == 'FLOAT':
|
100
|
+
df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(float)
|
101
|
+
|
102
|
+
# Handle irregular values
|
103
|
+
if irregular_value_treatment == 'NAN':
|
104
|
+
pass # Already converted to NaN
|
105
|
+
elif irregular_value_treatment == 'TO_ZERO':
|
106
|
+
df_copy[column_name] = df_copy[column_name].fillna(0)
|
107
|
+
elif irregular_value_treatment == 'MEAN':
|
108
|
+
mean_value = df_copy[column_name].mean()
|
109
|
+
df_copy[column_name] = df_copy[column_name].fillna(mean_value)
|
110
|
+
|
111
|
+
return df_copy
|
112
|
+
|
113
|
+
def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
|
114
|
+
"""
|
115
|
+
Limit the DataFrame to a specified number of rows.
|
116
|
+
|
117
|
+
Parameters:
|
118
|
+
df: The DataFrame to limit.
|
119
|
+
num_rows: The number of rows to retain.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
A new DataFrame limited to the specified number of rows.
|
123
|
+
|
124
|
+
Raises:
|
125
|
+
ValueError: If num_rows is not an integer.
|
126
|
+
"""
|
127
|
+
if not isinstance(num_rows, int):
|
128
|
+
raise ValueError("The number of rows should be an integer.")
|
129
|
+
|
130
|
+
return df.head(num_rows)
|
131
|
+
|
132
|
+
def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
|
133
|
+
"""
|
134
|
+
Create a DataFrame from raw data.
|
135
|
+
|
136
|
+
Parameters:
|
137
|
+
headers: A list of column headers.
|
138
|
+
data: A two-dimensional list of data.
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
A DataFrame created from the raw data.
|
142
|
+
|
143
|
+
Raises:
|
144
|
+
ValueError: If data is not in the correct format.
|
145
|
+
"""
|
146
|
+
if isinstance(data, list) and all(isinstance(row, list) for row in data):
|
147
|
+
df = pd.DataFrame(data, columns=headers)
|
148
|
+
else:
|
149
|
+
raise ValueError("Data should be an array of arrays.")
|
150
|
+
|
151
|
+
return df
|
152
|
+
|
153
|
+
def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
|
154
|
+
"""
|
155
|
+
Append rows to the DataFrame.
|
156
|
+
|
157
|
+
Parameters:
|
158
|
+
df: The original DataFrame.
|
159
|
+
rows: A list of lists, where each inner list represents a row to be appended.
|
160
|
+
|
161
|
+
Returns:
|
162
|
+
A new DataFrame with the appended rows.
|
163
|
+
|
164
|
+
Raises:
|
165
|
+
ValueError: If rows are not in the correct format.
|
166
|
+
"""
|
167
|
+
if not isinstance(rows, list) or not all(isinstance(row, list) for row in rows):
|
168
|
+
raise ValueError("Rows should be provided as a list of lists.")
|
169
|
+
|
170
|
+
if df.empty:
|
171
|
+
new_df = pd.DataFrame(rows)
|
172
|
+
else:
|
173
|
+
new_rows_df = pd.DataFrame(rows, columns=df.columns)
|
174
|
+
new_df = pd.concat([df, new_rows_df], ignore_index=True)
|
175
|
+
|
176
|
+
return new_df
|
177
|
+
|
178
|
+
def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
|
179
|
+
"""
|
180
|
+
Append columns to the DataFrame with None values.
|
181
|
+
|
182
|
+
Parameters:
|
183
|
+
df: The original DataFrame.
|
184
|
+
col_names: The names of the columns to add.
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
A new DataFrame with the appended columns.
|
188
|
+
|
189
|
+
Raises:
|
190
|
+
ValueError: If column names are not provided correctly.
|
191
|
+
"""
|
192
|
+
if not all(isinstance(col_name, str) for col_name in col_names):
|
193
|
+
raise ValueError("Column names should be provided as strings.")
|
194
|
+
|
195
|
+
new_df = df.copy()
|
196
|
+
for col_name in col_names:
|
197
|
+
new_df[col_name] = pd.Series([None] * len(df), dtype='object')
|
198
|
+
|
199
|
+
return new_df
|
200
|
+
|
201
|
+
def update_rows(
|
202
|
+
df: pd.DataFrame,
|
203
|
+
condition: str,
|
204
|
+
updates: Dict[str, any]
|
205
|
+
) -> pd.DataFrame:
|
206
|
+
"""
|
207
|
+
Update specific rows in the DataFrame based on a condition.
|
208
|
+
|
209
|
+
Parameters:
|
210
|
+
df: The original DataFrame.
|
211
|
+
condition: A query condition to identify rows for updating.
|
212
|
+
updates: A dictionary with column names as keys and new values as values.
|
213
|
+
|
214
|
+
Returns:
|
215
|
+
A new DataFrame with the updated rows.
|
216
|
+
|
217
|
+
Raises:
|
218
|
+
ValueError: If no rows match the condition or updates are invalid.
|
219
|
+
"""
|
220
|
+
mask = df.query(condition)
|
221
|
+
|
222
|
+
if mask.empty:
|
223
|
+
raise ValueError("No rows match the given condition.")
|
224
|
+
|
225
|
+
if not isinstance(updates, dict):
|
226
|
+
raise ValueError("Updates should be provided as a dictionary.")
|
227
|
+
|
228
|
+
invalid_cols = [col for col in updates if col not in df.columns]
|
229
|
+
if invalid_cols:
|
230
|
+
raise ValueError(f"Columns {', '.join(invalid_cols)} do not exist in the DataFrame.")
|
231
|
+
|
232
|
+
new_df = df.copy()
|
233
|
+
for col_name, new_value in updates.items():
|
234
|
+
new_df.loc[mask.index, col_name] = new_value
|
235
|
+
|
236
|
+
return new_df
|
237
|
+
|
238
|
+
def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
|
239
|
+
"""
|
240
|
+
Delete rows from the DataFrame based on a condition.
|
241
|
+
|
242
|
+
Parameters:
|
243
|
+
df: The original DataFrame.
|
244
|
+
condition: A query condition to identify rows for deletion.
|
245
|
+
|
246
|
+
Returns:
|
247
|
+
A new DataFrame with the specified rows deleted.
|
248
|
+
|
249
|
+
Raises:
|
250
|
+
ValueError: If no rows match the condition.
|
251
|
+
"""
|
252
|
+
mask = df.query(condition)
|
253
|
+
|
254
|
+
if mask.empty:
|
255
|
+
raise ValueError("No rows match the given condition.")
|
256
|
+
|
257
|
+
new_df = df.drop(mask.index).reset_index(drop=True)
|
258
|
+
|
259
|
+
return new_df
|
260
|
+
|
261
|
+
def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
262
|
+
"""
|
263
|
+
Drop duplicate rows in the DataFrame, retaining the first occurrence.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
df: The DataFrame from which duplicates will be dropped.
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
A new DataFrame with duplicates removed.
|
270
|
+
|
271
|
+
Raises:
|
272
|
+
ValueError: If the DataFrame is None.
|
273
|
+
"""
|
274
|
+
if df is None:
|
275
|
+
raise ValueError("DataFrame is not initialized.")
|
276
|
+
return df.drop_duplicates(keep='first')
|
277
|
+
|
278
|
+
def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
|
279
|
+
"""
|
280
|
+
Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
|
281
|
+
|
282
|
+
Parameters:
|
283
|
+
df: The DataFrame from which duplicates will be dropped.
|
284
|
+
columns: A comma-separated string with the column names used to identify duplicates.
|
285
|
+
|
286
|
+
Returns:
|
287
|
+
A new DataFrame with duplicates removed.
|
288
|
+
|
289
|
+
Raises:
|
290
|
+
ValueError: If the DataFrame is None.
|
291
|
+
"""
|
292
|
+
if df is None:
|
293
|
+
raise ValueError("DataFrame is not initialized.")
|
294
|
+
|
295
|
+
columns_list = [col.strip() for col in columns.split(',')] if columns else None
|
296
|
+
return df.drop_duplicates(subset=columns_list, keep='first')
|
297
|
+
|
298
|
+
def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
|
299
|
+
"""
|
300
|
+
Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
|
301
|
+
|
302
|
+
Parameters:
|
303
|
+
df: The DataFrame from which duplicates will be dropped.
|
304
|
+
columns: A comma-separated string with the column names used to identify duplicates.
|
305
|
+
|
306
|
+
Returns:
|
307
|
+
A new DataFrame with duplicates removed.
|
308
|
+
|
309
|
+
Raises:
|
310
|
+
ValueError: If the DataFrame is None.
|
311
|
+
"""
|
312
|
+
if df is None:
|
313
|
+
raise ValueError("DataFrame is not initialized.")
|
314
|
+
|
315
|
+
columns_list = [col.strip() for col in columns.split(',')] if columns else None
|
316
|
+
return df.drop_duplicates(subset=columns_list, keep='last')
|
317
|
+
|
318
|
+
def load_data_from_query(db_preset_name: str, query: str, config_file_name: str = "rgwml.config") -> pd.DataFrame:
|
319
|
+
"""
|
320
|
+
Load data from a database query into a DataFrame based on a configuration preset.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
db_preset_name: The name of the database preset in the configuration file.
|
324
|
+
query: The SQL query to execute.
|
325
|
+
config_file_name: Name of the configuration file (default: 'rgwml.config').
|
326
|
+
|
327
|
+
Returns:
|
328
|
+
A DataFrame containing the query result.
|
329
|
+
|
330
|
+
Raises:
|
331
|
+
FileNotFoundError: If the configuration file is not found.
|
332
|
+
ValueError: If the database preset or db_type is invalid.
|
333
|
+
"""
|
334
|
+
|
335
|
+
def locate_config_file(filename: str = config_file_name) -> str:
|
336
|
+
home_dir = os.path.expanduser("~")
|
337
|
+
search_paths = [
|
338
|
+
os.path.join(home_dir, "Desktop"),
|
339
|
+
os.path.join(home_dir, "Documents"),
|
340
|
+
os.path.join(home_dir, "Downloads"),
|
341
|
+
]
|
342
|
+
|
343
|
+
for path in search_paths:
|
344
|
+
for root, dirs, files in os.walk(path):
|
345
|
+
if filename in files:
|
346
|
+
return os.path.join(root, filename)
|
347
|
+
raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
|
348
|
+
|
349
|
+
def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
350
|
+
"""Execute a query on an MSSQL database and return the result as a DataFrame."""
|
351
|
+
server = db_preset['host']
|
352
|
+
user = db_preset['username']
|
353
|
+
password = db_preset['password']
|
354
|
+
database = db_preset.get('database', '')
|
355
|
+
|
356
|
+
with pymssql.connect(server=server, user=user, password=password, database=database) as conn:
|
357
|
+
with conn.cursor() as cursor:
|
358
|
+
cursor.execute(query)
|
359
|
+
rows = cursor.fetchall()
|
360
|
+
columns = [desc[0] for desc in cursor.description]
|
361
|
+
|
362
|
+
return pd.DataFrame(rows, columns=columns)
|
363
|
+
|
364
|
+
def query_mysql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
365
|
+
"""Execute a query on a MySQL database and return the result as a DataFrame."""
|
366
|
+
host = db_preset['host']
|
367
|
+
user = db_preset['username']
|
368
|
+
password = db_preset['password']
|
369
|
+
database = db_preset.get('database', '')
|
370
|
+
|
371
|
+
with mysql.connector.connect(host=host, user=user, password=password, database=database) as conn:
|
372
|
+
with conn.cursor() as cursor:
|
373
|
+
cursor.execute(query)
|
374
|
+
rows = cursor.fetchall()
|
375
|
+
columns = [desc[0] for desc in cursor.description] if cursor.description else []
|
376
|
+
|
377
|
+
return pd.DataFrame(rows, columns=columns)
|
378
|
+
|
379
|
+
def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
380
|
+
"""Query a ClickHouse database and return the result as a DataFrame."""
|
381
|
+
host = db_preset['host']
|
382
|
+
user = db_preset['username']
|
383
|
+
password = db_preset['password']
|
384
|
+
database = db_preset['database']
|
385
|
+
|
386
|
+
max_retries = 5
|
387
|
+
retry_delay = 5
|
388
|
+
|
389
|
+
for attempt in range(max_retries):
|
390
|
+
try:
|
391
|
+
client = clickhouse_connect.get_client(
|
392
|
+
host=host,
|
393
|
+
port='8123',
|
394
|
+
username=user,
|
395
|
+
password=password,
|
396
|
+
database=database
|
397
|
+
)
|
398
|
+
data = client.query(query)
|
399
|
+
rows = data.result_rows
|
400
|
+
columns = data.column_names
|
401
|
+
return pd.DataFrame(rows, columns=columns)
|
402
|
+
except Exception as e:
|
403
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
404
|
+
if attempt < max_retries - 1:
|
405
|
+
print(f"Retrying in {retry_delay} seconds...")
|
406
|
+
time.sleep(retry_delay)
|
407
|
+
else:
|
408
|
+
raise ConnectionError("All attempts to connect to ClickHouse failed.")
|
409
|
+
|
410
|
+
def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
411
|
+
"""Query a Google BigQuery database and return the result as a DataFrame."""
|
412
|
+
json_file_path = db_preset['json_file_path']
|
413
|
+
project_id = db_preset['project_id']
|
414
|
+
|
415
|
+
credentials = service_account.Credentials.from_service_account_file(json_file_path)
|
416
|
+
client = bigquery.Client(credentials=credentials, project=project_id)
|
417
|
+
|
418
|
+
query_job = client.query(query)
|
419
|
+
results = query_job.result()
|
420
|
+
rows = [list(row.values()) for row in results]
|
421
|
+
columns = [field.name for field in results.schema]
|
422
|
+
|
423
|
+
return pd.DataFrame(rows, columns=columns)
|
424
|
+
|
425
|
+
# Read the configuration file to get the database preset
|
426
|
+
config_path = locate_config_file()
|
427
|
+
with open(config_path, 'r') as f:
|
428
|
+
config = json.load(f)
|
429
|
+
|
430
|
+
db_presets = config.get('db_presets', [])
|
431
|
+
db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
|
432
|
+
if not db_preset:
|
433
|
+
raise ValueError(f"No matching db_preset found for {db_preset_name}")
|
434
|
+
|
435
|
+
db_type = db_preset['db_type']
|
436
|
+
|
437
|
+
if db_type == 'mssql':
|
438
|
+
return query_mssql(db_preset, query)
|
439
|
+
elif db_type == 'mysql':
|
440
|
+
return query_mysql(db_preset, query)
|
441
|
+
elif db_type == 'clickhouse':
|
442
|
+
return query_clickhouse(db_preset, query)
|
443
|
+
elif db_type == 'google_big_query':
|
444
|
+
return query_google_big_query(db_preset, query)
|
445
|
+
else:
|
446
|
+
raise ValueError(f"Unsupported db_type: {db_type}")
|
447
|
+
|
448
|
+
|
449
|
+
|
450
|
+
def load_data_from_path(file_path: str) -> pd.DataFrame:
|
451
|
+
"""
|
452
|
+
Load data from a file into a DataFrame based on the file extension.
|
453
|
+
|
454
|
+
Parameters:
|
455
|
+
file_path: The absolute path to the data file.
|
456
|
+
|
457
|
+
Returns:
|
458
|
+
A DataFrame containing the data loaded from the file.
|
459
|
+
|
460
|
+
Raises:
|
461
|
+
ValueError: If the file extension is unsupported.
|
462
|
+
"""
|
463
|
+
|
464
|
+
def load_hdf5(file_path: str) -> pd.DataFrame:
|
465
|
+
"""Helper function to load HDF5 files and select a key if necessary."""
|
466
|
+
with pd.HDFStore(file_path, mode='r') as store:
|
467
|
+
available_keys = store.keys()
|
468
|
+
if len(available_keys) == 1:
|
469
|
+
df = pd.read_hdf(file_path, key=available_keys[0])
|
470
|
+
print(f"Loaded key: {available_keys[0]}")
|
471
|
+
else:
|
472
|
+
while True:
|
473
|
+
print("Available keys:", available_keys)
|
474
|
+
key = input("Enter the key for the HDF5 dataset: ").strip()
|
475
|
+
if key in available_keys:
|
476
|
+
df = pd.read_hdf(file_path, key=key)
|
477
|
+
break
|
478
|
+
else:
|
479
|
+
print(f"Key '{key}' is not in the available keys. Please try again.")
|
480
|
+
return df
|
481
|
+
|
482
|
+
# Ensure the file path is absolute
|
483
|
+
file_path = os.path.abspath(file_path)
|
484
|
+
|
485
|
+
# Determine file type by extension
|
486
|
+
file_extension = file_path.split('.')[-1].lower()
|
487
|
+
|
488
|
+
# Load data based on file type
|
489
|
+
if file_extension == 'csv':
|
490
|
+
df = pd.read_csv(file_path, dtype=str)
|
491
|
+
df.replace('', None, inplace=True)
|
492
|
+
elif file_extension in ['xls', 'xlsx']:
|
493
|
+
df = pd.read_excel(file_path)
|
494
|
+
elif file_extension == 'json':
|
495
|
+
df = pd.read_json(file_path)
|
496
|
+
elif file_extension == 'parquet':
|
497
|
+
df = pd.read_parquet(file_path)
|
498
|
+
elif file_extension in ['h5', 'hdf5']:
|
499
|
+
df = load_hdf5(file_path)
|
500
|
+
elif file_extension == 'feather':
|
501
|
+
df = pd.read_feather(file_path)
|
502
|
+
elif file_extension == 'pkl':
|
503
|
+
df = pd.read_pickle(file_path)
|
504
|
+
else:
|
505
|
+
raise ValueError(f"Unsupported file extension: {file_extension}")
|
506
|
+
|
507
|
+
gc.collect()
|
508
|
+
return df
|
509
|
+
|
510
|
+
|
511
|
+
def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
|
512
|
+
"""
|
513
|
+
Execute a query on a SQLite database specified by its path and return the results as a DataFrame.
|
514
|
+
|
515
|
+
Parameters:
|
516
|
+
sqlite_path: The absolute path to the SQLite database file.
|
517
|
+
query: The SQL query to execute.
|
518
|
+
|
519
|
+
Returns:
|
520
|
+
A DataFrame containing the query results.
|
521
|
+
|
522
|
+
Raises:
|
523
|
+
ValueError: If there is a problem executing the query.
|
524
|
+
"""
|
525
|
+
|
526
|
+
# Ensure the file path is absolute
|
527
|
+
sqlite_path = os.path.abspath(sqlite_path)
|
528
|
+
|
529
|
+
try:
|
530
|
+
with sqlite3.connect(sqlite_path) as conn:
|
531
|
+
df = pd.read_sql_query(query, conn)
|
532
|
+
except sqlite3.Error as e:
|
533
|
+
raise ValueError(f"SQLite error: {e}")
|
534
|
+
|
535
|
+
gc.collect()
|
536
|
+
return df
|
537
|
+
|
538
|
+
def first_n_rows(df: pd.DataFrame, n: int) -> None:
|
539
|
+
"""Print the first n rows of the DataFrame."""
|
540
|
+
if df is not None:
|
541
|
+
first_n_rows = df.head(n).to_dict(orient="records")
|
542
|
+
for row in first_n_rows:
|
543
|
+
pprint(row, indent=4)
|
544
|
+
print()
|
545
|
+
else:
|
546
|
+
raise ValueError("No DataFrame to display. Please provide a DataFrame.")
|
547
|
+
|
548
|
+
gc.collect()
|
549
|
+
|
550
|
+
def last_n_rows(df: pd.DataFrame, n: int) -> None:
|
551
|
+
"""Print the last n rows of the DataFrame."""
|
552
|
+
if df is not None:
|
553
|
+
last_n_rows = df.tail(n).to_dict(orient="records")
|
554
|
+
for row in last_n_rows:
|
555
|
+
pprint(row, indent=4)
|
556
|
+
print()
|
557
|
+
else:
|
558
|
+
raise ValueError("No DataFrame to display. Please provide a DataFrame.")
|
559
|
+
|
560
|
+
gc.collect()
|
561
|
+
|
562
|
+
def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
563
|
+
"""Print top n unique values for specified columns in the DataFrame."""
|
564
|
+
if df is not None:
|
565
|
+
report = {}
|
566
|
+
for column in columns:
|
567
|
+
if column in df.columns:
|
568
|
+
frequency = df[column].astype(str).value_counts(dropna=False)
|
569
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
570
|
+
top_n_values = frequency.nlargest(n)
|
571
|
+
report[column] = {str(value): str(count) for value, count in top_n_values.items()}
|
572
|
+
print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
|
573
|
+
else:
|
574
|
+
print(f"Column '{column}' does not exist in the DataFrame.")
|
575
|
+
else:
|
576
|
+
raise ValueError("No DataFrame to display. Please provide a DataFrame.")
|
577
|
+
|
578
|
+
gc.collect()
|
579
|
+
|
580
|
+
def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
581
|
+
"""Print bottom n unique values for specified columns in the DataFrame."""
|
582
|
+
if df is not None:
|
583
|
+
report = {}
|
584
|
+
for column in columns:
|
585
|
+
if column in df.columns:
|
586
|
+
frequency = df[column].astype(str).value_counts(dropna=False)
|
587
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
588
|
+
bottom_n_values = frequency.nsmallest(n)
|
589
|
+
report[column] = {str(value): str(count) for value, count in bottom_n_values.items()}
|
590
|
+
print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
|
591
|
+
else:
|
592
|
+
print(f"Column '{column}' does not exist in the DataFrame.")
|
593
|
+
else:
|
594
|
+
raise ValueError("No DataFrame to display. Please provide a DataFrame.")
|
595
|
+
|
596
|
+
gc.collect()
|
597
|
+
|
598
|
+
def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
|
599
|
+
"""Print correlation for multiple pairs of columns in the DataFrame."""
|
600
|
+
if df is not None:
|
601
|
+
for col1, col2 in column_pairs:
|
602
|
+
if col1 in df.columns and col2 in df.columns:
|
603
|
+
try:
|
604
|
+
numeric_col1 = pd.to_numeric(df[col1], errors='coerce')
|
605
|
+
numeric_col2 = pd.to_numeric(df[col2], errors='coerce')
|
606
|
+
|
607
|
+
correlation = numeric_col1.corr(numeric_col2)
|
608
|
+
if pd.notnull(correlation):
|
609
|
+
print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
|
610
|
+
else:
|
611
|
+
print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
|
612
|
+
except Exception as e:
|
613
|
+
print(f"Error processing columns '{col1}' and '{col2}': {e}")
|
614
|
+
else:
|
615
|
+
print(f"One or both of the specified columns ('{col1}', '{col2}') do not exist in the DataFrame.")
|
616
|
+
else:
|
617
|
+
print("The DataFrame is empty.")
|
618
|
+
|
619
|
+
gc.collect()
|
620
|
+
|
621
|
+
def print_memory_usage(df: pd.DataFrame) -> None:
|
622
|
+
"""Print memory usage of the DataFrame."""
|
623
|
+
if df is not None:
|
624
|
+
memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024) # Convert bytes to MB
|
625
|
+
print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
|
626
|
+
else:
|
627
|
+
raise ValueError("No DataFrame to print. Please provide a DataFrame.")
|
628
|
+
|
629
|
+
gc.collect()
|
630
|
+
|
631
|
+
def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
|
632
|
+
"""Filter DataFrame with a given expression."""
|
633
|
+
if df is not None:
|
634
|
+
try:
|
635
|
+
filtered_df = df.query(filter_expr)
|
636
|
+
except Exception:
|
637
|
+
filtered_df = df[df.eval(filter_expr)]
|
638
|
+
else:
|
639
|
+
raise ValueError("No DataFrame to filter. Please provide a DataFrame.")
|
640
|
+
|
641
|
+
gc.collect()
|
642
|
+
|
643
|
+
return filtered_df
|
644
|
+
|
645
|
+
def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
|
646
|
+
"""Filter DataFrame for Indian mobile numbers."""
|
647
|
+
if df is not None:
|
648
|
+
filtered_df = df[
|
649
|
+
df[mobile_col].apply(
|
650
|
+
lambda x: (
|
651
|
+
str(x).isdigit() and
|
652
|
+
str(x).startswith(('6', '7', '8', '9')) and
|
653
|
+
len(set(str(x))) >= 4
|
654
|
+
)
|
655
|
+
)
|
656
|
+
]
|
657
|
+
else:
|
658
|
+
raise ValueError("No DataFrame to filter. Please provide a DataFrame.")
|
659
|
+
|
660
|
+
gc.collect()
|
661
|
+
|
662
|
+
return filtered_df
|
663
|
+
|
664
|
+
def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
665
|
+
"""
|
666
|
+
Print the DataFrame and its column types. If a source path is provided, print it as well.
|
667
|
+
|
668
|
+
Parameters:
|
669
|
+
df: The DataFrame to print.
|
670
|
+
source: Optional; The source path of the DataFrame for logging purposes.
|
671
|
+
"""
|
672
|
+
if df is not None:
|
673
|
+
print(df)
|
674
|
+
columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
|
675
|
+
print("Columns:", columns_with_types)
|
676
|
+
if source:
|
677
|
+
print(f"Source: {source}")
|
678
|
+
else:
|
679
|
+
raise ValueError("No DataFrame to print. Please provide a DataFrame.")
|
680
|
+
|
681
|
+
gc.collect()
|
682
|
+
|
683
|
+
def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
|
684
|
+
"""
|
685
|
+
Send a DataFrame via Telegram using a specified bot configuration.
|
686
|
+
|
687
|
+
Parameters:
|
688
|
+
df: The DataFrame to send.
|
689
|
+
bot_name: The name of the Telegram bot as specified in the configuration.
|
690
|
+
message: Custom message to send along with the DataFrame or file.
|
691
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file or as text.
|
692
|
+
remove_after_send: If True, removes the file after sending.
|
693
|
+
"""
|
694
|
+
|
695
|
+
def locate_config_file(filename: str = "rgwml.config") -> str:
|
696
|
+
"""Retrieve the configuration file path."""
|
697
|
+
home_dir = os.path.expanduser("~")
|
698
|
+
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
699
|
+
|
700
|
+
for path in search_paths:
|
701
|
+
for root, _, files in os.walk(path):
|
702
|
+
if filename in files:
|
703
|
+
return os.path.join(root, filename)
|
704
|
+
raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
|
705
|
+
|
706
|
+
def get_config(config_path: str) -> dict:
|
707
|
+
"""Load configuration from a json file."""
|
708
|
+
with open(config_path, 'r') as file:
|
709
|
+
return json.load(file)
|
710
|
+
|
711
|
+
config_path = locate_config_file()
|
712
|
+
config = get_config(config_path)
|
713
|
+
|
714
|
+
bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
|
715
|
+
if not bot_config:
|
716
|
+
raise ValueError(f"No bot found with the name {bot_name}")
|
717
|
+
|
718
|
+
if df is None:
|
719
|
+
raise ValueError("No DataFrame to send. Please provide a DataFrame.")
|
720
|
+
|
721
|
+
if as_file:
|
722
|
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
723
|
+
file_name = f"df_{timestamp}.csv"
|
724
|
+
df.to_csv(file_name, index=False)
|
725
|
+
try:
|
726
|
+
with open(file_name, 'rb') as file:
|
727
|
+
payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
|
728
|
+
files = {'document': file}
|
729
|
+
response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
|
730
|
+
if remove_after_send and os.path.exists(file_name):
|
731
|
+
os.remove(file_name)
|
732
|
+
except Exception as e:
|
733
|
+
print(f"Failed to send document: {e}")
|
734
|
+
raise
|
735
|
+
else:
|
736
|
+
df_str = df.to_string()
|
737
|
+
payload = {'chat_id': bot_config['chat_id'], 'text': message + "\n\n" + df_str if message else df_str, 'parse_mode': 'HTML'}
|
738
|
+
response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
|
739
|
+
|
740
|
+
if not response.ok:
|
741
|
+
raise Exception(f"Error sending message: {response.text}")
|
742
|
+
|
743
|
+
print("Message sent successfully.")
|
744
|
+
|
745
|
+
def send_data_to_email(
|
746
|
+
df: pd.DataFrame,
|
747
|
+
preset_name: str,
|
748
|
+
to_email: str,
|
749
|
+
subject: Optional[str] = None,
|
750
|
+
body: Optional[str] = None,
|
751
|
+
as_file: bool = True,
|
752
|
+
remove_after_send: bool = True
|
753
|
+
) -> None:
|
754
|
+
"""
|
755
|
+
Send an email with optional DataFrame attachment using Gmail API via a specified preset.
|
756
|
+
|
757
|
+
Parameters:
|
758
|
+
df: The DataFrame to send.
|
759
|
+
preset_name: The configuration preset name to use for sending the email.
|
760
|
+
to_email: The recipient email address.
|
761
|
+
subject: Optional subject of the email.
|
762
|
+
body: Optional message body of the email.
|
763
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file.
|
764
|
+
remove_after_send: If True, removes the CSV file after sending.
|
765
|
+
"""
|
766
|
+
|
767
|
+
def locate_config_file(filename: str = "rgwml.config") -> str:
|
768
|
+
"""Locate config file in common user directories."""
|
769
|
+
home_dir = os.path.expanduser("~")
|
770
|
+
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
771
|
+
|
772
|
+
for path in search_paths:
|
773
|
+
for root, _, files in os.walk(path):
|
774
|
+
if filename in files:
|
775
|
+
return os.path.join(root, filename)
|
776
|
+
raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
|
777
|
+
|
778
|
+
def get_config(config_path: str) -> dict:
|
779
|
+
"""Load configuration from a json file."""
|
780
|
+
with open(config_path, 'r') as file:
|
781
|
+
try:
|
782
|
+
return json.load(file)
|
783
|
+
except json.JSONDecodeError as e:
|
784
|
+
raise ValueError(f"Invalid JSON format in config file: {e}")
|
785
|
+
|
786
|
+
def authenticate_service_account(service_account_credentials_path: str, sender_email_id: str) -> Any:
|
787
|
+
"""Authenticate the service account and return a Gmail API service instance."""
|
788
|
+
credentials = service_account.Credentials.from_service_account_file(
|
789
|
+
service_account_credentials_path,
|
790
|
+
scopes=['https://mail.google.com/'],
|
791
|
+
subject=sender_email_id
|
792
|
+
)
|
793
|
+
return build('gmail', 'v1', credentials=credentials)
|
794
|
+
|
795
|
+
# Load configuration
|
796
|
+
config_path = locate_config_file()
|
797
|
+
config = get_config(config_path)
|
798
|
+
|
799
|
+
# Retrieve Gmail preset configuration
|
800
|
+
gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
|
801
|
+
if not gmail_config:
|
802
|
+
raise ValueError(f"No preset found with the name {preset_name}")
|
803
|
+
|
804
|
+
sender_email = gmail_config['name']
|
805
|
+
credentials_path = gmail_config['service_account_credentials_path']
|
806
|
+
|
807
|
+
# Authenticate and get the Gmail service
|
808
|
+
service = authenticate_service_account(credentials_path, sender_email)
|
809
|
+
|
810
|
+
if as_file:
|
811
|
+
# Create a temporary file for the DataFrame as CSV
|
812
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
|
813
|
+
tmp_file_name = tmp_file.name
|
814
|
+
df.to_csv(tmp_file_name, index=False)
|
815
|
+
|
816
|
+
# Create email with attachment
|
817
|
+
try:
|
818
|
+
message = MIMEMultipart()
|
819
|
+
message['to'] = to_email
|
820
|
+
message['from'] = sender_email
|
821
|
+
message['subject'] = subject if subject else 'DataFrame CSV File'
|
822
|
+
message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
|
823
|
+
|
824
|
+
with open(tmp_file_name, 'rb') as file:
|
825
|
+
part = MIMEBase('application', 'octet-stream')
|
826
|
+
part.set_payload(file.read())
|
827
|
+
encoders.encode_base64(part)
|
828
|
+
part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
|
829
|
+
message.attach(part)
|
830
|
+
|
831
|
+
if remove_after_send and os.path.exists(tmp_file_name):
|
832
|
+
os.remove(tmp_file_name)
|
833
|
+
|
834
|
+
except Exception as e:
|
835
|
+
raise Exception(f"Failed to prepare the document: {e}")
|
836
|
+
|
837
|
+
else:
|
838
|
+
# Create email body as plain text
|
839
|
+
df_str = df.to_string()
|
840
|
+
full_body = body + "\n\n" + df_str if body else df_str
|
841
|
+
message = MIMEText(full_body)
|
842
|
+
message['to'] = to_email
|
843
|
+
message['from'] = sender_email
|
844
|
+
message['subject'] = subject or 'DataFrame Content'
|
845
|
+
|
846
|
+
# Sending the email
|
847
|
+
try:
|
848
|
+
raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
|
849
|
+
email_body = {'raw': raw}
|
850
|
+
sent_message = service.users().messages().send(userId="me", body=email_body).execute()
|
851
|
+
print(f"Email with Message Id {sent_message['id']} successfully sent.")
|
852
|
+
except Exception as error:
|
853
|
+
raise Exception(f"Error sending email: {error}")
|
854
|
+
|
855
|
+
def send_data_to_slack(
|
856
|
+
df: pd.DataFrame,
|
857
|
+
bot_name: str,
|
858
|
+
message: Optional[str] = None,
|
859
|
+
as_file: bool = True,
|
860
|
+
remove_after_send: bool = True
|
861
|
+
) -> None:
|
862
|
+
"""
|
863
|
+
Send a DataFrame or message to Slack using a specified bot configuration.
|
864
|
+
|
865
|
+
Parameters:
|
866
|
+
df: The DataFrame to send.
|
867
|
+
bot_name: The Slack bot configuration preset name.
|
868
|
+
message: Custom message to send along with the DataFrame or file.
|
869
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file.
|
870
|
+
remove_after_send: If True, removes the CSV file after sending.
|
871
|
+
"""
|
872
|
+
|
873
|
+
def locate_config_file(filename: str = "rgwml.config") -> str:
|
874
|
+
"""Locate config file in common user directories."""
|
875
|
+
home_dir = os.path.expanduser("~")
|
876
|
+
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
877
|
+
|
878
|
+
for path in search_paths:
|
879
|
+
for root, _, files in os.walk(path):
|
880
|
+
if filename in files:
|
881
|
+
return os.path.join(root, filename)
|
882
|
+
raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
|
883
|
+
|
884
|
+
def get_config(config_path: str) -> dict:
|
885
|
+
"""Load configuration from a JSON file."""
|
886
|
+
with open(config_path, 'r') as file:
|
887
|
+
return json.load(file)
|
888
|
+
|
889
|
+
# Load the Slack configuration
|
890
|
+
config_path = locate_config_file()
|
891
|
+
config = get_config(config_path)
|
892
|
+
|
893
|
+
bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
|
894
|
+
if not bot_config:
|
895
|
+
raise ValueError(f"No bot found with the name {bot_name}")
|
896
|
+
|
897
|
+
client = WebClient(token=bot_config['bot_token'])
|
898
|
+
|
899
|
+
if as_file:
|
900
|
+
# Create a temporary file for the DataFrame as CSV
|
901
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
|
902
|
+
file_name = tmp_file.name
|
903
|
+
df.to_csv(file_name, index=False)
|
904
|
+
|
905
|
+
try:
|
906
|
+
with open(file_name, 'rb') as file:
|
907
|
+
response = client.files_upload(
|
908
|
+
channels=bot_config['channel_id'],
|
909
|
+
file=file,
|
910
|
+
filename=os.path.basename(file_name),
|
911
|
+
title="DataFrame Upload",
|
912
|
+
initial_comment=message or ''
|
913
|
+
)
|
914
|
+
finally:
|
915
|
+
if remove_after_send and os.path.exists(file_name):
|
916
|
+
os.remove(file_name)
|
917
|
+
else:
|
918
|
+
df_str = df.to_string()
|
919
|
+
response = client.chat_postMessage(
|
920
|
+
channel=bot_config['channel_id'],
|
921
|
+
text=(message + "\n\n" + df_str) if message else df_str
|
922
|
+
)
|
923
|
+
|
924
|
+
# Check if the message was sent successfully
|
925
|
+
if not response["ok"]:
|
926
|
+
raise Exception(f"Error sending message: {response['error']}")
|
927
|
+
|
928
|
+
print("Message sent successfully.")
|
929
|
+
|
930
|
+
def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
|
931
|
+
"""
|
932
|
+
Reorder the columns of the DataFrame based on a string input.
|
933
|
+
|
934
|
+
Parameters:
|
935
|
+
df: The DataFrame whose columns will be reordered.
|
936
|
+
column_order_str: A string specifying the desired order of columns, using ',' to separate columns.
|
937
|
+
|
938
|
+
Returns:
|
939
|
+
A new DataFrame with reordered columns.
|
940
|
+
|
941
|
+
Raises:
|
942
|
+
ValueError: If a specified column does not exist in the DataFrame.
|
943
|
+
"""
|
944
|
+
if df is None:
|
945
|
+
raise ValueError("No DataFrame to reorder. Please provide a valid DataFrame.")
|
946
|
+
|
947
|
+
columns = df.columns.tolist()
|
948
|
+
parts = [part.strip() for part in column_order_str.split(',')]
|
949
|
+
|
950
|
+
new_order = []
|
951
|
+
seen = set()
|
952
|
+
|
953
|
+
for part in parts:
|
954
|
+
if part == '...':
|
955
|
+
continue
|
956
|
+
elif part in columns:
|
957
|
+
new_order.append(part)
|
958
|
+
seen.add(part)
|
959
|
+
else:
|
960
|
+
raise ValueError(f"Column '{part}' not found in DataFrame.")
|
961
|
+
|
962
|
+
remaining = [col for col in columns if col not in seen]
|
963
|
+
|
964
|
+
# Determine the position of '...' and arrange the columns
|
965
|
+
if parts[0] == '...':
|
966
|
+
new_order = remaining + new_order
|
967
|
+
elif parts[-1] == '...':
|
968
|
+
new_order = new_order + remaining
|
969
|
+
else:
|
970
|
+
pos = parts.index('...')
|
971
|
+
new_order = new_order[:pos] + remaining + new_order[pos:]
|
972
|
+
|
973
|
+
return df[new_order]
|
974
|
+
|
975
|
+
def append_ranged_classification_column(
|
976
|
+
df: pd.DataFrame,
|
977
|
+
ranges: str,
|
978
|
+
target_col: str,
|
979
|
+
new_col_name: str
|
980
|
+
) -> pd.DataFrame:
|
981
|
+
"""
|
982
|
+
Append a ranged classification column to the DataFrame.
|
983
|
+
|
984
|
+
Parameters:
|
985
|
+
df: The DataFrame to modify.
|
986
|
+
ranges: A string representation of numeric ranges separated by commas.
|
987
|
+
target_col: The column to analyze.
|
988
|
+
new_col_name: The name of the new classification column.
|
989
|
+
|
990
|
+
Returns:
|
991
|
+
A new DataFrame with the classification column appended.
|
992
|
+
"""
|
993
|
+
|
994
|
+
def pad_number(number, integer_length, decimal_length=0, decimal=False):
|
995
|
+
"""Pad number to have a consistent length for integer and decimal parts."""
|
996
|
+
if decimal:
|
997
|
+
str_number = f"{number:.{decimal_length}f}"
|
998
|
+
integer_part, decimal_part = str_number.split('.')
|
999
|
+
padded_integer_part = integer_part.zfill(integer_length)
|
1000
|
+
return f"{padded_integer_part}.{decimal_part}"
|
1001
|
+
else:
|
1002
|
+
return str(int(number)).zfill(integer_length)
|
1003
|
+
|
1004
|
+
range_list = ranges.split(',')
|
1005
|
+
has_decimals = any('.' in r for r in range_list)
|
1006
|
+
|
1007
|
+
if has_decimals:
|
1008
|
+
range_list = [float(r) for r in range_list]
|
1009
|
+
max_decimal_length = max(len(str(r).split('.')[1]) for r in range_list if '.' in str(r))
|
1010
|
+
max_integer_length = max(len(str(int(float(r)))) for r in range_list)
|
1011
|
+
labels = [f"{pad_number(range_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(range_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(range_list) - 1)]
|
1012
|
+
else:
|
1013
|
+
range_list = [int(r) for r in range_list]
|
1014
|
+
max_integer_length = max(len(str(r)) for r in range_list)
|
1015
|
+
labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
|
1016
|
+
|
1017
|
+
# Ensure the target column is numeric
|
1018
|
+
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1019
|
+
|
1020
|
+
df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
|
1021
|
+
|
1022
|
+
return df
|
1023
|
+
|
1024
|
+
def append_percentile_classification_column(
|
1025
|
+
df: pd.DataFrame,
|
1026
|
+
percentiles: str,
|
1027
|
+
target_col: str,
|
1028
|
+
new_col_name: str
|
1029
|
+
) -> pd.DataFrame:
|
1030
|
+
"""
|
1031
|
+
Append a percentile classification column to the DataFrame.
|
1032
|
+
|
1033
|
+
Parameters:
|
1034
|
+
df: The DataFrame to modify.
|
1035
|
+
percentiles: A string representation of percentile values separated by commas.
|
1036
|
+
target_col: The column to analyze.
|
1037
|
+
new_col_name: The name of the new classification column.
|
1038
|
+
|
1039
|
+
Returns:
|
1040
|
+
A new DataFrame with the classification column appended.
|
1041
|
+
"""
|
1042
|
+
|
1043
|
+
def pad_number(number, integer_length, decimal_length=0, decimal=False):
|
1044
|
+
"""Pad number to have a consistent length for integer and decimal parts."""
|
1045
|
+
if decimal:
|
1046
|
+
str_number = f"{number:.{decimal_length}f}"
|
1047
|
+
integer_part, decimal_part = str_number.split('.')
|
1048
|
+
padded_integer_part = integer_part.zfill(integer_length)
|
1049
|
+
return f"{padded_integer_part}.{decimal_part}"
|
1050
|
+
else:
|
1051
|
+
return str(int(number)).zfill(integer_length)
|
1052
|
+
|
1053
|
+
percentiles_list = percentiles.split(',')
|
1054
|
+
has_decimals = any('.' in p for p in percentiles_list)
|
1055
|
+
|
1056
|
+
if has_decimals:
|
1057
|
+
percentiles_list = [float(p) for p in percentiles_list]
|
1058
|
+
max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
|
1059
|
+
max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
|
1060
|
+
labels = [f"{pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(percentiles_list) - 1)]
|
1061
|
+
else:
|
1062
|
+
percentiles_list = [int(p) for p in percentiles_list]
|
1063
|
+
max_integer_length = max(len(str(p)) for p in percentiles_list)
|
1064
|
+
labels = [f"{pad_number(percentiles_list[i], max_integer_length)} to {pad_number(percentiles_list[i + 1], max_integer_length)}" for i in range(len(percentiles_list) - 1)]
|
1065
|
+
|
1066
|
+
# Ensure the target column is numeric
|
1067
|
+
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1068
|
+
quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
|
1069
|
+
|
1070
|
+
df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
|
1071
|
+
|
1072
|
+
return df
|
1073
|
+
|
1074
|
+
def append_ranged_date_classification_column(
|
1075
|
+
df: pd.DataFrame,
|
1076
|
+
date_ranges: str,
|
1077
|
+
target_col: str,
|
1078
|
+
new_col_name: str
|
1079
|
+
) -> pd.DataFrame:
|
1080
|
+
"""
|
1081
|
+
Append a ranged date classification column to the DataFrame.
|
1082
|
+
|
1083
|
+
Parameters:
|
1084
|
+
df: The DataFrame to modify.
|
1085
|
+
date_ranges: A string representation of date ranges separated by commas.
|
1086
|
+
target_col: The date column to analyze.
|
1087
|
+
new_col_name: The name of the new date classification column.
|
1088
|
+
|
1089
|
+
Returns:
|
1090
|
+
A new DataFrame with the date classification column appended.
|
1091
|
+
"""
|
1092
|
+
|
1093
|
+
date_list = [pd.to_datetime(date) for date in date_ranges.split(',')]
|
1094
|
+
labels = [f"{date_list[i].strftime('%Y-%m-%d')} to {date_list[i + 1].strftime('%Y-%m-%d')}" for i in range(len(date_list) - 1)]
|
1095
|
+
|
1096
|
+
df[new_col_name] = pd.cut(pd.to_datetime(df[target_col]), bins=date_list, labels=labels, right=False)
|
1097
|
+
|
1098
|
+
return df
|
1099
|
+
|
1100
|
+
def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
|
1101
|
+
"""
|
1102
|
+
Rename columns in the DataFrame.
|
1103
|
+
|
1104
|
+
Parameters:
|
1105
|
+
df: The DataFrame to modify.
|
1106
|
+
rename_pairs: A dictionary mapping old column names to new column names.
|
1107
|
+
|
1108
|
+
Returns:
|
1109
|
+
A new DataFrame with columns renamed.
|
1110
|
+
"""
|
1111
|
+
if df is None:
|
1112
|
+
raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
|
1113
|
+
|
1114
|
+
return df.rename(columns=rename_pairs)
|
1115
|
+
|
1116
|
+
def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
|
1117
|
+
"""
|
1118
|
+
Cascade sort the DataFrame by specified columns and order.
|
1119
|
+
|
1120
|
+
Parameters:
|
1121
|
+
df: The DataFrame to sort.
|
1122
|
+
columns: A list of column names with sorting order, e.g., ['Column1::ASC', 'Column2::DESC'].
|
1123
|
+
|
1124
|
+
Returns:
|
1125
|
+
A new DataFrame sorted by specified columns.
|
1126
|
+
"""
|
1127
|
+
if df is None:
|
1128
|
+
raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
|
1129
|
+
|
1130
|
+
col_names = []
|
1131
|
+
asc_order = []
|
1132
|
+
|
1133
|
+
# Parse the columns and sorting order
|
1134
|
+
for col in columns:
|
1135
|
+
if "::" in col:
|
1136
|
+
name, order = col.split("::")
|
1137
|
+
col_names.append(name)
|
1138
|
+
asc_order.append(order.upper() == "ASC")
|
1139
|
+
else:
|
1140
|
+
col_names.append(col)
|
1141
|
+
asc_order.append(True)
|
1142
|
+
|
1143
|
+
# Ensure all specified columns exist
|
1144
|
+
for name in col_names:
|
1145
|
+
if name not in df.columns:
|
1146
|
+
raise ValueError(f"Column {name} not found in DataFrame")
|
1147
|
+
|
1148
|
+
return df.sort_values(by=col_names, ascending=asc_order)
|
1149
|
+
|
1150
|
+
def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
1151
|
+
"""
|
1152
|
+
Append XGB training labels based on a ratio string.
|
1153
|
+
|
1154
|
+
Parameters:
|
1155
|
+
df: The DataFrame to modify.
|
1156
|
+
ratio_str: A string specifying the ratio of TRAIN:TEST or TRAIN:VALIDATE:TEST.
|
1157
|
+
|
1158
|
+
Returns:
|
1159
|
+
A new DataFrame with XGB_TYPE labels appended.
|
1160
|
+
"""
|
1161
|
+
if df is None:
|
1162
|
+
raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
|
1163
|
+
|
1164
|
+
ratios = list(map(int, ratio_str.split(':')))
|
1165
|
+
total_ratio = sum(ratios)
|
1166
|
+
total_rows = len(df)
|
1167
|
+
|
1168
|
+
if len(ratios) == 2:
|
1169
|
+
train_rows = (ratios[0] * total_rows) // total_ratio
|
1170
|
+
test_rows = total_rows - train_rows
|
1171
|
+
labels = ['TRAIN'] * train_rows + ['TEST'] * test_rows
|
1172
|
+
elif len(ratios) == 3:
|
1173
|
+
train_rows = (ratios[0] * total_rows) // total_ratio
|
1174
|
+
validate_rows = (ratios[1] * total_rows) // total_ratio
|
1175
|
+
test_rows = total_rows - train_rows - validate_rows
|
1176
|
+
labels = ['TRAIN'] * train_rows + ['VALIDATE'] * validate_rows + ['TEST'] * test_rows
|
1177
|
+
else:
|
1178
|
+
raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
|
1179
|
+
|
1180
|
+
df_with_labels = df.copy()
|
1181
|
+
df_with_labels['XGB_TYPE'] = labels
|
1182
|
+
|
1183
|
+
return df_with_labels
|
1184
|
+
|
1185
|
+
def append_xgb_regression_predictions(
|
1186
|
+
df: pd.DataFrame,
|
1187
|
+
target_col: str,
|
1188
|
+
feature_cols: str,
|
1189
|
+
pred_col: str,
|
1190
|
+
boosting_rounds: int = 100,
|
1191
|
+
model_path: Optional[str] = None
|
1192
|
+
) -> pd.DataFrame:
|
1193
|
+
"""
|
1194
|
+
Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
|
1195
|
+
|
1196
|
+
Parameters:
|
1197
|
+
df: DataFrame to modify.
|
1198
|
+
target_col: The target column for regression.
|
1199
|
+
feature_cols: Comma-separated string of feature columns.
|
1200
|
+
pred_col: Name of the prediction column.
|
1201
|
+
boosting_rounds: (Optional) Number of boosting rounds for training.
|
1202
|
+
model_path: (Optional) Path to save the trained model.
|
1203
|
+
|
1204
|
+
Returns:
|
1205
|
+
DataFrame with predictions appended.
|
1206
|
+
"""
|
1207
|
+
if df is None or 'XGB_TYPE' not in df.columns:
|
1208
|
+
raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
|
1209
|
+
|
1210
|
+
features = feature_cols.replace(' ', '').split(',')
|
1211
|
+
|
1212
|
+
# Convert categorical columns to 'category' dtype
|
1213
|
+
for col in features:
|
1214
|
+
if df[col].dtype == 'object':
|
1215
|
+
df[col] = df[col].astype('category')
|
1216
|
+
|
1217
|
+
train_data = df[df['XGB_TYPE'] == 'TRAIN']
|
1218
|
+
validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
|
1219
|
+
|
1220
|
+
dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
|
1221
|
+
evals = [(dtrain, 'train')]
|
1222
|
+
|
1223
|
+
if validate_data is not None:
|
1224
|
+
dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
|
1225
|
+
evals.append((dvalidate, 'validate'))
|
1226
|
+
|
1227
|
+
params = {
|
1228
|
+
'objective': 'reg:squarederror',
|
1229
|
+
'eval_metric': 'rmse'
|
1230
|
+
}
|
1231
|
+
|
1232
|
+
model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
|
1233
|
+
|
1234
|
+
# Make predictions for all data
|
1235
|
+
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
1236
|
+
df[pred_col] = model.predict(dall)
|
1237
|
+
|
1238
|
+
if model_path:
|
1239
|
+
model.save_model(model_path)
|
1240
|
+
|
1241
|
+
columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1242
|
+
df = df[columns_order]
|
1243
|
+
|
1244
|
+
return df
|
1245
|
+
|
1246
|
+
def append_xgb_logistic_regression_predictions(
|
1247
|
+
df: pd.DataFrame,
|
1248
|
+
target_col: str,
|
1249
|
+
feature_cols: str,
|
1250
|
+
pred_col: str,
|
1251
|
+
boosting_rounds: int = 100,
|
1252
|
+
model_path: Optional[str] = None
|
1253
|
+
) -> pd.DataFrame:
|
1254
|
+
"""
|
1255
|
+
Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
|
1256
|
+
|
1257
|
+
Parameters:
|
1258
|
+
df: DataFrame to modify.
|
1259
|
+
target_col: The target column for logistic regression.
|
1260
|
+
feature_cols: Comma-separated string of feature columns.
|
1261
|
+
pred_col: Name of the prediction column.
|
1262
|
+
boosting_rounds: (Optional) Number of boosting rounds for training.
|
1263
|
+
model_path: (Optional) Path to save the trained model.
|
1264
|
+
|
1265
|
+
Returns:
|
1266
|
+
DataFrame with predictions appended.
|
1267
|
+
"""
|
1268
|
+
if df is None or 'XGB_TYPE' not in df.columns:
|
1269
|
+
raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
|
1270
|
+
|
1271
|
+
features = feature_cols.replace(' ', '').split(',')
|
1272
|
+
|
1273
|
+
# Convert categorical columns to 'category' dtype
|
1274
|
+
for col in features:
|
1275
|
+
if df[col].dtype == 'object':
|
1276
|
+
df[col] = df[col].astype('category')
|
1277
|
+
|
1278
|
+
train_data = df[df['XGB_TYPE'] == 'TRAIN']
|
1279
|
+
validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
|
1280
|
+
|
1281
|
+
dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
|
1282
|
+
evals = [(dtrain, 'train')]
|
1283
|
+
|
1284
|
+
if validate_data is not None:
|
1285
|
+
dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
|
1286
|
+
evals.append((dvalidate, 'validate'))
|
1287
|
+
|
1288
|
+
params = {
|
1289
|
+
'objective': 'binary:logistic',
|
1290
|
+
'eval_metric': 'auc'
|
1291
|
+
}
|
1292
|
+
|
1293
|
+
model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
|
1294
|
+
|
1295
|
+
# Make predictions for all data
|
1296
|
+
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
1297
|
+
df[pred_col] = model.predict(dall)
|
1298
|
+
|
1299
|
+
if model_path:
|
1300
|
+
model.save_model(model_path)
|
1301
|
+
|
1302
|
+
columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1303
|
+
df = df[columns_order]
|
1304
|
+
|
1305
|
+
return df
|
1306
|
+
|
1307
|
+
def print_n_frequency_cascading(
|
1308
|
+
df: pd.DataFrame,
|
1309
|
+
n: int,
|
1310
|
+
columns: str,
|
1311
|
+
order_by: str = "FREQ_DESC"
|
1312
|
+
) -> None:
|
1313
|
+
"""
|
1314
|
+
Print the cascading frequency of top n values for specified columns.
|
1315
|
+
|
1316
|
+
Parameters:
|
1317
|
+
df: DataFrame to analyze.
|
1318
|
+
n: Number of top values to print.
|
1319
|
+
columns: Comma-separated column names to analyze.
|
1320
|
+
order_by: Order of frequency: ACS, DESC, FREQ_ASC, FREQ_DESC.
|
1321
|
+
"""
|
1322
|
+
columns = [col.strip() for col in columns.split(",")]
|
1323
|
+
|
1324
|
+
def generate_cascade_report(df, columns, limit, order_by):
|
1325
|
+
if not columns:
|
1326
|
+
return None
|
1327
|
+
|
1328
|
+
current_col = columns[0]
|
1329
|
+
if current_col not in df.columns:
|
1330
|
+
return None
|
1331
|
+
|
1332
|
+
# Convert the column to string representation
|
1333
|
+
df[current_col] = df[current_col].astype(str)
|
1334
|
+
frequency = df[current_col].value_counts(dropna=False)
|
1335
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
1336
|
+
|
1337
|
+
if limit is not None:
|
1338
|
+
frequency = frequency.nlargest(limit)
|
1339
|
+
|
1340
|
+
sorted_frequency = sort_frequency(frequency, order_by)
|
1341
|
+
|
1342
|
+
report = {}
|
1343
|
+
for value, count in sorted_frequency.items():
|
1344
|
+
if value in ['NaN', 'NaT', 'None', 'Empty']:
|
1345
|
+
filtered_df = df[df[current_col].isna()]
|
1346
|
+
else:
|
1347
|
+
filtered_df = df[df[current_col] == value]
|
1348
|
+
|
1349
|
+
if len(columns) > 1:
|
1350
|
+
sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
|
1351
|
+
report[value] = {
|
1352
|
+
"count": str(count),
|
1353
|
+
f"sub_distribution({columns[1]})": sub_report if sub_report else {}
|
1354
|
+
}
|
1355
|
+
else:
|
1356
|
+
report[value] = {
|
1357
|
+
"count": str(count)
|
1358
|
+
}
|
1359
|
+
|
1360
|
+
return report
|
1361
|
+
|
1362
|
+
def sort_frequency(frequency, order_by):
|
1363
|
+
if order_by == "ASC":
|
1364
|
+
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1365
|
+
elif order_by == "DESC":
|
1366
|
+
return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
|
1367
|
+
elif order_by == "FREQ_ASC":
|
1368
|
+
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1369
|
+
else: # Default to "FREQ_DESC"
|
1370
|
+
return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
|
1371
|
+
|
1372
|
+
report = generate_cascade_report(df, columns, n, order_by)
|
1373
|
+
print(json.dumps(report, indent=2))
|
1374
|
+
|
1375
|
+
def print_n_frequency_linear(
|
1376
|
+
df: pd.DataFrame,
|
1377
|
+
n: int,
|
1378
|
+
columns: str,
|
1379
|
+
order_by: str = "FREQ_DESC"
|
1380
|
+
) -> None:
|
1381
|
+
"""
|
1382
|
+
Print the linear frequency of top n values for specified columns.
|
1383
|
+
|
1384
|
+
Parameters:
|
1385
|
+
df: DataFrame to analyze.
|
1386
|
+
n: Number of top values to print.
|
1387
|
+
columns: Comma-separated column names to analyze.
|
1388
|
+
order_by: Order of frequency: ACS, DESC, FREQ_ASC, FREQ_DESC.
|
1389
|
+
"""
|
1390
|
+
columns = [col.strip() for col in columns.split(",")]
|
1391
|
+
|
1392
|
+
def generate_linear_report(df, columns, limit, order_by):
|
1393
|
+
report = {}
|
1394
|
+
|
1395
|
+
for current_col in columns:
|
1396
|
+
if current_col not in df.columns:
|
1397
|
+
continue
|
1398
|
+
|
1399
|
+
frequency = df[current_col].astype(str).value_counts(dropna=False)
|
1400
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
1401
|
+
|
1402
|
+
if limit is not None:
|
1403
|
+
frequency = frequency.nlargest(limit)
|
1404
|
+
|
1405
|
+
sorted_frequency = sort_frequency(frequency, order_by)
|
1406
|
+
col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
|
1407
|
+
report[current_col] = col_report
|
1408
|
+
|
1409
|
+
return report
|
1410
|
+
|
1411
|
+
def sort_frequency(frequency, order_by):
|
1412
|
+
if order_by == "ASC":
|
1413
|
+
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1414
|
+
elif order_by == "DESC":
|
1415
|
+
return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
|
1416
|
+
elif order_by == "FREQ_ASC":
|
1417
|
+
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1418
|
+
else: # Default to "FREQ_DESC"
|
1419
|
+
return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
|
1420
|
+
|
1421
|
+
report = generate_linear_report(df, columns, n, order_by)
|
1422
|
+
print(json.dumps(report, indent=2))
|
1423
|
+
|
1424
|
+
def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
|
1425
|
+
"""
|
1426
|
+
Retain specified columns in the DataFrame and drop the others.
|
1427
|
+
|
1428
|
+
Parameters:
|
1429
|
+
df: DataFrame to modify.
|
1430
|
+
columns_to_retain: List of column names to retain.
|
1431
|
+
|
1432
|
+
Returns:
|
1433
|
+
A new DataFrame with only the retained columns.
|
1434
|
+
"""
|
1435
|
+
if not isinstance(columns_to_retain, list):
|
1436
|
+
raise ValueError("columns_to_retain should be a list of column names.")
|
1437
|
+
return df[columns_to_retain]
|
1438
|
+
|
1439
|
+
def mask_against_dataframe(
|
1440
|
+
df: pd.DataFrame,
|
1441
|
+
other_df: pd.DataFrame,
|
1442
|
+
column_name: str
|
1443
|
+
) -> pd.DataFrame:
|
1444
|
+
"""
|
1445
|
+
Retain only rows with common column values between two DataFrames.
|
1446
|
+
|
1447
|
+
Parameters:
|
1448
|
+
df: DataFrame to modify.
|
1449
|
+
other_df: DataFrame to compare against.
|
1450
|
+
column_name: Column name to compare.
|
1451
|
+
|
1452
|
+
Returns:
|
1453
|
+
A new DataFrame with rows whose column value exist in both DataFrames.
|
1454
|
+
"""
|
1455
|
+
if column_name not in df.columns or column_name not in other_df.columns:
|
1456
|
+
raise ValueError("The specified column must exist in both DataFrames.")
|
1457
|
+
return df[df[column_name].isin(other_df[column_name])]
|
1458
|
+
|
1459
|
+
def mask_against_dataframe_converse(
|
1460
|
+
df: pd.DataFrame,
|
1461
|
+
other_df: pd.DataFrame,
|
1462
|
+
column_name: str
|
1463
|
+
) -> pd.DataFrame:
|
1464
|
+
"""
|
1465
|
+
Retain only rows with uncommon column values between two DataFrames.
|
1466
|
+
|
1467
|
+
Parameters:
|
1468
|
+
df: The primary DataFrame to modify.
|
1469
|
+
other_df: The DataFrame to compare against.
|
1470
|
+
column_name: The column name to use for comparison.
|
1471
|
+
|
1472
|
+
Returns:
|
1473
|
+
A new DataFrame with rows whose column values do not exist in 'other_df'.
|
1474
|
+
"""
|
1475
|
+
if column_name not in df.columns or column_name not in other_df.columns:
|
1476
|
+
raise ValueError("The specified column must exist in both DataFrames.")
|
1477
|
+
|
1478
|
+
return df[~df[column_name].isin(other_df[column_name])]
|
1479
|
+
|