rgwfuncs 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rgwfuncs/__init__.py +3 -2
- rgwfuncs/df_lib.py +209 -503
- {rgwfuncs-0.0.5.dist-info → rgwfuncs-0.0.6.dist-info}/METADATA +98 -1
- rgwfuncs-0.0.6.dist-info/RECORD +8 -0
- rgwfuncs-0.0.5.dist-info/RECORD +0 -8
- {rgwfuncs-0.0.5.dist-info → rgwfuncs-0.0.6.dist-info}/LICENSE +0 -0
- {rgwfuncs-0.0.5.dist-info → rgwfuncs-0.0.6.dist-info}/WHEEL +0 -0
- {rgwfuncs-0.0.5.dist-info → rgwfuncs-0.0.6.dist-info}/entry_points.txt +0 -0
- {rgwfuncs-0.0.5.dist-info → rgwfuncs-0.0.6.dist-info}/top_level.txt +0 -0
rgwfuncs/df_lib.py
CHANGED
@@ -27,14 +27,10 @@ from typing import Optional, Callable, Dict, List, Tuple, Any
|
|
27
27
|
|
28
28
|
def docs(method_type_filter: Optional[str] = None) -> None:
|
29
29
|
"""
|
30
|
-
Print a list of function names in alphabetical order. If
|
31
|
-
method_type_filter is specified, print the docstrings of the functions
|
32
|
-
that match the filter. Using '*' as a filter will print the docstrings for
|
33
|
-
all functions.
|
30
|
+
Print a list of function names in alphabetical order. If method_type_filter is specified, print the docstrings of the functions that match the filter. Using '*' as a filter will print the docstrings for all functions.
|
34
31
|
|
35
32
|
Parameters:
|
36
|
-
method_type_filter: Optional filter string, comma-separated to select
|
37
|
-
docstring types, or '*' for all.
|
33
|
+
method_type_filter: Optional filter string, comma-separated to select docstring types, or '*' for all.
|
38
34
|
"""
|
39
35
|
# Get the current module's namespace
|
40
36
|
current_module = __name__
|
@@ -75,23 +71,15 @@ def docs(method_type_filter: Optional[str] = None) -> None:
|
|
75
71
|
print(f"\n{name}:\n{docstring}")
|
76
72
|
|
77
73
|
|
78
|
-
def numeric_clean(
|
79
|
-
df: pd.DataFrame,
|
80
|
-
column_names: str,
|
81
|
-
column_type: str,
|
82
|
-
irregular_value_treatment: str
|
83
|
-
) -> pd.DataFrame:
|
74
|
+
def numeric_clean(df: pd.DataFrame, column_names: str, column_type: str, irregular_value_treatment: str) -> pd.DataFrame:
|
84
75
|
"""
|
85
76
|
Cleans the numeric columns based on specified treatments.
|
86
77
|
|
87
78
|
Parameters:
|
88
79
|
df: The DataFrame to clean.
|
89
|
-
column_names: A comma-separated string containing the names of the
|
90
|
-
|
91
|
-
|
92
|
-
'FLOAT').
|
93
|
-
irregular_value_treatment: How to treat irregular values ('NAN',
|
94
|
-
'TO_ZERO', 'MEAN').
|
80
|
+
column_names: A comma-separated string containing the names of the columns to clean.
|
81
|
+
column_type: The type to convert the column to ('INTEGER' or 'FLOAT').
|
82
|
+
irregular_value_treatment: How to treat irregular values ('NAN', 'TO_ZERO', 'MEAN').
|
95
83
|
|
96
84
|
Returns:
|
97
85
|
A new DataFrame with cleaned numeric columns.
|
@@ -183,8 +171,7 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
|
|
183
171
|
|
184
172
|
Parameters:
|
185
173
|
df: The original DataFrame.
|
186
|
-
rows: A list of lists, where each inner list represents a row to be
|
187
|
-
appended.
|
174
|
+
rows: A list of lists, where each inner list represents a row to be appended.
|
188
175
|
|
189
176
|
Returns:
|
190
177
|
A new DataFrame with the appended rows.
|
@@ -244,8 +231,7 @@ def update_rows(
|
|
244
231
|
Parameters:
|
245
232
|
df: The original DataFrame.
|
246
233
|
condition: A query condition to identify rows for updating.
|
247
|
-
updates: A dictionary with column names as keys and new values as
|
248
|
-
values.
|
234
|
+
updates: A dictionary with column names as keys and new values as values.
|
249
235
|
|
250
236
|
Returns:
|
251
237
|
A new DataFrame with the updated rows.
|
@@ -316,17 +302,13 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
|
316
302
|
return df.drop_duplicates(keep='first')
|
317
303
|
|
318
304
|
|
319
|
-
def drop_duplicates_retain_first(
|
320
|
-
df: pd.DataFrame,
|
321
|
-
columns: Optional[str] = None) -> pd.DataFrame:
|
305
|
+
def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
|
322
306
|
"""
|
323
|
-
Drop duplicate rows in the DataFrame based on specified columns, retaining
|
324
|
-
the first occurrence.
|
307
|
+
Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
|
325
308
|
|
326
309
|
Parameters:
|
327
310
|
df: The DataFrame from which duplicates will be dropped.
|
328
|
-
columns: A comma-separated string with the column names used to
|
329
|
-
identify duplicates.
|
311
|
+
columns: A comma-separated string with the column names used to identify duplicates.
|
330
312
|
|
331
313
|
Returns:
|
332
314
|
A new DataFrame with duplicates removed.
|
@@ -342,17 +324,13 @@ def drop_duplicates_retain_first(
|
|
342
324
|
return df.drop_duplicates(subset=columns_list, keep='first')
|
343
325
|
|
344
326
|
|
345
|
-
def drop_duplicates_retain_last(
|
346
|
-
df: pd.DataFrame,
|
347
|
-
columns: Optional[str] = None) -> pd.DataFrame:
|
327
|
+
def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
|
348
328
|
"""
|
349
|
-
Drop duplicate rows in the DataFrame based on specified columns, retaining
|
350
|
-
the last occurrence.
|
329
|
+
Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
|
351
330
|
|
352
331
|
Parameters:
|
353
332
|
df: The DataFrame from which duplicates will be dropped.
|
354
|
-
columns: A comma-separated string with the column names used to
|
355
|
-
identify duplicates.
|
333
|
+
columns: A comma-separated string with the column names used to identify duplicates.
|
356
334
|
|
357
335
|
Returns:
|
358
336
|
A new DataFrame with duplicates removed.
|
@@ -363,22 +341,17 @@ def drop_duplicates_retain_last(
|
|
363
341
|
if df is None:
|
364
342
|
raise ValueError("DataFrame is not initialized.")
|
365
343
|
|
366
|
-
columns_list = [col.strip()
|
367
|
-
for col in columns.split(',')] if columns else None
|
344
|
+
columns_list = [col.strip() for col in columns.split(',')] if columns else None
|
368
345
|
return df.drop_duplicates(subset=columns_list, keep='last')
|
369
346
|
|
370
347
|
|
371
|
-
def load_data_from_query(
|
372
|
-
db_preset_name: str,
|
373
|
-
query: str,
|
374
|
-
config_file_name: str = "rgwml.config") -> pd.DataFrame:
|
348
|
+
def load_data_from_query(db_preset_name: str, query: str, config_file_name: str = "rgwml.config") -> pd.DataFrame:
|
375
349
|
"""
|
376
350
|
Load data from a database query into a DataFrame based on a configuration
|
377
351
|
preset.
|
378
352
|
|
379
353
|
Parameters:
|
380
|
-
db_preset_name: The name of the database preset in the configuration
|
381
|
-
file.
|
354
|
+
db_preset_name: The name of the database preset in the configuration file.
|
382
355
|
query: The SQL query to execute.
|
383
356
|
config_file_name: Name of the configuration file
|
384
357
|
(default: 'rgwml.config').
|
@@ -393,19 +366,14 @@ def load_data_from_query(
|
|
393
366
|
|
394
367
|
def locate_config_file(filename: str = config_file_name) -> str:
|
395
368
|
home_dir = os.path.expanduser("~")
|
396
|
-
search_paths = [
|
397
|
-
os.path.join(home_dir, "Desktop"),
|
398
|
-
os.path.join(home_dir, "Documents"),
|
399
|
-
os.path.join(home_dir, "Downloads"),
|
400
|
-
]
|
369
|
+
search_paths = [os.path.join(home_dir, "Desktop"), os.path.join(home_dir, "Documents"), os.path.join(home_dir, "Downloads"),]
|
401
370
|
|
402
371
|
for path in search_paths:
|
403
372
|
for root, dirs, files in os.walk(path):
|
404
373
|
if filename in files:
|
405
374
|
return os.path.join(root, filename)
|
406
375
|
raise FileNotFoundError(
|
407
|
-
f"{filename} not found in Desktop, Documents, or Downloads"
|
408
|
-
+ "folders")
|
376
|
+
f"{filename} not found in Desktop, Documents, or Downloads folders")
|
409
377
|
|
410
378
|
def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
411
379
|
server = db_preset['host']
|
@@ -413,8 +381,7 @@ def load_data_from_query(
|
|
413
381
|
password = db_preset['password']
|
414
382
|
database = db_preset.get('database', '')
|
415
383
|
|
416
|
-
with pymssql.connect(server=server, user=user, password=password,
|
417
|
-
database=database) as conn:
|
384
|
+
with pymssql.connect(server=server, user=user, password=password, database=database) as conn:
|
418
385
|
with conn.cursor() as cursor:
|
419
386
|
cursor.execute(query)
|
420
387
|
rows = cursor.fetchall()
|
@@ -428,25 +395,15 @@ def load_data_from_query(
|
|
428
395
|
password = db_preset['password']
|
429
396
|
database = db_preset.get('database', '')
|
430
397
|
|
431
|
-
with mysql.connector.connect(
|
432
|
-
host=host,
|
433
|
-
user=user,
|
434
|
-
password=password,
|
435
|
-
database=database
|
436
|
-
) as conn:
|
398
|
+
with mysql.connector.connect(host=host, user=user, password=password, database=database) as conn:
|
437
399
|
with conn.cursor() as cursor:
|
438
400
|
cursor.execute(query)
|
439
401
|
rows = cursor.fetchall()
|
440
|
-
columns = (
|
441
|
-
[desc[0] for desc in cursor.description]
|
442
|
-
if cursor.description
|
443
|
-
else []
|
444
|
-
)
|
402
|
+
columns = ([desc[0] for desc in cursor.description] if cursor.description else [])
|
445
403
|
|
446
404
|
return pd.DataFrame(rows, columns=columns)
|
447
405
|
|
448
|
-
def query_clickhouse(
|
449
|
-
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
406
|
+
def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
450
407
|
|
451
408
|
host = db_preset['host']
|
452
409
|
user = db_preset['username']
|
@@ -458,13 +415,7 @@ def load_data_from_query(
|
|
458
415
|
|
459
416
|
for attempt in range(max_retries):
|
460
417
|
try:
|
461
|
-
client = clickhouse_connect.get_client(
|
462
|
-
host=host,
|
463
|
-
port='8123',
|
464
|
-
username=user,
|
465
|
-
password=password,
|
466
|
-
database=database
|
467
|
-
)
|
418
|
+
client = clickhouse_connect.get_client(host=host, port='8123', username=user, password=password, database=database)
|
468
419
|
data = client.query(query)
|
469
420
|
rows = data.result_rows
|
470
421
|
columns = data.column_names
|
@@ -478,13 +429,11 @@ def load_data_from_query(
|
|
478
429
|
raise ConnectionError(
|
479
430
|
"All attempts to connect to ClickHouse failed.")
|
480
431
|
|
481
|
-
def query_google_big_query(
|
482
|
-
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
432
|
+
def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
483
433
|
json_file_path = db_preset['json_file_path']
|
484
434
|
project_id = db_preset['project_id']
|
485
435
|
|
486
|
-
credentials = service_account.Credentials.from_service_account_file(
|
487
|
-
json_file_path)
|
436
|
+
credentials = service_account.Credentials.from_service_account_file(json_file_path)
|
488
437
|
client = bigquery.Client(credentials=credentials, project=project_id)
|
489
438
|
|
490
439
|
query_job = client.query(query)
|
@@ -500,9 +449,7 @@ def load_data_from_query(
|
|
500
449
|
config = json.load(f)
|
501
450
|
|
502
451
|
db_presets = config.get('db_presets', [])
|
503
|
-
db_preset = next(
|
504
|
-
(preset for preset in db_presets if preset['name'] == db_preset_name),
|
505
|
-
None)
|
452
|
+
db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
|
506
453
|
if not db_preset:
|
507
454
|
raise ValueError(f"No matching db_preset found for {db_preset_name}")
|
508
455
|
|
@@ -584,8 +531,7 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
|
|
584
531
|
|
585
532
|
def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
|
586
533
|
"""
|
587
|
-
Execute a query on a SQLite database specified by its path and return the
|
588
|
-
results as a DataFrame.
|
534
|
+
Execute a query on a SQLite database specified by its path and return the results as a DataFrame.
|
589
535
|
|
590
536
|
Parameters:
|
591
537
|
sqlite_path: The absolute path to the SQLite database file.
|
@@ -615,10 +561,7 @@ def first_n_rows(df: pd.DataFrame, n: int) -> None:
|
|
615
561
|
"""
|
616
562
|
Display the first n rows of the DataFrame.
|
617
563
|
|
618
|
-
This function prints out the first `n` rows of a given DataFrame. Each row
|
619
|
-
is formatted for clarity and
|
620
|
-
printed as a dictionary. If the DataFrame is empty or `None`, it raises a
|
621
|
-
ValueError.
|
564
|
+
This function prints out the first `n` rows of a given DataFrame. Each row is formatted for clarity and printed as a dictionary. If the DataFrame is empty or `None`, it raises a ValueError.
|
622
565
|
|
623
566
|
Parameters:
|
624
567
|
- df (pd.DataFrame): The DataFrame to display rows from.
|
@@ -643,8 +586,7 @@ def last_n_rows(df: pd.DataFrame, n: int) -> None:
|
|
643
586
|
"""
|
644
587
|
Display the last n rows of the DataFrame.
|
645
588
|
|
646
|
-
Prints the last `n` rows of a given DataFrame, formatted as dictionaries.
|
647
|
-
Useful for end-segment analysis and verifying data continuity.
|
589
|
+
Prints the last `n` rows of a given DataFrame, formatted as dictionaries. Useful for end-segment analysis and verifying data continuity.
|
648
590
|
|
649
591
|
Parameters:
|
650
592
|
- df (pd.DataFrame): The DataFrame from which to display rows.
|
@@ -669,15 +611,13 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
|
669
611
|
"""
|
670
612
|
Print the top `n` unique values for specified columns in the DataFrame.
|
671
613
|
|
672
|
-
This method calculates and prints the top `n` unique frequency values for
|
673
|
-
specified columns in a DataFrame.
|
614
|
+
This method calculates and prints the top `n` unique frequency values for specified columns in a DataFrame.
|
674
615
|
|
675
616
|
Parameters:
|
676
617
|
- df (pd.DataFrame): The DataFrame from which to calculate top unique
|
677
618
|
values.
|
678
619
|
- n (int): Number of top values to display.
|
679
|
-
- columns (List[str]): List of column names for which to display top
|
680
|
-
unique values.
|
620
|
+
- columns (List[str]): List of column names for which to display top unique values.
|
681
621
|
|
682
622
|
Raises:
|
683
623
|
- ValueError: If the DataFrame is `None`.
|
@@ -687,20 +627,10 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
|
687
627
|
for column in columns:
|
688
628
|
if column in df.columns:
|
689
629
|
frequency = df[column].astype(str).value_counts(dropna=False)
|
690
|
-
frequency = frequency.rename(
|
691
|
-
index={
|
692
|
-
'nan': 'NaN',
|
693
|
-
'NaT': 'NaT',
|
694
|
-
'None': 'None',
|
695
|
-
'': 'Empty'})
|
630
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
696
631
|
top_n_values = frequency.nlargest(n)
|
697
|
-
report[column] = {str(value): str(count)
|
698
|
-
|
699
|
-
print(
|
700
|
-
f"Top {n} unique values for column '{column}':\n{
|
701
|
-
json.dumps(
|
702
|
-
report[column],
|
703
|
-
indent=2)}\n")
|
632
|
+
report[column] = {str(value): str(count) for value, count in top_n_values.items()}
|
633
|
+
print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
|
704
634
|
else:
|
705
635
|
print(f"Column '{column}' does not exist in the DataFrame.")
|
706
636
|
else:
|
@@ -710,22 +640,17 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
|
710
640
|
gc.collect()
|
711
641
|
|
712
642
|
|
713
|
-
def bottom_n_unique_values(
|
714
|
-
df: pd.DataFrame,
|
715
|
-
n: int,
|
716
|
-
columns: List[str]) -> None:
|
643
|
+
def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
717
644
|
"""
|
718
645
|
Print the bottom `n` unique values for specified columns in the DataFrame.
|
719
646
|
|
720
|
-
This method calculates and prints the bottom `n` unique frequency values
|
721
|
-
for specified columns in a DataFrame.
|
647
|
+
This method calculates and prints the bottom `n` unique frequency values for specified columns in a DataFrame.
|
722
648
|
|
723
649
|
Parameters:
|
724
650
|
- df (pd.DataFrame): The DataFrame from which to calculate bottom unique
|
725
651
|
values.
|
726
652
|
- n (int): Number of bottom unique frequency values to display.
|
727
|
-
- columns (List[str]): List of column names for which to display bottom
|
728
|
-
unique values.
|
653
|
+
- columns (List[str]): List of column names for which to display bottom unique values.
|
729
654
|
|
730
655
|
Raises:
|
731
656
|
- ValueError: If the DataFrame is `None`.
|
@@ -735,21 +660,12 @@ def bottom_n_unique_values(
|
|
735
660
|
for column in columns:
|
736
661
|
if column in df.columns:
|
737
662
|
frequency = df[column].astype(str).value_counts(dropna=False)
|
738
|
-
frequency = frequency.rename(
|
739
|
-
index={
|
740
|
-
'nan': 'NaN',
|
741
|
-
'NaT': 'NaT',
|
742
|
-
'None': 'None',
|
743
|
-
'': 'Empty'})
|
663
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
744
664
|
bottom_n_values = frequency.nsmallest(n)
|
745
665
|
report[column] = {
|
746
666
|
str(value): str(count) for value,
|
747
667
|
count in bottom_n_values.items()}
|
748
|
-
print(
|
749
|
-
f"Bottom {n} unique values for column '{column}':\n{
|
750
|
-
json.dumps(
|
751
|
-
report[column],
|
752
|
-
indent=2)}\n")
|
668
|
+
print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
|
753
669
|
else:
|
754
670
|
print(f"Column '{column}' does not exist in the DataFrame.")
|
755
671
|
else:
|
@@ -759,18 +675,15 @@ def bottom_n_unique_values(
|
|
759
675
|
gc.collect()
|
760
676
|
|
761
677
|
|
762
|
-
def print_correlation(
|
763
|
-
df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
|
678
|
+
def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
|
764
679
|
"""
|
765
680
|
Print correlation for multiple pairs of columns in the DataFrame.
|
766
681
|
|
767
|
-
This function computes and displays the correlation coefficients for
|
768
|
-
specified pairs of columns.
|
682
|
+
This function computes and displays the correlation coefficients for specified pairs of columns.
|
769
683
|
|
770
684
|
Parameters:
|
771
685
|
- df (pd.DataFrame): The DataFrame containing the columns to analyze.
|
772
|
-
- column_pairs (List[Tuple[str, str]]): List of column pairs for which to
|
773
|
-
compute correlations.
|
686
|
+
- column_pairs (List[Tuple[str, str]]): List of column pairs for which to compute correlations.
|
774
687
|
"""
|
775
688
|
if df is not None:
|
776
689
|
for col1, col2 in column_pairs:
|
@@ -781,21 +694,13 @@ def print_correlation(
|
|
781
694
|
|
782
695
|
correlation = numeric_col1.corr(numeric_col2)
|
783
696
|
if pd.notnull(correlation):
|
784
|
-
print(
|
785
|
-
f"The correlation between '{col1}' and '{col2}'"
|
786
|
-
+ f" is {correlation}.")
|
697
|
+
print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
|
787
698
|
else:
|
788
|
-
print(
|
789
|
-
f"Cannot calculate correlation between '{col1}'"
|
790
|
-
+ f" and '{col2}' due to insufficient numeric"
|
791
|
-
+ " data.")
|
699
|
+
print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
|
792
700
|
except Exception as e:
|
793
|
-
print(
|
794
|
-
f"Error processing cols '{col1}' and '{col2}': {e}")
|
701
|
+
print(f"Error processing cols '{col1}' and '{col2}': {e}")
|
795
702
|
else:
|
796
|
-
print(
|
797
|
-
f"One or both of the specified cols ('{col1}', '{col2}')"
|
798
|
-
+ " do not exist in the DataFrame.")
|
703
|
+
print(f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
|
799
704
|
else:
|
800
705
|
print("The DataFrame is empty.")
|
801
706
|
|
@@ -806,8 +711,7 @@ def print_memory_usage(df: pd.DataFrame) -> None:
|
|
806
711
|
"""
|
807
712
|
Prints the memory usage of the DataFrame.
|
808
713
|
|
809
|
-
This function computes the memory footprint of a DataFrame in megabytes
|
810
|
-
and displays it, rounding to two decimal places for clarity.
|
714
|
+
This function computes the memory footprint of a DataFrame in megabytes and displays it, rounding to two decimal places for clarity.
|
811
715
|
|
812
716
|
Parameters:
|
813
717
|
- df (pd.DataFrame): The DataFrame for which the memory usage is computed.
|
@@ -816,8 +720,7 @@ def print_memory_usage(df: pd.DataFrame) -> None:
|
|
816
720
|
- ValueError: If the DataFrame is `None`.
|
817
721
|
"""
|
818
722
|
if df is not None:
|
819
|
-
memory_usage = df.memory_usage(deep=True).sum(
|
820
|
-
) / (1024 * 1024) # Convert bytes to MB
|
723
|
+
memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024) # Convert bytes to MB
|
821
724
|
print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
|
822
725
|
else:
|
823
726
|
raise ValueError("No DataFrame to print. Please provide a DataFrame.")
|
@@ -829,9 +732,7 @@ def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
|
|
829
732
|
"""
|
830
733
|
Return a filtered DataFrame according to the given expression.
|
831
734
|
|
832
|
-
This function filters rows of a DataFrame using a specified query
|
833
|
-
expression, returning a new DataFrame containing only the rows that
|
834
|
-
match the criteria.
|
735
|
+
This function filters rows of a DataFrame using a specified query expression, returning a new DataFrame containing only the rows that match the criteria.
|
835
736
|
|
836
737
|
Parameters:
|
837
738
|
- df (pd.DataFrame): The original DataFrame to be filtered.
|
@@ -860,19 +761,14 @@ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
|
|
860
761
|
"""
|
861
762
|
Filter and return DataFrame rows containing valid Indian mobile numbers.
|
862
763
|
|
863
|
-
This function processes a DataFrame to extract and retain rows where the
|
864
|
-
specified column matches the typical format for Indian mobile numbers.
|
865
|
-
An Indian mobile number is expected to be a digit-only string starting
|
866
|
-
with 6, 7, 8, or 9, and should have at least 4 distinct digits.
|
764
|
+
This function processes a DataFrame to extract and retain rows where the specified column matches the typical format for Indian mobile numbers. An Indian mobile number is expected to be a digit-only string starting with 6, 7, 8, or 9, and should have at least 4 distinct digits.
|
867
765
|
|
868
766
|
Parameters:
|
869
767
|
- df (pd.DataFrame): The DataFrame to filter.
|
870
|
-
- mobile_col (str): The name of the column in the DataFrame that contains
|
871
|
-
mobile number data.
|
768
|
+
- mobile_col (str): The name of the column in the DataFrame that contains mobile number data.
|
872
769
|
|
873
770
|
Returns:
|
874
|
-
- pd.DataFrame: A new DataFrame containing only rows with valid Indian
|
875
|
-
mobile numbers.
|
771
|
+
- pd.DataFrame: A new DataFrame containing only rows with valid Indian mobile numbers.
|
876
772
|
|
877
773
|
Raises:
|
878
774
|
- ValueError: If the DataFrame is `None`.
|
@@ -897,18 +793,15 @@ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
|
|
897
793
|
|
898
794
|
def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
899
795
|
"""
|
900
|
-
Print the DataFrame and its column types. If a source path is provided,
|
901
|
-
print it as well.
|
796
|
+
Print the DataFrame and its column types. If a source path is provided, print it as well.
|
902
797
|
|
903
798
|
Parameters:
|
904
799
|
df: The DataFrame to print.
|
905
|
-
source: Optional; The source path of the DataFrame for logging
|
906
|
-
purposes.
|
800
|
+
source: Optional; The source path of the DataFrame for logging purposes.
|
907
801
|
"""
|
908
802
|
if df is not None:
|
909
803
|
print(df)
|
910
|
-
columns_with_types = [
|
911
|
-
f"{col} ({df[col].dtypes})" for col in df.columns]
|
804
|
+
columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
|
912
805
|
print("Columns:", columns_with_types)
|
913
806
|
if source:
|
914
807
|
print(f"Source: {source}")
|
@@ -918,35 +811,22 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
|
918
811
|
gc.collect()
|
919
812
|
|
920
813
|
|
921
|
-
def send_dataframe_via_telegram(
|
922
|
-
df: pd.DataFrame,
|
923
|
-
bot_name: str,
|
924
|
-
message: Optional[str] = None,
|
925
|
-
as_file: bool = True,
|
926
|
-
remove_after_send: bool = True) -> None:
|
814
|
+
def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
|
927
815
|
"""
|
928
816
|
Send a DataFrame via Telegram using a specified bot configuration.
|
929
817
|
|
930
818
|
Parameters:
|
931
819
|
df: The DataFrame to send.
|
932
|
-
bot_name: The name of the Telegram bot as specified in the
|
933
|
-
configuration.
|
820
|
+
bot_name: The name of the Telegram bot as specified in the configuration.
|
934
821
|
message: Custom message to send along with the DataFrame or file.
|
935
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a
|
936
|
-
file or as text.
|
822
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file or as text.
|
937
823
|
remove_after_send: If True, removes the file after sending.
|
938
824
|
"""
|
939
825
|
|
940
826
|
def locate_config_file(filename: str = "rgwml.config") -> str:
|
941
827
|
"""Retrieve the configuration file path."""
|
942
828
|
home_dir = os.path.expanduser("~")
|
943
|
-
search_paths = [
|
944
|
-
os.path.join(
|
945
|
-
home_dir,
|
946
|
-
folder) for folder in [
|
947
|
-
"Desktop",
|
948
|
-
"Documents",
|
949
|
-
"Downloads"]]
|
829
|
+
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
950
830
|
|
951
831
|
for path in search_paths:
|
952
832
|
for root, _, files in os.walk(path):
|
@@ -962,13 +842,7 @@ def send_dataframe_via_telegram(
|
|
962
842
|
|
963
843
|
config_path = locate_config_file()
|
964
844
|
config = get_config(config_path)
|
965
|
-
bot_config = next(
|
966
|
-
(
|
967
|
-
bot for bot in config['telegram_bot_presets']
|
968
|
-
if bot['name'] == bot_name
|
969
|
-
),
|
970
|
-
None
|
971
|
-
)
|
845
|
+
bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
|
972
846
|
|
973
847
|
if not bot_config:
|
974
848
|
raise ValueError(f"No bot found with the name {bot_name}")
|
@@ -982,15 +856,9 @@ def send_dataframe_via_telegram(
|
|
982
856
|
df.to_csv(file_name, index=False)
|
983
857
|
try:
|
984
858
|
with open(file_name, 'rb') as file:
|
985
|
-
payload = {
|
986
|
-
'chat_id': bot_config['chat_id'],
|
987
|
-
'caption': message or ''}
|
859
|
+
payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
|
988
860
|
files = {'document': file}
|
989
|
-
response = requests.post(
|
990
|
-
f"https://api.telegram.org/bot{
|
991
|
-
bot_config['bot_token']}/sendDocument",
|
992
|
-
data=payload,
|
993
|
-
files=files)
|
861
|
+
response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
|
994
862
|
if remove_after_send and os.path.exists(file_name):
|
995
863
|
os.remove(file_name)
|
996
864
|
except Exception as e:
|
@@ -1002,9 +870,7 @@ def send_dataframe_via_telegram(
|
|
1002
870
|
'chat_id': bot_config['chat_id'],
|
1003
871
|
'text': message + "\n\n" + df_str if message else df_str,
|
1004
872
|
'parse_mode': 'HTML'}
|
1005
|
-
response = requests.post(
|
1006
|
-
f"https://api.telegram.org/bot{bot_config['bot_token']}"
|
1007
|
-
+ "/sendMessage", data=payload)
|
873
|
+
response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
|
1008
874
|
|
1009
875
|
if not response.ok:
|
1010
876
|
raise Exception(f"Error sending message: {response.text}")
|
@@ -1012,49 +878,30 @@ def send_dataframe_via_telegram(
|
|
1012
878
|
print("Message sent successfully.")
|
1013
879
|
|
1014
880
|
|
1015
|
-
def send_data_to_email(
|
1016
|
-
df: pd.DataFrame,
|
1017
|
-
preset_name: str,
|
1018
|
-
to_email: str,
|
1019
|
-
subject: Optional[str] = None,
|
1020
|
-
body: Optional[str] = None,
|
1021
|
-
as_file: bool = True,
|
1022
|
-
remove_after_send: bool = True
|
1023
|
-
) -> None:
|
881
|
+
def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subject: Optional[str] = None, body: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
|
1024
882
|
"""
|
1025
|
-
Send an email with optional DataFrame attachment using Gmail API via a
|
1026
|
-
specified preset.
|
883
|
+
Send an email with optional DataFrame attachment using Gmail API via a specified preset.
|
1027
884
|
|
1028
885
|
Parameters:
|
1029
886
|
df: The DataFrame to send.
|
1030
|
-
preset_name: The configuration preset name to use for sending the
|
1031
|
-
email.
|
887
|
+
preset_name: The configuration preset name to use for sending the email.
|
1032
888
|
to_email: The recipient email address.
|
1033
889
|
subject: Optional subject of the email.
|
1034
890
|
body: Optional message body of the email.
|
1035
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a
|
1036
|
-
file.
|
891
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file.
|
1037
892
|
remove_after_send: If True, removes the CSV file after sending.
|
1038
893
|
"""
|
1039
894
|
|
1040
895
|
def locate_config_file(filename: str = "rgwml.config") -> str:
|
1041
896
|
"""Locate config file in common user directories."""
|
1042
897
|
home_dir = os.path.expanduser("~")
|
1043
|
-
search_paths = [
|
1044
|
-
os.path.join(
|
1045
|
-
home_dir,
|
1046
|
-
folder) for folder in [
|
1047
|
-
"Desktop",
|
1048
|
-
"Documents",
|
1049
|
-
"Downloads"]]
|
898
|
+
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
1050
899
|
|
1051
900
|
for path in search_paths:
|
1052
901
|
for root, _, files in os.walk(path):
|
1053
902
|
if filename in files:
|
1054
903
|
return os.path.join(root, filename)
|
1055
|
-
raise FileNotFoundError(
|
1056
|
-
f"{filename} not found in Desktop, Documents, or Downloads"
|
1057
|
-
+ " folders")
|
904
|
+
raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
|
1058
905
|
|
1059
906
|
def get_config(config_path: str) -> dict:
|
1060
907
|
with open(config_path, 'r') as file:
|
@@ -1078,13 +925,7 @@ def send_data_to_email(
|
|
1078
925
|
config = get_config(config_path)
|
1079
926
|
|
1080
927
|
# Retrieve Gmail preset configuration
|
1081
|
-
gmail_config = next(
|
1082
|
-
(
|
1083
|
-
preset for preset in config['gmail_bot_presets']
|
1084
|
-
if preset['name'] == preset_name
|
1085
|
-
),
|
1086
|
-
None
|
1087
|
-
)
|
928
|
+
gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
|
1088
929
|
|
1089
930
|
if not gmail_config:
|
1090
931
|
raise ValueError(f"No preset found with the name {preset_name}")
|
@@ -1097,9 +938,7 @@ def send_data_to_email(
|
|
1097
938
|
|
1098
939
|
if as_file:
|
1099
940
|
# Create a temporary file for the DataFrame as CSV
|
1100
|
-
with tempfile.NamedTemporaryFile(
|
1101
|
-
delete=False, suffix=".csv"
|
1102
|
-
) as tmp_file:
|
941
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
|
1103
942
|
tmp_file_name = tmp_file.name
|
1104
943
|
df.to_csv(tmp_file_name, index=False)
|
1105
944
|
|
@@ -1109,18 +948,13 @@ def send_data_to_email(
|
|
1109
948
|
message['to'] = to_email
|
1110
949
|
message['from'] = sender_email
|
1111
950
|
message['subject'] = subject if subject else 'DataFrame CSV File'
|
1112
|
-
message.attach(
|
1113
|
-
MIMEText(
|
1114
|
-
body if body else 'Please find the CSV file attached.'))
|
951
|
+
message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
|
1115
952
|
|
1116
953
|
with open(tmp_file_name, 'rb') as file:
|
1117
954
|
part = MIMEBase('application', 'octet-stream')
|
1118
955
|
part.set_payload(file.read())
|
1119
956
|
encoders.encode_base64(part)
|
1120
|
-
part.add_header(
|
1121
|
-
'Content-Disposition',
|
1122
|
-
f'attachment; filename={
|
1123
|
-
os.path.basename(tmp_file_name)}')
|
957
|
+
part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
|
1124
958
|
message.attach(part)
|
1125
959
|
|
1126
960
|
if remove_after_send and os.path.exists(tmp_file_name):
|
@@ -1142,20 +976,13 @@ def send_data_to_email(
|
|
1142
976
|
try:
|
1143
977
|
raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
|
1144
978
|
email_body = {'raw': raw}
|
1145
|
-
sent_message = service.users().messages().send(
|
1146
|
-
userId="me", body=email_body).execute()
|
979
|
+
sent_message = service.users().messages().send(userId="me", body=email_body).execute()
|
1147
980
|
print(f"Email with Message Id {sent_message['id']} successfully sent.")
|
1148
981
|
except Exception as error:
|
1149
982
|
raise Exception(f"Error sending email: {error}")
|
1150
983
|
|
1151
984
|
|
1152
|
-
def send_data_to_slack(
|
1153
|
-
df: pd.DataFrame,
|
1154
|
-
bot_name: str,
|
1155
|
-
message: Optional[str] = None,
|
1156
|
-
as_file: bool = True,
|
1157
|
-
remove_after_send: bool = True
|
1158
|
-
) -> None:
|
985
|
+
def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
|
1159
986
|
"""
|
1160
987
|
Send a DataFrame or message to Slack using a specified bot configuration.
|
1161
988
|
|
@@ -1163,29 +990,21 @@ def send_data_to_slack(
|
|
1163
990
|
df: The DataFrame to send.
|
1164
991
|
bot_name: The Slack bot configuration preset name.
|
1165
992
|
message: Custom message to send along with the DataFrame or file.
|
1166
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a
|
1167
|
-
file.
|
993
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file.
|
1168
994
|
remove_after_send: If True, removes the CSV file after sending.
|
1169
995
|
"""
|
1170
996
|
|
1171
997
|
def locate_config_file(filename: str = "rgwml.config") -> str:
|
1172
998
|
"""Locate config file in common user directories."""
|
1173
999
|
home_dir = os.path.expanduser("~")
|
1174
|
-
search_paths = [
|
1175
|
-
os.path.join(
|
1176
|
-
home_dir,
|
1177
|
-
folder) for folder in [
|
1178
|
-
"Desktop",
|
1179
|
-
"Documents",
|
1180
|
-
"Downloads"]]
|
1000
|
+
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
1181
1001
|
|
1182
1002
|
for path in search_paths:
|
1183
1003
|
for root, _, files in os.walk(path):
|
1184
1004
|
if filename in files:
|
1185
1005
|
return os.path.join(root, filename)
|
1186
1006
|
raise FileNotFoundError(
|
1187
|
-
f"{filename} not found in Desktop, Documents, or Downloads"
|
1188
|
-
+ " folders")
|
1007
|
+
f"{filename} not found in Desktop, Documents, or Downloads folders")
|
1189
1008
|
|
1190
1009
|
def get_config(config_path: str) -> dict:
|
1191
1010
|
"""Load configuration from a JSON file."""
|
@@ -1196,13 +1015,7 @@ def send_data_to_slack(
|
|
1196
1015
|
config_path = locate_config_file()
|
1197
1016
|
config = get_config(config_path)
|
1198
1017
|
|
1199
|
-
bot_config = next(
|
1200
|
-
(
|
1201
|
-
bot for bot in config['slack_bot_presets']
|
1202
|
-
if bot['name'] == bot_name
|
1203
|
-
),
|
1204
|
-
None
|
1205
|
-
)
|
1018
|
+
bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
|
1206
1019
|
|
1207
1020
|
if not bot_config:
|
1208
1021
|
raise ValueError(f"No bot found with the name {bot_name}")
|
@@ -1211,30 +1024,19 @@ def send_data_to_slack(
|
|
1211
1024
|
|
1212
1025
|
if as_file:
|
1213
1026
|
# Create a temporary file for the DataFrame as CSV
|
1214
|
-
with tempfile.NamedTemporaryFile(
|
1215
|
-
delete=False, suffix=".csv"
|
1216
|
-
) as tmp_file:
|
1027
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
|
1217
1028
|
file_name = tmp_file.name
|
1218
1029
|
df.to_csv(file_name, index=False)
|
1219
1030
|
|
1220
1031
|
try:
|
1221
1032
|
with open(file_name, 'rb') as file:
|
1222
|
-
response = client.files_upload(
|
1223
|
-
channels=bot_config['channel_id'],
|
1224
|
-
file=file,
|
1225
|
-
filename=os.path.basename(file_name),
|
1226
|
-
title="DataFrame Upload",
|
1227
|
-
initial_comment=message or ''
|
1228
|
-
)
|
1033
|
+
response = client.files_upload(channels=bot_config['channel_id'], file=file, filename=os.path.basename(file_name), title="DataFrame Upload", initial_comment=message or '')
|
1229
1034
|
finally:
|
1230
1035
|
if remove_after_send and os.path.exists(file_name):
|
1231
1036
|
os.remove(file_name)
|
1232
1037
|
else:
|
1233
1038
|
df_str = df.to_string()
|
1234
|
-
response = client.chat_postMessage(
|
1235
|
-
channel=bot_config['channel_id'],
|
1236
|
-
text=(message + "\n\n" + df_str) if message else df_str
|
1237
|
-
)
|
1039
|
+
response = client.chat_postMessage(channel=bot_config['channel_id'], text=(message + "\n\n" + df_str) if message else df_str)
|
1238
1040
|
|
1239
1041
|
# Check if the message was sent successfully
|
1240
1042
|
if not response["ok"]:
|
@@ -1291,12 +1093,7 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
|
|
1291
1093
|
return df[new_order]
|
1292
1094
|
|
1293
1095
|
|
1294
|
-
def append_ranged_classification_column(
|
1295
|
-
df: pd.DataFrame,
|
1296
|
-
ranges: str,
|
1297
|
-
target_col: str,
|
1298
|
-
new_col_name: str
|
1299
|
-
) -> pd.DataFrame:
|
1096
|
+
def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
|
1300
1097
|
"""
|
1301
1098
|
Append a ranged classification column to the DataFrame.
|
1302
1099
|
|
@@ -1364,39 +1161,22 @@ def append_ranged_classification_column(
|
|
1364
1161
|
for r in range_list
|
1365
1162
|
)
|
1366
1163
|
|
1367
|
-
labels = [
|
1368
|
-
f"{pad_number(range_list[i], max_integer_length)}"
|
1369
|
-
f" to "
|
1370
|
-
f"{pad_number(range_list[i + 1], max_integer_length)}"
|
1371
|
-
for i in range(len(range_list) - 1)
|
1372
|
-
]
|
1164
|
+
labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
|
1373
1165
|
|
1374
1166
|
# Ensure the target column is numeric
|
1375
1167
|
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1376
|
-
|
1377
|
-
df[new_col_name] = pd.cut(
|
1378
|
-
df[target_col],
|
1379
|
-
bins=range_list,
|
1380
|
-
labels=labels,
|
1381
|
-
right=False,
|
1382
|
-
include_lowest=True)
|
1168
|
+
df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
|
1383
1169
|
|
1384
1170
|
return df
|
1385
1171
|
|
1386
1172
|
|
1387
|
-
def append_percentile_classification_column(
|
1388
|
-
df: pd.DataFrame,
|
1389
|
-
percentiles: str,
|
1390
|
-
target_col: str,
|
1391
|
-
new_col_name: str
|
1392
|
-
) -> pd.DataFrame:
|
1173
|
+
def append_percentile_classification_column(df: pd.DataFrame, percentiles: str, target_col: str, new_col_name: str) -> pd.DataFrame:
|
1393
1174
|
"""
|
1394
1175
|
Append a percentile classification column to the DataFrame.
|
1395
1176
|
|
1396
1177
|
Parameters:
|
1397
1178
|
df: The DataFrame to modify.
|
1398
|
-
percentiles: A string representation of percentile values separated
|
1399
|
-
by commas.
|
1179
|
+
percentiles: A string representation of percentile values separated by commas.
|
1400
1180
|
target_col: The column to analyze.
|
1401
1181
|
new_col_name: The name of the new classification column.
|
1402
1182
|
|
@@ -1418,57 +1198,28 @@ def append_percentile_classification_column(
|
|
1418
1198
|
|
1419
1199
|
if has_decimals:
|
1420
1200
|
percentiles_list = [float(p) for p in percentiles_list]
|
1421
|
-
|
1422
|
-
|
1423
|
-
len(str(p).split('.')[1])
|
1424
|
-
for p in percentiles_list
|
1425
|
-
if '.' in str(p)
|
1426
|
-
)
|
1427
|
-
|
1428
|
-
max_integer_length = max(
|
1429
|
-
len(str(int(float(p))))
|
1430
|
-
for p in percentiles_list
|
1431
|
-
)
|
1201
|
+
max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
|
1202
|
+
max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
|
1432
1203
|
|
1433
1204
|
labels = []
|
1434
1205
|
|
1435
1206
|
for i in range(len(percentiles_list) - 1):
|
1436
|
-
start = pad_number(
|
1437
|
-
|
1438
|
-
max_integer_length,
|
1439
|
-
max_decimal_length,
|
1440
|
-
decimal=True
|
1441
|
-
)
|
1442
|
-
|
1443
|
-
end = pad_number(
|
1444
|
-
percentiles_list[i + 1],
|
1445
|
-
max_integer_length,
|
1446
|
-
max_decimal_length,
|
1447
|
-
decimal=True
|
1448
|
-
)
|
1207
|
+
start = pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)
|
1208
|
+
end = pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
|
1449
1209
|
|
1450
1210
|
label = f"{start} to {end}"
|
1451
1211
|
labels.append(label)
|
1452
1212
|
else:
|
1453
1213
|
percentiles_list = [int(p) for p in percentiles_list]
|
1454
1214
|
|
1455
|
-
max_integer_length = max(
|
1456
|
-
len(str(p))
|
1457
|
-
for p in percentiles_list
|
1458
|
-
)
|
1215
|
+
max_integer_length = max(len(str(p)) for p in percentiles_list)
|
1459
1216
|
|
1460
1217
|
labels = []
|
1461
1218
|
|
1462
1219
|
for i in range(len(percentiles_list) - 1):
|
1463
|
-
start = pad_number(
|
1464
|
-
percentiles_list[i],
|
1465
|
-
max_integer_length
|
1466
|
-
)
|
1220
|
+
start = pad_number(percentiles_list[i], max_integer_length)
|
1467
1221
|
|
1468
|
-
end = pad_number(
|
1469
|
-
percentiles_list[i + 1],
|
1470
|
-
max_integer_length
|
1471
|
-
)
|
1222
|
+
end = pad_number(percentiles_list[i + 1], max_integer_length)
|
1472
1223
|
|
1473
1224
|
label = f"{start} to {end}"
|
1474
1225
|
labels.append(label)
|
@@ -1477,28 +1228,18 @@ def append_percentile_classification_column(
|
|
1477
1228
|
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1478
1229
|
quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
|
1479
1230
|
|
1480
|
-
df[new_col_name] = pd.cut(
|
1481
|
-
df[target_col],
|
1482
|
-
bins=quantiles,
|
1483
|
-
labels=labels,
|
1484
|
-
include_lowest=True)
|
1231
|
+
df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
|
1485
1232
|
|
1486
1233
|
return df
|
1487
1234
|
|
1488
1235
|
|
1489
|
-
def append_ranged_date_classification_column(
|
1490
|
-
df: pd.DataFrame,
|
1491
|
-
date_ranges: str,
|
1492
|
-
target_col: str,
|
1493
|
-
new_col_name: str
|
1494
|
-
) -> pd.DataFrame:
|
1236
|
+
def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
|
1495
1237
|
"""
|
1496
1238
|
Append a ranged date classification column to the DataFrame.
|
1497
1239
|
|
1498
1240
|
Parameters:
|
1499
1241
|
df: The DataFrame to modify.
|
1500
|
-
date_ranges: A string representation of date ranges separated by
|
1501
|
-
commas.
|
1242
|
+
date_ranges: A string representation of date ranges separated by commas.
|
1502
1243
|
target_col: The date column to analyze.
|
1503
1244
|
new_col_name: The name of the new date classification column.
|
1504
1245
|
|
@@ -1525,24 +1266,19 @@ def append_ranged_date_classification_column(
|
|
1525
1266
|
return df
|
1526
1267
|
|
1527
1268
|
|
1528
|
-
def rename_columns(df: pd.DataFrame,
|
1529
|
-
rename_pairs: Dict[str,
|
1530
|
-
str]) -> pd.DataFrame:
|
1269
|
+
def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
|
1531
1270
|
"""
|
1532
1271
|
Rename columns in the DataFrame.
|
1533
1272
|
|
1534
1273
|
Parameters:
|
1535
1274
|
df: The DataFrame to modify.
|
1536
|
-
rename_pairs: A dictionary mapping old column names to new column
|
1537
|
-
names.
|
1275
|
+
rename_pairs: A dictionary mapping old column names to new column names.
|
1538
1276
|
|
1539
1277
|
Returns:
|
1540
1278
|
A new DataFrame with columns renamed.
|
1541
1279
|
"""
|
1542
1280
|
if df is None:
|
1543
|
-
raise ValueError(
|
1544
|
-
"No DataFrame to rename columns. Please provide a valid"
|
1545
|
-
+ " DataFrame.")
|
1281
|
+
raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
|
1546
1282
|
|
1547
1283
|
return df.rename(columns=rename_pairs)
|
1548
1284
|
|
@@ -1560,8 +1296,7 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
|
|
1560
1296
|
A new DataFrame sorted by specified columns.
|
1561
1297
|
"""
|
1562
1298
|
if df is None:
|
1563
|
-
raise ValueError(
|
1564
|
-
"No DataFrame to sort. Please provide a valid DataFrame.")
|
1299
|
+
raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
|
1565
1300
|
|
1566
1301
|
col_names = []
|
1567
1302
|
asc_order = []
|
@@ -1590,15 +1325,13 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
|
1590
1325
|
|
1591
1326
|
Parameters:
|
1592
1327
|
df: The DataFrame to modify.
|
1593
|
-
ratio_str: A string specifying the ratio of TRAIN:TEST or
|
1594
|
-
TRAIN:VALIDATE:TEST.
|
1328
|
+
ratio_str: A string specifying the ratio of TRAIN:TEST or TRAIN:VALIDATE:TEST.
|
1595
1329
|
|
1596
1330
|
Returns:
|
1597
1331
|
A new DataFrame with XGB_TYPE labels appended.
|
1598
1332
|
"""
|
1599
1333
|
if df is None:
|
1600
|
-
raise ValueError(
|
1601
|
-
"No DataFrame to add labels. Please provide a valid DataFrame.")
|
1334
|
+
raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
|
1602
1335
|
|
1603
1336
|
ratios = list(map(int, ratio_str.split(':')))
|
1604
1337
|
total_ratio = sum(ratios)
|
@@ -1615,9 +1348,7 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
|
1615
1348
|
labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
|
1616
1349
|
validate_rows + ['TEST'] * test_rows
|
1617
1350
|
else:
|
1618
|
-
raise ValueError(
|
1619
|
-
"Invalid ratio string format. Use 'TRAIN:TEST' or"
|
1620
|
-
+ "'TRAIN:VALIDATE:TEST'.")
|
1351
|
+
raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
|
1621
1352
|
|
1622
1353
|
df_with_labels = df.copy()
|
1623
1354
|
df_with_labels['XGB_TYPE'] = labels
|
@@ -1625,17 +1356,9 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
|
1625
1356
|
return df_with_labels
|
1626
1357
|
|
1627
1358
|
|
1628
|
-
def append_xgb_regression_predictions(
|
1629
|
-
df: pd.DataFrame,
|
1630
|
-
target_col: str,
|
1631
|
-
feature_cols: str,
|
1632
|
-
pred_col: str,
|
1633
|
-
boosting_rounds: int = 100,
|
1634
|
-
model_path: Optional[str] = None
|
1635
|
-
) -> pd.DataFrame:
|
1359
|
+
def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
|
1636
1360
|
"""
|
1637
|
-
Append XGB regression predictions to DataFrame. Assumes data is labeled
|
1638
|
-
by an 'XGB_TYPE' column.
|
1361
|
+
Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
|
1639
1362
|
|
1640
1363
|
Parameters:
|
1641
1364
|
df: DataFrame to modify.
|
@@ -1649,8 +1372,7 @@ def append_xgb_regression_predictions(
|
|
1649
1372
|
DataFrame with predictions appended.
|
1650
1373
|
"""
|
1651
1374
|
if df is None or 'XGB_TYPE' not in df.columns:
|
1652
|
-
raise ValueError(
|
1653
|
-
"DataFrame is not initialized or 'XGB_TYPE' column is missing.")
|
1375
|
+
raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
|
1654
1376
|
|
1655
1377
|
features = feature_cols.replace(' ', '').split(',')
|
1656
1378
|
|
@@ -1666,30 +1388,16 @@ def append_xgb_regression_predictions(
|
|
1666
1388
|
else:
|
1667
1389
|
validate_data = None
|
1668
1390
|
|
1669
|
-
dtrain = xgb.DMatrix(
|
1670
|
-
train_data[features],
|
1671
|
-
label=train_data[target_col],
|
1672
|
-
enable_categorical=True)
|
1391
|
+
dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
|
1673
1392
|
evals = [(dtrain, 'train')]
|
1674
1393
|
|
1675
1394
|
if validate_data is not None:
|
1676
|
-
dvalidate = xgb.DMatrix(
|
1677
|
-
validate_data[features],
|
1678
|
-
label=validate_data[target_col],
|
1679
|
-
enable_categorical=True)
|
1395
|
+
dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
|
1680
1396
|
evals.append((dvalidate, 'validate'))
|
1681
1397
|
|
1682
|
-
params = {
|
1683
|
-
'objective': 'reg:squarederror',
|
1684
|
-
'eval_metric': 'rmse'
|
1685
|
-
}
|
1398
|
+
params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
|
1686
1399
|
|
1687
|
-
model = xgb.train(
|
1688
|
-
params,
|
1689
|
-
dtrain,
|
1690
|
-
num_boost_round=boosting_rounds,
|
1691
|
-
evals=evals,
|
1692
|
-
early_stopping_rounds=10 if validate_data is not None else None)
|
1400
|
+
model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
|
1693
1401
|
|
1694
1402
|
# Make predictions for all data
|
1695
1403
|
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
@@ -1698,24 +1406,15 @@ def append_xgb_regression_predictions(
|
|
1698
1406
|
if model_path:
|
1699
1407
|
model.save_model(model_path)
|
1700
1408
|
|
1701
|
-
columns_order = [col for col in df.columns if col not in [
|
1702
|
-
'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1409
|
+
columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1703
1410
|
df = df[columns_order]
|
1704
1411
|
|
1705
1412
|
return df
|
1706
1413
|
|
1707
1414
|
|
1708
|
-
def append_xgb_logistic_regression_predictions(
|
1709
|
-
df: pd.DataFrame,
|
1710
|
-
target_col: str,
|
1711
|
-
feature_cols: str,
|
1712
|
-
pred_col: str,
|
1713
|
-
boosting_rounds: int = 100,
|
1714
|
-
model_path: Optional[str] = None
|
1715
|
-
) -> pd.DataFrame:
|
1415
|
+
def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
|
1716
1416
|
"""
|
1717
|
-
Append XGB logistic regression predictions to DataFrame. Assumes data is
|
1718
|
-
labeled by an 'XGB_TYPE' column.
|
1417
|
+
Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
|
1719
1418
|
|
1720
1419
|
Parameters:
|
1721
1420
|
df: DataFrame to modify.
|
@@ -1745,30 +1444,16 @@ def append_xgb_logistic_regression_predictions(
|
|
1745
1444
|
if 'VALIDATE' in df['XGB_TYPE'].values:
|
1746
1445
|
validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
|
1747
1446
|
|
1748
|
-
dtrain = xgb.DMatrix(
|
1749
|
-
train_data[features],
|
1750
|
-
label=train_data[target_col],
|
1751
|
-
enable_categorical=True)
|
1447
|
+
dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
|
1752
1448
|
evals = [(dtrain, 'train')]
|
1753
1449
|
|
1754
1450
|
if validate_data is not None:
|
1755
|
-
dvalidate = xgb.DMatrix(
|
1756
|
-
validate_data[features],
|
1757
|
-
label=validate_data[target_col],
|
1758
|
-
enable_categorical=True)
|
1451
|
+
dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
|
1759
1452
|
evals.append((dvalidate, 'validate'))
|
1760
1453
|
|
1761
|
-
params = {
|
1762
|
-
'objective': 'binary:logistic',
|
1763
|
-
'eval_metric': 'auc'
|
1764
|
-
}
|
1454
|
+
params = {'objective': 'binary:logistic', 'eval_metric': 'auc'}
|
1765
1455
|
|
1766
|
-
model = xgb.train(
|
1767
|
-
params,
|
1768
|
-
dtrain,
|
1769
|
-
num_boost_round=boosting_rounds,
|
1770
|
-
evals=evals,
|
1771
|
-
early_stopping_rounds=10 if validate_data is not None else None)
|
1456
|
+
model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
|
1772
1457
|
|
1773
1458
|
# Make predictions for all data
|
1774
1459
|
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
@@ -1777,19 +1462,13 @@ def append_xgb_logistic_regression_predictions(
|
|
1777
1462
|
if model_path:
|
1778
1463
|
model.save_model(model_path)
|
1779
1464
|
|
1780
|
-
columns_order = [col for col in df.columns if col not in [
|
1781
|
-
'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1465
|
+
columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1782
1466
|
df = df[columns_order]
|
1783
1467
|
|
1784
1468
|
return df
|
1785
1469
|
|
1786
1470
|
|
1787
|
-
def print_n_frequency_cascading(
|
1788
|
-
df: pd.DataFrame,
|
1789
|
-
n: int,
|
1790
|
-
columns: str,
|
1791
|
-
order_by: str = "FREQ_DESC"
|
1792
|
-
) -> None:
|
1471
|
+
def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
|
1793
1472
|
"""
|
1794
1473
|
Print the cascading frequency of top n values for specified columns.
|
1795
1474
|
|
@@ -1812,12 +1491,7 @@ def print_n_frequency_cascading(
|
|
1812
1491
|
# Convert the column to string representation
|
1813
1492
|
df[current_col] = df[current_col].astype(str)
|
1814
1493
|
frequency = df[current_col].value_counts(dropna=False)
|
1815
|
-
frequency = frequency.rename(
|
1816
|
-
index={
|
1817
|
-
'nan': 'NaN',
|
1818
|
-
'NaT': 'NaT',
|
1819
|
-
'None': 'None',
|
1820
|
-
'': 'Empty'})
|
1494
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
1821
1495
|
|
1822
1496
|
if limit is not None:
|
1823
1497
|
frequency = frequency.nlargest(limit)
|
@@ -1832,15 +1506,10 @@ def print_n_frequency_cascading(
|
|
1832
1506
|
filtered_df = df[df[current_col] == value]
|
1833
1507
|
|
1834
1508
|
if len(columns) > 1:
|
1835
|
-
sub_report = generate_cascade_report(
|
1836
|
-
|
1837
|
-
report[value] = {
|
1838
|
-
"count": str(count), f"sub_distribution({
|
1839
|
-
columns[1]})": sub_report if sub_report else {}}
|
1509
|
+
sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
|
1510
|
+
report[value] = {"count": str(count), f"sub_distribution({columns[1]})": sub_report if sub_report else {}}
|
1840
1511
|
else:
|
1841
|
-
report[value] = {
|
1842
|
-
"count": str(count)
|
1843
|
-
}
|
1512
|
+
report[value] = {"count": str(count)}
|
1844
1513
|
|
1845
1514
|
return report
|
1846
1515
|
|
@@ -1848,30 +1517,17 @@ def print_n_frequency_cascading(
|
|
1848
1517
|
if order_by == "ASC":
|
1849
1518
|
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1850
1519
|
elif order_by == "DESC":
|
1851
|
-
return dict(
|
1852
|
-
sorted(
|
1853
|
-
frequency.items(),
|
1854
|
-
key=lambda item: item[0],
|
1855
|
-
reverse=True))
|
1520
|
+
return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
|
1856
1521
|
elif order_by == "FREQ_ASC":
|
1857
1522
|
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1858
1523
|
else: # Default to "FREQ_DESC"
|
1859
|
-
return dict(
|
1860
|
-
sorted(
|
1861
|
-
frequency.items(),
|
1862
|
-
key=lambda item: item[1],
|
1863
|
-
reverse=True))
|
1524
|
+
return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
|
1864
1525
|
|
1865
1526
|
report = generate_cascade_report(df, columns, n, order_by)
|
1866
1527
|
print(json.dumps(report, indent=2))
|
1867
1528
|
|
1868
1529
|
|
1869
|
-
def print_n_frequency_linear(
|
1870
|
-
df: pd.DataFrame,
|
1871
|
-
n: int,
|
1872
|
-
columns: str,
|
1873
|
-
order_by: str = "FREQ_DESC"
|
1874
|
-
) -> None:
|
1530
|
+
def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
|
1875
1531
|
"""
|
1876
1532
|
Print the linear frequency of top n values for specified columns.
|
1877
1533
|
|
@@ -1891,19 +1547,13 @@ def print_n_frequency_linear(
|
|
1891
1547
|
continue
|
1892
1548
|
|
1893
1549
|
frequency = df[current_col].astype(str).value_counts(dropna=False)
|
1894
|
-
frequency = frequency.rename(
|
1895
|
-
index={
|
1896
|
-
'nan': 'NaN',
|
1897
|
-
'NaT': 'NaT',
|
1898
|
-
'None': 'None',
|
1899
|
-
'': 'Empty'})
|
1550
|
+
frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
|
1900
1551
|
|
1901
1552
|
if limit is not None:
|
1902
1553
|
frequency = frequency.nlargest(limit)
|
1903
1554
|
|
1904
1555
|
sorted_frequency = sort_frequency(frequency, order_by)
|
1905
|
-
col_report = {str(value): str(count)
|
1906
|
-
for value, count in sorted_frequency.items()}
|
1556
|
+
col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
|
1907
1557
|
report[current_col] = col_report
|
1908
1558
|
|
1909
1559
|
return report
|
@@ -1912,27 +1562,17 @@ def print_n_frequency_linear(
|
|
1912
1562
|
if order_by == "ASC":
|
1913
1563
|
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1914
1564
|
elif order_by == "DESC":
|
1915
|
-
return dict(
|
1916
|
-
sorted(
|
1917
|
-
frequency.items(),
|
1918
|
-
key=lambda item: item[0],
|
1919
|
-
reverse=True))
|
1565
|
+
return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
|
1920
1566
|
elif order_by == "FREQ_ASC":
|
1921
1567
|
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1922
1568
|
else: # Default to "FREQ_DESC"
|
1923
|
-
return dict(
|
1924
|
-
sorted(
|
1925
|
-
frequency.items(),
|
1926
|
-
key=lambda item: item[1],
|
1927
|
-
reverse=True))
|
1569
|
+
return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
|
1928
1570
|
|
1929
1571
|
report = generate_linear_report(df, columns, n, order_by)
|
1930
1572
|
print(json.dumps(report, indent=2))
|
1931
1573
|
|
1932
1574
|
|
1933
|
-
def retain_columns(
|
1934
|
-
df: pd.DataFrame,
|
1935
|
-
columns_to_retain: List[str]) -> pd.DataFrame:
|
1575
|
+
def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
|
1936
1576
|
"""
|
1937
1577
|
Retain specified columns in the DataFrame and drop the others.
|
1938
1578
|
|
@@ -1948,11 +1588,7 @@ def retain_columns(
|
|
1948
1588
|
return df[columns_to_retain]
|
1949
1589
|
|
1950
1590
|
|
1951
|
-
def mask_against_dataframe(
|
1952
|
-
df: pd.DataFrame,
|
1953
|
-
other_df: pd.DataFrame,
|
1954
|
-
column_name: str
|
1955
|
-
) -> pd.DataFrame:
|
1591
|
+
def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
|
1956
1592
|
"""
|
1957
1593
|
Retain only rows with common column values between two DataFrames.
|
1958
1594
|
|
@@ -1969,11 +1605,7 @@ def mask_against_dataframe(
|
|
1969
1605
|
return df[df[column_name].isin(other_df[column_name])]
|
1970
1606
|
|
1971
1607
|
|
1972
|
-
def mask_against_dataframe_converse(
|
1973
|
-
df: pd.DataFrame,
|
1974
|
-
other_df: pd.DataFrame,
|
1975
|
-
column_name: str
|
1976
|
-
) -> pd.DataFrame:
|
1608
|
+
def mask_against_dataframe_converse(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
|
1977
1609
|
"""
|
1978
1610
|
Retain only rows with uncommon column values between two DataFrames.
|
1979
1611
|
|
@@ -1990,3 +1622,77 @@ def mask_against_dataframe_converse(
|
|
1990
1622
|
raise ValueError("The specified column must exist in both DataFrames.")
|
1991
1623
|
|
1992
1624
|
return df[~df[column_name].isin(other_df[column_name])]
|
1625
|
+
|
1626
|
+
|
1627
|
+
def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
1628
|
+
"""
|
1629
|
+
Perform a union join, concatenating the two DataFrames and dropping duplicates.
|
1630
|
+
|
1631
|
+
Parameters:
|
1632
|
+
df1: First DataFrame.
|
1633
|
+
df2: Second DataFrame.
|
1634
|
+
|
1635
|
+
Returns:
|
1636
|
+
A new DataFrame with the union of df1 and df2, without duplicates.
|
1637
|
+
|
1638
|
+
Raises:
|
1639
|
+
ValueError: If the DataFrames do not have the same columns.
|
1640
|
+
"""
|
1641
|
+
if set(df1.columns) != set(df2.columns):
|
1642
|
+
raise ValueError("Both DataFrames must have the same columns for a union join")
|
1643
|
+
|
1644
|
+
result_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
|
1645
|
+
return result_df
|
1646
|
+
|
1647
|
+
|
1648
|
+
def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
1649
|
+
"""
|
1650
|
+
Perform a bag union join, concatenating the two DataFrames without dropping duplicates.
|
1651
|
+
|
1652
|
+
Parameters:
|
1653
|
+
df1: First DataFrame.
|
1654
|
+
df2: Second DataFrame.
|
1655
|
+
|
1656
|
+
Returns:
|
1657
|
+
A new DataFrame with the concatenated data of df1 and df2.
|
1658
|
+
|
1659
|
+
Raises:
|
1660
|
+
ValueError: If the DataFrames do not have the same columns.
|
1661
|
+
"""
|
1662
|
+
if set(df1.columns) != set(df2.columns):
|
1663
|
+
raise ValueError("Both DataFrames must have the same columns for a bag union join")
|
1664
|
+
|
1665
|
+
result_df = pd.concat([df1, df2], ignore_index=True)
|
1666
|
+
return result_df
|
1667
|
+
|
1668
|
+
|
1669
|
+
def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
|
1670
|
+
"""
|
1671
|
+
Perform a left join on two DataFrames.
|
1672
|
+
|
1673
|
+
Parameters:
|
1674
|
+
df1: The left DataFrame.
|
1675
|
+
df2: The right DataFrame.
|
1676
|
+
left_on: Column name in df1 to join on.
|
1677
|
+
right_on: Column name in df2 to join on.
|
1678
|
+
|
1679
|
+
Returns:
|
1680
|
+
A new DataFrame as the result of a left join.
|
1681
|
+
"""
|
1682
|
+
return df1.merge(df2, how='left', left_on=left_on, right_on=right_on)
|
1683
|
+
|
1684
|
+
|
1685
|
+
def right_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
|
1686
|
+
"""
|
1687
|
+
Perform a right join on two DataFrames.
|
1688
|
+
|
1689
|
+
Parameters:
|
1690
|
+
df1: The left DataFrame.
|
1691
|
+
df2: The right DataFrame.
|
1692
|
+
left_on: Column name in df1 to join on.
|
1693
|
+
right_on: Column name in df2 to join on.
|
1694
|
+
|
1695
|
+
Returns:
|
1696
|
+
A new DataFrame as the result of a right join.
|
1697
|
+
"""
|
1698
|
+
return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
|