imsciences 0.6.2.0__tar.gz → 0.6.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/PKG-INFO +1 -1
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/imsciences/datafunctions.py +38 -32
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/imsciences.egg-info/PKG-INFO +1 -1
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/setup.py +1 -1
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/README.md +0 -0
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/imsciences/__init__.py +0 -0
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/imsciences.egg-info/SOURCES.txt +0 -0
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.6.2.0 → imsciences-0.6.2.2}/setup.cfg +0 -0
|
@@ -109,8 +109,8 @@ class dataprocessing:
|
|
|
109
109
|
|
|
110
110
|
print("\n17. pivot_table")
|
|
111
111
|
print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
|
|
112
|
-
print(" - Usage: pivot_table(df,
|
|
113
|
-
print(" - Example: pivot_table(df, {'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''},
|
|
112
|
+
print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'False')")
|
|
113
|
+
print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value',filters_dict={'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'True')")
|
|
114
114
|
|
|
115
115
|
print("\n18. apply_lookup_table_for_columns")
|
|
116
116
|
print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
|
|
@@ -657,59 +657,68 @@ class dataprocessing:
|
|
|
657
657
|
|
|
658
658
|
return combined_df
|
|
659
659
|
|
|
660
|
-
def pivot_table(self, df,
|
|
660
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False):
|
|
661
661
|
"""
|
|
662
662
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
663
663
|
|
|
664
664
|
Args:
|
|
665
665
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
666
|
-
filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
|
|
667
666
|
index_col (str): Name of Column for your pivot table to index on
|
|
668
667
|
columns (str): Name of Columns for your pivot table.
|
|
669
668
|
values_col (str): Name of Values Columns for your pivot table.
|
|
669
|
+
filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
|
|
670
670
|
fill_value (int, optional): The value to replace nan with. Defaults to 0.
|
|
671
671
|
aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
|
|
672
672
|
margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
|
|
673
673
|
margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
|
|
674
674
|
datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
|
|
675
|
+
reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
|
|
675
676
|
|
|
676
677
|
Returns:
|
|
677
678
|
pandas.DataFrame: The pivot table specified
|
|
678
679
|
"""
|
|
679
680
|
|
|
680
681
|
# Create the filtered df by applying the conditions
|
|
681
|
-
|
|
682
|
+
if filters_dict is None:
|
|
683
|
+
df_filtered = df
|
|
684
|
+
else:
|
|
685
|
+
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
682
686
|
|
|
683
|
-
# Ensure
|
|
687
|
+
# Ensure index column is in datetime format for proper sorting
|
|
684
688
|
df_filtered = df_filtered.copy()
|
|
685
689
|
|
|
686
690
|
# If datetime transformation is needed
|
|
687
|
-
if datetime_trans_needed
|
|
688
|
-
df_filtered
|
|
691
|
+
if datetime_trans_needed:
|
|
692
|
+
df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
|
|
689
693
|
|
|
690
694
|
# Create the pivot table
|
|
691
|
-
pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc,margins=margins,margins_name=margins_name)
|
|
695
|
+
pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
|
|
692
696
|
|
|
693
697
|
# Handling MultiIndex columns if present, making them a flat structure
|
|
694
|
-
if
|
|
695
|
-
pivoted_df.columns
|
|
698
|
+
if not reverse_header_order:
|
|
699
|
+
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
700
|
+
pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
|
|
701
|
+
else:
|
|
702
|
+
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
696
703
|
else:
|
|
697
|
-
pivoted_df.columns
|
|
704
|
+
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
705
|
+
# Reorder the MultiIndex columns
|
|
706
|
+
pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
|
|
707
|
+
else:
|
|
708
|
+
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
709
|
+
# Reverse the order for single index columns
|
|
710
|
+
pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
|
|
698
711
|
|
|
699
712
|
# Reset the pivot before returning
|
|
700
713
|
pivoted_df = pivoted_df.reset_index()
|
|
701
714
|
|
|
702
|
-
# Sort by
|
|
703
|
-
if datetime_trans_needed
|
|
704
|
-
# pivoted_df = pivoted_df.reset_index()
|
|
715
|
+
# Sort by index column from oldest to newest
|
|
716
|
+
if datetime_trans_needed:
|
|
705
717
|
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
|
|
706
718
|
pivoted_df = pivoted_df.sort_values(by=index_col)
|
|
707
|
-
|
|
708
|
-
# Convert OBS back to a string in YYYY-MM-DD format for display purposes
|
|
709
|
-
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
|
|
710
719
|
|
|
711
|
-
#
|
|
712
|
-
|
|
720
|
+
# Convert index column back to a string in YYYY-MM-DD format for display purposes
|
|
721
|
+
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
|
|
713
722
|
|
|
714
723
|
# Fill in any NaNs
|
|
715
724
|
pivoted_df = pivoted_df.fillna(fill_value)
|
|
@@ -1436,15 +1445,14 @@ class dataprocessing:
|
|
|
1436
1445
|
Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
|
|
1437
1446
|
|
|
1438
1447
|
Args:
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
resulting categories. Default is 'Category'.
|
|
1448
|
+
df (pd.DataFrame): The DataFrame containing the column to categorize.
|
|
1449
|
+
column_name (str): The name of the column in the DataFrame that contains the text data to categorize.
|
|
1450
|
+
category_dict (dict): A dictionary where keys are substrings to search for in the text and values are the categories to assign when a substring is found.
|
|
1451
|
+
new_col_name (str, optional): The name of the new column to be created in the DataFrame, which will hold the resulting categories. Default is 'Category'.
|
|
1452
|
+
other_label (str, optional): The name given to category if no substring from the dictionary is found in the cell
|
|
1445
1453
|
|
|
1446
1454
|
Returns:
|
|
1447
|
-
|
|
1455
|
+
pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
|
|
1448
1456
|
"""
|
|
1449
1457
|
|
|
1450
1458
|
def categorize_text(text):
|
|
@@ -1452,13 +1460,11 @@ class dataprocessing:
|
|
|
1452
1460
|
Assigns a category to a single text string based on the presence of substrings from a dictionary.
|
|
1453
1461
|
|
|
1454
1462
|
Args:
|
|
1455
|
-
|
|
1456
|
-
- category_dict (dict): A dictionary where keys are substrings to search for in the text and
|
|
1457
|
-
values are the categories to assign if a substring is found.
|
|
1463
|
+
text (str): The text string to categorize.
|
|
1458
1464
|
|
|
1459
1465
|
Returns:
|
|
1460
|
-
|
|
1461
|
-
matching substring is found, returns
|
|
1466
|
+
str: The category assigned based on the first matching substring found in the text. If no
|
|
1467
|
+
matching substring is found, returns other_name.
|
|
1462
1468
|
"""
|
|
1463
1469
|
for key, category in category_dict.items():
|
|
1464
1470
|
if key.lower() in text.lower(): # Check if the substring is in the text (case-insensitive)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|