imsciences 0.6.2.1__py3-none-any.whl → 0.6.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/datafunctions.py +96 -28
- {imsciences-0.6.2.1.dist-info → imsciences-0.6.2.3.dist-info}/METADATA +1 -1
- {imsciences-0.6.2.1.dist-info → imsciences-0.6.2.3.dist-info}/RECORD +5 -5
- {imsciences-0.6.2.1.dist-info → imsciences-0.6.2.3.dist-info}/WHEEL +0 -0
- {imsciences-0.6.2.1.dist-info → imsciences-0.6.2.3.dist-info}/top_level.txt +0 -0
imsciences/datafunctions.py
CHANGED
|
@@ -109,8 +109,8 @@ class dataprocessing:
|
|
|
109
109
|
|
|
110
110
|
print("\n17. pivot_table")
|
|
111
111
|
print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
|
|
112
|
-
print(" - Usage: pivot_table(df,
|
|
113
|
-
print(" - Example: pivot_table(df, {'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''},
|
|
112
|
+
print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'False')")
|
|
113
|
+
print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value',filters_dict={'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'True')")
|
|
114
114
|
|
|
115
115
|
print("\n18. apply_lookup_table_for_columns")
|
|
116
116
|
print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
|
|
@@ -177,42 +177,42 @@ class dataprocessing:
|
|
|
177
177
|
print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
178
178
|
print(" - Example: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
179
179
|
|
|
180
|
-
print("\n31.
|
|
180
|
+
print("\n31. remove_zero_values")
|
|
181
181
|
print(" - Description: Remove zero values in a specified column.")
|
|
182
182
|
print(" - Usage: remove_zero_values(self, data_frame, column_to_filter)")
|
|
183
183
|
print(" - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
|
|
184
184
|
|
|
185
|
-
print("\n32.
|
|
185
|
+
print("\n32. upgrade_outdated_packages")
|
|
186
186
|
print(" - Description: Upgrades all packages.")
|
|
187
187
|
print(" - Usage: upgrade_outdated_packages()")
|
|
188
188
|
print(" - Example: upgrade_outdated_packages()")
|
|
189
189
|
|
|
190
|
-
print("\n33.
|
|
190
|
+
print("\n33. convert_mixed_formats_dates")
|
|
191
191
|
print(" - Description: Convert a mix of US and UK dates to datetime.")
|
|
192
192
|
print(" - Usage: convert_mixed_formats_dates(df, datecol)")
|
|
193
193
|
print(" - Example: convert_mixed_formats_dates(df, 'OBS')")
|
|
194
194
|
|
|
195
|
-
print("\n34.
|
|
195
|
+
print("\n34. fill_weekly_date_range")
|
|
196
196
|
print(" - Description: Fill in any missing weeks with 0.")
|
|
197
197
|
print(" - Usage: fill_weekly_date_range(df, date_column, freq)")
|
|
198
198
|
print(" - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
|
|
199
199
|
|
|
200
|
-
print("\n35.
|
|
200
|
+
print("\n35. add_prefix_and_suffix")
|
|
201
201
|
print(" - Description: Add Prefix and/or Suffix to Column Headers.")
|
|
202
202
|
print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
|
|
203
203
|
print(" - Example: add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')")
|
|
204
204
|
|
|
205
|
-
print("\n36.
|
|
205
|
+
print("\n36. create_dummies")
|
|
206
206
|
print(" - Description: Changes time series to 0s and 1s based off threshold")
|
|
207
207
|
print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
|
|
208
208
|
print(" - Example: create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')")
|
|
209
209
|
|
|
210
|
-
print("\n37.
|
|
210
|
+
print("\n37. replace_substrings")
|
|
211
211
|
print(" - Description: Replace substrings in column of strings based off dictionary, can also change column to lower")
|
|
212
212
|
print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
|
|
213
213
|
print(" - Example: replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')")
|
|
214
214
|
|
|
215
|
-
print("\n38.
|
|
215
|
+
print("\n38. add_total_column")
|
|
216
216
|
print(" - Description: Sums all columns with the option to exclude an date column to create a total column")
|
|
217
217
|
print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
|
|
218
218
|
print(" - Example: add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')")
|
|
@@ -221,6 +221,13 @@ class dataprocessing:
|
|
|
221
221
|
print(" - Description: Equivalent of xlookup in excel, but only based on substrings. If a substring is found in a cell, than look it up in the dictionary. Otherwise use the other label")
|
|
222
222
|
print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
|
|
223
223
|
print(" - Example: apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')")
|
|
224
|
+
|
|
225
|
+
print("\n40. compare_overlap")
|
|
226
|
+
print(" - Description: With two matching dataset, it takes the common columns and rows and takes the difference between them, outputing a differences and total differences table")
|
|
227
|
+
print(" - Usage: compare_overlap(df1, df2, date_col)")
|
|
228
|
+
print(" - Example: compare_overlap(df_1, df_2, 'obs')")
|
|
229
|
+
|
|
230
|
+
|
|
224
231
|
|
|
225
232
|
def get_wd_levels(self, levels):
|
|
226
233
|
"""
|
|
@@ -657,59 +664,68 @@ class dataprocessing:
|
|
|
657
664
|
|
|
658
665
|
return combined_df
|
|
659
666
|
|
|
660
|
-
def pivot_table(self, df,
|
|
667
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False):
|
|
661
668
|
"""
|
|
662
669
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
663
670
|
|
|
664
671
|
Args:
|
|
665
672
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
666
|
-
filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
|
|
667
673
|
index_col (str): Name of Column for your pivot table to index on
|
|
668
674
|
columns (str): Name of Columns for your pivot table.
|
|
669
675
|
values_col (str): Name of Values Columns for your pivot table.
|
|
676
|
+
filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
|
|
670
677
|
fill_value (int, optional): The value to replace nan with. Defaults to 0.
|
|
671
678
|
aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
|
|
672
679
|
margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
|
|
673
680
|
margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
|
|
674
681
|
datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
|
|
682
|
+
reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
|
|
675
683
|
|
|
676
684
|
Returns:
|
|
677
685
|
pandas.DataFrame: The pivot table specified
|
|
678
686
|
"""
|
|
679
687
|
|
|
680
688
|
# Create the filtered df by applying the conditions
|
|
681
|
-
|
|
689
|
+
if filters_dict is None:
|
|
690
|
+
df_filtered = df
|
|
691
|
+
else:
|
|
692
|
+
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
682
693
|
|
|
683
|
-
# Ensure
|
|
694
|
+
# Ensure index column is in datetime format for proper sorting
|
|
684
695
|
df_filtered = df_filtered.copy()
|
|
685
696
|
|
|
686
697
|
# If datetime transformation is needed
|
|
687
|
-
if datetime_trans_needed
|
|
688
|
-
df_filtered
|
|
698
|
+
if datetime_trans_needed:
|
|
699
|
+
df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
|
|
689
700
|
|
|
690
701
|
# Create the pivot table
|
|
691
|
-
pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc,margins=margins,margins_name=margins_name)
|
|
702
|
+
pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
|
|
692
703
|
|
|
693
704
|
# Handling MultiIndex columns if present, making them a flat structure
|
|
694
|
-
if
|
|
695
|
-
pivoted_df.columns
|
|
705
|
+
if not reverse_header_order:
|
|
706
|
+
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
707
|
+
pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
|
|
708
|
+
else:
|
|
709
|
+
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
696
710
|
else:
|
|
697
|
-
pivoted_df.columns
|
|
711
|
+
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
712
|
+
# Reorder the MultiIndex columns
|
|
713
|
+
pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
|
|
714
|
+
else:
|
|
715
|
+
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
716
|
+
# Reverse the order for single index columns
|
|
717
|
+
pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
|
|
698
718
|
|
|
699
719
|
# Reset the pivot before returning
|
|
700
720
|
pivoted_df = pivoted_df.reset_index()
|
|
701
721
|
|
|
702
|
-
# Sort by
|
|
703
|
-
if datetime_trans_needed
|
|
704
|
-
# pivoted_df = pivoted_df.reset_index()
|
|
722
|
+
# Sort by index column from oldest to newest
|
|
723
|
+
if datetime_trans_needed:
|
|
705
724
|
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
|
|
706
725
|
pivoted_df = pivoted_df.sort_values(by=index_col)
|
|
707
|
-
|
|
708
|
-
# Convert OBS back to a string in YYYY-MM-DD format for display purposes
|
|
709
|
-
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
|
|
710
726
|
|
|
711
|
-
#
|
|
712
|
-
|
|
727
|
+
# Convert index column back to a string in YYYY-MM-DD format for display purposes
|
|
728
|
+
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
|
|
713
729
|
|
|
714
730
|
# Fill in any NaNs
|
|
715
731
|
pivoted_df = pivoted_df.fillna(fill_value)
|
|
@@ -1466,7 +1482,59 @@ class dataprocessing:
|
|
|
1466
1482
|
df[new_col_name] = df[column_name].apply(categorize_text)
|
|
1467
1483
|
return df
|
|
1468
1484
|
|
|
1485
|
+
def compare_overlap(df1, df2, date_col):
|
|
1486
|
+
"""
|
|
1487
|
+
Compare overlapping periods between two DataFrames and provide a summary of total differences.
|
|
1488
|
+
|
|
1489
|
+
Args:
|
|
1490
|
+
df1 (pandas.DataFrame): First DataFrame containing date-based data.
|
|
1491
|
+
df2 (pandas.DataFrame): Second DataFrame containing date-based data.
|
|
1492
|
+
date_col (str): The name of the date column used for aligning data.
|
|
1469
1493
|
|
|
1494
|
+
Returns:
|
|
1495
|
+
tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
|
|
1496
|
+
"""
|
|
1497
|
+
# Ensure date columns are in datetime format
|
|
1498
|
+
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
1499
|
+
df2[date_col] = pd.to_datetime(df2[date_col])
|
|
1500
|
+
|
|
1501
|
+
# Determine the overlap period
|
|
1502
|
+
start_date = max(df1[date_col].min(), df2[date_col].min())
|
|
1503
|
+
end_date = min(df1[date_col].max(), df2[date_col].max())
|
|
1504
|
+
|
|
1505
|
+
# Filter dataframes to the overlapping period
|
|
1506
|
+
df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
|
|
1507
|
+
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1508
|
+
|
|
1509
|
+
# Merge the dataframes on the date column to align data for comparison
|
|
1510
|
+
merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
|
|
1511
|
+
|
|
1512
|
+
# Initialize a list to collect total differences for each column
|
|
1513
|
+
total_diff_list = []
|
|
1514
|
+
|
|
1515
|
+
# Compare the values in each column (excluding the date column)
|
|
1516
|
+
diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
|
|
1517
|
+
|
|
1518
|
+
for col in df1.columns:
|
|
1519
|
+
if col != date_col:
|
|
1520
|
+
# Calculate the difference for each row
|
|
1521
|
+
diff_col = f'diff_{col}'
|
|
1522
|
+
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
|
|
1523
|
+
|
|
1524
|
+
# Calculate the total difference for the column and add it to the list
|
|
1525
|
+
total_diff = diff_df[diff_col].sum()
|
|
1526
|
+
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1527
|
+
|
|
1528
|
+
# Create a DataFrame for the summary of total differences
|
|
1529
|
+
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1530
|
+
|
|
1531
|
+
# Apply formatting to the numerical columns
|
|
1532
|
+
float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
|
|
1533
|
+
diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
|
|
1534
|
+
total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
|
|
1535
|
+
|
|
1536
|
+
return diff_df, total_diff_df
|
|
1537
|
+
|
|
1470
1538
|
########################################################################################################################################
|
|
1471
1539
|
########################################################################################################################################
|
|
1472
1540
|
|
|
@@ -2,13 +2,13 @@ dataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
|
|
|
2
2
|
dataprocessing/data-processing-functions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
3
3
|
dataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
4
4
|
imsciences/__init__.py,sha256=GIPbLmWc06sVcOySWwNvMNUr6XGOHqPLryFIWgtpHh8,78
|
|
5
|
-
imsciences/datafunctions.py,sha256=
|
|
5
|
+
imsciences/datafunctions.py,sha256=IrcIfw80MQnnRc2gD6QfuKIlDgVQxkZX-bTj7LKOiEU,143441
|
|
6
6
|
imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
|
|
7
7
|
imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
|
|
8
8
|
imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
9
9
|
imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
|
|
10
10
|
imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
11
|
-
imsciences-0.6.2.
|
|
12
|
-
imsciences-0.6.2.
|
|
13
|
-
imsciences-0.6.2.
|
|
14
|
-
imsciences-0.6.2.
|
|
11
|
+
imsciences-0.6.2.3.dist-info/METADATA,sha256=diBYqgQ-3WJ9pcVQfeAmJkUyLzpy5tqMX1VWjD6zT7k,854
|
|
12
|
+
imsciences-0.6.2.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
13
|
+
imsciences-0.6.2.3.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
|
|
14
|
+
imsciences-0.6.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|