imsciences 0.6.2.1__tar.gz → 0.6.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.2.1
3
+ Version: 0.6.2.3
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -109,8 +109,8 @@ class dataprocessing:
109
109
 
110
110
  print("\n17. pivot_table")
111
111
  print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
112
- print(" - Usage: pivot_table(df, filters_dict, index_col, columns, values_col, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True)")
113
- print(" - Example: pivot_table(df, {'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, 'OBS', 'Channel Short Names', 'Value', fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True)")
112
+ print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'False')")
113
+ print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value',filters_dict={'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'True')")
114
114
 
115
115
  print("\n18. apply_lookup_table_for_columns")
116
116
  print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
@@ -177,42 +177,42 @@ class dataprocessing:
177
177
  print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
178
178
  print(" - Example: read_and_concatenate_files(folder_path, file_type='csv')")
179
179
 
180
- print("\n31. remove zero values")
180
+ print("\n31. remove_zero_values")
181
181
  print(" - Description: Remove zero values in a specified column.")
182
182
  print(" - Usage: remove_zero_values(self, data_frame, column_to_filter)")
183
183
  print(" - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
184
184
 
185
- print("\n32. upgrade all packages")
185
+ print("\n32. upgrade_outdated_packages")
186
186
  print(" - Description: Upgrades all packages.")
187
187
  print(" - Usage: upgrade_outdated_packages()")
188
188
  print(" - Example: upgrade_outdated_packages()")
189
189
 
190
- print("\n33. Convert Mixed Formats Dates")
190
+ print("\n33. convert_mixed_formats_dates")
191
191
  print(" - Description: Convert a mix of US and UK dates to datetime.")
192
192
  print(" - Usage: convert_mixed_formats_dates(df, datecol)")
193
193
  print(" - Example: convert_mixed_formats_dates(df, 'OBS')")
194
194
 
195
- print("\n34. Fill Weekly Missing Dates")
195
+ print("\n34. fill_weekly_date_range")
196
196
  print(" - Description: Fill in any missing weeks with 0.")
197
197
  print(" - Usage: fill_weekly_date_range(df, date_column, freq)")
198
198
  print(" - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
199
199
 
200
- print("\n35. Add Prefix and/or Suffix to Column Headers")
200
+ print("\n35. add_prefix_and_suffix")
201
201
  print(" - Description: Add Prefix and/or Suffix to Column Headers.")
202
202
  print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
203
203
  print(" - Example: add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')")
204
204
 
205
- print("\n36. Change all data to dummies")
205
+ print("\n36. create_dummies")
206
206
  print(" - Description: Changes time series to 0s and 1s based off threshold")
207
207
  print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
208
208
  print(" - Example: create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')")
209
209
 
210
- print("\n37. Replace substrings in column of strings")
210
+ print("\n37. replace_substrings")
211
211
  print(" - Description: Replace substrings in column of strings based off dictionary, can also change column to lower")
212
212
  print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
213
213
  print(" - Example: replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')")
214
214
 
215
- print("\n38. Add totals column")
215
+ print("\n38. add_total_column")
216
216
  print(" - Description: Sums all columns with the option to exclude an date column to create a total column")
217
217
  print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
218
218
  print(" - Example: add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')")
@@ -221,6 +221,13 @@ class dataprocessing:
221
221
  print(" - Description: Equivalent of xlookup in excel, but only based on substrings. If a substring is found in a cell, than look it up in the dictionary. Otherwise use the other label")
222
222
  print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
223
223
  print(" - Example: apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')")
224
+
225
+ print("\n40. compare_overlap")
226
+ print(" - Description: With two matching dataset, it takes the common columns and rows and takes the difference between them, outputing a differences and total differences table")
227
+ print(" - Usage: compare_overlap(df1, df2, date_col)")
228
+ print(" - Example: compare_overlap(df_1, df_2, 'obs')")
229
+
230
+
224
231
 
225
232
  def get_wd_levels(self, levels):
226
233
  """
@@ -657,59 +664,68 @@ class dataprocessing:
657
664
 
658
665
  return combined_df
659
666
 
660
- def pivot_table(self, df, filters_dict, index_col, columns, values_col, fill_value=0,aggfunc='sum',margins=False,margins_name="Total",datetime_trans_needed=True):
667
+ def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False):
661
668
  """
662
669
  Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
663
670
 
664
671
  Args:
665
672
  df (pandas.DataFrame): The DataFrame containing the data.
666
- filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
667
673
  index_col (str): Name of Column for your pivot table to index on
668
674
  columns (str): Name of Columns for your pivot table.
669
675
  values_col (str): Name of Values Columns for your pivot table.
676
+ filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
670
677
  fill_value (int, optional): The value to replace nan with. Defaults to 0.
671
678
  aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
672
679
  margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
673
680
  margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
674
681
  datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
682
+ reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
675
683
 
676
684
  Returns:
677
685
  pandas.DataFrame: The pivot table specified
678
686
  """
679
687
 
680
688
  # Create the filtered df by applying the conditions
681
- df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
689
+ if filters_dict is None:
690
+ df_filtered = df
691
+ else:
692
+ df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
682
693
 
683
- # Ensure OBS is in datetime format for proper sorting
694
+ # Ensure index column is in datetime format for proper sorting
684
695
  df_filtered = df_filtered.copy()
685
696
 
686
697
  # If datetime transformation is needed
687
- if datetime_trans_needed is True:
688
- df_filtered.loc[:,index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
698
+ if datetime_trans_needed:
699
+ df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
689
700
 
690
701
  # Create the pivot table
691
- pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc,margins=margins,margins_name=margins_name)
702
+ pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc, margins=margins, margins_name=margins_name)
692
703
 
693
704
  # Handling MultiIndex columns if present, making them a flat structure
694
- if isinstance(pivoted_df.columns, pd.MultiIndex):
695
- pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
705
+ if not reverse_header_order:
706
+ if isinstance(pivoted_df.columns, pd.MultiIndex):
707
+ pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
708
+ else:
709
+ pivoted_df.columns = pivoted_df.columns.map(str)
696
710
  else:
697
- pivoted_df.columns = pivoted_df.columns.map(str)
711
+ if isinstance(pivoted_df.columns, pd.MultiIndex):
712
+ # Reorder the MultiIndex columns
713
+ pivoted_df.columns = ['_'.join(reversed(list(map(str, col)))).strip() for col in pivoted_df.columns.values]
714
+ else:
715
+ pivoted_df.columns = pivoted_df.columns.map(str)
716
+ # Reverse the order for single index columns
717
+ pivoted_df.columns = ['_'.join(reversed(col.split('_'))).strip() for col in pivoted_df.columns]
698
718
 
699
719
  # Reset the pivot before returning
700
720
  pivoted_df = pivoted_df.reset_index()
701
721
 
702
- # Sort by OBS from oldest to newest
703
- if datetime_trans_needed is True:
704
- # pivoted_df = pivoted_df.reset_index()
722
+ # Sort by index column from oldest to newest
723
+ if datetime_trans_needed:
705
724
  pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
706
725
  pivoted_df = pivoted_df.sort_values(by=index_col)
707
-
708
- # Convert OBS back to a string in YYYY-MM-DD format for display purposes
709
- pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
710
726
 
711
- # Set index back to date column
712
- # pivoted_df.set_index(index_col,inplace=True)
727
+ # Convert index column back to a string in YYYY-MM-DD format for display purposes
728
+ pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
713
729
 
714
730
  # Fill in any NaNs
715
731
  pivoted_df = pivoted_df.fillna(fill_value)
@@ -1466,7 +1482,59 @@ class dataprocessing:
1466
1482
  df[new_col_name] = df[column_name].apply(categorize_text)
1467
1483
  return df
1468
1484
 
1485
+ def compare_overlap(df1, df2, date_col):
1486
+ """
1487
+ Compare overlapping periods between two DataFrames and provide a summary of total differences.
1488
+
1489
+ Args:
1490
+ df1 (pandas.DataFrame): First DataFrame containing date-based data.
1491
+ df2 (pandas.DataFrame): Second DataFrame containing date-based data.
1492
+ date_col (str): The name of the date column used for aligning data.
1469
1493
 
1494
+ Returns:
1495
+ tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
1496
+ """
1497
+ # Ensure date columns are in datetime format
1498
+ df1[date_col] = pd.to_datetime(df1[date_col])
1499
+ df2[date_col] = pd.to_datetime(df2[date_col])
1500
+
1501
+ # Determine the overlap period
1502
+ start_date = max(df1[date_col].min(), df2[date_col].min())
1503
+ end_date = min(df1[date_col].max(), df2[date_col].max())
1504
+
1505
+ # Filter dataframes to the overlapping period
1506
+ df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
1507
+ df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1508
+
1509
+ # Merge the dataframes on the date column to align data for comparison
1510
+ merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1511
+
1512
+ # Initialize a list to collect total differences for each column
1513
+ total_diff_list = []
1514
+
1515
+ # Compare the values in each column (excluding the date column)
1516
+ diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
1517
+
1518
+ for col in df1.columns:
1519
+ if col != date_col:
1520
+ # Calculate the difference for each row
1521
+ diff_col = f'diff_{col}'
1522
+ diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
1523
+
1524
+ # Calculate the total difference for the column and add it to the list
1525
+ total_diff = diff_df[diff_col].sum()
1526
+ total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1527
+
1528
+ # Create a DataFrame for the summary of total differences
1529
+ total_diff_df = pd.DataFrame(total_diff_list)
1530
+
1531
+ # Apply formatting to the numerical columns
1532
+ float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
1533
+ diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
1534
+ total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
1535
+
1536
+ return diff_df, total_diff_df
1537
+
1470
1538
  ########################################################################################################################################
1471
1539
  ########################################################################################################################################
1472
1540
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.2.1
3
+ Version: 0.6.2.3
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.6.2.1'
11
+ VERSION = '0.6.2.3'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md') # Reading from README.md
14
14
 
File without changes
File without changes