PyPI - imsciences - Versions diffs - 0.6.2.2__tar.gz → 0.6.2.3__tar.gz - Mend

@@ -177,42 +177,42 @@ class dataprocessing:
         print("   - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
         print("   - Example: read_and_concatenate_files(folder_path, file_type='csv')")
-        print("\n31. remove zero values")
+        print("\n31. remove_zero_values")
         print("   - Description: Remove zero values in a specified column.")
         print("   - Usage: remove_zero_values(self, data_frame, column_to_filter)")
         print("   - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
-        print("\n32. upgrade all packages")
+        print("\n32. upgrade_outdated_packages")
         print("   - Description: Upgrades all packages.")
         print("   - Usage: upgrade_outdated_packages()")
         print("   - Example: upgrade_outdated_packages()")
-        print("\n33. Convert Mixed Formats Dates")
+        print("\n33. convert_mixed_formats_dates")
         print("   - Description: Convert a mix of US and UK dates to datetime.")
         print("   - Usage: convert_mixed_formats_dates(df, datecol)")
         print("   - Example: convert_mixed_formats_dates(df, 'OBS')")
-        print("\n34. Fill Weekly Missing Dates")
+        print("\n34. fill_weekly_date_range")
         print("   - Description: Fill in any missing weeks with 0.")
         print("   - Usage: fill_weekly_date_range(df, date_column, freq)")
         print("   - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
-        print("\n35. Add Prefix and/or Suffix to Column Headers")
+        print("\n35. add_prefix_and_suffix")
         print("   - Description: Add Prefix and/or Suffix to Column Headers.")
         print("   - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
         print("   - Example: add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')")
-        print("\n36. Change all data to dummies")
+        print("\n36. create_dummies")
         print("   - Description: Changes time series to 0s and 1s based off threshold")
         print("   - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
         print("   - Example: create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')")
-        print("\n37. Replace substrings in column of strings")
+        print("\n37. replace_substrings")
         print("   - Description: Replace substrings in column of strings based off dictionary, can also change column to lower")
         print("   - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
         print("   - Example: replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')")
-        print("\n38. Add totals column")
+        print("\n38. add_total_column")
         print("   - Description: Sums all columns with the option to exclude an date column to create a total column")
         print("   - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
         print("   - Example: add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')")
@@ -221,6 +221,13 @@ class dataprocessing:
         print("    - Description: Equivalent of xlookup in excel, but only based on substrings. If a substring is found in a cell, than look it up in the dictionary. Otherwise use the other label")
         print("    - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
         print("    - Example: apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')")
+        print("\n40. compare_overlap")
+        print("    - Description: With two matching dataset, it takes the common columns and rows and takes the difference between them, outputing a differences and total differences table")
+        print("    - Usage: compare_overlap(df1, df2, date_col)")
+        print("    - Example: compare_overlap(df_1, df_2, 'obs')")
     def get_wd_levels(self, levels):
         """
@@ -1475,7 +1482,59 @@ class dataprocessing:
         df[new_col_name] = df[column_name].apply(categorize_text)
         return df
+    def compare_overlap(df1, df2, date_col):
+        """
+        Compare overlapping periods between two DataFrames and provide a summary of total differences.
+        Args:
+            df1 (pandas.DataFrame): First DataFrame containing date-based data.
+            df2 (pandas.DataFrame): Second DataFrame containing date-based data.
+            date_col (str): The name of the date column used for aligning data.
+        Returns:
+            tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
+        """
+        # Ensure date columns are in datetime format
+        df1[date_col] = pd.to_datetime(df1[date_col])
+        df2[date_col] = pd.to_datetime(df2[date_col])
+        # Determine the overlap period
+        start_date = max(df1[date_col].min(), df2[date_col].min())
+        end_date = min(df1[date_col].max(), df2[date_col].max())
+        # Filter dataframes to the overlapping period
+        df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
+        df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
+        # Merge the dataframes on the date column to align data for comparison
+        merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
+        # Initialize a list to collect total differences for each column
+        total_diff_list = []
+        # Compare the values in each column (excluding the date column)
+        diff_df = pd.DataFrame({date_col: merged_df[date_col]})  # Initialize diff_df with the date column
+        for col in df1.columns:
+            if col != date_col:
+                # Calculate the difference for each row
+                diff_col = f'diff_{col}'
+                diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
+                # Calculate the total difference for the column and add it to the list
+                total_diff = diff_df[diff_col].sum()
+                total_diff_list.append({'Column': col, 'Total Difference': total_diff})
+        # Create a DataFrame for the summary of total differences
+        total_diff_df = pd.DataFrame(total_diff_list)
+        # Apply formatting to the numerical columns
+        float_format = "{:,.2f}".format  # Format to 2 decimal places with comma as thousand separator
+        diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
+        total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
+        return diff_df, total_diff_df
 ########################################################################################################################################
 ########################################################################################################################################

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.6.2.2
+Version: 0.6.2.3
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.6.2.2
+Version: 0.6.2.3
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

@@ -8,7 +8,7 @@ def read_md(file_name):
             return f.read()
     return ''
-VERSION = '0.6.2.2'
+VERSION = '0.6.2.3'
 DESCRIPTION = 'IMS Data Processing Package'
 LONG_DESCRIPTION = read_md('README.md')  # Reading from README.md

imsciences 0.6.2.2__tar.gz → 0.6.2.3__tar.gz

imsciences 0.6.2.2tar.gz → 0.6.2.3tar.gz