imsciences 0.6.2.2__tar.gz → 0.6.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/PKG-INFO +1 -1
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/imsciences/datafunctions.py +67 -8
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/imsciences.egg-info/PKG-INFO +1 -1
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/setup.py +1 -1
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/README.md +0 -0
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/imsciences/__init__.py +0 -0
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/imsciences.egg-info/SOURCES.txt +0 -0
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.6.2.2 → imsciences-0.6.2.3}/setup.cfg +0 -0
|
@@ -177,42 +177,42 @@ class dataprocessing:
|
|
|
177
177
|
print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
178
178
|
print(" - Example: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
179
179
|
|
|
180
|
-
print("\n31.
|
|
180
|
+
print("\n31. remove_zero_values")
|
|
181
181
|
print(" - Description: Remove zero values in a specified column.")
|
|
182
182
|
print(" - Usage: remove_zero_values(self, data_frame, column_to_filter)")
|
|
183
183
|
print(" - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
|
|
184
184
|
|
|
185
|
-
print("\n32.
|
|
185
|
+
print("\n32. upgrade_outdated_packages")
|
|
186
186
|
print(" - Description: Upgrades all packages.")
|
|
187
187
|
print(" - Usage: upgrade_outdated_packages()")
|
|
188
188
|
print(" - Example: upgrade_outdated_packages()")
|
|
189
189
|
|
|
190
|
-
print("\n33.
|
|
190
|
+
print("\n33. convert_mixed_formats_dates")
|
|
191
191
|
print(" - Description: Convert a mix of US and UK dates to datetime.")
|
|
192
192
|
print(" - Usage: convert_mixed_formats_dates(df, datecol)")
|
|
193
193
|
print(" - Example: convert_mixed_formats_dates(df, 'OBS')")
|
|
194
194
|
|
|
195
|
-
print("\n34.
|
|
195
|
+
print("\n34. fill_weekly_date_range")
|
|
196
196
|
print(" - Description: Fill in any missing weeks with 0.")
|
|
197
197
|
print(" - Usage: fill_weekly_date_range(df, date_column, freq)")
|
|
198
198
|
print(" - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
|
|
199
199
|
|
|
200
|
-
print("\n35.
|
|
200
|
+
print("\n35. add_prefix_and_suffix")
|
|
201
201
|
print(" - Description: Add Prefix and/or Suffix to Column Headers.")
|
|
202
202
|
print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
|
|
203
203
|
print(" - Example: add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')")
|
|
204
204
|
|
|
205
|
-
print("\n36.
|
|
205
|
+
print("\n36. create_dummies")
|
|
206
206
|
print(" - Description: Changes time series to 0s and 1s based off threshold")
|
|
207
207
|
print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
|
|
208
208
|
print(" - Example: create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')")
|
|
209
209
|
|
|
210
|
-
print("\n37.
|
|
210
|
+
print("\n37. replace_substrings")
|
|
211
211
|
print(" - Description: Replace substrings in column of strings based off dictionary, can also change column to lower")
|
|
212
212
|
print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
|
|
213
213
|
print(" - Example: replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')")
|
|
214
214
|
|
|
215
|
-
print("\n38.
|
|
215
|
+
print("\n38. add_total_column")
|
|
216
216
|
print(" - Description: Sums all columns with the option to exclude an date column to create a total column")
|
|
217
217
|
print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
|
|
218
218
|
print(" - Example: add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')")
|
|
@@ -221,6 +221,13 @@ class dataprocessing:
|
|
|
221
221
|
print(" - Description: Equivalent of xlookup in excel, but only based on substrings. If a substring is found in a cell, than look it up in the dictionary. Otherwise use the other label")
|
|
222
222
|
print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
|
|
223
223
|
print(" - Example: apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')")
|
|
224
|
+
|
|
225
|
+
print("\n40. compare_overlap")
|
|
226
|
+
print(" - Description: With two matching dataset, it takes the common columns and rows and takes the difference between them, outputing a differences and total differences table")
|
|
227
|
+
print(" - Usage: compare_overlap(df1, df2, date_col)")
|
|
228
|
+
print(" - Example: compare_overlap(df_1, df_2, 'obs')")
|
|
229
|
+
|
|
230
|
+
|
|
224
231
|
|
|
225
232
|
def get_wd_levels(self, levels):
|
|
226
233
|
"""
|
|
@@ -1475,7 +1482,59 @@ class dataprocessing:
|
|
|
1475
1482
|
df[new_col_name] = df[column_name].apply(categorize_text)
|
|
1476
1483
|
return df
|
|
1477
1484
|
|
|
1485
|
+
def compare_overlap(df1, df2, date_col):
|
|
1486
|
+
"""
|
|
1487
|
+
Compare overlapping periods between two DataFrames and provide a summary of total differences.
|
|
1488
|
+
|
|
1489
|
+
Args:
|
|
1490
|
+
df1 (pandas.DataFrame): First DataFrame containing date-based data.
|
|
1491
|
+
df2 (pandas.DataFrame): Second DataFrame containing date-based data.
|
|
1492
|
+
date_col (str): The name of the date column used for aligning data.
|
|
1478
1493
|
|
|
1494
|
+
Returns:
|
|
1495
|
+
tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
|
|
1496
|
+
"""
|
|
1497
|
+
# Ensure date columns are in datetime format
|
|
1498
|
+
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
1499
|
+
df2[date_col] = pd.to_datetime(df2[date_col])
|
|
1500
|
+
|
|
1501
|
+
# Determine the overlap period
|
|
1502
|
+
start_date = max(df1[date_col].min(), df2[date_col].min())
|
|
1503
|
+
end_date = min(df1[date_col].max(), df2[date_col].max())
|
|
1504
|
+
|
|
1505
|
+
# Filter dataframes to the overlapping period
|
|
1506
|
+
df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
|
|
1507
|
+
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1508
|
+
|
|
1509
|
+
# Merge the dataframes on the date column to align data for comparison
|
|
1510
|
+
merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
|
|
1511
|
+
|
|
1512
|
+
# Initialize a list to collect total differences for each column
|
|
1513
|
+
total_diff_list = []
|
|
1514
|
+
|
|
1515
|
+
# Compare the values in each column (excluding the date column)
|
|
1516
|
+
diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
|
|
1517
|
+
|
|
1518
|
+
for col in df1.columns:
|
|
1519
|
+
if col != date_col:
|
|
1520
|
+
# Calculate the difference for each row
|
|
1521
|
+
diff_col = f'diff_{col}'
|
|
1522
|
+
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
|
|
1523
|
+
|
|
1524
|
+
# Calculate the total difference for the column and add it to the list
|
|
1525
|
+
total_diff = diff_df[diff_col].sum()
|
|
1526
|
+
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1527
|
+
|
|
1528
|
+
# Create a DataFrame for the summary of total differences
|
|
1529
|
+
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1530
|
+
|
|
1531
|
+
# Apply formatting to the numerical columns
|
|
1532
|
+
float_format = "{:,.2f}".format # Format to 2 decimal places with comma as thousand separator
|
|
1533
|
+
diff_df.iloc[:, 1:] = diff_df.iloc[:, 1:].applymap(float_format)
|
|
1534
|
+
total_diff_df['Total Difference'] = total_diff_df['Total Difference'].apply(float_format)
|
|
1535
|
+
|
|
1536
|
+
return diff_df, total_diff_df
|
|
1537
|
+
|
|
1479
1538
|
########################################################################################################################################
|
|
1480
1539
|
########################################################################################################################################
|
|
1481
1540
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|