imsciences 0.6.2.5__py3-none-any.whl → 0.6.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/datafunctions.py +190 -16
- {imsciences-0.6.2.5.dist-info → imsciences-0.6.2.7.dist-info}/METADATA +1 -1
- {imsciences-0.6.2.5.dist-info → imsciences-0.6.2.7.dist-info}/RECORD +6 -6
- {imsciences-0.6.2.5.dist-info → imsciences-0.6.2.7.dist-info}/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.6.2.5.dist-info → imsciences-0.6.2.7.dist-info}/WHEEL +0 -0
- {imsciences-0.6.2.5.dist-info → imsciences-0.6.2.7.dist-info}/top_level.txt +0 -0
imsciences/datafunctions.py
CHANGED
|
@@ -109,13 +109,13 @@ class dataprocessing:
|
|
|
109
109
|
|
|
110
110
|
print("\n17. pivot_table")
|
|
111
111
|
print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
|
|
112
|
-
print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'False')")
|
|
113
|
-
print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value',filters_dict={'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'True')")
|
|
112
|
+
print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'False',fill_missing_weekly_dates=False,week_commencing='W-MON')")
|
|
113
|
+
print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value',filters_dict={'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'True',fill_missing_weekly_dates=True,week_commencing='W-MON')")
|
|
114
114
|
|
|
115
115
|
print("\n18. apply_lookup_table_for_columns")
|
|
116
116
|
print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
|
|
117
|
-
print(" - Usage:
|
|
118
|
-
print(" - Example:
|
|
117
|
+
print(" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')")
|
|
118
|
+
print(" - Example: apply_lookup_table_for_columns(df, col_names, {'spend':'spd','clicks':'clk'}, if_not_in_dict='Other', new_column_name='Metrics Short')")
|
|
119
119
|
|
|
120
120
|
print("\n19. aggregate_daily_to_wc_wide")
|
|
121
121
|
print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
|
|
@@ -226,6 +226,16 @@ class dataprocessing:
|
|
|
226
226
|
print(" - Description: With two matching dataset, it takes the common columns and rows and takes the difference between them, outputing a differences and total differences table")
|
|
227
227
|
print(" - Usage: compare_overlap(df1, df2, date_col)")
|
|
228
228
|
print(" - Example: compare_overlap(df_1, df_2, 'obs')")
|
|
229
|
+
|
|
230
|
+
print("\n41. week_commencing_2_week_commencing_conversion")
|
|
231
|
+
print(" - Description: Take a week commencing column say sunday and creates a new column with a different week commencing e.g. monday")
|
|
232
|
+
print(" - Usage: week_commencing_2_week_commencing_conversion(df,date_col,week_commencing='sun')")
|
|
233
|
+
print(" - Example: week_commencing_2_week_commencing_conversion(df,'obs,week_commencing='mon')")
|
|
234
|
+
|
|
235
|
+
print("\n42. week_commencing_2_week_commencing_conversion")
|
|
236
|
+
print(" - Description: Take a week commencing column say sunday and creates a new column with a different week commencing e.g. monday")
|
|
237
|
+
print(" - Usage: week_commencing_2_week_commencing_conversion(df,date_col,week_commencing='sun')")
|
|
238
|
+
print(" - Example: week_commencing_2_week_commencing_conversion(df,'obs,week_commencing='mon')")
|
|
229
239
|
|
|
230
240
|
|
|
231
241
|
|
|
@@ -664,7 +674,7 @@ class dataprocessing:
|
|
|
664
674
|
|
|
665
675
|
return combined_df
|
|
666
676
|
|
|
667
|
-
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False):
|
|
677
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name="Total", datetime_trans_needed=True, reverse_header_order=False,fill_missing_weekly_dates=False,week_commencing='W-MON'):
|
|
668
678
|
"""
|
|
669
679
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
670
680
|
|
|
@@ -680,6 +690,8 @@ class dataprocessing:
|
|
|
680
690
|
margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
|
|
681
691
|
datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
|
|
682
692
|
reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
|
|
693
|
+
fill_missing_weekly_dates (bool, optional): Fills in any weekly missing dates. Defaults to False.
|
|
694
|
+
week_commencing (str,optional): Fills in missing weeks if option is specified. Defaults to 'W-MON'.
|
|
683
695
|
|
|
684
696
|
Returns:
|
|
685
697
|
pandas.DataFrame: The pivot table specified
|
|
@@ -730,6 +742,10 @@ class dataprocessing:
|
|
|
730
742
|
# Fill in any NaNs
|
|
731
743
|
pivoted_df = pivoted_df.fillna(fill_value)
|
|
732
744
|
|
|
745
|
+
# If there is a need to fill in missing weeks
|
|
746
|
+
if fill_missing_weekly_dates == True:
|
|
747
|
+
pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
|
|
748
|
+
|
|
733
749
|
return pivoted_df
|
|
734
750
|
|
|
735
751
|
def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
|
|
@@ -1482,7 +1498,7 @@ class dataprocessing:
|
|
|
1482
1498
|
df[new_col_name] = df[column_name].apply(categorize_text)
|
|
1483
1499
|
return df
|
|
1484
1500
|
|
|
1485
|
-
def compare_overlap(
|
|
1501
|
+
def compare_overlap(df1, df2, date_col):
|
|
1486
1502
|
"""
|
|
1487
1503
|
Compare overlapping periods between two DataFrames and provide a summary of total differences.
|
|
1488
1504
|
|
|
@@ -1509,21 +1525,23 @@ class dataprocessing:
|
|
|
1509
1525
|
# Merge the dataframes on the date column to align data for comparison
|
|
1510
1526
|
merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
|
|
1511
1527
|
|
|
1528
|
+
# Get the common columns between the two DataFrames, excluding the date column
|
|
1529
|
+
common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
|
|
1530
|
+
|
|
1512
1531
|
# Initialize a list to collect total differences for each column
|
|
1513
1532
|
total_diff_list = []
|
|
1514
1533
|
|
|
1515
|
-
#
|
|
1534
|
+
# Create a DataFrame for the differences
|
|
1516
1535
|
diff_df = pd.DataFrame({date_col: merged_df[date_col]}) # Initialize diff_df with the date column
|
|
1517
1536
|
|
|
1518
|
-
for col in
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1537
|
+
for col in common_cols:
|
|
1538
|
+
# Calculate the difference for each row
|
|
1539
|
+
diff_col = f'diff_{col}'
|
|
1540
|
+
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2']
|
|
1541
|
+
|
|
1542
|
+
# Calculate the total difference for the column and add it to the list
|
|
1543
|
+
total_diff = diff_df[diff_col].sum()
|
|
1544
|
+
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1527
1545
|
|
|
1528
1546
|
# Create a DataFrame for the summary of total differences
|
|
1529
1547
|
total_diff_df = pd.DataFrame(total_diff_list)
|
|
@@ -1535,6 +1553,162 @@ class dataprocessing:
|
|
|
1535
1553
|
|
|
1536
1554
|
return diff_df, total_diff_df
|
|
1537
1555
|
|
|
1556
|
+
# Convert week commencing col (should be most likely monday to sunday or vice versa)
|
|
1557
|
+
def week_commencing_2_week_commencing_conversion(df,date_col,week_commencing='sun'):
|
|
1558
|
+
"""
|
|
1559
|
+
Convert week commencing column in a DataFrame to the start of another day specified.
|
|
1560
|
+
|
|
1561
|
+
Args:
|
|
1562
|
+
df (pandas.DataFrame): The DataFrame containing the date-based data.
|
|
1563
|
+
date_col (str): The name of the date column in the DataFrame.
|
|
1564
|
+
week_commencing (str, optional): The day of the week that the week starts on ('sun' for Sunday, 'mon' for Monday, etc.). Defaults to 'sun'.
|
|
1565
|
+
|
|
1566
|
+
Returns:
|
|
1567
|
+
pandas.DataFrame: The original DataFrame with an additional column indicating the start of the week.
|
|
1568
|
+
"""
|
|
1569
|
+
# Week commencing dictionary
|
|
1570
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1571
|
+
df['week_start_'+ week_commencing] = df[date_col].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1572
|
+
|
|
1573
|
+
return df
|
|
1574
|
+
|
|
1575
|
+
def plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
|
|
1576
|
+
"""
|
|
1577
|
+
Plot various types of charts using Plotly.
|
|
1578
|
+
|
|
1579
|
+
Args:
|
|
1580
|
+
df (pandas.DataFrame): DataFrame containing the data.
|
|
1581
|
+
date_col (str): The name of the column with date information.
|
|
1582
|
+
value_cols (list): List of columns to plot.
|
|
1583
|
+
chart_type (str): Type of chart to plot ('line', 'bar', 'scatter', 'pie', 'box', 'heatmap', 'area', 'bubble', 'funnel', 'waterfall', 'contour', 'scatter3d').
|
|
1584
|
+
title (str): Title of the chart.
|
|
1585
|
+
x_title (str): Title of the x-axis.
|
|
1586
|
+
y_title (str): Title of the y-axis.
|
|
1587
|
+
**kwargs: Additional keyword arguments for customization.
|
|
1588
|
+
|
|
1589
|
+
Returns:
|
|
1590
|
+
plotly.graph_objects.Figure: The Plotly figure object.
|
|
1591
|
+
"""
|
|
1592
|
+
# Ensure the date column is in datetime format
|
|
1593
|
+
df[date_col] = pd.to_datetime(df[date_col])
|
|
1594
|
+
|
|
1595
|
+
# Initialize the figure
|
|
1596
|
+
fig = go.Figure()
|
|
1597
|
+
|
|
1598
|
+
# Make sure the date col is excluded from the line cols
|
|
1599
|
+
value_cols = [x for x in value_cols if x!=date_col]
|
|
1600
|
+
|
|
1601
|
+
# Add each value column to the plot based on the chart type
|
|
1602
|
+
for col in value_cols:
|
|
1603
|
+
if chart_type == 'line':
|
|
1604
|
+
fig.add_trace(go.Scatter(
|
|
1605
|
+
x=df[date_col],
|
|
1606
|
+
y=df[col],
|
|
1607
|
+
mode='lines',
|
|
1608
|
+
name=col,
|
|
1609
|
+
**kwargs
|
|
1610
|
+
))
|
|
1611
|
+
elif chart_type == 'bar':
|
|
1612
|
+
fig.add_trace(go.Bar(
|
|
1613
|
+
x=df[date_col],
|
|
1614
|
+
y=df[col],
|
|
1615
|
+
name=col,
|
|
1616
|
+
**kwargs
|
|
1617
|
+
))
|
|
1618
|
+
elif chart_type == 'scatter':
|
|
1619
|
+
fig.add_trace(go.Scatter(
|
|
1620
|
+
x=df[date_col],
|
|
1621
|
+
y=df[col],
|
|
1622
|
+
mode='markers',
|
|
1623
|
+
name=col,
|
|
1624
|
+
**kwargs
|
|
1625
|
+
))
|
|
1626
|
+
elif chart_type == 'histogram':
|
|
1627
|
+
fig.add_trace(go.Histogram(
|
|
1628
|
+
x=df[col],
|
|
1629
|
+
name=col,
|
|
1630
|
+
**kwargs
|
|
1631
|
+
))
|
|
1632
|
+
elif chart_type == 'pie':
|
|
1633
|
+
fig.add_trace(go.Pie(
|
|
1634
|
+
labels=df[date_col], # or another column for labels
|
|
1635
|
+
values=df[col],
|
|
1636
|
+
name=col,
|
|
1637
|
+
**kwargs
|
|
1638
|
+
))
|
|
1639
|
+
elif chart_type == 'box':
|
|
1640
|
+
fig.add_trace(go.Box(
|
|
1641
|
+
y=df[col],
|
|
1642
|
+
name=col,
|
|
1643
|
+
**kwargs
|
|
1644
|
+
))
|
|
1645
|
+
elif chart_type == 'heatmap':
|
|
1646
|
+
fig.add_trace(go.Heatmap(
|
|
1647
|
+
z=df.pivot_table(index=date_col, columns=value_cols[0], values=value_cols[1]),
|
|
1648
|
+
x=df[value_cols[0]],
|
|
1649
|
+
y=df[date_col],
|
|
1650
|
+
**kwargs
|
|
1651
|
+
))
|
|
1652
|
+
elif chart_type == 'area':
|
|
1653
|
+
fig.add_trace(go.Scatter(
|
|
1654
|
+
x=df[date_col],
|
|
1655
|
+
y=df[col],
|
|
1656
|
+
mode='lines', # Use 'lines+markers' if you want markers
|
|
1657
|
+
fill='tozeroy', # Fill the area under the line
|
|
1658
|
+
name=col,
|
|
1659
|
+
**kwargs
|
|
1660
|
+
))
|
|
1661
|
+
elif chart_type == 'bubble':
|
|
1662
|
+
fig.add_trace(go.Scatter(
|
|
1663
|
+
x=df[value_cols[0]],
|
|
1664
|
+
y=df[value_cols[1]],
|
|
1665
|
+
mode='markers',
|
|
1666
|
+
marker=dict(size=df[value_cols[2]]),
|
|
1667
|
+
name='Bubble Chart',
|
|
1668
|
+
**kwargs
|
|
1669
|
+
))
|
|
1670
|
+
elif chart_type == 'funnel':
|
|
1671
|
+
fig.add_trace(go.Funnel(
|
|
1672
|
+
y=df[date_col],
|
|
1673
|
+
x=df[col],
|
|
1674
|
+
**kwargs
|
|
1675
|
+
))
|
|
1676
|
+
elif chart_type == 'waterfall':
|
|
1677
|
+
fig.add_trace(go.Waterfall(
|
|
1678
|
+
x=df[date_col],
|
|
1679
|
+
y=df[col],
|
|
1680
|
+
measure=df[value_cols[1]], # measures like 'increase', 'decrease', 'total'
|
|
1681
|
+
**kwargs
|
|
1682
|
+
))
|
|
1683
|
+
elif chart_type == 'contour':
|
|
1684
|
+
fig.add_trace(go.Contour(
|
|
1685
|
+
z=df.pivot_table(index=value_cols[0], columns=value_cols[1], values=value_cols[2]),
|
|
1686
|
+
x=df[value_cols[0]],
|
|
1687
|
+
y=df[value_cols[1]],
|
|
1688
|
+
**kwargs
|
|
1689
|
+
))
|
|
1690
|
+
elif chart_type == 'scatter3d':
|
|
1691
|
+
fig.add_trace(go.Scatter3d(
|
|
1692
|
+
x=df[value_cols[0]],
|
|
1693
|
+
y=df[value_cols[1]],
|
|
1694
|
+
z=df[value_cols[2]],
|
|
1695
|
+
mode='markers',
|
|
1696
|
+
**kwargs
|
|
1697
|
+
))
|
|
1698
|
+
else:
|
|
1699
|
+
raise ValueError(f"Unsupported chart type: {chart_type}")
|
|
1700
|
+
|
|
1701
|
+
# Update the layout of the figure
|
|
1702
|
+
fig.update_layout(
|
|
1703
|
+
title=title,
|
|
1704
|
+
xaxis_title=x_title,
|
|
1705
|
+
yaxis_title=y_title,
|
|
1706
|
+
legend_title='Series',
|
|
1707
|
+
template='plotly_dark'
|
|
1708
|
+
)
|
|
1709
|
+
|
|
1710
|
+
return fig
|
|
1711
|
+
|
|
1538
1712
|
########################################################################################################################################
|
|
1539
1713
|
########################################################################################################################################
|
|
1540
1714
|
|
|
@@ -3,14 +3,14 @@ dataprocessing/data-processing-functions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nF
|
|
|
3
3
|
dataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
4
4
|
imsciences/__init__.py,sha256=GIPbLmWc06sVcOySWwNvMNUr6XGOHqPLryFIWgtpHh8,78
|
|
5
5
|
imsciences/datafunctions-IMS-24Ltp-3.py,sha256=3Snv-0iE_03StmyjtT-riOU9f4v8TaJWLoyZLJp6l8Y,141406
|
|
6
|
-
imsciences/datafunctions.py,sha256=
|
|
6
|
+
imsciences/datafunctions.py,sha256=PGuvgJIurXGWM8E1M_w9BijUJGBm5FTaZVE-C1_sPog,151382
|
|
7
7
|
imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
|
|
8
8
|
imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
|
|
9
9
|
imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
10
10
|
imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
|
|
11
11
|
imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
12
|
-
imsciences-0.6.2.
|
|
13
|
-
imsciences-0.6.2.
|
|
14
|
-
imsciences-0.6.2.
|
|
15
|
-
imsciences-0.6.2.
|
|
16
|
-
imsciences-0.6.2.
|
|
12
|
+
imsciences-0.6.2.7.dist-info/METADATA,sha256=0IT7pWYxsHXerkqBVKsS2Zh1_6qwn1u7NL3mK44c4tk,854
|
|
13
|
+
imsciences-0.6.2.7.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
|
|
14
|
+
imsciences-0.6.2.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
15
|
+
imsciences-0.6.2.7.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
|
|
16
|
+
imsciences-0.6.2.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|