imsciences 1.0.1__tar.gz → 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-1.0.1/imsciences.egg-info → imsciences-1.0.9}/PKG-INFO +11 -2
- imsciences-1.0.1/imsciences/pull.py → imsciences-1.0.9/imsciences/pull-IMS-24Ltp-3.py +64 -23
- imsciences-1.0.9/imsciences/pull.py +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences/vis.py +16 -3
- imsciences-1.0.9/imsciences.egg-info/PKG-INFO +365 -0
- imsciences-1.0.1/PKG-INFO → imsciences-1.0.9/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 +1 -1
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences.egg-info/SOURCES.txt +2 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/setup.py +1 -2
- {imsciences-1.0.1 → imsciences-1.0.9}/LICENSE.txt +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/README.md +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences/__init__.py +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences/geo.py +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences/mmm.py +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences.egg-info/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/pyproject.toml +0 -0
- {imsciences-1.0.1 → imsciences-1.0.9}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: imsciences
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.9
|
|
4
4
|
Summary: IMS Data Processing Package
|
|
5
5
|
Author: IMS
|
|
6
6
|
Author-email: cam@im-sciences.com
|
|
@@ -26,6 +26,15 @@ Requires-Dist: google-analytics-data
|
|
|
26
26
|
Requires-Dist: geopandas
|
|
27
27
|
Requires-Dist: geopy
|
|
28
28
|
Requires-Dist: workalendar
|
|
29
|
+
Dynamic: author
|
|
30
|
+
Dynamic: author-email
|
|
31
|
+
Dynamic: classifier
|
|
32
|
+
Dynamic: description
|
|
33
|
+
Dynamic: description-content-type
|
|
34
|
+
Dynamic: keywords
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: summary
|
|
29
38
|
|
|
30
39
|
# IMS Package Documentation
|
|
31
40
|
|
|
@@ -2397,13 +2397,13 @@ class datapull:
|
|
|
2397
2397
|
cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
2398
2398
|
week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
|
|
2399
2399
|
sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
|
|
2400
|
-
|
|
2400
|
+
(e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
|
|
2401
2401
|
|
|
2402
2402
|
Returns
|
|
2403
2403
|
-------
|
|
2404
2404
|
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
|
|
2405
|
-
|
|
2406
|
-
|
|
2405
|
+
and all series as renamed columns (e.g., 'macro_retail_sales_uk').
|
|
2406
|
+
Returns an empty DataFrame if no data is fetched or processed.
|
|
2407
2407
|
|
|
2408
2408
|
"""
|
|
2409
2409
|
# Define CDIDs for sectors and defaults
|
|
@@ -2436,16 +2436,11 @@ class datapull:
|
|
|
2436
2436
|
sector_cdids_map.get(sec, []),
|
|
2437
2437
|
) # Use extend to add items from the list
|
|
2438
2438
|
|
|
2439
|
-
standard_cdids = list(
|
|
2440
|
-
set(default_cdids + sector_specific_cdids),
|
|
2441
|
-
) # Combine default and selected sector CDIDs, ensure uniqueness
|
|
2442
|
-
|
|
2443
2439
|
# Combine standard CDIDs and any additional user-provided CDIDs
|
|
2440
|
+
standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
|
|
2444
2441
|
if cdid_list is None:
|
|
2445
2442
|
cdid_list = []
|
|
2446
|
-
final_cdid_list = list(
|
|
2447
|
-
set(standard_cdids + cdid_list),
|
|
2448
|
-
) # Ensure uniqueness in the final list
|
|
2443
|
+
final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
|
|
2449
2444
|
|
|
2450
2445
|
base_search_url = (
|
|
2451
2446
|
"https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
@@ -2670,26 +2665,59 @@ class datapull:
|
|
|
2670
2665
|
)
|
|
2671
2666
|
|
|
2672
2667
|
def clean_column_name(name):
|
|
2673
|
-
# Remove content within parentheses
|
|
2668
|
+
# Remove content within parentheses
|
|
2674
2669
|
name = re.sub(r"\(.*?\)", "", name)
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2670
|
+
|
|
2671
|
+
# Special handling for ANY CPI items (not just CPI INDEX)
|
|
2672
|
+
if "CPI" in name.upper():
|
|
2673
|
+
# Extract the description part after the colon for CPI items
|
|
2674
|
+
if ":" in name:
|
|
2675
|
+
parts = name.split(":")
|
|
2676
|
+
if len(parts) >= 2:
|
|
2677
|
+
# Take the description part (usually the second part)
|
|
2678
|
+
description = parts[1].strip()
|
|
2679
|
+
# Remove any remaining colons and everything after
|
|
2680
|
+
description = description.split(":")[0].strip()
|
|
2681
|
+
name = f"CPI {description}"
|
|
2682
|
+
|
|
2683
|
+
# Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
|
|
2684
|
+
name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
|
|
2685
|
+
|
|
2686
|
+
else:
|
|
2687
|
+
# For non-CPI items, take only the part before the first colon
|
|
2688
|
+
name = re.split(r":", name)[0]
|
|
2689
|
+
# Remove all digits for non-CPI items too
|
|
2690
|
+
name = re.sub(r"\d+", "", name)
|
|
2691
|
+
|
|
2692
|
+
# Remove year references like "2015=100"
|
|
2693
|
+
name = re.sub(r"\d{4}=\d+", "", name)
|
|
2694
|
+
|
|
2695
|
+
# Remove specific words case-insensitively
|
|
2696
|
+
name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
|
|
2697
|
+
|
|
2698
|
+
# Remove percentage symbols and "%"
|
|
2699
|
+
name = re.sub(r"%", "percent", name)
|
|
2700
|
+
|
|
2681
2701
|
# Remove non-alphanumeric characters (except underscore and space)
|
|
2682
2702
|
name = re.sub(r"[^\w\s]", "", name)
|
|
2703
|
+
|
|
2683
2704
|
# Replace spaces with underscores
|
|
2684
|
-
name = name.strip()
|
|
2685
|
-
|
|
2705
|
+
name = name.strip().replace(" ", "_")
|
|
2706
|
+
|
|
2686
2707
|
# Replace multiple underscores with a single one
|
|
2687
2708
|
name = re.sub(r"_+", "_", name)
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
2709
|
+
|
|
2710
|
+
# Remove leading/trailing underscores
|
|
2711
|
+
name = name.strip("_")
|
|
2712
|
+
|
|
2713
|
+
# Truncate very long names (optional)
|
|
2714
|
+
if len(name) > 50:
|
|
2715
|
+
words = name.split("_")
|
|
2716
|
+
# Keep first few meaningful words
|
|
2717
|
+
name = "_".join(words[:4])
|
|
2718
|
+
|
|
2691
2719
|
return f"macro_{name.lower()}_uk"
|
|
2692
|
-
|
|
2720
|
+
|
|
2693
2721
|
# Apply cleaning function to relevant columns
|
|
2694
2722
|
weekly_df.columns = [
|
|
2695
2723
|
clean_column_name(col) if col != "week_commencing" else col
|
|
@@ -2704,6 +2732,19 @@ class datapull:
|
|
|
2704
2732
|
# Consider if 0 is the appropriate fill value for your use case
|
|
2705
2733
|
# weekly_df = weekly_df.fillna(0)
|
|
2706
2734
|
|
|
2735
|
+
# Get only the data columns (excluding OBS)
|
|
2736
|
+
data_columns = [col for col in weekly_df.columns if col != "OBS"]
|
|
2737
|
+
|
|
2738
|
+
new_columns = ["OBS"]
|
|
2739
|
+
for i, col in enumerate(data_columns):
|
|
2740
|
+
if i < len(final_cdid_list):
|
|
2741
|
+
new_columns.append(f"{col}_{final_cdid_list[i]}")
|
|
2742
|
+
else:
|
|
2743
|
+
new_columns.append(col) # Keep original if no matching CDID
|
|
2744
|
+
|
|
2745
|
+
# Apply the new column names to the DataFrame
|
|
2746
|
+
weekly_df.columns = new_columns
|
|
2747
|
+
|
|
2707
2748
|
return weekly_df
|
|
2708
2749
|
print("No data successfully fetched or processed.")
|
|
2709
2750
|
return pd.DataFrame()
|
|
File without changes
|
|
@@ -51,7 +51,7 @@ class datavis:
|
|
|
51
51
|
"font_size": 10,
|
|
52
52
|
},
|
|
53
53
|
}
|
|
54
|
-
self.current_theme = "
|
|
54
|
+
self.current_theme = "default"
|
|
55
55
|
|
|
56
56
|
def help(self, method=None, *, show_examples=True):
|
|
57
57
|
"""
|
|
@@ -546,7 +546,7 @@ class datavis:
|
|
|
546
546
|
|
|
547
547
|
return fig
|
|
548
548
|
|
|
549
|
-
def plot_two(self, data_config, *, same_axis=True):
|
|
549
|
+
def plot_two(self, data_config, *, same_axis=True, title="Comparison Plot"):
|
|
550
550
|
"""
|
|
551
551
|
Plots specified columns from two different DataFrames with themed styling.
|
|
552
552
|
|
|
@@ -556,6 +556,8 @@ class datavis:
|
|
|
556
556
|
Dictionary with keys: 'df1', 'col1', 'df2', 'col2', 'date_column'
|
|
557
557
|
same_axis : bool, default True
|
|
558
558
|
If True, plot both traces on the same y-axis; otherwise, use separate y-axes
|
|
559
|
+
title : str, default "Comparison Plot"
|
|
560
|
+
Custom title for the plot
|
|
559
561
|
|
|
560
562
|
Returns
|
|
561
563
|
-------
|
|
@@ -636,7 +638,18 @@ class datavis:
|
|
|
636
638
|
}
|
|
637
639
|
)
|
|
638
640
|
|
|
639
|
-
|
|
641
|
+
# Update layout with custom title and legend positioning
|
|
642
|
+
fig.update_layout(
|
|
643
|
+
title=title,
|
|
644
|
+
showlegend=True,
|
|
645
|
+
legend={
|
|
646
|
+
"orientation": "h",
|
|
647
|
+
"yanchor": "bottom",
|
|
648
|
+
"y": 1.02,
|
|
649
|
+
"xanchor": "center",
|
|
650
|
+
"x": 0.5
|
|
651
|
+
}
|
|
652
|
+
)
|
|
640
653
|
|
|
641
654
|
return fig
|
|
642
655
|
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: imsciences
|
|
3
|
+
Version: 1.0.9
|
|
4
|
+
Summary: IMS Data Processing Package
|
|
5
|
+
Author: IMS
|
|
6
|
+
Author-email: cam@im-sciences.com
|
|
7
|
+
Keywords: data processing,apis,data analysis,data visualization,machine learning
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: Unix
|
|
12
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
13
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE.txt
|
|
16
|
+
Requires-Dist: pandas
|
|
17
|
+
Requires-Dist: plotly
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
Requires-Dist: fredapi
|
|
20
|
+
Requires-Dist: xgboost
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: bs4
|
|
23
|
+
Requires-Dist: yfinance
|
|
24
|
+
Requires-Dist: holidays
|
|
25
|
+
Requires-Dist: google-analytics-data
|
|
26
|
+
Requires-Dist: geopandas
|
|
27
|
+
Requires-Dist: geopy
|
|
28
|
+
Requires-Dist: workalendar
|
|
29
|
+
Dynamic: author
|
|
30
|
+
Dynamic: author-email
|
|
31
|
+
Dynamic: classifier
|
|
32
|
+
Dynamic: description
|
|
33
|
+
Dynamic: description-content-type
|
|
34
|
+
Dynamic: keywords
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: summary
|
|
38
|
+
|
|
39
|
+
# IMS Package Documentation
|
|
40
|
+
|
|
41
|
+
The **Independent Marketing Sciences** package is a Python library designed to process incoming data into a format tailored for projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data collection, manipulation, visualisation and analysis.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Key Features
|
|
46
|
+
- Seamless data processing for time series workflows.
|
|
47
|
+
- Aggregation, filtering, and transformation of time series data.
|
|
48
|
+
- Visualising Data
|
|
49
|
+
- Integration with external data sources like FRED, Bank of England and ONS.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
Table of Contents
|
|
54
|
+
=================
|
|
55
|
+
|
|
56
|
+
1. [Usage](#usage)
|
|
57
|
+
2. [Data Processing for Time Series](#data-processing-for-time-series)
|
|
58
|
+
3. [Data Processing for Incrementality Testing](#data-processing-for-incrementality-testing)
|
|
59
|
+
4. [Data Visualisations](#data-visualisations)
|
|
60
|
+
5. [Data Pulling](#data-pulling)
|
|
61
|
+
6. [Installation](#installation)
|
|
62
|
+
7. [License](#license)
|
|
63
|
+
8. [Roadmap](#roadmap)
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
from imsciences import dataprocessing, geoprocessing, datapull, datavis
|
|
71
|
+
ims_proc = dataprocessing()
|
|
72
|
+
ims_geo = geoprocessing()
|
|
73
|
+
ims_pull = datapull()
|
|
74
|
+
ims_vis = datavis()
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Data Processing for Time Series
|
|
78
|
+
|
|
79
|
+
## 1. `get_wd_levels`
|
|
80
|
+
- **Description**: Get the working directory with the option of moving up parents.
|
|
81
|
+
- **Usage**: `get_wd_levels(levels)`
|
|
82
|
+
- **Example**: `get_wd_levels(0)`
|
|
83
|
+
|
|
84
|
+
## 2. `aggregate_daily_to_wc_long`
|
|
85
|
+
- **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
|
|
86
|
+
- **Usage**: `aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')`
|
|
87
|
+
- **Example**: `aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')`
|
|
88
|
+
|
|
89
|
+
## 3. `convert_monthly_to_daily`
|
|
90
|
+
- **Description**: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.
|
|
91
|
+
- **Usage**: `convert_monthly_to_daily(df, date_column, divide=True)`
|
|
92
|
+
- **Example**: `convert_monthly_to_daily(df, 'date')`
|
|
93
|
+
|
|
94
|
+
## 4. `week_of_year_mapping`
|
|
95
|
+
- **Description**: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.
|
|
96
|
+
- **Usage**: `week_of_year_mapping(df, week_col, start_day_str)`
|
|
97
|
+
- **Example**: `week_of_year_mapping(df, 'week', 'mon')`
|
|
98
|
+
|
|
99
|
+
## 5. `rename_cols`
|
|
100
|
+
- **Description**: Renames columns in a pandas DataFrame with a specified prefix or format.
|
|
101
|
+
- **Usage**: `rename_cols(df, name='ame_')`
|
|
102
|
+
- **Example**: `rename_cols(df, 'ame_facebook')`
|
|
103
|
+
|
|
104
|
+
## 6. `merge_new_and_old`
|
|
105
|
+
- **Description**: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.
|
|
106
|
+
- **Usage**: `merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')`
|
|
107
|
+
- **Example**: `merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')`
|
|
108
|
+
|
|
109
|
+
## 7. `merge_dataframes_on_column`
|
|
110
|
+
- **Description**: Merge a list of DataFrames on a common column.
|
|
111
|
+
- **Usage**: `merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')`
|
|
112
|
+
- **Example**: `merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')`
|
|
113
|
+
|
|
114
|
+
## 8. `merge_and_update_dfs`
|
|
115
|
+
- **Description**: Merges two dataframes, updating columns from the second dataframe where values are available.
|
|
116
|
+
- **Usage**: `merge_and_update_dfs(df1, df2, key_column)`
|
|
117
|
+
- **Example**: `merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')`
|
|
118
|
+
|
|
119
|
+
## 9. `convert_us_to_uk_dates`
|
|
120
|
+
- **Description**: Convert a DataFrame column with mixed US and UK date formats to datetime.
|
|
121
|
+
- **Usage**: `convert_us_to_uk_dates(df, date_col)`
|
|
122
|
+
- **Example**: `convert_us_to_uk_dates(df, 'date')`
|
|
123
|
+
|
|
124
|
+
## 10. `combine_sheets`
|
|
125
|
+
- **Description**: Combines multiple DataFrames from a dictionary into a single DataFrame.
|
|
126
|
+
- **Usage**: `combine_sheets(all_sheets)`
|
|
127
|
+
- **Example**: `combine_sheets({'Sheet1': df1, 'Sheet2': df2})`
|
|
128
|
+
|
|
129
|
+
## 11. `pivot_table`
|
|
130
|
+
- **Description**: Dynamically pivots a DataFrame based on specified columns.
|
|
131
|
+
- **Usage**: `pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')`
|
|
132
|
+
- **Example**: `pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)`
|
|
133
|
+
|
|
134
|
+
## 12. `apply_lookup_table_for_columns`
|
|
135
|
+
- **Description**: Maps substrings in columns to new values based on a dictionary.
|
|
136
|
+
- **Usage**: `apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')`
|
|
137
|
+
- **Example**: `apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')`
|
|
138
|
+
|
|
139
|
+
## 13. `aggregate_daily_to_wc_wide`
|
|
140
|
+
- **Description**: Aggregates daily data into weekly data and pivots it to wide format.
|
|
141
|
+
- **Usage**: `aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)`
|
|
142
|
+
- **Example**: `aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)`
|
|
143
|
+
|
|
144
|
+
## 14. `merge_cols_with_seperator`
|
|
145
|
+
- **Description**: Merges multiple columns in a DataFrame into one column with a specified separator.
|
|
146
|
+
- **Usage**: `merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')`
|
|
147
|
+
- **Example**: `merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')`
|
|
148
|
+
|
|
149
|
+
## 15. `check_sum_of_df_cols_are_equal`
|
|
150
|
+
- **Description**: Checks if the sum of two columns in two DataFrames are equal and provides the difference.
|
|
151
|
+
- **Usage**: `check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)`
|
|
152
|
+
- **Example**: `check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')`
|
|
153
|
+
|
|
154
|
+
## 16. `convert_2_df_cols_to_dict`
|
|
155
|
+
- **Description**: Creates a dictionary from two DataFrame columns.
|
|
156
|
+
- **Usage**: `convert_2_df_cols_to_dict(df, key_col, value_col)`
|
|
157
|
+
- **Example**: `convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')`
|
|
158
|
+
|
|
159
|
+
## 17. `create_FY_and_H_columns`
|
|
160
|
+
- **Description**: Adds financial year and half-year columns to a DataFrame based on a start date.
|
|
161
|
+
- **Usage**: `create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')`
|
|
162
|
+
- **Example**: `create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')`
|
|
163
|
+
|
|
164
|
+
## 18. `keyword_lookup_replacement`
|
|
165
|
+
- **Description**: Updates values in a column based on a lookup dictionary with conditional logic.
|
|
166
|
+
- **Usage**: `keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')`
|
|
167
|
+
- **Example**: `keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')`
|
|
168
|
+
|
|
169
|
+
## 19. `create_new_version_of_col_using_LUT`
|
|
170
|
+
- **Description**: Creates a new column based on a lookup table applied to an existing column.
|
|
171
|
+
- **Usage**: `create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')`
|
|
172
|
+
- **Example**: `create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)`
|
|
173
|
+
|
|
174
|
+
## 20. `convert_df_wide_2_long`
|
|
175
|
+
- **Description**: Converts a wide-format DataFrame into a long-format DataFrame.
|
|
176
|
+
- **Usage**: `convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')`
|
|
177
|
+
- **Example**: `convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')`
|
|
178
|
+
|
|
179
|
+
## 21. `manually_edit_data`
|
|
180
|
+
- **Description**: Manually updates specified cells in a DataFrame based on filters.
|
|
181
|
+
- **Usage**: `manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)`
|
|
182
|
+
- **Example**: `manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')`
|
|
183
|
+
|
|
184
|
+
## 22. `format_numbers_with_commas`
|
|
185
|
+
- **Description**: Formats numerical columns with commas and a specified number of decimal places.
|
|
186
|
+
- **Usage**: `format_numbers_with_commas(df, decimal_length_chosen=2)`
|
|
187
|
+
- **Example**: `format_numbers_with_commas(df, decimal_length_chosen=1)`
|
|
188
|
+
|
|
189
|
+
## 23. `filter_df_on_multiple_conditions`
|
|
190
|
+
- **Description**: Filters a DataFrame based on multiple column conditions.
|
|
191
|
+
- **Usage**: `filter_df_on_multiple_conditions(df, filters_dict)`
|
|
192
|
+
- **Example**: `filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})`
|
|
193
|
+
|
|
194
|
+
## 24. `read_and_concatenate_files`
|
|
195
|
+
- **Description**: Reads and concatenates files from a specified folder into a single DataFrame.
|
|
196
|
+
- **Usage**: `read_and_concatenate_files(folder_path, file_type='csv')`
|
|
197
|
+
- **Example**: `read_and_concatenate_files('/path/to/files', file_type='xlsx')`
|
|
198
|
+
|
|
199
|
+
## 25. `upgrade_outdated_packages`
|
|
200
|
+
- **Description**: Upgrades all outdated Python packages except specified ones.
|
|
201
|
+
- **Usage**: `upgrade_outdated_packages(exclude_packages=['twine'])`
|
|
202
|
+
- **Example**: `upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])`
|
|
203
|
+
|
|
204
|
+
## 26. `convert_mixed_formats_dates`
|
|
205
|
+
- **Description**: Converts mixed-format date columns into standardized datetime format.
|
|
206
|
+
- **Usage**: `convert_mixed_formats_dates(df, column_name)`
|
|
207
|
+
- **Example**: `convert_mixed_formats_dates(df, 'date_col')`
|
|
208
|
+
|
|
209
|
+
## 27. `fill_weekly_date_range`
|
|
210
|
+
- **Description**: Fills in missing weekly dates in a DataFrame with a specified frequency.
|
|
211
|
+
- **Usage**: `fill_weekly_date_range(df, date_column, freq='W-MON')`
|
|
212
|
+
- **Example**: `fill_weekly_date_range(df, 'date_col')`
|
|
213
|
+
|
|
214
|
+
## 28. `add_prefix_and_suffix`
|
|
215
|
+
- **Description**: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.
|
|
216
|
+
- **Usage**: `add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)`
|
|
217
|
+
- **Example**: `add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')`
|
|
218
|
+
|
|
219
|
+
## 29. `create_dummies`
|
|
220
|
+
- **Description**: Creates dummy variables for columns, with an option to add a total dummy column.
|
|
221
|
+
- **Usage**: `create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')`
|
|
222
|
+
- **Example**: `create_dummies(df, date_col='date_col', dummy_threshold=1)`
|
|
223
|
+
|
|
224
|
+
## 30. `replace_substrings`
|
|
225
|
+
- **Description**: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.
|
|
226
|
+
- **Usage**: `replace_substrings(df, column, replacements, to_lower=False, new_column=None)`
|
|
227
|
+
- **Example**: `replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')`
|
|
228
|
+
|
|
229
|
+
## 31. `add_total_column`
|
|
230
|
+
- **Description**: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.
|
|
231
|
+
- **Usage**: `add_total_column(df, exclude_col=None, total_col_name='Total')`
|
|
232
|
+
- **Example**: `add_total_column(df, exclude_col='date_col')`
|
|
233
|
+
|
|
234
|
+
## 32. `apply_lookup_table_based_on_substring`
|
|
235
|
+
- **Description**: Categorizes text in a column using a lookup table based on substrings.
|
|
236
|
+
- **Usage**: `apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')`
|
|
237
|
+
- **Example**: `apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})`
|
|
238
|
+
|
|
239
|
+
## 33. `compare_overlap`
|
|
240
|
+
- **Description**: Compares overlapping periods between two DataFrames and summarizes differences.
|
|
241
|
+
- **Usage**: `compare_overlap(df1, df2, date_col)`
|
|
242
|
+
- **Example**: `compare_overlap(df1, df2, 'date_col')`
|
|
243
|
+
|
|
244
|
+
## 34. `week_commencing_2_week_commencing_conversion_isoweekday`
|
|
245
|
+
- **Description**: Maps dates to the start of the current ISO week based on a specified weekday.
|
|
246
|
+
- **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
|
|
247
|
+
- **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
|
|
248
|
+
|
|
249
|
+
## 35. `seasonality_feature_extraction`
|
|
250
|
+
- **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
|
|
251
|
+
- **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
|
|
252
|
+
- **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Data Processing for Incrementality Testing
|
|
257
|
+
|
|
258
|
+
## 1. `pull_ga`
|
|
259
|
+
- **Description**: Pull in GA4 data for geo experiments.
|
|
260
|
+
- **Usage**: `pull_ga(credentials_file, property_id, start_date, country, metrics)`
|
|
261
|
+
- **Example**: `pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])`
|
|
262
|
+
|
|
263
|
+
## 2. `process_itv_analysis`
|
|
264
|
+
- **Description**: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.
|
|
265
|
+
- **Usage**: `process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
|
|
266
|
+
- **Example**: `process_itv_analysis(df, 'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum'])`
|
|
267
|
+
|
|
268
|
+
## 3. `process_city_analysis`
|
|
269
|
+
- **Description**: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.
|
|
270
|
+
- **Usage**: `process_city_analysis(raw_df, spend_df, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
|
|
271
|
+
- **Example**: `process_city_analysis(df, spend, output, ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'sum'])`
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## Data Visualisations
|
|
276
|
+
|
|
277
|
+
## 1. `plot_one`
|
|
278
|
+
- **Description**: Plots a specified column from a DataFrame with white background and black axes.
|
|
279
|
+
- **Usage**: `plot_one(df1, col1, date_column)`
|
|
280
|
+
- **Example**: `plot_one(df, 'sales', 'date')`
|
|
281
|
+
|
|
282
|
+
## 2. `plot_two`
|
|
283
|
+
- **Description**: Plots specified columns from two DataFrames, optionally on the same or separate y-axes.
|
|
284
|
+
- **Usage**: `plot_two(df1, col1, df2, col2, date_column, same_axis=True)`
|
|
285
|
+
- **Example**: `plot_two(df1, 'sales', df2, 'revenue', 'date', same_axis=False)`
|
|
286
|
+
|
|
287
|
+
## 3. `plot_chart`
|
|
288
|
+
- **Description**: Plots various chart types using Plotly, including line, bar, scatter, area, pie, etc.
|
|
289
|
+
- **Usage**: `plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values')`
|
|
290
|
+
- **Example**: `plot_chart(df, 'date', ['sales', 'revenue'], chart_type='line', title='Sales and Revenue')`
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## Data Pulling
|
|
295
|
+
|
|
296
|
+
## 1. `pull_fred_data`
|
|
297
|
+
- **Description**: Fetch data from FRED using series ID tokens.
|
|
298
|
+
- **Usage**: `pull_fred_data(week_commencing, series_id_list)`
|
|
299
|
+
- **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
|
|
300
|
+
|
|
301
|
+
## 2. `pull_boe_data`
|
|
302
|
+
- **Description**: Fetch and process Bank of England interest rate data.
|
|
303
|
+
- **Usage**: `pull_boe_data(week_commencing)`
|
|
304
|
+
- **Example**: `pull_boe_data('mon')`
|
|
305
|
+
|
|
306
|
+
## 3. `pull_oecd`
|
|
307
|
+
- **Description**: Fetch macroeconomic data from OECD for a specified country.
|
|
308
|
+
- **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')`
|
|
309
|
+
- **Example**: `pull_oecd('GBR', 'mon', '2000-01-01')`
|
|
310
|
+
|
|
311
|
+
## 4. `get_google_mobility_data`
|
|
312
|
+
- **Description**: Fetch Google Mobility data for the specified country.
|
|
313
|
+
- **Usage**: `get_google_mobility_data(country, wc)`
|
|
314
|
+
- **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
|
|
315
|
+
|
|
316
|
+
## 5. `pull_seasonality`
|
|
317
|
+
- **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
|
|
318
|
+
- **Usage**: `pull_seasonality(week_commencing, start_date, countries)`
|
|
319
|
+
- **Example**: `pull_seasonality('mon', '2020-01-01', ['US', 'GB'])`
|
|
320
|
+
|
|
321
|
+
## 6. `pull_weather`
|
|
322
|
+
- **Description**: Fetch and process historical weather data for the specified country.
|
|
323
|
+
- **Usage**: `pull_weather(week_commencing, start_date, country)`
|
|
324
|
+
- **Example**: `pull_weather('mon', '2020-01-01', 'GBR')`
|
|
325
|
+
|
|
326
|
+
## 7. `pull_macro_ons_uk`
|
|
327
|
+
- **Description**: Fetch and process time series data from the Beta ONS API.
|
|
328
|
+
- **Usage**: `pull_macro_ons_uk(additional_list, week_commencing, sector)`
|
|
329
|
+
- **Example**: `pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')`
|
|
330
|
+
|
|
331
|
+
## 8. `pull_yfinance`
|
|
332
|
+
- **Description**: Fetch and process time series data from Yahoo Finance.
|
|
333
|
+
- **Usage**: `pull_yfinance(tickers, week_start_day)`
|
|
334
|
+
- **Example**: `pull_yfinance(['^FTMC', '^IXIC'], 'mon')`
|
|
335
|
+
|
|
336
|
+
## 9. `pull_sports_events`
|
|
337
|
+
- **Description**: Pull a veriety of sports events primaraly football and rugby.
|
|
338
|
+
- **Usage**: `pull_sports_events(start_date, week_commencing)`
|
|
339
|
+
- **Example**: `pull_sports_events('2020-01-01', 'mon')`
|
|
340
|
+
|
|
341
|
+
---
|
|
342
|
+
|
|
343
|
+
## Installation
|
|
344
|
+
|
|
345
|
+
Install the IMS package via pip:
|
|
346
|
+
|
|
347
|
+
```bash
|
|
348
|
+
pip install imsciences
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## License
|
|
354
|
+
|
|
355
|
+
This project is licensed under the MIT License. 
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
## Roadmap
|
|
360
|
+
|
|
361
|
+
- [Fixes]: Naming conventions are inconsistent/ have changed from previous seasonality tools (eg. 'seas_nyd' is named 'seas_new_years_day', 'week_1' is named 'seas_1')
|
|
362
|
+
- [Fixes]: Naming conventions can be inconsistent within the data pull (suffix on some var is 'gb' on some it is 'uk' and for others there is no suffix) - furthermore, there is a lack of consistency for global holidays/events (Christmas, Easter, Halloween, etc) - some have regional suffix and others don't.
|
|
363
|
+
- [Additions]: Need to add new data pulls for more macro and seasonal varibles
|
|
364
|
+
|
|
365
|
+
---
|
|
@@ -5,9 +5,11 @@ setup.py
|
|
|
5
5
|
imsciences/__init__.py
|
|
6
6
|
imsciences/geo.py
|
|
7
7
|
imsciences/mmm.py
|
|
8
|
+
imsciences/pull-IMS-24Ltp-3.py
|
|
8
9
|
imsciences/pull.py
|
|
9
10
|
imsciences/vis.py
|
|
10
11
|
imsciences.egg-info/PKG-INFO
|
|
12
|
+
imsciences.egg-info/PKG-INFO-IMS-24Ltp-3
|
|
11
13
|
imsciences.egg-info/PKG-INFO-TomG-HP-290722
|
|
12
14
|
imsciences.egg-info/SOURCES.txt
|
|
13
15
|
imsciences.egg-info/dependency_links.txt
|
|
@@ -15,14 +15,13 @@ def read_md(file_name):
|
|
|
15
15
|
except FileNotFoundError:
|
|
16
16
|
return ""
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
def get_version():
|
|
20
19
|
"""Get version from __init__.py file."""
|
|
21
20
|
with Path("imsciences/__init__.py").open("r", encoding="utf-8") as f:
|
|
22
21
|
for line in f:
|
|
23
22
|
if line.startswith("__version__"):
|
|
24
23
|
return line.split("=")[1].strip().strip('"').strip("'")
|
|
25
|
-
return "1.0.
|
|
24
|
+
return "1.0.8" # Start from 1.0.0 instead of 0.0.0
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
def increment_version():
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|