imsciences 0.9.6.0__py3-none-any.whl → 0.9.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

imsciences/pull.py CHANGED
@@ -1171,20 +1171,22 @@ class datapull:
1171
1171
 
1172
1172
  def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
1173
1173
  """
1174
- Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
1174
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
1175
1175
  aggregates it to weekly averages, and renames variables based on specified rules.
1176
1176
 
1177
1177
  Parameters:
1178
- cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
1179
- week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
1180
- sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
1178
+ cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
1179
+ week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
1180
+ sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
1181
+ (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
1181
1182
 
1182
1183
  Returns:
1183
- pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
1184
- and all series as renamed columns.
1184
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
1185
+ and all series as renamed columns (e.g., 'macro_retail_sales_uk').
1186
+ Returns an empty DataFrame if no data is fetched or processed.
1185
1187
  """
1186
1188
  # Define CDIDs for sectors and defaults
1187
- sector_cdids = {
1189
+ sector_cdids_map = {
1188
1190
  "fast_food": ["L7TD", "L78Q", "DOAD"],
1189
1191
  "clothing_footwear": ["D7BW","D7GO","CHBJ"],
1190
1192
  "fuel": ["A9FS","L7FP","CHOL"],
@@ -1192,14 +1194,29 @@ class datapull:
1192
1194
  "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
1193
1195
  }
1194
1196
 
1195
- default_cdids = sector_cdids["default"]
1196
- sector_specific_cdids = sector_cdids.get(sector, [])
1197
- standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
1197
+ default_cdids = sector_cdids_map["default"]
1198
+ sector_specific_cdids = [] # Initialize empty list for sector CDIDs
1199
+
1200
+ if sector: # Check if sector is not None or empty
1201
+ if isinstance(sector, str):
1202
+ # If it's a single string, wrap it in a list
1203
+ sector_list = [sector]
1204
+ elif isinstance(sector, list):
1205
+ # If it's already a list, use it directly
1206
+ sector_list = sector
1207
+ else:
1208
+ raise TypeError("`sector` parameter must be a string or a list of strings.")
1209
+
1210
+ # Iterate through the list of sectors and collect their CDIDs
1211
+ for sec in sector_list:
1212
+ sector_specific_cdids.extend(sector_cdids_map.get(sec, [])) # Use extend to add items from the list
1213
+
1214
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Combine default and selected sector CDIDs, ensure uniqueness
1198
1215
 
1199
- # Combine standard CDIDs and additional CDIDs
1216
+ # Combine standard CDIDs and any additional user-provided CDIDs
1200
1217
  if cdid_list is None:
1201
1218
  cdid_list = []
1202
- cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
1219
+ final_cdid_list = list(set(standard_cdids + cdid_list)) # Ensure uniqueness in the final list
1203
1220
 
1204
1221
  base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
1205
1222
  base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
@@ -1207,41 +1224,57 @@ class datapull:
1207
1224
 
1208
1225
  # Map week start day to pandas weekday convention
1209
1226
  days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
1210
- if week_start_day not in days_map:
1227
+ if week_start_day.lower() not in days_map:
1211
1228
  raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
1212
- week_start = days_map[week_start_day]
1229
+ week_start = days_map[week_start_day.lower()] # Use lower() for case-insensitivity
1213
1230
 
1214
- for cdid in cdid_list:
1231
+ for cdid in final_cdid_list: # Use the final combined list
1215
1232
  try:
1216
1233
  # Search for the series
1217
1234
  search_url = f"{base_search_url}{cdid}"
1218
- search_response = requests.get(search_url)
1235
+ search_response = requests.get(search_url, timeout=30) # Add timeout
1219
1236
  search_response.raise_for_status()
1220
1237
  search_data = search_response.json()
1221
1238
 
1222
1239
  items = search_data.get("items", [])
1223
1240
  if not items:
1224
- print(f"No data found for CDID: {cdid}")
1241
+ print(f"Warning: No data found for CDID: {cdid}")
1225
1242
  continue
1226
1243
 
1227
1244
  # Extract series name and latest release URI
1228
- series_name = items[0].get("title", f"Series_{cdid}")
1229
- latest_date = max(
1230
- datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
1231
- for item in items if "release_date" in item
1232
- )
1233
- latest_uri = next(
1234
- item["uri"] for item in items
1235
- if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
1236
- )
1245
+ # Find the item with the most recent release_date
1246
+ latest_item = None
1247
+ latest_date = None
1248
+ for item in items:
1249
+ if "release_date" in item:
1250
+ try:
1251
+ # Ensure timezone awareness for comparison
1252
+ current_date = datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
1253
+ if latest_date is None or current_date > latest_date:
1254
+ latest_date = current_date
1255
+ latest_item = item
1256
+ except ValueError:
1257
+ print(f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}")
1258
+ continue # Skip this item if date is invalid
1259
+
1260
+ if latest_item is None:
1261
+ print(f"Warning: No valid release date found for CDID: {cdid}")
1262
+ continue
1263
+
1264
+ series_name = latest_item.get("title", f"Series_{cdid}") # Use title from the latest item
1265
+ latest_uri = latest_item.get("uri")
1266
+ if not latest_uri:
1267
+ print(f"Warning: No URI found for the latest release of CDID: {cdid}")
1268
+ continue
1237
1269
 
1238
1270
  # Fetch the dataset
1239
1271
  data_url = f"{base_data_url}{latest_uri}"
1240
- data_response = requests.get(data_url)
1272
+ data_response = requests.get(data_url, timeout=30) # Add timeout
1241
1273
  data_response.raise_for_status()
1242
1274
  data_json = data_response.json()
1243
1275
 
1244
1276
  # Detect the frequency and process accordingly
1277
+ frequency_key = None
1245
1278
  if "months" in data_json and data_json["months"]:
1246
1279
  frequency_key = "months"
1247
1280
  elif "quarters" in data_json and data_json["quarters"]:
@@ -1249,72 +1282,142 @@ class datapull:
1249
1282
  elif "years" in data_json and data_json["years"]:
1250
1283
  frequency_key = "years"
1251
1284
  else:
1252
- print(f"Unsupported frequency or no data for CDID: {cdid}")
1285
+ print(f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}")
1253
1286
  continue
1254
1287
 
1255
1288
  # Prepare the DataFrame
1289
+ if not data_json[frequency_key]: # Check if the list of values is empty
1290
+ print(f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}")
1291
+ continue
1292
+
1256
1293
  df = pd.DataFrame(data_json[frequency_key])
1257
1294
 
1258
- # Parse the 'date' field based on frequency
1259
- if frequency_key == "months":
1260
- df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
1261
- elif frequency_key == "quarters":
1262
- def parse_quarter(quarter_str):
1263
- year, qtr = quarter_str.split(" Q")
1264
- month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
1265
- return datetime(int(year), month, 1)
1266
- df["date"] = df["date"].apply(parse_quarter)
1267
- elif frequency_key == "years":
1268
- df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
1295
+ # Check if essential columns exist
1296
+ if "date" not in df.columns or "value" not in df.columns:
1297
+ print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
1298
+ continue
1269
1299
 
1300
+ # Parse the 'date' field based on frequency
1301
+ try:
1302
+ if frequency_key == "months":
1303
+ # Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
1304
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
1305
+ elif frequency_key == "quarters":
1306
+ def parse_quarter(quarter_str):
1307
+ try:
1308
+ year, qtr = quarter_str.split(" Q")
1309
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
1310
+ return datetime(int(year), month, 1)
1311
+ except (ValueError, KeyError):
1312
+ return pd.NaT # Return Not a Time for parsing errors
1313
+ df["date"] = df["date"].apply(parse_quarter)
1314
+ elif frequency_key == "years":
1315
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
1316
+ except Exception as e:
1317
+ print(f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}")
1318
+ continue # Skip this series if date parsing fails
1319
+
1320
+ # Coerce value to numeric, handle potential errors
1270
1321
  df["value"] = pd.to_numeric(df["value"], errors="coerce")
1322
+
1323
+ # Drop rows where date or value parsing failed
1324
+ df.dropna(subset=["date", "value"], inplace=True)
1325
+
1326
+ if df.empty:
1327
+ print(f"Warning: No valid data points after processing for CDID: {cdid}")
1328
+ continue
1329
+
1271
1330
  df.rename(columns={"value": series_name}, inplace=True)
1272
1331
 
1273
1332
  # Combine data
1274
- df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
1333
+ df_subset = df.loc[:, ["date", series_name]].reset_index(drop=True) # Explicitly select columns
1275
1334
  if combined_df.empty:
1276
- combined_df = df
1335
+ combined_df = df_subset
1277
1336
  else:
1278
- combined_df = pd.merge(combined_df, df, on="date", how="outer")
1337
+ # Use outer merge to keep all dates, sort afterwards
1338
+ combined_df = pd.merge(combined_df, df_subset, on="date", how="outer")
1279
1339
 
1280
1340
  except requests.exceptions.RequestException as e:
1281
1341
  print(f"Error fetching data for CDID {cdid}: {e}")
1282
- except (KeyError, ValueError) as e:
1342
+ except (KeyError, ValueError, TypeError) as e: # Added TypeError
1283
1343
  print(f"Error processing data for CDID {cdid}: {e}")
1344
+ except Exception as e: # Catch unexpected errors
1345
+ print(f"An unexpected error occurred for CDID {cdid}: {e}")
1346
+
1284
1347
 
1285
1348
  if not combined_df.empty:
1349
+ # Sort by date after merging to ensure correct forward fill
1350
+ combined_df.sort_values(by="date", inplace=True)
1351
+ combined_df.reset_index(drop=True, inplace=True)
1352
+
1353
+ # Create a complete daily date range
1286
1354
  min_date = combined_df["date"].min()
1287
- max_date = datetime.today()
1355
+ # Ensure max_date is timezone-naive if min_date is, or consistent otherwise
1356
+ max_date = pd.Timestamp(datetime.today().date()) # Use today's date, timezone-naive
1357
+
1358
+ if pd.isna(min_date):
1359
+ print("Error: Minimum date is NaT, cannot create date range.")
1360
+ return pd.DataFrame()
1361
+
1362
+ # Make sure min_date is not NaT before creating the range
1288
1363
  date_range = pd.date_range(start=min_date, end=max_date, freq='D')
1289
1364
  daily_df = pd.DataFrame(date_range, columns=['date'])
1365
+
1366
+ # Merge with original data and forward fill
1290
1367
  daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
1291
1368
  daily_df = daily_df.ffill()
1292
1369
 
1370
+ # Drop rows before the first valid data point after ffill
1371
+ first_valid_index = daily_df.dropna(subset=daily_df.columns.difference(['date'])).index.min()
1372
+ if pd.notna(first_valid_index):
1373
+ daily_df = daily_df.loc[first_valid_index:]
1374
+ else:
1375
+ print("Warning: No valid data points found after forward filling.")
1376
+ return pd.DataFrame() # Return empty if ffill results in no data
1377
+
1378
+
1293
1379
  # Aggregate to weekly frequency
1294
- daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
1380
+ # Ensure 'date' column is datetime type before dt accessor
1381
+ daily_df['date'] = pd.to_datetime(daily_df['date'])
1382
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start + 7) % 7, unit='D') # Corrected logic for week start
1383
+ # Group by week_commencing and calculate mean for numeric columns only
1295
1384
  weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
1296
1385
 
1386
+
1297
1387
  def clean_column_name(name):
1388
+ # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
1298
1389
  name = re.sub(r"\(.*?\)", "", name)
1390
+ # Take only the part before the first colon if present
1299
1391
  name = re.split(r":", name)[0]
1300
- name = re.sub(r"\d+", "", name)
1392
+ # Remove digits
1393
+ #name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
1394
+ # Remove specific words like 'annual', 'rate' case-insensitively
1301
1395
  name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
1396
+ # Remove non-alphanumeric characters (except underscore and space)
1302
1397
  name = re.sub(r"[^\w\s]", "", name)
1398
+ # Replace spaces with underscores
1399
+ name = name.strip() # Remove leading/trailing whitespace
1303
1400
  name = name.replace(" ", "_")
1401
+ # Replace multiple underscores with a single one
1304
1402
  name = re.sub(r"_+", "_", name)
1403
+ # Remove trailing underscores
1305
1404
  name = name.rstrip("_")
1405
+ # Add prefix and suffix
1306
1406
  return f"macro_{name.lower()}_uk"
1307
1407
 
1408
+ # Apply cleaning function to relevant columns
1308
1409
  weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
1309
- weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
1410
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True) # Rename week commencing col
1310
1411
 
1311
- weekly_df = weekly_df.fillna(0)
1412
+ # Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
1413
+ # Consider if 0 is the appropriate fill value for your use case
1414
+ # weekly_df = weekly_df.fillna(0)
1312
1415
 
1313
1416
  return weekly_df
1314
1417
  else:
1315
- print("No data available to process.")
1418
+ print("No data successfully fetched or processed.")
1316
1419
  return pd.DataFrame()
1317
-
1420
+
1318
1421
  def pull_yfinance(self, tickers=None, week_start_day="mon"):
1319
1422
  """
1320
1423
  Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.6.0
3
+ Version: 0.9.6.3
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -0,0 +1,355 @@
1
+ Metadata-Version: 2.1
2
+ Name: imsciences
3
+ Version: 0.9.6.3
4
+ Summary: IMS Data Processing Package
5
+ Author: IMS
6
+ Author-email: cam@im-sciences.com
7
+ Keywords: data processing,apis,data analysis,data visualization,machine learning
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: Unix
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE.txt
16
+ Requires-Dist: pandas
17
+ Requires-Dist: plotly
18
+ Requires-Dist: numpy
19
+ Requires-Dist: fredapi
20
+ Requires-Dist: xgboost
21
+ Requires-Dist: scikit-learn
22
+ Requires-Dist: bs4
23
+ Requires-Dist: yfinance
24
+ Requires-Dist: holidays
25
+ Requires-Dist: google-analytics-data
26
+ Requires-Dist: geopandas
27
+ Requires-Dist: geopy
28
+
29
+ # IMS Package Documentation
30
+
31
+ The **Independent Marketing Sciences** package is a Python library designed to process incoming data into a format tailored for projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data collection, manipulation, visualisation and analysis.
32
+
33
+ ---
34
+
35
+ ## Key Features
36
+ - Seamless data processing for time series workflows.
37
+ - Aggregation, filtering, and transformation of time series data.
38
+ - Visualising Data
39
+ - Integration with external data sources like FRED, Bank of England and ONS.
40
+
41
+ ---
42
+
43
+ Table of Contents
44
+ =================
45
+
46
+ 1. [Usage](#usage)
47
+ 2. [Data Processing for Time Series](#data-processing-for-time-series)
48
+ 3. [Data Processing for Incrementality Testing](#data-processing-for-incrementality-testing)
49
+ 4. [Data Visualisations](#data-visualisations)
50
+ 5. [Data Pulling](#data-pulling)
51
+ 6. [Installation](#installation)
52
+ 7. [License](#license)
53
+ 8. [Roadmap](#roadmap)
54
+
55
+ ---
56
+
57
+ ## Usage
58
+
59
+ ```bash
60
+ from imsciences import dataprocessing, geoprocessing, datapull, datavis
61
+ ims_proc = dataprocessing()
62
+ ims_geo = geoprocessing()
63
+ ims_pull = datapull()
64
+ ims_vis = datavis()
65
+ ```
66
+
67
+ ## Data Processing for Time Series
68
+
69
+ ## 1. `get_wd_levels`
70
+ - **Description**: Get the working directory with the option of moving up parents.
71
+ - **Usage**: `get_wd_levels(levels)`
72
+ - **Example**: `get_wd_levels(0)`
73
+
74
+ ## 2. `aggregate_daily_to_wc_long`
75
+ - **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
76
+ - **Usage**: `aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')`
77
+ - **Example**: `aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')`
78
+
79
+ ## 3. `convert_monthly_to_daily`
80
+ - **Description**: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.
81
+ - **Usage**: `convert_monthly_to_daily(df, date_column, divide=True)`
82
+ - **Example**: `convert_monthly_to_daily(df, 'date')`
83
+
84
+ ## 4. `week_of_year_mapping`
85
+ - **Description**: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.
86
+ - **Usage**: `week_of_year_mapping(df, week_col, start_day_str)`
87
+ - **Example**: `week_of_year_mapping(df, 'week', 'mon')`
88
+
89
+ ## 5. `rename_cols`
90
+ - **Description**: Renames columns in a pandas DataFrame with a specified prefix or format.
91
+ - **Usage**: `rename_cols(df, name='ame_')`
92
+ - **Example**: `rename_cols(df, 'ame_facebook')`
93
+
94
+ ## 6. `merge_new_and_old`
95
+ - **Description**: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.
96
+ - **Usage**: `merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')`
97
+ - **Example**: `merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')`
98
+
99
+ ## 7. `merge_dataframes_on_column`
100
+ - **Description**: Merge a list of DataFrames on a common column.
101
+ - **Usage**: `merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')`
102
+ - **Example**: `merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')`
103
+
104
+ ## 8. `merge_and_update_dfs`
105
+ - **Description**: Merges two dataframes, updating columns from the second dataframe where values are available.
106
+ - **Usage**: `merge_and_update_dfs(df1, df2, key_column)`
107
+ - **Example**: `merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')`
108
+
109
+ ## 9. `convert_us_to_uk_dates`
110
+ - **Description**: Convert a DataFrame column with mixed US and UK date formats to datetime.
111
+ - **Usage**: `convert_us_to_uk_dates(df, date_col)`
112
+ - **Example**: `convert_us_to_uk_dates(df, 'date')`
113
+
114
+ ## 10. `combine_sheets`
115
+ - **Description**: Combines multiple DataFrames from a dictionary into a single DataFrame.
116
+ - **Usage**: `combine_sheets(all_sheets)`
117
+ - **Example**: `combine_sheets({'Sheet1': df1, 'Sheet2': df2})`
118
+
119
+ ## 11. `pivot_table`
120
+ - **Description**: Dynamically pivots a DataFrame based on specified columns.
121
+ - **Usage**: `pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')`
122
+ - **Example**: `pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)`
123
+
124
+ ## 12. `apply_lookup_table_for_columns`
125
+ - **Description**: Maps substrings in columns to new values based on a dictionary.
126
+ - **Usage**: `apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')`
127
+ - **Example**: `apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')`
128
+
129
+ ## 13. `aggregate_daily_to_wc_wide`
130
+ - **Description**: Aggregates daily data into weekly data and pivots it to wide format.
131
+ - **Usage**: `aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)`
132
+ - **Example**: `aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)`
133
+
134
+ ## 14. `merge_cols_with_seperator`
135
+ - **Description**: Merges multiple columns in a DataFrame into one column with a specified separator.
136
+ - **Usage**: `merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')`
137
+ - **Example**: `merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')`
138
+
139
+ ## 15. `check_sum_of_df_cols_are_equal`
140
+ - **Description**: Checks if the sum of two columns in two DataFrames are equal and provides the difference.
141
+ - **Usage**: `check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)`
142
+ - **Example**: `check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')`
143
+
144
+ ## 16. `convert_2_df_cols_to_dict`
145
+ - **Description**: Creates a dictionary from two DataFrame columns.
146
+ - **Usage**: `convert_2_df_cols_to_dict(df, key_col, value_col)`
147
+ - **Example**: `convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')`
148
+
149
+ ## 17. `create_FY_and_H_columns`
150
+ - **Description**: Adds financial year and half-year columns to a DataFrame based on a start date.
151
+ - **Usage**: `create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')`
152
+ - **Example**: `create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')`
153
+
154
+ ## 18. `keyword_lookup_replacement`
155
+ - **Description**: Updates values in a column based on a lookup dictionary with conditional logic.
156
+ - **Usage**: `keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')`
157
+ - **Example**: `keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')`
158
+
159
+ ## 19. `create_new_version_of_col_using_LUT`
160
+ - **Description**: Creates a new column based on a lookup table applied to an existing column.
161
+ - **Usage**: `create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')`
162
+ - **Example**: `create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)`
163
+
164
+ ## 20. `convert_df_wide_2_long`
165
+ - **Description**: Converts a wide-format DataFrame into a long-format DataFrame.
166
+ - **Usage**: `convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')`
167
+ - **Example**: `convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')`
168
+
169
+ ## 21. `manually_edit_data`
170
+ - **Description**: Manually updates specified cells in a DataFrame based on filters.
171
+ - **Usage**: `manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)`
172
+ - **Example**: `manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')`
173
+
174
+ ## 22. `format_numbers_with_commas`
175
+ - **Description**: Formats numerical columns with commas and a specified number of decimal places.
176
+ - **Usage**: `format_numbers_with_commas(df, decimal_length_chosen=2)`
177
+ - **Example**: `format_numbers_with_commas(df, decimal_length_chosen=1)`
178
+
179
+ ## 23. `filter_df_on_multiple_conditions`
180
+ - **Description**: Filters a DataFrame based on multiple column conditions.
181
+ - **Usage**: `filter_df_on_multiple_conditions(df, filters_dict)`
182
+ - **Example**: `filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})`
183
+
184
+ ## 24. `read_and_concatenate_files`
185
+ - **Description**: Reads and concatenates files from a specified folder into a single DataFrame.
186
+ - **Usage**: `read_and_concatenate_files(folder_path, file_type='csv')`
187
+ - **Example**: `read_and_concatenate_files('/path/to/files', file_type='xlsx')`
188
+
189
+ ## 25. `upgrade_outdated_packages`
190
+ - **Description**: Upgrades all outdated Python packages except specified ones.
191
+ - **Usage**: `upgrade_outdated_packages(exclude_packages=['twine'])`
192
+ - **Example**: `upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])`
193
+
194
+ ## 26. `convert_mixed_formats_dates`
195
+ - **Description**: Converts mixed-format date columns into standardized datetime format.
196
+ - **Usage**: `convert_mixed_formats_dates(df, column_name)`
197
+ - **Example**: `convert_mixed_formats_dates(df, 'date_col')`
198
+
199
+ ## 27. `fill_weekly_date_range`
200
+ - **Description**: Fills in missing weekly dates in a DataFrame with a specified frequency.
201
+ - **Usage**: `fill_weekly_date_range(df, date_column, freq='W-MON')`
202
+ - **Example**: `fill_weekly_date_range(df, 'date_col')`
203
+
204
+ ## 28. `add_prefix_and_suffix`
205
+ - **Description**: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.
206
+ - **Usage**: `add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)`
207
+ - **Example**: `add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')`
208
+
209
+ ## 29. `create_dummies`
210
+ - **Description**: Creates dummy variables for columns, with an option to add a total dummy column.
211
+ - **Usage**: `create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')`
212
+ - **Example**: `create_dummies(df, date_col='date_col', dummy_threshold=1)`
213
+
214
+ ## 30. `replace_substrings`
215
+ - **Description**: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.
216
+ - **Usage**: `replace_substrings(df, column, replacements, to_lower=False, new_column=None)`
217
+ - **Example**: `replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')`
218
+
219
+ ## 31. `add_total_column`
220
+ - **Description**: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.
221
+ - **Usage**: `add_total_column(df, exclude_col=None, total_col_name='Total')`
222
+ - **Example**: `add_total_column(df, exclude_col='date_col')`
223
+
224
+ ## 32. `apply_lookup_table_based_on_substring`
225
+ - **Description**: Categorizes text in a column using a lookup table based on substrings.
226
+ - **Usage**: `apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')`
227
+ - **Example**: `apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})`
228
+
229
+ ## 33. `compare_overlap`
230
+ - **Description**: Compares overlapping periods between two DataFrames and summarizes differences.
231
+ - **Usage**: `compare_overlap(df1, df2, date_col)`
232
+ - **Example**: `compare_overlap(df1, df2, 'date_col')`
233
+
234
+ ## 34. `week_commencing_2_week_commencing_conversion_isoweekday`
235
+ - **Description**: Maps dates to the start of the current ISO week based on a specified weekday.
236
+ - **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
237
+ - **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
238
+
239
+ ## 35. `seasonality_feature_extraction`
240
+ - **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
241
+ - **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
242
+ - **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
243
+
244
+ ---
245
+
246
+ ## Data Processing for Incrementality Testing
247
+
248
+ ## 1. `pull_ga`
249
+ - **Description**: Pull in GA4 data for geo experiments.
250
+ - **Usage**: `pull_ga(credentials_file, property_id, start_date, country, metrics)`
251
+ - **Example**: `pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])`
252
+
253
+ ## 2. `process_itv_analysis`
254
+ - **Description**: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.
255
+ - **Usage**: `process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
256
+ - **Example**: `process_itv_analysis(df, 'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum'])`
257
+
258
+ ## 3. `process_city_analysis`
259
+ - **Description**: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.
260
+ - **Usage**: `process_city_analysis(raw_df, spend_df, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
261
+ - **Example**: `process_city_analysis(df, spend, output, ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'sum'])`
262
+
263
+ ---
264
+
265
+ ## Data Visualisations
266
+
267
+ ## 1. `plot_one`
268
+ - **Description**: Plots a specified column from a DataFrame with white background and black axes.
269
+ - **Usage**: `plot_one(df1, col1, date_column)`
270
+ - **Example**: `plot_one(df, 'sales', 'date')`
271
+
272
+ ## 2. `plot_two`
273
+ - **Description**: Plots specified columns from two DataFrames, optionally on the same or separate y-axes.
274
+ - **Usage**: `plot_two(df1, col1, df2, col2, date_column, same_axis=True)`
275
+ - **Example**: `plot_two(df1, 'sales', df2, 'revenue', 'date', same_axis=False)`
276
+
277
+ ## 3. `plot_chart`
278
+ - **Description**: Plots various chart types using Plotly, including line, bar, scatter, area, pie, etc.
279
+ - **Usage**: `plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values')`
280
+ - **Example**: `plot_chart(df, 'date', ['sales', 'revenue'], chart_type='line', title='Sales and Revenue')`
281
+
282
+ ---
283
+
284
+ ## Data Pulling
285
+
286
+ ## 1. `pull_fred_data`
287
+ - **Description**: Fetch data from FRED using series ID tokens.
288
+ - **Usage**: `pull_fred_data(week_commencing, series_id_list)`
289
+ - **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
290
+
291
+ ## 2. `pull_boe_data`
292
+ - **Description**: Fetch and process Bank of England interest rate data.
293
+ - **Usage**: `pull_boe_data(week_commencing)`
294
+ - **Example**: `pull_boe_data('mon')`
295
+
296
+ ## 3. `pull_oecd`
297
+ - **Description**: Fetch macroeconomic data from OECD for a specified country.
298
+ - **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')`
299
+ - **Example**: `pull_oecd('GBR', 'mon', '2000-01-01')`
300
+
301
+ ## 4. `get_google_mobility_data`
302
+ - **Description**: Fetch Google Mobility data for the specified country.
303
+ - **Usage**: `get_google_mobility_data(country, wc)`
304
+ - **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
305
+
306
+ ## 5. `pull_seasonality`
307
+ - **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
308
+ - **Usage**: `pull_seasonality(week_commencing, start_date, countries)`
309
+ - **Example**: `pull_seasonality('mon', '2020-01-01', ['US', 'GB'])`
310
+
311
+ ## 6. `pull_weather`
312
+ - **Description**: Fetch and process historical weather data for the specified country.
313
+ - **Usage**: `pull_weather(week_commencing, start_date, country)`
314
+ - **Example**: `pull_weather('mon', '2020-01-01', 'GBR')`
315
+
316
+ ## 7. `pull_macro_ons_uk`
317
+ - **Description**: Fetch and process time series data from the Beta ONS API.
318
+ - **Usage**: `pull_macro_ons_uk(additional_list, week_commencing, sector)`
319
+ - **Example**: `pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')`
320
+
321
+ ## 8. `pull_yfinance`
322
+ - **Description**: Fetch and process time series data from Yahoo Finance.
323
+ - **Usage**: `pull_yfinance(tickers, week_start_day)`
324
+ - **Example**: `pull_yfinance(['^FTMC', '^IXIC'], 'mon')`
325
+
326
+ ## 9. `pull_sports_events`
327
+ - **Description**: Pull a veriety of sports events primaraly football and rugby.
328
+ - **Usage**: `pull_sports_events(start_date, week_commencing)`
329
+ - **Example**: `pull_sports_events('2020-01-01', 'mon')`
330
+
331
+ ---
332
+
333
+ ## Installation
334
+
335
+ Install the IMS package via pip:
336
+
337
+ ```bash
338
+ pip install imsciences
339
+ ```
340
+
341
+ ---
342
+
343
+ ## License
344
+
345
+ This project is licensed under the MIT License. ![License](https://img.shields.io/badge/license-MIT-blue.svg)
346
+
347
+ ---
348
+
349
+ ## Roadmap
350
+
351
+ - [Fixes]: Naming conventions are inconsistent/ have changed from previous seasonality tools (eg. 'seas_nyd' is named 'seas_new_years_day', 'week_1' is named 'seas_1')
352
+ - [Fixes]: Naming conventions can be inconsistent within the data pull (suffix on some var is 'gb' on some it is 'uk' and for others there is no suffix) - furthermore, there is a lack of consistency for global holidays/events (Christmas, Easter, Halloween, etc) - some have regional suffix and others don't.
353
+ - [Additions]: Need to add new data pulls for more macro and seasonal varibles
354
+
355
+ ---
@@ -0,0 +1,12 @@
1
+ imsciences/__init__.py,sha256=_HuYeLbDMTdt7GpKI4r6-d7yRPZgcAQ7yOW0-ydR2Yo,117
2
+ imsciences/geo.py,sha256=eenng7_BP_E2WD5Wt1G_oNxQS8W3t6lycRwJ91ngysY,15808
3
+ imsciences/mmm.py,sha256=qMh0ccOepehfCcux7EeG8cq6piSEoFEz5iiJbDBWOS4,82214
4
+ imsciences/pull.py,sha256=F83xlklM_lyPffMMZasHWLxDaeUHtOnUQGAUsiV7ves,88073
5
+ imsciences/unittesting.py,sha256=U177_Txg0Lqn49zYRu5bl9OVe_X7MkNJ6V_Zd6DHOsU,45656
6
+ imsciences/vis.py,sha256=2izdHQhmWEReerRqIxhY4Ai10VjL7xoUqyWyZC7-2XI,8931
7
+ imsciences-0.9.6.3.dist-info/LICENSE.txt,sha256=lVq2QwcExPX4Kl2DHeEkRrikuItcDB1Pr7yF7FQ8_z8,1108
8
+ imsciences-0.9.6.3.dist-info/METADATA,sha256=RMcthCSyWmU6IBsXGL-nYqw0RP06pzjPKK3dzOQcU-8,18846
9
+ imsciences-0.9.6.3.dist-info/PKG-INFO-TomG-HP-290722,sha256=RMcthCSyWmU6IBsXGL-nYqw0RP06pzjPKK3dzOQcU-8,18846
10
+ imsciences-0.9.6.3.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
11
+ imsciences-0.9.6.3.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
12
+ imsciences-0.9.6.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.2.0)
2
+ Generator: setuptools (74.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- imsciences/__init__.py,sha256=_HuYeLbDMTdt7GpKI4r6-d7yRPZgcAQ7yOW0-ydR2Yo,117
2
- imsciences/geo.py,sha256=eenng7_BP_E2WD5Wt1G_oNxQS8W3t6lycRwJ91ngysY,15808
3
- imsciences/mmm.py,sha256=qMh0ccOepehfCcux7EeG8cq6piSEoFEz5iiJbDBWOS4,82214
4
- imsciences/pull.py,sha256=B05cjuWCihFfZp8pyO118QYHJiASsWn94s1o5hd1n1Q,81788
5
- imsciences/unittesting.py,sha256=U177_Txg0Lqn49zYRu5bl9OVe_X7MkNJ6V_Zd6DHOsU,45656
6
- imsciences/vis.py,sha256=2izdHQhmWEReerRqIxhY4Ai10VjL7xoUqyWyZC7-2XI,8931
7
- imsciences-0.9.6.0.dist-info/LICENSE.txt,sha256=lVq2QwcExPX4Kl2DHeEkRrikuItcDB1Pr7yF7FQ8_z8,1108
8
- imsciences-0.9.6.0.dist-info/METADATA,sha256=Khfs0zUye-2GAdswojmCutDo3JBq2OF0fEjuK0pkBR4,18846
9
- imsciences-0.9.6.0.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
10
- imsciences-0.9.6.0.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
11
- imsciences-0.9.6.0.dist-info/RECORD,,