imsciences 0.9.5.9__tar.gz → 0.9.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.5.9
3
+ Version: 0.9.6.3
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -133,7 +133,7 @@ class datapull:
133
133
 
134
134
  Args:
135
135
  week_commencing (str): The starting day of the week for aggregation.
136
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
136
+ Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
137
137
  Default is "mon".
138
138
  max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
139
139
  delay (int): Delay in seconds between retry attempts. Default is 5.
@@ -144,7 +144,7 @@ class datapull:
144
144
  and 'macro_boe_intr_rate' contains the average interest rate for the week.
145
145
  """
146
146
  # Week commencing dictionary
147
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
147
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
148
148
 
149
149
  # URL of the Bank of England data page
150
150
  url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
@@ -209,7 +209,7 @@ class datapull:
209
209
  Args:
210
210
  country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
211
211
  week_commencing (str): The starting day of the week for aggregation.
212
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
212
+ Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
213
213
  start_date (str): Dataset start date in the format "YYYY-MM-DD"
214
214
 
215
215
  Returns:
@@ -383,7 +383,7 @@ class datapull:
383
383
  # ---------------------------------------------------------------------
384
384
  # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
385
385
  # ---------------------------------------------------------------------
386
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
386
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
387
387
 
388
388
  # ---------------------------------------------------------------------
389
389
  # 1. Create daily date range from start_date to today
@@ -668,7 +668,7 @@ class datapull:
668
668
  raise ValueError("country_codes must be a list/tuple or a single string.")
669
669
 
670
670
  # --- Setup / Constants --- #
671
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
671
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
672
672
  # Map each 2-letter code to a key
673
673
  country_dict = {
674
674
  "US": "US_STATES",
@@ -1171,74 +1171,110 @@ class datapull:
1171
1171
 
1172
1172
  def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
1173
1173
  """
1174
- Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
1174
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
1175
1175
  aggregates it to weekly averages, and renames variables based on specified rules.
1176
1176
 
1177
1177
  Parameters:
1178
- cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
1179
- week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
1180
- sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
1178
+ cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
1179
+ week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
1180
+ sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
1181
+ (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
1181
1182
 
1182
1183
  Returns:
1183
- pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
1184
- and all series as renamed columns.
1184
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
1185
+ and all series as renamed columns (e.g., 'macro_retail_sales_uk').
1186
+ Returns an empty DataFrame if no data is fetched or processed.
1185
1187
  """
1186
1188
  # Define CDIDs for sectors and defaults
1187
- sector_cdids = {
1189
+ sector_cdids_map = {
1188
1190
  "fast_food": ["L7TD", "L78Q", "DOAD"],
1191
+ "clothing_footwear": ["D7BW","D7GO","CHBJ"],
1192
+ "fuel": ["A9FS","L7FP","CHOL"],
1193
+ "cars":["D7E8","D7E9","D7CO"],
1189
1194
  "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
1190
1195
  }
1191
1196
 
1192
- default_cdids = sector_cdids["default"]
1193
- sector_specific_cdids = sector_cdids.get(sector, [])
1194
- standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
1197
+ default_cdids = sector_cdids_map["default"]
1198
+ sector_specific_cdids = [] # Initialize empty list for sector CDIDs
1199
+
1200
+ if sector: # Check if sector is not None or empty
1201
+ if isinstance(sector, str):
1202
+ # If it's a single string, wrap it in a list
1203
+ sector_list = [sector]
1204
+ elif isinstance(sector, list):
1205
+ # If it's already a list, use it directly
1206
+ sector_list = sector
1207
+ else:
1208
+ raise TypeError("`sector` parameter must be a string or a list of strings.")
1209
+
1210
+ # Iterate through the list of sectors and collect their CDIDs
1211
+ for sec in sector_list:
1212
+ sector_specific_cdids.extend(sector_cdids_map.get(sec, [])) # Use extend to add items from the list
1213
+
1214
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Combine default and selected sector CDIDs, ensure uniqueness
1195
1215
 
1196
- # Combine standard CDIDs and additional CDIDs
1216
+ # Combine standard CDIDs and any additional user-provided CDIDs
1197
1217
  if cdid_list is None:
1198
1218
  cdid_list = []
1199
- cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
1219
+ final_cdid_list = list(set(standard_cdids + cdid_list)) # Ensure uniqueness in the final list
1200
1220
 
1201
1221
  base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
1202
1222
  base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
1203
1223
  combined_df = pd.DataFrame()
1204
1224
 
1205
1225
  # Map week start day to pandas weekday convention
1206
- days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1207
- if week_start_day not in days_map:
1226
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
1227
+ if week_start_day.lower() not in days_map:
1208
1228
  raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
1209
- week_start = days_map[week_start_day]
1229
+ week_start = days_map[week_start_day.lower()] # Use lower() for case-insensitivity
1210
1230
 
1211
- for cdid in cdid_list:
1231
+ for cdid in final_cdid_list: # Use the final combined list
1212
1232
  try:
1213
1233
  # Search for the series
1214
1234
  search_url = f"{base_search_url}{cdid}"
1215
- search_response = requests.get(search_url)
1235
+ search_response = requests.get(search_url, timeout=30) # Add timeout
1216
1236
  search_response.raise_for_status()
1217
1237
  search_data = search_response.json()
1218
1238
 
1219
1239
  items = search_data.get("items", [])
1220
1240
  if not items:
1221
- print(f"No data found for CDID: {cdid}")
1241
+ print(f"Warning: No data found for CDID: {cdid}")
1222
1242
  continue
1223
1243
 
1224
1244
  # Extract series name and latest release URI
1225
- series_name = items[0].get("title", f"Series_{cdid}")
1226
- latest_date = max(
1227
- datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
1228
- for item in items if "release_date" in item
1229
- )
1230
- latest_uri = next(
1231
- item["uri"] for item in items
1232
- if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
1233
- )
1245
+ # Find the item with the most recent release_date
1246
+ latest_item = None
1247
+ latest_date = None
1248
+ for item in items:
1249
+ if "release_date" in item:
1250
+ try:
1251
+ # Ensure timezone awareness for comparison
1252
+ current_date = datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
1253
+ if latest_date is None or current_date > latest_date:
1254
+ latest_date = current_date
1255
+ latest_item = item
1256
+ except ValueError:
1257
+ print(f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}")
1258
+ continue # Skip this item if date is invalid
1259
+
1260
+ if latest_item is None:
1261
+ print(f"Warning: No valid release date found for CDID: {cdid}")
1262
+ continue
1263
+
1264
+ series_name = latest_item.get("title", f"Series_{cdid}") # Use title from the latest item
1265
+ latest_uri = latest_item.get("uri")
1266
+ if not latest_uri:
1267
+ print(f"Warning: No URI found for the latest release of CDID: {cdid}")
1268
+ continue
1234
1269
 
1235
1270
  # Fetch the dataset
1236
1271
  data_url = f"{base_data_url}{latest_uri}"
1237
- data_response = requests.get(data_url)
1272
+ data_response = requests.get(data_url, timeout=30) # Add timeout
1238
1273
  data_response.raise_for_status()
1239
1274
  data_json = data_response.json()
1240
1275
 
1241
1276
  # Detect the frequency and process accordingly
1277
+ frequency_key = None
1242
1278
  if "months" in data_json and data_json["months"]:
1243
1279
  frequency_key = "months"
1244
1280
  elif "quarters" in data_json and data_json["quarters"]:
@@ -1246,72 +1282,142 @@ class datapull:
1246
1282
  elif "years" in data_json and data_json["years"]:
1247
1283
  frequency_key = "years"
1248
1284
  else:
1249
- print(f"Unsupported frequency or no data for CDID: {cdid}")
1285
+ print(f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}")
1250
1286
  continue
1251
1287
 
1252
1288
  # Prepare the DataFrame
1289
+ if not data_json[frequency_key]: # Check if the list of values is empty
1290
+ print(f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}")
1291
+ continue
1292
+
1253
1293
  df = pd.DataFrame(data_json[frequency_key])
1254
1294
 
1255
- # Parse the 'date' field based on frequency
1256
- if frequency_key == "months":
1257
- df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
1258
- elif frequency_key == "quarters":
1259
- def parse_quarter(quarter_str):
1260
- year, qtr = quarter_str.split(" Q")
1261
- month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
1262
- return datetime(int(year), month, 1)
1263
- df["date"] = df["date"].apply(parse_quarter)
1264
- elif frequency_key == "years":
1265
- df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
1295
+ # Check if essential columns exist
1296
+ if "date" not in df.columns or "value" not in df.columns:
1297
+ print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
1298
+ continue
1266
1299
 
1300
+ # Parse the 'date' field based on frequency
1301
+ try:
1302
+ if frequency_key == "months":
1303
+ # Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
1304
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
1305
+ elif frequency_key == "quarters":
1306
+ def parse_quarter(quarter_str):
1307
+ try:
1308
+ year, qtr = quarter_str.split(" Q")
1309
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
1310
+ return datetime(int(year), month, 1)
1311
+ except (ValueError, KeyError):
1312
+ return pd.NaT # Return Not a Time for parsing errors
1313
+ df["date"] = df["date"].apply(parse_quarter)
1314
+ elif frequency_key == "years":
1315
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
1316
+ except Exception as e:
1317
+ print(f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}")
1318
+ continue # Skip this series if date parsing fails
1319
+
1320
+ # Coerce value to numeric, handle potential errors
1267
1321
  df["value"] = pd.to_numeric(df["value"], errors="coerce")
1322
+
1323
+ # Drop rows where date or value parsing failed
1324
+ df.dropna(subset=["date", "value"], inplace=True)
1325
+
1326
+ if df.empty:
1327
+ print(f"Warning: No valid data points after processing for CDID: {cdid}")
1328
+ continue
1329
+
1268
1330
  df.rename(columns={"value": series_name}, inplace=True)
1269
1331
 
1270
1332
  # Combine data
1271
- df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
1333
+ df_subset = df.loc[:, ["date", series_name]].reset_index(drop=True) # Explicitly select columns
1272
1334
  if combined_df.empty:
1273
- combined_df = df
1335
+ combined_df = df_subset
1274
1336
  else:
1275
- combined_df = pd.merge(combined_df, df, on="date", how="outer")
1337
+ # Use outer merge to keep all dates, sort afterwards
1338
+ combined_df = pd.merge(combined_df, df_subset, on="date", how="outer")
1276
1339
 
1277
1340
  except requests.exceptions.RequestException as e:
1278
1341
  print(f"Error fetching data for CDID {cdid}: {e}")
1279
- except (KeyError, ValueError) as e:
1342
+ except (KeyError, ValueError, TypeError) as e: # Added TypeError
1280
1343
  print(f"Error processing data for CDID {cdid}: {e}")
1344
+ except Exception as e: # Catch unexpected errors
1345
+ print(f"An unexpected error occurred for CDID {cdid}: {e}")
1346
+
1281
1347
 
1282
1348
  if not combined_df.empty:
1349
+ # Sort by date after merging to ensure correct forward fill
1350
+ combined_df.sort_values(by="date", inplace=True)
1351
+ combined_df.reset_index(drop=True, inplace=True)
1352
+
1353
+ # Create a complete daily date range
1283
1354
  min_date = combined_df["date"].min()
1284
- max_date = datetime.today()
1355
+ # Ensure max_date is timezone-naive if min_date is, or consistent otherwise
1356
+ max_date = pd.Timestamp(datetime.today().date()) # Use today's date, timezone-naive
1357
+
1358
+ if pd.isna(min_date):
1359
+ print("Error: Minimum date is NaT, cannot create date range.")
1360
+ return pd.DataFrame()
1361
+
1362
+ # Make sure min_date is not NaT before creating the range
1285
1363
  date_range = pd.date_range(start=min_date, end=max_date, freq='D')
1286
1364
  daily_df = pd.DataFrame(date_range, columns=['date'])
1365
+
1366
+ # Merge with original data and forward fill
1287
1367
  daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
1288
1368
  daily_df = daily_df.ffill()
1289
1369
 
1370
+ # Drop rows before the first valid data point after ffill
1371
+ first_valid_index = daily_df.dropna(subset=daily_df.columns.difference(['date'])).index.min()
1372
+ if pd.notna(first_valid_index):
1373
+ daily_df = daily_df.loc[first_valid_index:]
1374
+ else:
1375
+ print("Warning: No valid data points found after forward filling.")
1376
+ return pd.DataFrame() # Return empty if ffill results in no data
1377
+
1378
+
1290
1379
  # Aggregate to weekly frequency
1291
- daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
1380
+ # Ensure 'date' column is datetime type before dt accessor
1381
+ daily_df['date'] = pd.to_datetime(daily_df['date'])
1382
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start + 7) % 7, unit='D') # Corrected logic for week start
1383
+ # Group by week_commencing and calculate mean for numeric columns only
1292
1384
  weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
1293
1385
 
1386
+
1294
1387
  def clean_column_name(name):
1388
+ # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
1295
1389
  name = re.sub(r"\(.*?\)", "", name)
1390
+ # Take only the part before the first colon if present
1296
1391
  name = re.split(r":", name)[0]
1297
- name = re.sub(r"\d+", "", name)
1392
+ # Remove digits
1393
+ #name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
1394
+ # Remove specific words like 'annual', 'rate' case-insensitively
1298
1395
  name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
1396
+ # Remove non-alphanumeric characters (except underscore and space)
1299
1397
  name = re.sub(r"[^\w\s]", "", name)
1398
+ # Replace spaces with underscores
1399
+ name = name.strip() # Remove leading/trailing whitespace
1300
1400
  name = name.replace(" ", "_")
1401
+ # Replace multiple underscores with a single one
1301
1402
  name = re.sub(r"_+", "_", name)
1403
+ # Remove trailing underscores
1302
1404
  name = name.rstrip("_")
1405
+ # Add prefix and suffix
1303
1406
  return f"macro_{name.lower()}_uk"
1304
1407
 
1408
+ # Apply cleaning function to relevant columns
1305
1409
  weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
1306
- weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
1410
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True) # Rename week commencing col
1307
1411
 
1308
- weekly_df = weekly_df.fillna(0)
1412
+ # Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
1413
+ # Consider if 0 is the appropriate fill value for your use case
1414
+ # weekly_df = weekly_df.fillna(0)
1309
1415
 
1310
1416
  return weekly_df
1311
1417
  else:
1312
- print("No data available to process.")
1418
+ print("No data successfully fetched or processed.")
1313
1419
  return pd.DataFrame()
1314
-
1420
+
1315
1421
  def pull_yfinance(self, tickers=None, week_start_day="mon"):
1316
1422
  """
1317
1423
  Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
@@ -1337,7 +1443,7 @@ class datapull:
1337
1443
  end_date = datetime.today().strftime("%Y-%m-%d")
1338
1444
 
1339
1445
  # Mapping week start day to pandas weekday convention
1340
- days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1446
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
1341
1447
  if week_start_day not in days_map:
1342
1448
  raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
1343
1449
  week_start = days_map[week_start_day]
@@ -1497,9 +1603,9 @@ class datapull:
1497
1603
  # Aggregate by week commencing
1498
1604
  day_offsets = {
1499
1605
  'mon': 'W-MON',
1500
- 'tues': 'W-TUE',
1606
+ 'tue': 'W-TUE',
1501
1607
  'wed': 'W-WED',
1502
- 'thurs': 'W-THU',
1608
+ 'thu': 'W-THU',
1503
1609
  'fri': 'W-FRI',
1504
1610
  'sat': 'W-SAT',
1505
1611
  'sun': 'W-SUN'
@@ -1592,9 +1698,9 @@ class datapull:
1592
1698
  # Resample by week
1593
1699
  day_offsets = {
1594
1700
  'mon': 'W-MON',
1595
- 'tues': 'W-TUE',
1701
+ 'tue': 'W-TUE',
1596
1702
  'wed': 'W-WED',
1597
- 'thurs': 'W-THU',
1703
+ 'thu': 'W-THU',
1598
1704
  'fri': 'W-FRI',
1599
1705
  'sat': 'W-SAT',
1600
1706
  'sun': 'W-SUN'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.5.9
3
+ Version: 0.9.6.3
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -0,0 +1,355 @@
1
+ Metadata-Version: 2.1
2
+ Name: imsciences
3
+ Version: 0.9.6.3
4
+ Summary: IMS Data Processing Package
5
+ Author: IMS
6
+ Author-email: cam@im-sciences.com
7
+ Keywords: data processing,apis,data analysis,data visualization,machine learning
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: Unix
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE.txt
16
+ Requires-Dist: pandas
17
+ Requires-Dist: plotly
18
+ Requires-Dist: numpy
19
+ Requires-Dist: fredapi
20
+ Requires-Dist: xgboost
21
+ Requires-Dist: scikit-learn
22
+ Requires-Dist: bs4
23
+ Requires-Dist: yfinance
24
+ Requires-Dist: holidays
25
+ Requires-Dist: google-analytics-data
26
+ Requires-Dist: geopandas
27
+ Requires-Dist: geopy
28
+
29
+ # IMS Package Documentation
30
+
31
+ The **Independent Marketing Sciences** package is a Python library designed to process incoming data into a format tailored for projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data collection, manipulation, visualisation and analysis.
32
+
33
+ ---
34
+
35
+ ## Key Features
36
+ - Seamless data processing for time series workflows.
37
+ - Aggregation, filtering, and transformation of time series data.
38
+ - Visualising Data
39
+ - Integration with external data sources like FRED, Bank of England and ONS.
40
+
41
+ ---
42
+
43
+ Table of Contents
44
+ =================
45
+
46
+ 1. [Usage](#usage)
47
+ 2. [Data Processing for Time Series](#data-processing-for-time-series)
48
+ 3. [Data Processing for Incrementality Testing](#data-processing-for-incrementality-testing)
49
+ 4. [Data Visualisations](#data-visualisations)
50
+ 5. [Data Pulling](#data-pulling)
51
+ 6. [Installation](#installation)
52
+ 7. [License](#license)
53
+ 8. [Roadmap](#roadmap)
54
+
55
+ ---
56
+
57
+ ## Usage
58
+
59
+ ```bash
60
+ from imsciences import dataprocessing, geoprocessing, datapull, datavis
61
+ ims_proc = dataprocessing()
62
+ ims_geo = geoprocessing()
63
+ ims_pull = datapull()
64
+ ims_vis = datavis()
65
+ ```
66
+
67
+ ## Data Processing for Time Series
68
+
69
+ ## 1. `get_wd_levels`
70
+ - **Description**: Get the working directory with the option of moving up parents.
71
+ - **Usage**: `get_wd_levels(levels)`
72
+ - **Example**: `get_wd_levels(0)`
73
+
74
+ ## 2. `aggregate_daily_to_wc_long`
75
+ - **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
76
+ - **Usage**: `aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')`
77
+ - **Example**: `aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')`
78
+
79
+ ## 3. `convert_monthly_to_daily`
80
+ - **Description**: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.
81
+ - **Usage**: `convert_monthly_to_daily(df, date_column, divide=True)`
82
+ - **Example**: `convert_monthly_to_daily(df, 'date')`
83
+
84
+ ## 4. `week_of_year_mapping`
85
+ - **Description**: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.
86
+ - **Usage**: `week_of_year_mapping(df, week_col, start_day_str)`
87
+ - **Example**: `week_of_year_mapping(df, 'week', 'mon')`
88
+
89
+ ## 5. `rename_cols`
90
+ - **Description**: Renames columns in a pandas DataFrame with a specified prefix or format.
91
+ - **Usage**: `rename_cols(df, name='ame_')`
92
+ - **Example**: `rename_cols(df, 'ame_facebook')`
93
+
94
+ ## 6. `merge_new_and_old`
95
+ - **Description**: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.
96
+ - **Usage**: `merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')`
97
+ - **Example**: `merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')`
98
+
99
+ ## 7. `merge_dataframes_on_column`
100
+ - **Description**: Merge a list of DataFrames on a common column.
101
+ - **Usage**: `merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')`
102
+ - **Example**: `merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')`
103
+
104
+ ## 8. `merge_and_update_dfs`
105
+ - **Description**: Merges two dataframes, updating columns from the second dataframe where values are available.
106
+ - **Usage**: `merge_and_update_dfs(df1, df2, key_column)`
107
+ - **Example**: `merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')`
108
+
109
+ ## 9. `convert_us_to_uk_dates`
110
+ - **Description**: Convert a DataFrame column with mixed US and UK date formats to datetime.
111
+ - **Usage**: `convert_us_to_uk_dates(df, date_col)`
112
+ - **Example**: `convert_us_to_uk_dates(df, 'date')`
113
+
114
+ ## 10. `combine_sheets`
115
+ - **Description**: Combines multiple DataFrames from a dictionary into a single DataFrame.
116
+ - **Usage**: `combine_sheets(all_sheets)`
117
+ - **Example**: `combine_sheets({'Sheet1': df1, 'Sheet2': df2})`
118
+
119
+ ## 11. `pivot_table`
120
+ - **Description**: Dynamically pivots a DataFrame based on specified columns.
121
+ - **Usage**: `pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')`
122
+ - **Example**: `pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)`
123
+
124
+ ## 12. `apply_lookup_table_for_columns`
125
+ - **Description**: Maps substrings in columns to new values based on a dictionary.
126
+ - **Usage**: `apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')`
127
+ - **Example**: `apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')`
128
+
129
+ ## 13. `aggregate_daily_to_wc_wide`
130
+ - **Description**: Aggregates daily data into weekly data and pivots it to wide format.
131
+ - **Usage**: `aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)`
132
+ - **Example**: `aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)`
133
+
134
+ ## 14. `merge_cols_with_seperator`
135
+ - **Description**: Merges multiple columns in a DataFrame into one column with a specified separator.
136
+ - **Usage**: `merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')`
137
+ - **Example**: `merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')`
138
+
139
+ ## 15. `check_sum_of_df_cols_are_equal`
140
+ - **Description**: Checks if the sum of two columns in two DataFrames are equal and provides the difference.
141
+ - **Usage**: `check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)`
142
+ - **Example**: `check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')`
143
+
144
+ ## 16. `convert_2_df_cols_to_dict`
145
+ - **Description**: Creates a dictionary from two DataFrame columns.
146
+ - **Usage**: `convert_2_df_cols_to_dict(df, key_col, value_col)`
147
+ - **Example**: `convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')`
148
+
149
+ ## 17. `create_FY_and_H_columns`
150
+ - **Description**: Adds financial year and half-year columns to a DataFrame based on a start date.
151
+ - **Usage**: `create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')`
152
+ - **Example**: `create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')`
153
+
154
+ ## 18. `keyword_lookup_replacement`
155
+ - **Description**: Updates values in a column based on a lookup dictionary with conditional logic.
156
+ - **Usage**: `keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')`
157
+ - **Example**: `keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')`
158
+
159
+ ## 19. `create_new_version_of_col_using_LUT`
160
+ - **Description**: Creates a new column based on a lookup table applied to an existing column.
161
+ - **Usage**: `create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')`
162
+ - **Example**: `create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)`
163
+
164
+ ## 20. `convert_df_wide_2_long`
165
+ - **Description**: Converts a wide-format DataFrame into a long-format DataFrame.
166
+ - **Usage**: `convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')`
167
+ - **Example**: `convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')`
168
+
169
+ ## 21. `manually_edit_data`
170
+ - **Description**: Manually updates specified cells in a DataFrame based on filters.
171
+ - **Usage**: `manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)`
172
+ - **Example**: `manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')`
173
+
174
+ ## 22. `format_numbers_with_commas`
175
+ - **Description**: Formats numerical columns with commas and a specified number of decimal places.
176
+ - **Usage**: `format_numbers_with_commas(df, decimal_length_chosen=2)`
177
+ - **Example**: `format_numbers_with_commas(df, decimal_length_chosen=1)`
178
+
179
+ ## 23. `filter_df_on_multiple_conditions`
180
+ - **Description**: Filters a DataFrame based on multiple column conditions.
181
+ - **Usage**: `filter_df_on_multiple_conditions(df, filters_dict)`
182
+ - **Example**: `filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})`
183
+
184
+ ## 24. `read_and_concatenate_files`
185
+ - **Description**: Reads and concatenates files from a specified folder into a single DataFrame.
186
+ - **Usage**: `read_and_concatenate_files(folder_path, file_type='csv')`
187
+ - **Example**: `read_and_concatenate_files('/path/to/files', file_type='xlsx')`
188
+
189
+ ## 25. `upgrade_outdated_packages`
190
+ - **Description**: Upgrades all outdated Python packages except specified ones.
191
+ - **Usage**: `upgrade_outdated_packages(exclude_packages=['twine'])`
192
+ - **Example**: `upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])`
193
+
194
+ ## 26. `convert_mixed_formats_dates`
195
+ - **Description**: Converts mixed-format date columns into standardized datetime format.
196
+ - **Usage**: `convert_mixed_formats_dates(df, column_name)`
197
+ - **Example**: `convert_mixed_formats_dates(df, 'date_col')`
198
+
199
+ ## 27. `fill_weekly_date_range`
200
+ - **Description**: Fills in missing weekly dates in a DataFrame with a specified frequency.
201
+ - **Usage**: `fill_weekly_date_range(df, date_column, freq='W-MON')`
202
+ - **Example**: `fill_weekly_date_range(df, 'date_col')`
203
+
204
+ ## 28. `add_prefix_and_suffix`
205
+ - **Description**: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.
206
+ - **Usage**: `add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)`
207
+ - **Example**: `add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')`
208
+
209
+ ## 29. `create_dummies`
210
+ - **Description**: Creates dummy variables for columns, with an option to add a total dummy column.
211
+ - **Usage**: `create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')`
212
+ - **Example**: `create_dummies(df, date_col='date_col', dummy_threshold=1)`
213
+
214
+ ## 30. `replace_substrings`
215
+ - **Description**: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.
216
+ - **Usage**: `replace_substrings(df, column, replacements, to_lower=False, new_column=None)`
217
+ - **Example**: `replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')`
218
+
219
+ ## 31. `add_total_column`
220
+ - **Description**: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.
221
+ - **Usage**: `add_total_column(df, exclude_col=None, total_col_name='Total')`
222
+ - **Example**: `add_total_column(df, exclude_col='date_col')`
223
+
224
+ ## 32. `apply_lookup_table_based_on_substring`
225
+ - **Description**: Categorizes text in a column using a lookup table based on substrings.
226
+ - **Usage**: `apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')`
227
+ - **Example**: `apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})`
228
+
229
+ ## 33. `compare_overlap`
230
+ - **Description**: Compares overlapping periods between two DataFrames and summarizes differences.
231
+ - **Usage**: `compare_overlap(df1, df2, date_col)`
232
+ - **Example**: `compare_overlap(df1, df2, 'date_col')`
233
+
234
+ ## 34. `week_commencing_2_week_commencing_conversion_isoweekday`
235
+ - **Description**: Maps dates to the start of the current ISO week based on a specified weekday.
236
+ - **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
237
+ - **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
238
+
239
+ ## 35. `seasonality_feature_extraction`
240
+ - **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
241
+ - **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
242
+ - **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
243
+
244
+ ---
245
+
246
+ ## Data Processing for Incrementality Testing
247
+
248
+ ## 1. `pull_ga`
249
+ - **Description**: Pull in GA4 data for geo experiments.
250
+ - **Usage**: `pull_ga(credentials_file, property_id, start_date, country, metrics)`
251
+ - **Example**: `pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])`
252
+
253
+ ## 2. `process_itv_analysis`
254
+ - **Description**: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.
255
+ - **Usage**: `process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
256
+ - **Example**: `process_itv_analysis(df, 'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum'])`
257
+
258
+ ## 3. `process_city_analysis`
259
+ - **Description**: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.
260
+ - **Usage**: `process_city_analysis(raw_df, spend_df, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
261
+ - **Example**: `process_city_analysis(df, spend, output, ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'sum'])`
262
+
263
+ ---
264
+
265
+ ## Data Visualisations
266
+
267
+ ## 1. `plot_one`
268
+ - **Description**: Plots a specified column from a DataFrame with white background and black axes.
269
+ - **Usage**: `plot_one(df1, col1, date_column)`
270
+ - **Example**: `plot_one(df, 'sales', 'date')`
271
+
272
+ ## 2. `plot_two`
273
+ - **Description**: Plots specified columns from two DataFrames, optionally on the same or separate y-axes.
274
+ - **Usage**: `plot_two(df1, col1, df2, col2, date_column, same_axis=True)`
275
+ - **Example**: `plot_two(df1, 'sales', df2, 'revenue', 'date', same_axis=False)`
276
+
277
+ ## 3. `plot_chart`
278
+ - **Description**: Plots various chart types using Plotly, including line, bar, scatter, area, pie, etc.
279
+ - **Usage**: `plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values')`
280
+ - **Example**: `plot_chart(df, 'date', ['sales', 'revenue'], chart_type='line', title='Sales and Revenue')`
281
+
282
+ ---
283
+
284
+ ## Data Pulling
285
+
286
+ ## 1. `pull_fred_data`
287
+ - **Description**: Fetch data from FRED using series ID tokens.
288
+ - **Usage**: `pull_fred_data(week_commencing, series_id_list)`
289
+ - **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
290
+
291
+ ## 2. `pull_boe_data`
292
+ - **Description**: Fetch and process Bank of England interest rate data.
293
+ - **Usage**: `pull_boe_data(week_commencing)`
294
+ - **Example**: `pull_boe_data('mon')`
295
+
296
+ ## 3. `pull_oecd`
297
+ - **Description**: Fetch macroeconomic data from OECD for a specified country.
298
+ - **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')`
299
+ - **Example**: `pull_oecd('GBR', 'mon', '2000-01-01')`
300
+
301
+ ## 4. `get_google_mobility_data`
302
+ - **Description**: Fetch Google Mobility data for the specified country.
303
+ - **Usage**: `get_google_mobility_data(country, wc)`
304
+ - **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
305
+
306
+ ## 5. `pull_seasonality`
307
+ - **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
308
+ - **Usage**: `pull_seasonality(week_commencing, start_date, countries)`
309
+ - **Example**: `pull_seasonality('mon', '2020-01-01', ['US', 'GB'])`
310
+
311
+ ## 6. `pull_weather`
312
+ - **Description**: Fetch and process historical weather data for the specified country.
313
+ - **Usage**: `pull_weather(week_commencing, start_date, country)`
314
+ - **Example**: `pull_weather('mon', '2020-01-01', 'GBR')`
315
+
316
+ ## 7. `pull_macro_ons_uk`
317
+ - **Description**: Fetch and process time series data from the Beta ONS API.
318
+ - **Usage**: `pull_macro_ons_uk(additional_list, week_commencing, sector)`
319
+ - **Example**: `pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')`
320
+
321
+ ## 8. `pull_yfinance`
322
+ - **Description**: Fetch and process time series data from Yahoo Finance.
323
+ - **Usage**: `pull_yfinance(tickers, week_start_day)`
324
+ - **Example**: `pull_yfinance(['^FTMC', '^IXIC'], 'mon')`
325
+
326
+ ## 9. `pull_sports_events`
327
+ - **Description**: Pull a veriety of sports events primaraly football and rugby.
328
+ - **Usage**: `pull_sports_events(start_date, week_commencing)`
329
+ - **Example**: `pull_sports_events('2020-01-01', 'mon')`
330
+
331
+ ---
332
+
333
+ ## Installation
334
+
335
+ Install the IMS package via pip:
336
+
337
+ ```bash
338
+ pip install imsciences
339
+ ```
340
+
341
+ ---
342
+
343
+ ## License
344
+
345
+ This project is licensed under the MIT License. ![License](https://img.shields.io/badge/license-MIT-blue.svg)
346
+
347
+ ---
348
+
349
+ ## Roadmap
350
+
351
+ - [Fixes]: Naming conventions are inconsistent/ have changed from previous seasonality tools (eg. 'seas_nyd' is named 'seas_new_years_day', 'week_1' is named 'seas_1')
352
+ - [Fixes]: Naming conventions can be inconsistent within the data pull (suffix on some var is 'gb' on some it is 'uk' and for others there is no suffix) - furthermore, there is a lack of consistency for global holidays/events (Christmas, Easter, Halloween, etc) - some have regional suffix and others don't.
353
+ - [Additions]: Need to add new data pulls for more macro and seasonal varibles
354
+
355
+ ---
@@ -8,7 +8,7 @@ imsciences/pull.py
8
8
  imsciences/unittesting.py
9
9
  imsciences/vis.py
10
10
  imsciences.egg-info/PKG-INFO
11
- imsciences.egg-info/PKG-INFO-IMS-24Ltp-3
11
+ imsciences.egg-info/PKG-INFO-TomG-HP-290722
12
12
  imsciences.egg-info/SOURCES.txt
13
13
  imsciences.egg-info/dependency_links.txt
14
14
  imsciences.egg-info/requires.txt
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.9.5.9'
11
+ VERSION = '0.9.6.3'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md')
14
14
 
@@ -34,5 +34,5 @@ setup(
34
34
  "Operating System :: Unix",
35
35
  "Operating System :: MacOS :: MacOS X",
36
36
  "Operating System :: Microsoft :: Windows",
37
- ]
37
+ ],
38
38
  )
@@ -1,24 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: imsciences
3
- Version: 0.6.1.1
4
- Summary: IMS Data Processing Package
5
- Author: IMS
6
- Author-email: cam@im-sciences.com
7
- Keywords: python,data processing
8
- Classifier: Development Status :: 3 - Alpha
9
- Classifier: Intended Audience :: Developers
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Operating System :: Unix
12
- Classifier: Operating System :: MacOS :: MacOS X
13
- Classifier: Operating System :: Microsoft :: Windows
14
- Description-Content-Type: text/markdown
15
- Requires-Dist: pandas
16
-
17
- # IMS Package Documentation
18
-
19
- The IMS package is a python library for processing incoming data into a format that can be used for projects. IMS processing offers a variety of functions to manipulate and analyze data efficiently. Here are the functionalities provided by the package:
20
-
21
- ## Data Processing
22
-
23
- ## Data Pulling
24
-
File without changes
File without changes
File without changes