imsciences 1.0.2__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/pull.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import importlib
2
+ import os
2
3
  import re
3
4
  import time
4
5
  import urllib.request
@@ -19,7 +20,6 @@ from imsciences.mmm import dataprocessing
19
20
 
20
21
  ims_proc = dataprocessing()
21
22
 
22
-
23
23
  class datapull:
24
24
  def help(self):
25
25
  print("This is the help section. The functions in the package are as follows:")
@@ -281,10 +281,16 @@ class datapull:
281
281
  start_date: str = "2020-01-01",
282
282
  ) -> pd.DataFrame:
283
283
  """
284
- Fetch and process time series data from the OECD API.
284
+ Load and process time series data from the cached OECD parquet file.
285
+
286
+ This method loads pre-fetched OECD data from either:
287
+ 1. Shared network path (if accessible)
288
+ 2. Local cache directory (fallback)
289
+
290
+ If the cache doesn't exist anywhere, it automatically runs the OECDDataPuller to generate it.
285
291
 
286
292
  Args:
287
- country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
293
+ country (str): A string containing a 3-letter code of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
288
294
  week_commencing (str): The starting day of the week for aggregation.
289
295
  Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
290
296
  start_date (str): Dataset start date in the format "YYYY-MM-DD"
@@ -294,196 +300,59 @@ class datapull:
294
300
  commencing dates, and other columns contain the aggregated time series values.
295
301
 
296
302
  """
297
-
298
- def parse_quarter(date_str):
299
- """Parses a string in 'YYYY-Q#' format into a datetime object."""
300
- year, quarter = date_str.split("-")
301
- quarter_number = int(quarter[1])
302
- month = (quarter_number - 1) * 3 + 1
303
- return pd.Timestamp(f"{year}-{month:02d}-01")
304
-
305
- # Generate a date range from 1950-01-01 to today
306
- date_range = pd.date_range(start=start_date, end=datetime.today(), freq="D")
307
-
308
- url_details = [
309
- [
310
- "BCICP",
311
- "SDD.STES,DSD_STES@DF_CLI,",
312
- ".....",
313
- "macro_business_confidence_index",
314
- ],
315
- [
316
- "CCICP",
317
- "SDD.STES,DSD_STES@DF_CLI,",
318
- ".....",
319
- "macro_consumer_confidence_index",
320
- ],
321
- [
322
- "N.CPI",
323
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
324
- "PA._T.N.GY",
325
- "macro_cpi_total",
326
- ],
327
- [
328
- "N.CPI",
329
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
330
- "PA.CP041T043.N.GY",
331
- "macro_cpi_housing",
332
- ],
333
- [
334
- "N.CPI",
335
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
336
- "PA.CP01.N.GY",
337
- "macro_cpi_food",
338
- ],
339
- [
340
- "N.CPI",
341
- "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
342
- "PA.CP045_0722.N.GY",
343
- "macro_cpi_energy",
344
- ],
345
- [
346
- "UNE_LF_M",
347
- "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,",
348
- "._Z.Y._T.Y_GE15.",
349
- "macro_unemployment_rate",
350
- ],
351
- [
352
- "EAR",
353
- "SDD.TPS,DSD_EAR@DF_HOU_EAR,",
354
- ".Y..S1D",
355
- "macro_private_hourly_earnings",
356
- ],
357
- [
358
- "RHP",
359
- "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0",
360
- "",
361
- "macro_real_house_prices",
362
- ],
363
- [
364
- "PRVM",
365
- "SDD.STES,DSD_KEI@DF_KEI,4.0",
366
- "IX.C..",
367
- "macro_manufacturing_production_volume",
368
- ],
369
- [
370
- "TOVM",
371
- "SDD.STES,DSD_KEI@DF_KEI,4.0",
372
- "IX...",
373
- "macro_retail_trade_volume",
374
- ],
375
- ["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
376
- [
377
- "IRLT",
378
- "SDD.STES,DSD_KEI@DF_KEI,4.0",
379
- "PA...",
380
- "macro_long_term_interest_rate",
381
- ],
382
- [
383
- "B1GQ",
384
- "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1",
385
- "._Z....GY.T0102",
386
- "macro_gdp_growth_yoy",
387
- ],
388
- ]
389
-
390
- # Create empty final dataframe
391
- oecd_df_final = pd.DataFrame()
392
-
393
- daily_df = pd.DataFrame({"OBS": date_range})
394
- value_columns = []
395
-
396
- # Iterate for each variable of interest
397
- for series_details in url_details:
398
- series = series_details[0]
399
- dataset_id = series_details[1]
400
- filter = series_details[2]
401
- col_name = series_details[3]
402
-
403
- # check if request was successful and determine the most granular data available
404
- for freq in ["M", "Q", "A"]:
405
- if series in ["UNE_LF_M", "EAR"]:
406
- data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
407
- elif series in ["B1GQ"]:
408
- data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
409
- else:
410
- data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
411
-
412
- # Make the request to the OECD API for data
413
- data_response = requests.get(data_url)
414
-
415
- # Check if the request was successful
416
- if data_response.status_code != 200:
417
- print(
418
- f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}",
419
- )
420
- url_test = False
421
- continue
422
- url_test = True
423
- break
424
-
425
- # get data for the next variable if url doesn't exist
426
- if url_test is False:
427
- continue
428
-
429
- root = ET.fromstring(data_response.content)
430
-
431
- # Define namespaces if necessary (the namespace is included in the tags)
432
- namespaces = {
433
- "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
434
- }
435
-
436
- # Lists to store the data
437
- dates = []
438
- values = []
439
-
440
- # Iterate over all <Obs> elements and extract date and value
441
- for obs in root.findall(".//generic:Obs", namespaces):
442
- # Extracting the time period (date)
443
- time_period = obs.find(".//generic:ObsDimension", namespaces).get(
444
- "value",
445
- )
446
-
447
- # Extracting the observation value
448
- value = obs.find(".//generic:ObsValue", namespaces).get("value")
449
-
450
- # Storing the data
451
- if time_period and value:
452
- dates.append(time_period)
453
- values.append(float(value)) # Convert value to float
454
-
455
- # Add variable names that were found to a list
456
- value_columns.append(col_name)
457
-
458
- # Creating a DataFrame
459
- data = pd.DataFrame({"OBS": dates, col_name: values})
460
-
461
- # Convert date strings into datetime format
462
- if freq == "Q":
463
- data["OBS"] = data["OBS"].apply(parse_quarter)
303
+ from pathlib import Path
304
+
305
+ # Try shared network path first, then fall back to local
306
+ user_home = os.path.expanduser("~")
307
+ shared_path = Path(user_home) / "im-sciences.com" / "FileShare - MasterDrive" / "Central Database" / "Pull All" / "OECD Database"
308
+ local_path = Path("oecd_data")
309
+ shared_data_file = shared_path / f"oecd_data_{country}.csv"
310
+ local_data_file = local_path / f"oecd_data_{country}.csv"
311
+
312
+ data_file = None
313
+ data_location = None
314
+
315
+ # Check shared path first
316
+ if shared_data_file.exists():
317
+ data_file = shared_data_file
318
+ data_location = "shared network"
319
+ # Fall back to local path
320
+ elif local_data_file.exists():
321
+ data_file = local_data_file
322
+ data_location = "local"
323
+
324
+ # If no cache found anywhere, run the puller
325
+ if data_file is None:
326
+ print(f"OECD data cache not found. Running OECDDataPuller to fetch data...")
327
+ from .oecd_pull import OECDDataPuller
328
+ puller = OECDDataPuller(
329
+ country=country,
330
+ start_date=start_date,
331
+ output_dir=None # Let puller decide between shared/local
332
+ )
333
+ puller.run_until_complete(max_iterations=1) # Run one complete cycle
334
+ print(f"OECD data fetched and cached successfully.\n")
335
+
336
+ # Determine where it was saved
337
+ if shared_data_file.exists():
338
+ data_file = shared_data_file
339
+ data_location = "shared network"
464
340
  else:
465
- # Display the DataFrame
466
- data["OBS"] = data["OBS"].apply(lambda x: datetime.strptime(x, "%Y-%m"))
341
+ data_file = local_data_file
342
+ data_location = "local"
467
343
 
468
- # Sort data by chronological order
469
- data.sort_values(by="OBS", inplace=True)
344
+ print(f"Loading OECD data from {data_location}: {data_file}")
345
+ daily_df = pd.read_csv(data_file)
346
+ daily_df['OBS'] = pd.to_datetime(daily_df['OBS'])
470
347
 
471
- # Merge the data based on the observation date
472
- daily_df = pd.merge_asof(
473
- daily_df,
474
- data[["OBS", col_name]],
475
- on="OBS",
476
- direction="backward",
477
- )
348
+ # Get list of value columns (exclude OBS)
349
+ value_columns = [col for col in daily_df.columns if col != "OBS"]
478
350
 
479
351
  # Ensure columns are numeric
480
352
  for col in value_columns:
481
- if col in daily_df.columns:
482
- daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
483
- else:
484
- print(f"Column {col} not found in daily_df")
353
+ daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
485
354
 
486
- # Aggregate results by week
355
+ # Aggregate to weekly
487
356
  country_df = ims_proc.aggregate_daily_to_wc_wide(
488
357
  df=daily_df,
489
358
  date_column="OBS",
@@ -493,11 +362,7 @@ class datapull:
493
362
  aggregation="average",
494
363
  )
495
364
 
496
- oecd_df_final = pd.concat(
497
- [oecd_df_final, country_df],
498
- axis=0,
499
- ignore_index=True,
500
- )
365
+ return country_df
501
366
 
502
367
  return oecd_df_final
503
368
 
@@ -909,6 +774,12 @@ class datapull:
909
774
  "NG": "Nigeria",
910
775
  "ST": "SaoTomeAndPrincipe",
911
776
  }
777
+ # Month Order Dictionary
778
+ month_order = [
779
+ "january", "february", "march", "april",
780
+ "may", "june", "july", "august",
781
+ "september", "october", "november", "december",
782
+ ]
912
783
 
913
784
  # ---------------------------------------------------------------------
914
785
  # 1. Create daily date range from start_date to today
@@ -1124,6 +995,9 @@ class datapull:
1124
995
  holiday_date = row["Date"]
1125
996
  # Create column name without modifying original holiday names
1126
997
  holiday_name = row["Holiday"].lower().replace(" ", "_")
998
+
999
+ # Remove all non-alphanumeric characters (except underscores) to prevent commas and apostrophes
1000
+ holiday_name = re.sub(r"[^\w]", "", holiday_name)
1127
1001
 
1128
1002
  # Remove "_shift" or "_substitute" if they appear as standalone suffixes
1129
1003
  if holiday_name.endswith("_shift"):
@@ -1349,10 +1223,15 @@ class datapull:
1349
1223
  df_weekly_iso_week_year["Year"] = df_weekly_iso_week_year["Year"].astype(int)
1350
1224
 
1351
1225
  # --- Monthly dummies (spread evenly across week) ---
1352
- df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
1226
+
1227
+ df_daily["Month"] = pd.Categorical(
1228
+ df_daily["Date"].dt.month_name().str.lower(),
1229
+ categories=month_order,
1230
+ ordered=True,
1231
+ )
1353
1232
  df_monthly_dummies_daily = pd.get_dummies(
1354
1233
  df_daily[["week_start", "Month"]], # Only need these columns
1355
- prefix="seas_month",
1234
+ prefix="seas",
1356
1235
  columns=["Month"],
1357
1236
  dtype=float, # Use float for division
1358
1237
  )
@@ -1435,7 +1314,7 @@ class datapull:
1435
1314
  # Reorder columns - OBS first, then Constant, Trend, then seasonal features
1436
1315
  cols_order = (
1437
1316
  ["OBS", "Constant", "Trend"]
1438
- + sorted([col for col in df_combined.columns if col.startswith("seas_")])
1317
+ + [col for col in df_combined.columns if col.startswith("seas_")]
1439
1318
  + sorted([col for col in df_combined.columns if col.startswith("dum_")])
1440
1319
  ) # If individual week dummies were enabled
1441
1320
 
@@ -1444,7 +1323,6 @@ class datapull:
1444
1323
  df_combined = df_combined[final_cols]
1445
1324
 
1446
1325
  return df_combined
1447
-
1448
1326
  def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
1449
1327
  """
1450
1328
  Pull weather data for a given week-commencing day and one or more country codes.
@@ -2397,13 +2275,13 @@ class datapull:
2397
2275
  cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
2398
2276
  week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
2399
2277
  sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
2400
- (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
2278
+ (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
2401
2279
 
2402
2280
  Returns
2403
2281
  -------
2404
2282
  pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
2405
- and all series as renamed columns (e.g., 'macro_retail_sales_uk').
2406
- Returns an empty DataFrame if no data is fetched or processed.
2283
+ and all series as renamed columns (e.g., 'macro_retail_sales_uk').
2284
+ Returns an empty DataFrame if no data is fetched or processed.
2407
2285
 
2408
2286
  """
2409
2287
  # Define CDIDs for sectors and defaults
@@ -2436,16 +2314,11 @@ class datapull:
2436
2314
  sector_cdids_map.get(sec, []),
2437
2315
  ) # Use extend to add items from the list
2438
2316
 
2439
- standard_cdids = list(
2440
- set(default_cdids + sector_specific_cdids),
2441
- ) # Combine default and selected sector CDIDs, ensure uniqueness
2442
-
2443
2317
  # Combine standard CDIDs and any additional user-provided CDIDs
2318
+ standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
2444
2319
  if cdid_list is None:
2445
2320
  cdid_list = []
2446
- final_cdid_list = list(
2447
- set(standard_cdids + cdid_list),
2448
- ) # Ensure uniqueness in the final list
2321
+ final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
2449
2322
 
2450
2323
  base_search_url = (
2451
2324
  "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
@@ -2670,26 +2543,59 @@ class datapull:
2670
2543
  )
2671
2544
 
2672
2545
  def clean_column_name(name):
2673
- # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
2546
+ # Remove content within parentheses
2674
2547
  name = re.sub(r"\(.*?\)", "", name)
2675
- # Take only the part before the first colon if present
2676
- name = re.split(r":", name)[0]
2677
- # Remove digits
2678
- # name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
2679
- # Remove specific words like 'annual', 'rate' case-insensitively
2680
- name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
2548
+
2549
+ # Special handling for ANY CPI items (not just CPI INDEX)
2550
+ if "CPI" in name.upper():
2551
+ # Extract the description part after the colon for CPI items
2552
+ if ":" in name:
2553
+ parts = name.split(":")
2554
+ if len(parts) >= 2:
2555
+ # Take the description part (usually the second part)
2556
+ description = parts[1].strip()
2557
+ # Remove any remaining colons and everything after
2558
+ description = description.split(":")[0].strip()
2559
+ name = f"CPI {description}"
2560
+
2561
+ # Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
2562
+ name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
2563
+
2564
+ else:
2565
+ # For non-CPI items, take only the part before the first colon
2566
+ name = re.split(r":", name)[0]
2567
+ # Remove all digits for non-CPI items too
2568
+ name = re.sub(r"\d+", "", name)
2569
+
2570
+ # Remove year references like "2015=100"
2571
+ name = re.sub(r"\d{4}=\d+", "", name)
2572
+
2573
+ # Remove specific words case-insensitively
2574
+ name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
2575
+
2576
+ # Remove percentage symbols and "%"
2577
+ name = re.sub(r"%", "percent", name)
2578
+
2681
2579
  # Remove non-alphanumeric characters (except underscore and space)
2682
2580
  name = re.sub(r"[^\w\s]", "", name)
2581
+
2683
2582
  # Replace spaces with underscores
2684
- name = name.strip() # Remove leading/trailing whitespace
2685
- name = name.replace(" ", "_")
2583
+ name = name.strip().replace(" ", "_")
2584
+
2686
2585
  # Replace multiple underscores with a single one
2687
2586
  name = re.sub(r"_+", "_", name)
2688
- # Remove trailing underscores
2689
- name = name.rstrip("_")
2690
- # Add prefix and suffix
2587
+
2588
+ # Remove leading/trailing underscores
2589
+ name = name.strip("_")
2590
+
2591
+ # Truncate very long names (optional)
2592
+ if len(name) > 50:
2593
+ words = name.split("_")
2594
+ # Keep first few meaningful words
2595
+ name = "_".join(words[:4])
2596
+
2691
2597
  return f"macro_{name.lower()}_uk"
2692
-
2598
+
2693
2599
  # Apply cleaning function to relevant columns
2694
2600
  weekly_df.columns = [
2695
2601
  clean_column_name(col) if col != "week_commencing" else col
@@ -2704,6 +2610,19 @@ class datapull:
2704
2610
  # Consider if 0 is the appropriate fill value for your use case
2705
2611
  # weekly_df = weekly_df.fillna(0)
2706
2612
 
2613
+ # Get only the data columns (excluding OBS)
2614
+ data_columns = [col for col in weekly_df.columns if col != "OBS"]
2615
+
2616
+ new_columns = ["OBS"]
2617
+ for i, col in enumerate(data_columns):
2618
+ if i < len(final_cdid_list):
2619
+ new_columns.append(f"{col}_{final_cdid_list[i]}")
2620
+ else:
2621
+ new_columns.append(col) # Keep original if no matching CDID
2622
+
2623
+ # Apply the new column names to the DataFrame
2624
+ weekly_df.columns = new_columns
2625
+
2707
2626
  return weekly_df
2708
2627
  print("No data successfully fetched or processed.")
2709
2628
  return pd.DataFrame()