imsciences 1.0.2__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +2 -0
- imsciences/oecd_pull.py +423 -0
- imsciences/pull-IMS-24Ltp-3.py +3132 -0
- imsciences/pull.py +137 -218
- imsciences-1.1.6.dist-info/METADATA +365 -0
- imsciences-1.0.2.dist-info/METADATA → imsciences-1.1.6.dist-info/PKG-INFO-IMS-24Ltp-3 +1 -1
- imsciences-1.1.6.dist-info/RECORD +14 -0
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/WHEEL +1 -1
- imsciences/unittesting.py +0 -1314
- imsciences-1.0.2.dist-info/RECORD +0 -12
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info/licenses}/LICENSE.txt +0 -0
- {imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/top_level.txt +0 -0
imsciences/pull.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import importlib
|
|
2
|
+
import os
|
|
2
3
|
import re
|
|
3
4
|
import time
|
|
4
5
|
import urllib.request
|
|
@@ -19,7 +20,6 @@ from imsciences.mmm import dataprocessing
|
|
|
19
20
|
|
|
20
21
|
ims_proc = dataprocessing()
|
|
21
22
|
|
|
22
|
-
|
|
23
23
|
class datapull:
|
|
24
24
|
def help(self):
|
|
25
25
|
print("This is the help section. The functions in the package are as follows:")
|
|
@@ -281,10 +281,16 @@ class datapull:
|
|
|
281
281
|
start_date: str = "2020-01-01",
|
|
282
282
|
) -> pd.DataFrame:
|
|
283
283
|
"""
|
|
284
|
-
|
|
284
|
+
Load and process time series data from the cached OECD parquet file.
|
|
285
|
+
|
|
286
|
+
This method loads pre-fetched OECD data from either:
|
|
287
|
+
1. Shared network path (if accessible)
|
|
288
|
+
2. Local cache directory (fallback)
|
|
289
|
+
|
|
290
|
+
If the cache doesn't exist anywhere, it automatically runs the OECDDataPuller to generate it.
|
|
285
291
|
|
|
286
292
|
Args:
|
|
287
|
-
country (
|
|
293
|
+
country (str): A string containing a 3-letter code of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
|
|
288
294
|
week_commencing (str): The starting day of the week for aggregation.
|
|
289
295
|
Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
|
|
290
296
|
start_date (str): Dataset start date in the format "YYYY-MM-DD"
|
|
@@ -294,196 +300,59 @@ class datapull:
|
|
|
294
300
|
commencing dates, and other columns contain the aggregated time series values.
|
|
295
301
|
|
|
296
302
|
"""
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
"N.CPI",
|
|
335
|
-
"SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
336
|
-
"PA.CP01.N.GY",
|
|
337
|
-
"macro_cpi_food",
|
|
338
|
-
],
|
|
339
|
-
[
|
|
340
|
-
"N.CPI",
|
|
341
|
-
"SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
342
|
-
"PA.CP045_0722.N.GY",
|
|
343
|
-
"macro_cpi_energy",
|
|
344
|
-
],
|
|
345
|
-
[
|
|
346
|
-
"UNE_LF_M",
|
|
347
|
-
"SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,",
|
|
348
|
-
"._Z.Y._T.Y_GE15.",
|
|
349
|
-
"macro_unemployment_rate",
|
|
350
|
-
],
|
|
351
|
-
[
|
|
352
|
-
"EAR",
|
|
353
|
-
"SDD.TPS,DSD_EAR@DF_HOU_EAR,",
|
|
354
|
-
".Y..S1D",
|
|
355
|
-
"macro_private_hourly_earnings",
|
|
356
|
-
],
|
|
357
|
-
[
|
|
358
|
-
"RHP",
|
|
359
|
-
"ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0",
|
|
360
|
-
"",
|
|
361
|
-
"macro_real_house_prices",
|
|
362
|
-
],
|
|
363
|
-
[
|
|
364
|
-
"PRVM",
|
|
365
|
-
"SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
366
|
-
"IX.C..",
|
|
367
|
-
"macro_manufacturing_production_volume",
|
|
368
|
-
],
|
|
369
|
-
[
|
|
370
|
-
"TOVM",
|
|
371
|
-
"SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
372
|
-
"IX...",
|
|
373
|
-
"macro_retail_trade_volume",
|
|
374
|
-
],
|
|
375
|
-
["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
|
|
376
|
-
[
|
|
377
|
-
"IRLT",
|
|
378
|
-
"SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
379
|
-
"PA...",
|
|
380
|
-
"macro_long_term_interest_rate",
|
|
381
|
-
],
|
|
382
|
-
[
|
|
383
|
-
"B1GQ",
|
|
384
|
-
"SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1",
|
|
385
|
-
"._Z....GY.T0102",
|
|
386
|
-
"macro_gdp_growth_yoy",
|
|
387
|
-
],
|
|
388
|
-
]
|
|
389
|
-
|
|
390
|
-
# Create empty final dataframe
|
|
391
|
-
oecd_df_final = pd.DataFrame()
|
|
392
|
-
|
|
393
|
-
daily_df = pd.DataFrame({"OBS": date_range})
|
|
394
|
-
value_columns = []
|
|
395
|
-
|
|
396
|
-
# Iterate for each variable of interest
|
|
397
|
-
for series_details in url_details:
|
|
398
|
-
series = series_details[0]
|
|
399
|
-
dataset_id = series_details[1]
|
|
400
|
-
filter = series_details[2]
|
|
401
|
-
col_name = series_details[3]
|
|
402
|
-
|
|
403
|
-
# check if request was successful and determine the most granular data available
|
|
404
|
-
for freq in ["M", "Q", "A"]:
|
|
405
|
-
if series in ["UNE_LF_M", "EAR"]:
|
|
406
|
-
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
|
|
407
|
-
elif series in ["B1GQ"]:
|
|
408
|
-
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
|
|
409
|
-
else:
|
|
410
|
-
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
|
|
411
|
-
|
|
412
|
-
# Make the request to the OECD API for data
|
|
413
|
-
data_response = requests.get(data_url)
|
|
414
|
-
|
|
415
|
-
# Check if the request was successful
|
|
416
|
-
if data_response.status_code != 200:
|
|
417
|
-
print(
|
|
418
|
-
f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}",
|
|
419
|
-
)
|
|
420
|
-
url_test = False
|
|
421
|
-
continue
|
|
422
|
-
url_test = True
|
|
423
|
-
break
|
|
424
|
-
|
|
425
|
-
# get data for the next variable if url doesn't exist
|
|
426
|
-
if url_test is False:
|
|
427
|
-
continue
|
|
428
|
-
|
|
429
|
-
root = ET.fromstring(data_response.content)
|
|
430
|
-
|
|
431
|
-
# Define namespaces if necessary (the namespace is included in the tags)
|
|
432
|
-
namespaces = {
|
|
433
|
-
"generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
# Lists to store the data
|
|
437
|
-
dates = []
|
|
438
|
-
values = []
|
|
439
|
-
|
|
440
|
-
# Iterate over all <Obs> elements and extract date and value
|
|
441
|
-
for obs in root.findall(".//generic:Obs", namespaces):
|
|
442
|
-
# Extracting the time period (date)
|
|
443
|
-
time_period = obs.find(".//generic:ObsDimension", namespaces).get(
|
|
444
|
-
"value",
|
|
445
|
-
)
|
|
446
|
-
|
|
447
|
-
# Extracting the observation value
|
|
448
|
-
value = obs.find(".//generic:ObsValue", namespaces).get("value")
|
|
449
|
-
|
|
450
|
-
# Storing the data
|
|
451
|
-
if time_period and value:
|
|
452
|
-
dates.append(time_period)
|
|
453
|
-
values.append(float(value)) # Convert value to float
|
|
454
|
-
|
|
455
|
-
# Add variable names that were found to a list
|
|
456
|
-
value_columns.append(col_name)
|
|
457
|
-
|
|
458
|
-
# Creating a DataFrame
|
|
459
|
-
data = pd.DataFrame({"OBS": dates, col_name: values})
|
|
460
|
-
|
|
461
|
-
# Convert date strings into datetime format
|
|
462
|
-
if freq == "Q":
|
|
463
|
-
data["OBS"] = data["OBS"].apply(parse_quarter)
|
|
303
|
+
from pathlib import Path
|
|
304
|
+
|
|
305
|
+
# Try shared network path first, then fall back to local
|
|
306
|
+
user_home = os.path.expanduser("~")
|
|
307
|
+
shared_path = Path(user_home) / "im-sciences.com" / "FileShare - MasterDrive" / "Central Database" / "Pull All" / "OECD Database"
|
|
308
|
+
local_path = Path("oecd_data")
|
|
309
|
+
shared_data_file = shared_path / f"oecd_data_{country}.csv"
|
|
310
|
+
local_data_file = local_path / f"oecd_data_{country}.csv"
|
|
311
|
+
|
|
312
|
+
data_file = None
|
|
313
|
+
data_location = None
|
|
314
|
+
|
|
315
|
+
# Check shared path first
|
|
316
|
+
if shared_data_file.exists():
|
|
317
|
+
data_file = shared_data_file
|
|
318
|
+
data_location = "shared network"
|
|
319
|
+
# Fall back to local path
|
|
320
|
+
elif local_data_file.exists():
|
|
321
|
+
data_file = local_data_file
|
|
322
|
+
data_location = "local"
|
|
323
|
+
|
|
324
|
+
# If no cache found anywhere, run the puller
|
|
325
|
+
if data_file is None:
|
|
326
|
+
print(f"OECD data cache not found. Running OECDDataPuller to fetch data...")
|
|
327
|
+
from .oecd_pull import OECDDataPuller
|
|
328
|
+
puller = OECDDataPuller(
|
|
329
|
+
country=country,
|
|
330
|
+
start_date=start_date,
|
|
331
|
+
output_dir=None # Let puller decide between shared/local
|
|
332
|
+
)
|
|
333
|
+
puller.run_until_complete(max_iterations=1) # Run one complete cycle
|
|
334
|
+
print(f"OECD data fetched and cached successfully.\n")
|
|
335
|
+
|
|
336
|
+
# Determine where it was saved
|
|
337
|
+
if shared_data_file.exists():
|
|
338
|
+
data_file = shared_data_file
|
|
339
|
+
data_location = "shared network"
|
|
464
340
|
else:
|
|
465
|
-
|
|
466
|
-
|
|
341
|
+
data_file = local_data_file
|
|
342
|
+
data_location = "local"
|
|
467
343
|
|
|
468
|
-
|
|
469
|
-
|
|
344
|
+
print(f"Loading OECD data from {data_location}: {data_file}")
|
|
345
|
+
daily_df = pd.read_csv(data_file)
|
|
346
|
+
daily_df['OBS'] = pd.to_datetime(daily_df['OBS'])
|
|
470
347
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
daily_df,
|
|
474
|
-
data[["OBS", col_name]],
|
|
475
|
-
on="OBS",
|
|
476
|
-
direction="backward",
|
|
477
|
-
)
|
|
348
|
+
# Get list of value columns (exclude OBS)
|
|
349
|
+
value_columns = [col for col in daily_df.columns if col != "OBS"]
|
|
478
350
|
|
|
479
351
|
# Ensure columns are numeric
|
|
480
352
|
for col in value_columns:
|
|
481
|
-
|
|
482
|
-
daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
|
|
483
|
-
else:
|
|
484
|
-
print(f"Column {col} not found in daily_df")
|
|
353
|
+
daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
|
|
485
354
|
|
|
486
|
-
# Aggregate
|
|
355
|
+
# Aggregate to weekly
|
|
487
356
|
country_df = ims_proc.aggregate_daily_to_wc_wide(
|
|
488
357
|
df=daily_df,
|
|
489
358
|
date_column="OBS",
|
|
@@ -493,11 +362,7 @@ class datapull:
|
|
|
493
362
|
aggregation="average",
|
|
494
363
|
)
|
|
495
364
|
|
|
496
|
-
|
|
497
|
-
[oecd_df_final, country_df],
|
|
498
|
-
axis=0,
|
|
499
|
-
ignore_index=True,
|
|
500
|
-
)
|
|
365
|
+
return country_df
|
|
501
366
|
|
|
502
367
|
return oecd_df_final
|
|
503
368
|
|
|
@@ -909,6 +774,12 @@ class datapull:
|
|
|
909
774
|
"NG": "Nigeria",
|
|
910
775
|
"ST": "SaoTomeAndPrincipe",
|
|
911
776
|
}
|
|
777
|
+
# Month Order Dictionary
|
|
778
|
+
month_order = [
|
|
779
|
+
"january", "february", "march", "april",
|
|
780
|
+
"may", "june", "july", "august",
|
|
781
|
+
"september", "october", "november", "december",
|
|
782
|
+
]
|
|
912
783
|
|
|
913
784
|
# ---------------------------------------------------------------------
|
|
914
785
|
# 1. Create daily date range from start_date to today
|
|
@@ -1124,6 +995,9 @@ class datapull:
|
|
|
1124
995
|
holiday_date = row["Date"]
|
|
1125
996
|
# Create column name without modifying original holiday names
|
|
1126
997
|
holiday_name = row["Holiday"].lower().replace(" ", "_")
|
|
998
|
+
|
|
999
|
+
# Remove all non-alphanumeric characters (except underscores) to prevent commas and apostrophes
|
|
1000
|
+
holiday_name = re.sub(r"[^\w]", "", holiday_name)
|
|
1127
1001
|
|
|
1128
1002
|
# Remove "_shift" or "_substitute" if they appear as standalone suffixes
|
|
1129
1003
|
if holiday_name.endswith("_shift"):
|
|
@@ -1349,10 +1223,15 @@ class datapull:
|
|
|
1349
1223
|
df_weekly_iso_week_year["Year"] = df_weekly_iso_week_year["Year"].astype(int)
|
|
1350
1224
|
|
|
1351
1225
|
# --- Monthly dummies (spread evenly across week) ---
|
|
1352
|
-
|
|
1226
|
+
|
|
1227
|
+
df_daily["Month"] = pd.Categorical(
|
|
1228
|
+
df_daily["Date"].dt.month_name().str.lower(),
|
|
1229
|
+
categories=month_order,
|
|
1230
|
+
ordered=True,
|
|
1231
|
+
)
|
|
1353
1232
|
df_monthly_dummies_daily = pd.get_dummies(
|
|
1354
1233
|
df_daily[["week_start", "Month"]], # Only need these columns
|
|
1355
|
-
prefix="
|
|
1234
|
+
prefix="seas",
|
|
1356
1235
|
columns=["Month"],
|
|
1357
1236
|
dtype=float, # Use float for division
|
|
1358
1237
|
)
|
|
@@ -1435,7 +1314,7 @@ class datapull:
|
|
|
1435
1314
|
# Reorder columns - OBS first, then Constant, Trend, then seasonal features
|
|
1436
1315
|
cols_order = (
|
|
1437
1316
|
["OBS", "Constant", "Trend"]
|
|
1438
|
-
+
|
|
1317
|
+
+ [col for col in df_combined.columns if col.startswith("seas_")]
|
|
1439
1318
|
+ sorted([col for col in df_combined.columns if col.startswith("dum_")])
|
|
1440
1319
|
) # If individual week dummies were enabled
|
|
1441
1320
|
|
|
@@ -1444,7 +1323,6 @@ class datapull:
|
|
|
1444
1323
|
df_combined = df_combined[final_cols]
|
|
1445
1324
|
|
|
1446
1325
|
return df_combined
|
|
1447
|
-
|
|
1448
1326
|
def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
|
|
1449
1327
|
"""
|
|
1450
1328
|
Pull weather data for a given week-commencing day and one or more country codes.
|
|
@@ -2397,13 +2275,13 @@ class datapull:
|
|
|
2397
2275
|
cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
2398
2276
|
week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
|
|
2399
2277
|
sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
|
|
2400
|
-
|
|
2278
|
+
(e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
|
|
2401
2279
|
|
|
2402
2280
|
Returns
|
|
2403
2281
|
-------
|
|
2404
2282
|
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
|
|
2405
|
-
|
|
2406
|
-
|
|
2283
|
+
and all series as renamed columns (e.g., 'macro_retail_sales_uk').
|
|
2284
|
+
Returns an empty DataFrame if no data is fetched or processed.
|
|
2407
2285
|
|
|
2408
2286
|
"""
|
|
2409
2287
|
# Define CDIDs for sectors and defaults
|
|
@@ -2436,16 +2314,11 @@ class datapull:
|
|
|
2436
2314
|
sector_cdids_map.get(sec, []),
|
|
2437
2315
|
) # Use extend to add items from the list
|
|
2438
2316
|
|
|
2439
|
-
standard_cdids = list(
|
|
2440
|
-
set(default_cdids + sector_specific_cdids),
|
|
2441
|
-
) # Combine default and selected sector CDIDs, ensure uniqueness
|
|
2442
|
-
|
|
2443
2317
|
# Combine standard CDIDs and any additional user-provided CDIDs
|
|
2318
|
+
standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
|
|
2444
2319
|
if cdid_list is None:
|
|
2445
2320
|
cdid_list = []
|
|
2446
|
-
final_cdid_list = list(
|
|
2447
|
-
set(standard_cdids + cdid_list),
|
|
2448
|
-
) # Ensure uniqueness in the final list
|
|
2321
|
+
final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
|
|
2449
2322
|
|
|
2450
2323
|
base_search_url = (
|
|
2451
2324
|
"https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
@@ -2670,26 +2543,59 @@ class datapull:
|
|
|
2670
2543
|
)
|
|
2671
2544
|
|
|
2672
2545
|
def clean_column_name(name):
|
|
2673
|
-
# Remove content within parentheses
|
|
2546
|
+
# Remove content within parentheses
|
|
2674
2547
|
name = re.sub(r"\(.*?\)", "", name)
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2548
|
+
|
|
2549
|
+
# Special handling for ANY CPI items (not just CPI INDEX)
|
|
2550
|
+
if "CPI" in name.upper():
|
|
2551
|
+
# Extract the description part after the colon for CPI items
|
|
2552
|
+
if ":" in name:
|
|
2553
|
+
parts = name.split(":")
|
|
2554
|
+
if len(parts) >= 2:
|
|
2555
|
+
# Take the description part (usually the second part)
|
|
2556
|
+
description = parts[1].strip()
|
|
2557
|
+
# Remove any remaining colons and everything after
|
|
2558
|
+
description = description.split(":")[0].strip()
|
|
2559
|
+
name = f"CPI {description}"
|
|
2560
|
+
|
|
2561
|
+
# Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
|
|
2562
|
+
name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
|
|
2563
|
+
|
|
2564
|
+
else:
|
|
2565
|
+
# For non-CPI items, take only the part before the first colon
|
|
2566
|
+
name = re.split(r":", name)[0]
|
|
2567
|
+
# Remove all digits for non-CPI items too
|
|
2568
|
+
name = re.sub(r"\d+", "", name)
|
|
2569
|
+
|
|
2570
|
+
# Remove year references like "2015=100"
|
|
2571
|
+
name = re.sub(r"\d{4}=\d+", "", name)
|
|
2572
|
+
|
|
2573
|
+
# Remove specific words case-insensitively
|
|
2574
|
+
name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
|
|
2575
|
+
|
|
2576
|
+
# Remove percentage symbols and "%"
|
|
2577
|
+
name = re.sub(r"%", "percent", name)
|
|
2578
|
+
|
|
2681
2579
|
# Remove non-alphanumeric characters (except underscore and space)
|
|
2682
2580
|
name = re.sub(r"[^\w\s]", "", name)
|
|
2581
|
+
|
|
2683
2582
|
# Replace spaces with underscores
|
|
2684
|
-
name = name.strip()
|
|
2685
|
-
|
|
2583
|
+
name = name.strip().replace(" ", "_")
|
|
2584
|
+
|
|
2686
2585
|
# Replace multiple underscores with a single one
|
|
2687
2586
|
name = re.sub(r"_+", "_", name)
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
2587
|
+
|
|
2588
|
+
# Remove leading/trailing underscores
|
|
2589
|
+
name = name.strip("_")
|
|
2590
|
+
|
|
2591
|
+
# Truncate very long names (optional)
|
|
2592
|
+
if len(name) > 50:
|
|
2593
|
+
words = name.split("_")
|
|
2594
|
+
# Keep first few meaningful words
|
|
2595
|
+
name = "_".join(words[:4])
|
|
2596
|
+
|
|
2691
2597
|
return f"macro_{name.lower()}_uk"
|
|
2692
|
-
|
|
2598
|
+
|
|
2693
2599
|
# Apply cleaning function to relevant columns
|
|
2694
2600
|
weekly_df.columns = [
|
|
2695
2601
|
clean_column_name(col) if col != "week_commencing" else col
|
|
@@ -2704,6 +2610,19 @@ class datapull:
|
|
|
2704
2610
|
# Consider if 0 is the appropriate fill value for your use case
|
|
2705
2611
|
# weekly_df = weekly_df.fillna(0)
|
|
2706
2612
|
|
|
2613
|
+
# Get only the data columns (excluding OBS)
|
|
2614
|
+
data_columns = [col for col in weekly_df.columns if col != "OBS"]
|
|
2615
|
+
|
|
2616
|
+
new_columns = ["OBS"]
|
|
2617
|
+
for i, col in enumerate(data_columns):
|
|
2618
|
+
if i < len(final_cdid_list):
|
|
2619
|
+
new_columns.append(f"{col}_{final_cdid_list[i]}")
|
|
2620
|
+
else:
|
|
2621
|
+
new_columns.append(col) # Keep original if no matching CDID
|
|
2622
|
+
|
|
2623
|
+
# Apply the new column names to the DataFrame
|
|
2624
|
+
weekly_df.columns = new_columns
|
|
2625
|
+
|
|
2707
2626
|
return weekly_df
|
|
2708
2627
|
print("No data successfully fetched or processed.")
|
|
2709
2628
|
return pd.DataFrame()
|