imsciences 1.0.2__tar.gz → 1.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-1.0.2/imsciences.egg-info → imsciences-1.1.2}/PKG-INFO +11 -2
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences/__init__.py +2 -0
- imsciences-1.1.2/imsciences/oecd_pull.py +423 -0
- imsciences-1.0.2/imsciences/pull.py → imsciences-1.1.2/imsciences/pull-IMS-24Ltp-3.py +64 -23
- imsciences-1.1.2/imsciences/pull.py +3010 -0
- imsciences-1.1.2/imsciences.egg-info/PKG-INFO +365 -0
- imsciences-1.0.2/PKG-INFO → imsciences-1.1.2/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 +1 -1
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences.egg-info/SOURCES.txt +3 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/setup.py +1 -3
- {imsciences-1.0.2 → imsciences-1.1.2}/LICENSE.txt +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/README.md +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences/geo.py +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences/mmm.py +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences/vis.py +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences.egg-info/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/pyproject.toml +0 -0
- {imsciences-1.0.2 → imsciences-1.1.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: imsciences
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.2
|
|
4
4
|
Summary: IMS Data Processing Package
|
|
5
5
|
Author: IMS
|
|
6
6
|
Author-email: cam@im-sciences.com
|
|
@@ -26,6 +26,15 @@ Requires-Dist: google-analytics-data
|
|
|
26
26
|
Requires-Dist: geopandas
|
|
27
27
|
Requires-Dist: geopy
|
|
28
28
|
Requires-Dist: workalendar
|
|
29
|
+
Dynamic: author
|
|
30
|
+
Dynamic: author-email
|
|
31
|
+
Dynamic: classifier
|
|
32
|
+
Dynamic: description
|
|
33
|
+
Dynamic: description-content-type
|
|
34
|
+
Dynamic: keywords
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: summary
|
|
29
38
|
|
|
30
39
|
# IMS Package Documentation
|
|
31
40
|
|
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import requests
|
|
3
|
+
import xml.etree.ElementTree as ET
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import time
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OECDDataPuller:
|
|
12
|
+
"""
|
|
13
|
+
OECD data puller that saves progress and retries until all indicators are fetched.
|
|
14
|
+
Designed to handle API rate limits by saving state between runs.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
INDICATOR_CONFIG = [
|
|
18
|
+
{
|
|
19
|
+
"name": "Business Confidence Index",
|
|
20
|
+
"series": "BCICP",
|
|
21
|
+
"dataset": "SDD.STES,DSD_STES@DF_CLI,",
|
|
22
|
+
"filter": ".....",
|
|
23
|
+
"col_name": "macro_business_confidence_index",
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"name": "Consumer Confidence Index",
|
|
27
|
+
"series": "CCICP",
|
|
28
|
+
"dataset": "SDD.STES,DSD_STES@DF_CLI,",
|
|
29
|
+
"filter": ".....",
|
|
30
|
+
"col_name": "macro_consumer_confidence_index",
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"name": "CPI Total",
|
|
34
|
+
"series": "N.CPI",
|
|
35
|
+
"dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
36
|
+
"filter": "PA._T.N.GY",
|
|
37
|
+
"col_name": "macro_cpi_total",
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"name": "CPI Housing",
|
|
41
|
+
"series": "N.CPI",
|
|
42
|
+
"dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
43
|
+
"filter": "PA.CP041T043.N.GY",
|
|
44
|
+
"col_name": "macro_cpi_housing",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"name": "CPI Food",
|
|
48
|
+
"series": "N.CPI",
|
|
49
|
+
"dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
50
|
+
"filter": "PA.CP01.N.GY",
|
|
51
|
+
"col_name": "macro_cpi_food",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"name": "CPI Energy",
|
|
55
|
+
"series": "N.CPI",
|
|
56
|
+
"dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
|
|
57
|
+
"filter": "PA.CP045_0722.N.GY",
|
|
58
|
+
"col_name": "macro_cpi_energy",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"name": "Unemployment Rate",
|
|
62
|
+
"series": "UNE_LF_M",
|
|
63
|
+
"dataset": "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,",
|
|
64
|
+
"filter": "._Z.Y._T.Y_GE15.",
|
|
65
|
+
"col_name": "macro_unemployment_rate",
|
|
66
|
+
"special": "SPECIAL_UNE",
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"name": "Real House Prices",
|
|
70
|
+
"series": "RHP",
|
|
71
|
+
"dataset": "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0",
|
|
72
|
+
"filter": "",
|
|
73
|
+
"col_name": "macro_real_house_prices",
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"name": "Manufacturing Production",
|
|
77
|
+
"series": "PRVM",
|
|
78
|
+
"dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
79
|
+
"filter": "IX.C..",
|
|
80
|
+
"col_name": "macro_manufacturing_production_volume",
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"name": "Retail Trade Volume",
|
|
84
|
+
"series": "TOVM",
|
|
85
|
+
"dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
86
|
+
"filter": "IX...",
|
|
87
|
+
"col_name": "macro_retail_trade_volume",
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"name": "Interbank Rate",
|
|
91
|
+
"series": "IRSTCI",
|
|
92
|
+
"dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
93
|
+
"filter": "PA...",
|
|
94
|
+
"col_name": "macro_interbank_rate",
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"name": "Long-term Interest Rate",
|
|
98
|
+
"series": "IRLT",
|
|
99
|
+
"dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
|
|
100
|
+
"filter": "PA...",
|
|
101
|
+
"col_name": "macro_long_term_interest_rate",
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"name": "GDP Growth",
|
|
105
|
+
"series": "B1GQ",
|
|
106
|
+
"dataset": "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1",
|
|
107
|
+
"filter": "._Z....GY.T0102",
|
|
108
|
+
"col_name": "macro_gdp_growth_yoy",
|
|
109
|
+
"special": "SPECIAL_GDP",
|
|
110
|
+
},
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
def __init__(self, country="GBR", start_date="2020-01-01", output_dir=None):
|
|
114
|
+
"""
|
|
115
|
+
Initialize the puller.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
country (str): Country code (e.g., "GBR")
|
|
119
|
+
start_date (str): Start date for data collection
|
|
120
|
+
output_dir (str): Directory to save output files and state.
|
|
121
|
+
Defaults to shared network path if available, else local "oecd_data"
|
|
122
|
+
"""
|
|
123
|
+
self.country = country
|
|
124
|
+
self.start_date = start_date
|
|
125
|
+
|
|
126
|
+
# Determine output directory: try shared path first, fall back to local
|
|
127
|
+
if output_dir is None:
|
|
128
|
+
user_home = os.path.expanduser("~")
|
|
129
|
+
shared_path = Path(user_home) / "im-sciences.com" / "FileShare - MasterDrive" / "Central Database" / "Pull All" / "OECD Database"
|
|
130
|
+
local_path = Path("oecd_data")
|
|
131
|
+
|
|
132
|
+
# Try to use shared path if it exists and is accessible
|
|
133
|
+
if shared_path.exists() and shared_path.is_dir():
|
|
134
|
+
self.output_dir = shared_path
|
|
135
|
+
print(f"Using shared network path: {self.output_dir}")
|
|
136
|
+
else:
|
|
137
|
+
self.output_dir = local_path
|
|
138
|
+
print(f"Shared path not available. Using local directory: {self.output_dir}")
|
|
139
|
+
else:
|
|
140
|
+
self.output_dir = Path(output_dir)
|
|
141
|
+
|
|
142
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
143
|
+
|
|
144
|
+
self.state_file = self.output_dir / f"state_{country}.json"
|
|
145
|
+
self.data_file = self.output_dir / f"oecd_data_{country}.csv"
|
|
146
|
+
self.log_file = self.output_dir / f"log_{country}.txt"
|
|
147
|
+
|
|
148
|
+
self.state = self._load_state()
|
|
149
|
+
|
|
150
|
+
def _load_state(self):
|
|
151
|
+
"""Load the current state from file, or initialize a new state."""
|
|
152
|
+
if self.state_file.exists():
|
|
153
|
+
with open(self.state_file, 'r') as f:
|
|
154
|
+
return json.load(f)
|
|
155
|
+
else:
|
|
156
|
+
return {
|
|
157
|
+
"completed_indicators": [],
|
|
158
|
+
"failed_attempts": {},
|
|
159
|
+
"last_run": None,
|
|
160
|
+
"fully_complete": False
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
def _save_state(self):
|
|
164
|
+
"""Save the current state to file."""
|
|
165
|
+
self.state["last_run"] = datetime.now().isoformat()
|
|
166
|
+
with open(self.state_file, 'w') as f:
|
|
167
|
+
json.dump(self.state, f, indent=2)
|
|
168
|
+
|
|
169
|
+
def _log(self, message):
|
|
170
|
+
"""Write a log message to both console and log file."""
|
|
171
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
172
|
+
log_message = f"[{timestamp}] {message}"
|
|
173
|
+
print(log_message)
|
|
174
|
+
|
|
175
|
+
with open(self.log_file, 'a', encoding='utf-8') as f:
|
|
176
|
+
f.write(log_message + "\n")
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def parse_quarter(date_str):
|
|
180
|
+
"""Parse a string in 'YYYY-Q#' format into a datetime object."""
|
|
181
|
+
year, quarter = date_str.split("-")
|
|
182
|
+
quarter_number = int(quarter[1])
|
|
183
|
+
month = (quarter_number - 1) * 3 + 1
|
|
184
|
+
return pd.Timestamp(f"{year}-{month:02d}-01")
|
|
185
|
+
|
|
186
|
+
def _build_url(self, series, dataset_id, filter_val, freq, special_flag=None):
|
|
187
|
+
"""Build the appropriate OECD API URL based on indicator type."""
|
|
188
|
+
if special_flag == "SPECIAL_GDP":
|
|
189
|
+
return f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{self.country}...{series}.{filter_val}?startPeriod=1950-01"
|
|
190
|
+
elif special_flag == "SPECIAL_UNE":
|
|
191
|
+
return f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{self.country}.{series}.{filter_val}.{freq}?startPeriod=1950-01"
|
|
192
|
+
else:
|
|
193
|
+
return f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{self.country}.{freq}.{series}.{filter_val}?startPeriod=1950-01"
|
|
194
|
+
|
|
195
|
+
def _extract_observations(self, xml_content):
|
|
196
|
+
"""Extract dates and values from OECD API XML response."""
|
|
197
|
+
root = ET.fromstring(xml_content)
|
|
198
|
+
namespaces = {
|
|
199
|
+
"generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
dates = []
|
|
203
|
+
values = []
|
|
204
|
+
|
|
205
|
+
for obs in root.findall(".//generic:Obs", namespaces):
|
|
206
|
+
time_period = obs.find(".//generic:ObsDimension", namespaces).get("value")
|
|
207
|
+
value = obs.find(".//generic:ObsValue", namespaces).get("value")
|
|
208
|
+
|
|
209
|
+
if time_period and value:
|
|
210
|
+
dates.append(time_period)
|
|
211
|
+
values.append(float(value))
|
|
212
|
+
|
|
213
|
+
return dates, values
|
|
214
|
+
|
|
215
|
+
def _fetch_indicator_data(self, series, dataset_id, filter_val, col_name, special_flag=None):
|
|
216
|
+
"""
|
|
217
|
+
Attempt to fetch data for a single indicator across different frequencies.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
tuple: (DataFrame or None, frequency_used, success_flag)
|
|
221
|
+
"""
|
|
222
|
+
for freq in ["M", "Q", "A"]:
|
|
223
|
+
url = self._build_url(series, dataset_id, filter_val, freq, special_flag)
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
response = requests.get(url, timeout=15)
|
|
227
|
+
|
|
228
|
+
if response.status_code == 429:
|
|
229
|
+
self._log(f"Rate limit hit for {col_name}")
|
|
230
|
+
return None, None, False
|
|
231
|
+
|
|
232
|
+
if response.status_code != 200:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
dates, values = self._extract_observations(response.content)
|
|
236
|
+
|
|
237
|
+
if len(dates) == 0:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
data = pd.DataFrame({"OBS": dates, col_name: values})
|
|
241
|
+
|
|
242
|
+
if freq == "Q":
|
|
243
|
+
data["OBS"] = data["OBS"].apply(self.parse_quarter)
|
|
244
|
+
else:
|
|
245
|
+
data["OBS"] = data["OBS"].apply(lambda x: datetime.strptime(x, "%Y-%m"))
|
|
246
|
+
|
|
247
|
+
data.sort_values(by="OBS", inplace=True)
|
|
248
|
+
|
|
249
|
+
return data, freq, True
|
|
250
|
+
|
|
251
|
+
except Exception as e:
|
|
252
|
+
self._log(f"Error fetching {col_name}: {str(e)}")
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
return None, None, False
|
|
256
|
+
|
|
257
|
+
def _load_existing_data(self):
|
|
258
|
+
"""Load existing data from CSV file if it exists."""
|
|
259
|
+
if self.data_file.exists():
|
|
260
|
+
df = pd.read_csv(self.data_file)
|
|
261
|
+
df['OBS'] = pd.to_datetime(df['OBS'])
|
|
262
|
+
return df
|
|
263
|
+
else:
|
|
264
|
+
date_range = pd.date_range(start=self.start_date, end=datetime.today(), freq="D")
|
|
265
|
+
return pd.DataFrame({"OBS": date_range})
|
|
266
|
+
|
|
267
|
+
def _save_data(self, df):
|
|
268
|
+
"""Save DataFrame to CSV file."""
|
|
269
|
+
df.to_csv(self.data_file, index=False)
|
|
270
|
+
|
|
271
|
+
def fetch_pending_indicators(self):
|
|
272
|
+
"""Fetch all indicators to ensure the most up-to-date data."""
|
|
273
|
+
# Reset completed indicators for this fresh run
|
|
274
|
+
self.state["completed_indicators"] = []
|
|
275
|
+
|
|
276
|
+
daily_df = self._load_existing_data()
|
|
277
|
+
|
|
278
|
+
# Always attempt to refresh all indicators
|
|
279
|
+
pending_indicators = self.INDICATOR_CONFIG
|
|
280
|
+
|
|
281
|
+
self._log(f"Starting fetch cycle. Pending: {len(pending_indicators)}/{len(self.INDICATOR_CONFIG)}")
|
|
282
|
+
|
|
283
|
+
for indicator in pending_indicators:
|
|
284
|
+
col_name = indicator["col_name"]
|
|
285
|
+
|
|
286
|
+
self._log(f"Fetching: {indicator['name']} ({col_name})")
|
|
287
|
+
|
|
288
|
+
data, freq_used, success = self._fetch_indicator_data(
|
|
289
|
+
indicator["series"],
|
|
290
|
+
indicator["dataset"],
|
|
291
|
+
indicator["filter"],
|
|
292
|
+
col_name,
|
|
293
|
+
indicator.get("special")
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
if success:
|
|
297
|
+
self._log(f" [OK] Success! {len(data)} observations ({freq_used})")
|
|
298
|
+
|
|
299
|
+
daily_df = pd.merge_asof(
|
|
300
|
+
daily_df,
|
|
301
|
+
data[["OBS", col_name]],
|
|
302
|
+
on="OBS",
|
|
303
|
+
direction="backward",
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
self.state["completed_indicators"].append(col_name)
|
|
307
|
+
|
|
308
|
+
if col_name in self.state["failed_attempts"]:
|
|
309
|
+
del self.state["failed_attempts"][col_name]
|
|
310
|
+
|
|
311
|
+
self._save_data(daily_df)
|
|
312
|
+
self._save_state()
|
|
313
|
+
|
|
314
|
+
time.sleep(2)
|
|
315
|
+
|
|
316
|
+
else:
|
|
317
|
+
self._log(f" [FAIL] Failed to fetch {col_name}")
|
|
318
|
+
|
|
319
|
+
if col_name not in self.state["failed_attempts"]:
|
|
320
|
+
self.state["failed_attempts"][col_name] = 0
|
|
321
|
+
self.state["failed_attempts"][col_name] += 1
|
|
322
|
+
|
|
323
|
+
self._save_state()
|
|
324
|
+
|
|
325
|
+
remaining = len(self.INDICATOR_CONFIG) - len(self.state["completed_indicators"])
|
|
326
|
+
|
|
327
|
+
if remaining == 0:
|
|
328
|
+
self._log("All indicators successfully fetched!")
|
|
329
|
+
else:
|
|
330
|
+
self._log(f"Fetch cycle complete. {remaining} indicators still have failures.")
|
|
331
|
+
|
|
332
|
+
return True # Always return True to indicate the refresh cycle completed
|
|
333
|
+
|
|
334
|
+
def run_until_complete(self, max_iterations=None, sleep_seconds=3600):
|
|
335
|
+
"""
|
|
336
|
+
Run the fetcher to refresh all indicators with up-to-date data.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
max_iterations (int): Maximum number of cycles to run (None = unlimited)
|
|
340
|
+
sleep_seconds (int): Seconds to wait between cycles (default: 3600 = 1 hour)
|
|
341
|
+
"""
|
|
342
|
+
iteration = 0
|
|
343
|
+
|
|
344
|
+
self._log("=" * 80)
|
|
345
|
+
self._log(f"Starting OECD data puller for {self.country}")
|
|
346
|
+
self._log(f"Output directory: {self.output_dir.absolute()}")
|
|
347
|
+
self._log("=" * 80)
|
|
348
|
+
|
|
349
|
+
while True:
|
|
350
|
+
iteration += 1
|
|
351
|
+
|
|
352
|
+
if max_iterations and iteration > max_iterations:
|
|
353
|
+
self._log(f"Reached maximum iterations ({max_iterations}). Stopping.")
|
|
354
|
+
break
|
|
355
|
+
|
|
356
|
+
self._log(f"{'='*80}")
|
|
357
|
+
self._log(f"ITERATION {iteration}")
|
|
358
|
+
self._log(f"{'='*80}")
|
|
359
|
+
|
|
360
|
+
self.fetch_pending_indicators()
|
|
361
|
+
|
|
362
|
+
# Check if all indicators have data
|
|
363
|
+
if len(self.state["completed_indicators"]) == len(self.INDICATOR_CONFIG):
|
|
364
|
+
self._log("=" * 80)
|
|
365
|
+
self._log("SUCCESS! All indicators have been fetched.")
|
|
366
|
+
self._log(f"Data saved to: {self.data_file.absolute()}")
|
|
367
|
+
self._log("=" * 80)
|
|
368
|
+
break
|
|
369
|
+
|
|
370
|
+
self._log(f"Waiting {sleep_seconds} seconds before next attempt...")
|
|
371
|
+
self._log(f"Next run scheduled for: {(datetime.now() + pd.Timedelta(seconds=sleep_seconds)).strftime('%Y-%m-%d %H:%M:%S')}")
|
|
372
|
+
|
|
373
|
+
time.sleep(sleep_seconds)
|
|
374
|
+
|
|
375
|
+
def get_status(self):
|
|
376
|
+
"""Get current status of data collection."""
|
|
377
|
+
total = len(self.INDICATOR_CONFIG)
|
|
378
|
+
completed = len(self.state["completed_indicators"])
|
|
379
|
+
|
|
380
|
+
status = {
|
|
381
|
+
"country": self.country,
|
|
382
|
+
"total_indicators": total,
|
|
383
|
+
"completed_indicators": completed,
|
|
384
|
+
"remaining_indicators": total - completed,
|
|
385
|
+
"completion_percentage": (completed / total) * 100,
|
|
386
|
+
"fully_complete": self.state["fully_complete"],
|
|
387
|
+
"last_run": self.state["last_run"],
|
|
388
|
+
"failed_attempts": self.state["failed_attempts"],
|
|
389
|
+
"completed_list": self.state["completed_indicators"]
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return status
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def main():
|
|
396
|
+
"""Main execution function with example usage."""
|
|
397
|
+
|
|
398
|
+
# Example 1: Run until complete with 1-hour intervals
|
|
399
|
+
puller = OECDDataPuller(
|
|
400
|
+
country="GBR",
|
|
401
|
+
start_date="2020-01-01",
|
|
402
|
+
output_dir="oecd_data"
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Check current status
|
|
406
|
+
status = puller.get_status()
|
|
407
|
+
print(f"Current progress: {status['completed_indicators']}/{status['total_indicators']} "
|
|
408
|
+
f"({status['completion_percentage']:.1f}%)")
|
|
409
|
+
|
|
410
|
+
# Run until complete (will retry every hour)
|
|
411
|
+
puller.run_until_complete(sleep_seconds=3600)
|
|
412
|
+
|
|
413
|
+
# Example 2: Run a single fetch cycle (useful for manual/scheduled execution)
|
|
414
|
+
# puller = OECDDataPuller(country="GBR")
|
|
415
|
+
# puller.fetch_pending_indicators()
|
|
416
|
+
|
|
417
|
+
# Example 3: Run with a custom sleep interval (e.g., 30 minutes)
|
|
418
|
+
# puller = OECDDataPuller(country="USA")
|
|
419
|
+
# puller.run_until_complete(sleep_seconds=1800)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
if __name__ == "__main__":
|
|
423
|
+
main()
|
|
@@ -2397,13 +2397,13 @@ class datapull:
|
|
|
2397
2397
|
cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
2398
2398
|
week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
|
|
2399
2399
|
sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
|
|
2400
|
-
|
|
2400
|
+
(e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
|
|
2401
2401
|
|
|
2402
2402
|
Returns
|
|
2403
2403
|
-------
|
|
2404
2404
|
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
|
|
2405
|
-
|
|
2406
|
-
|
|
2405
|
+
and all series as renamed columns (e.g., 'macro_retail_sales_uk').
|
|
2406
|
+
Returns an empty DataFrame if no data is fetched or processed.
|
|
2407
2407
|
|
|
2408
2408
|
"""
|
|
2409
2409
|
# Define CDIDs for sectors and defaults
|
|
@@ -2436,16 +2436,11 @@ class datapull:
|
|
|
2436
2436
|
sector_cdids_map.get(sec, []),
|
|
2437
2437
|
) # Use extend to add items from the list
|
|
2438
2438
|
|
|
2439
|
-
standard_cdids = list(
|
|
2440
|
-
set(default_cdids + sector_specific_cdids),
|
|
2441
|
-
) # Combine default and selected sector CDIDs, ensure uniqueness
|
|
2442
|
-
|
|
2443
2439
|
# Combine standard CDIDs and any additional user-provided CDIDs
|
|
2440
|
+
standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
|
|
2444
2441
|
if cdid_list is None:
|
|
2445
2442
|
cdid_list = []
|
|
2446
|
-
final_cdid_list = list(
|
|
2447
|
-
set(standard_cdids + cdid_list),
|
|
2448
|
-
) # Ensure uniqueness in the final list
|
|
2443
|
+
final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
|
|
2449
2444
|
|
|
2450
2445
|
base_search_url = (
|
|
2451
2446
|
"https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
@@ -2670,26 +2665,59 @@ class datapull:
|
|
|
2670
2665
|
)
|
|
2671
2666
|
|
|
2672
2667
|
def clean_column_name(name):
|
|
2673
|
-
# Remove content within parentheses
|
|
2668
|
+
# Remove content within parentheses
|
|
2674
2669
|
name = re.sub(r"\(.*?\)", "", name)
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2670
|
+
|
|
2671
|
+
# Special handling for ANY CPI items (not just CPI INDEX)
|
|
2672
|
+
if "CPI" in name.upper():
|
|
2673
|
+
# Extract the description part after the colon for CPI items
|
|
2674
|
+
if ":" in name:
|
|
2675
|
+
parts = name.split(":")
|
|
2676
|
+
if len(parts) >= 2:
|
|
2677
|
+
# Take the description part (usually the second part)
|
|
2678
|
+
description = parts[1].strip()
|
|
2679
|
+
# Remove any remaining colons and everything after
|
|
2680
|
+
description = description.split(":")[0].strip()
|
|
2681
|
+
name = f"CPI {description}"
|
|
2682
|
+
|
|
2683
|
+
# Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
|
|
2684
|
+
name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
|
|
2685
|
+
|
|
2686
|
+
else:
|
|
2687
|
+
# For non-CPI items, take only the part before the first colon
|
|
2688
|
+
name = re.split(r":", name)[0]
|
|
2689
|
+
# Remove all digits for non-CPI items too
|
|
2690
|
+
name = re.sub(r"\d+", "", name)
|
|
2691
|
+
|
|
2692
|
+
# Remove year references like "2015=100"
|
|
2693
|
+
name = re.sub(r"\d{4}=\d+", "", name)
|
|
2694
|
+
|
|
2695
|
+
# Remove specific words case-insensitively
|
|
2696
|
+
name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
|
|
2697
|
+
|
|
2698
|
+
# Remove percentage symbols and "%"
|
|
2699
|
+
name = re.sub(r"%", "percent", name)
|
|
2700
|
+
|
|
2681
2701
|
# Remove non-alphanumeric characters (except underscore and space)
|
|
2682
2702
|
name = re.sub(r"[^\w\s]", "", name)
|
|
2703
|
+
|
|
2683
2704
|
# Replace spaces with underscores
|
|
2684
|
-
name = name.strip()
|
|
2685
|
-
|
|
2705
|
+
name = name.strip().replace(" ", "_")
|
|
2706
|
+
|
|
2686
2707
|
# Replace multiple underscores with a single one
|
|
2687
2708
|
name = re.sub(r"_+", "_", name)
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
2709
|
+
|
|
2710
|
+
# Remove leading/trailing underscores
|
|
2711
|
+
name = name.strip("_")
|
|
2712
|
+
|
|
2713
|
+
# Truncate very long names (optional)
|
|
2714
|
+
if len(name) > 50:
|
|
2715
|
+
words = name.split("_")
|
|
2716
|
+
# Keep first few meaningful words
|
|
2717
|
+
name = "_".join(words[:4])
|
|
2718
|
+
|
|
2691
2719
|
return f"macro_{name.lower()}_uk"
|
|
2692
|
-
|
|
2720
|
+
|
|
2693
2721
|
# Apply cleaning function to relevant columns
|
|
2694
2722
|
weekly_df.columns = [
|
|
2695
2723
|
clean_column_name(col) if col != "week_commencing" else col
|
|
@@ -2704,6 +2732,19 @@ class datapull:
|
|
|
2704
2732
|
# Consider if 0 is the appropriate fill value for your use case
|
|
2705
2733
|
# weekly_df = weekly_df.fillna(0)
|
|
2706
2734
|
|
|
2735
|
+
# Get only the data columns (excluding OBS)
|
|
2736
|
+
data_columns = [col for col in weekly_df.columns if col != "OBS"]
|
|
2737
|
+
|
|
2738
|
+
new_columns = ["OBS"]
|
|
2739
|
+
for i, col in enumerate(data_columns):
|
|
2740
|
+
if i < len(final_cdid_list):
|
|
2741
|
+
new_columns.append(f"{col}_{final_cdid_list[i]}")
|
|
2742
|
+
else:
|
|
2743
|
+
new_columns.append(col) # Keep original if no matching CDID
|
|
2744
|
+
|
|
2745
|
+
# Apply the new column names to the DataFrame
|
|
2746
|
+
weekly_df.columns = new_columns
|
|
2747
|
+
|
|
2707
2748
|
return weekly_df
|
|
2708
2749
|
print("No data successfully fetched or processed.")
|
|
2709
2750
|
return pd.DataFrame()
|