imsciences 1.0.9__tar.gz → 1.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: imsciences
3
- Version: 1.0.9
3
+ Version: 1.1.8
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -1,3 +1,5 @@
1
+ __version__ = "1.1.8"
2
+
1
3
  from .geo import geoprocessing
2
4
  from .mmm import dataprocessing
3
5
  from .pull import datapull
@@ -0,0 +1,423 @@
1
+ import pandas as pd
2
+ import requests
3
+ import xml.etree.ElementTree as ET
4
+ from datetime import datetime
5
+ import time
6
+ import json
7
+ import os
8
+ from pathlib import Path
9
+
10
+
11
+ class OECDDataPuller:
12
+ """
13
+ OECD data puller that saves progress and retries until all indicators are fetched.
14
+ Designed to handle API rate limits by saving state between runs.
15
+ """
16
+
17
+ INDICATOR_CONFIG = [
18
+ {
19
+ "name": "Business Confidence Index",
20
+ "series": "BCICP",
21
+ "dataset": "SDD.STES,DSD_STES@DF_CLI,",
22
+ "filter": ".....",
23
+ "col_name": "macro_business_confidence_index",
24
+ },
25
+ {
26
+ "name": "Consumer Confidence Index",
27
+ "series": "CCICP",
28
+ "dataset": "SDD.STES,DSD_STES@DF_CLI,",
29
+ "filter": ".....",
30
+ "col_name": "macro_consumer_confidence_index",
31
+ },
32
+ {
33
+ "name": "CPI Total",
34
+ "series": "N.CPI",
35
+ "dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
36
+ "filter": "PA._T.N.GY",
37
+ "col_name": "macro_cpi_total",
38
+ },
39
+ {
40
+ "name": "CPI Housing",
41
+ "series": "N.CPI",
42
+ "dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
43
+ "filter": "PA.CP041T043.N.GY",
44
+ "col_name": "macro_cpi_housing",
45
+ },
46
+ {
47
+ "name": "CPI Food",
48
+ "series": "N.CPI",
49
+ "dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
50
+ "filter": "PA.CP01.N.GY",
51
+ "col_name": "macro_cpi_food",
52
+ },
53
+ {
54
+ "name": "CPI Energy",
55
+ "series": "N.CPI",
56
+ "dataset": "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
57
+ "filter": "PA.CP045_0722.N.GY",
58
+ "col_name": "macro_cpi_energy",
59
+ },
60
+ {
61
+ "name": "Unemployment Rate",
62
+ "series": "UNE_LF_M",
63
+ "dataset": "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,",
64
+ "filter": "._Z.Y._T.Y_GE15.",
65
+ "col_name": "macro_unemployment_rate",
66
+ "special": "SPECIAL_UNE",
67
+ },
68
+ {
69
+ "name": "Real House Prices",
70
+ "series": "RHP",
71
+ "dataset": "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0",
72
+ "filter": "",
73
+ "col_name": "macro_real_house_prices",
74
+ },
75
+ {
76
+ "name": "Manufacturing Production",
77
+ "series": "PRVM",
78
+ "dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
79
+ "filter": "IX.C..",
80
+ "col_name": "macro_manufacturing_production_volume",
81
+ },
82
+ {
83
+ "name": "Retail Trade Volume",
84
+ "series": "TOVM",
85
+ "dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
86
+ "filter": "IX...",
87
+ "col_name": "macro_retail_trade_volume",
88
+ },
89
+ {
90
+ "name": "Interbank Rate",
91
+ "series": "IRSTCI",
92
+ "dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
93
+ "filter": "PA...",
94
+ "col_name": "macro_interbank_rate",
95
+ },
96
+ {
97
+ "name": "Long-term Interest Rate",
98
+ "series": "IRLT",
99
+ "dataset": "SDD.STES,DSD_KEI@DF_KEI,4.0",
100
+ "filter": "PA...",
101
+ "col_name": "macro_long_term_interest_rate",
102
+ },
103
+ {
104
+ "name": "GDP Growth",
105
+ "series": "B1GQ",
106
+ "dataset": "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1",
107
+ "filter": "._Z....GY.T0102",
108
+ "col_name": "macro_gdp_growth_yoy",
109
+ "special": "SPECIAL_GDP",
110
+ },
111
+ ]
112
+
113
+ def __init__(self, country="GBR", start_date="2020-01-01", output_dir=None):
114
+ """
115
+ Initialize the puller.
116
+
117
+ Args:
118
+ country (str): Country code (e.g., "GBR")
119
+ start_date (str): Start date for data collection
120
+ output_dir (str): Directory to save output files and state.
121
+ Defaults to shared network path if available, else local "oecd_data"
122
+ """
123
+ self.country = country
124
+ self.start_date = start_date
125
+
126
+ # Determine output directory: try shared path first, fall back to local
127
+ if output_dir is None:
128
+ user_home = os.path.expanduser("~")
129
+ shared_path = Path(user_home) / "im-sciences.com" / "FileShare - MasterDrive" / "Central Database" / "Pull All" / "OECD Database"
130
+ local_path = Path("oecd_data")
131
+
132
+ # Try to use shared path if it exists and is accessible
133
+ if shared_path.exists() and shared_path.is_dir():
134
+ self.output_dir = shared_path
135
+ print(f"Using shared network path: {self.output_dir}")
136
+ else:
137
+ self.output_dir = local_path
138
+ print(f"Shared path not available. Using local directory: {self.output_dir}")
139
+ else:
140
+ self.output_dir = Path(output_dir)
141
+
142
+ self.output_dir.mkdir(parents=True, exist_ok=True)
143
+
144
+ self.state_file = self.output_dir / f"state_{country}.json"
145
+ self.data_file = self.output_dir / f"oecd_data_{country}.csv"
146
+ self.log_file = self.output_dir / f"log_{country}.txt"
147
+
148
+ self.state = self._load_state()
149
+
150
+ def _load_state(self):
151
+ """Load the current state from file, or initialize a new state."""
152
+ if self.state_file.exists():
153
+ with open(self.state_file, 'r') as f:
154
+ return json.load(f)
155
+ else:
156
+ return {
157
+ "completed_indicators": [],
158
+ "failed_attempts": {},
159
+ "last_run": None,
160
+ "fully_complete": False
161
+ }
162
+
163
+ def _save_state(self):
164
+ """Save the current state to file."""
165
+ self.state["last_run"] = datetime.now().isoformat()
166
+ with open(self.state_file, 'w') as f:
167
+ json.dump(self.state, f, indent=2)
168
+
169
+ def _log(self, message):
170
+ """Write a log message to both console and log file."""
171
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
172
+ log_message = f"[{timestamp}] {message}"
173
+ print(log_message)
174
+
175
+ with open(self.log_file, 'a', encoding='utf-8') as f:
176
+ f.write(log_message + "\n")
177
+
178
+ @staticmethod
179
+ def parse_quarter(date_str):
180
+ """Parse a string in 'YYYY-Q#' format into a datetime object."""
181
+ year, quarter = date_str.split("-")
182
+ quarter_number = int(quarter[1])
183
+ month = (quarter_number - 1) * 3 + 1
184
+ return pd.Timestamp(f"{year}-{month:02d}-01")
185
+
186
+ def _build_url(self, series, dataset_id, filter_val, freq, special_flag=None):
187
+ """Build the appropriate OECD API URL based on indicator type."""
188
+ if special_flag == "SPECIAL_GDP":
189
+ return f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{self.country}...{series}.{filter_val}?startPeriod=1950-01"
190
+ elif special_flag == "SPECIAL_UNE":
191
+ return f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{self.country}.{series}.{filter_val}.{freq}?startPeriod=1950-01"
192
+ else:
193
+ return f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{self.country}.{freq}.{series}.{filter_val}?startPeriod=1950-01"
194
+
195
+ def _extract_observations(self, xml_content):
196
+ """Extract dates and values from OECD API XML response."""
197
+ root = ET.fromstring(xml_content)
198
+ namespaces = {
199
+ "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
200
+ }
201
+
202
+ dates = []
203
+ values = []
204
+
205
+ for obs in root.findall(".//generic:Obs", namespaces):
206
+ time_period = obs.find(".//generic:ObsDimension", namespaces).get("value")
207
+ value = obs.find(".//generic:ObsValue", namespaces).get("value")
208
+
209
+ if time_period and value:
210
+ dates.append(time_period)
211
+ values.append(float(value))
212
+
213
+ return dates, values
214
+
215
+ def _fetch_indicator_data(self, series, dataset_id, filter_val, col_name, special_flag=None):
216
+ """
217
+ Attempt to fetch data for a single indicator across different frequencies.
218
+
219
+ Returns:
220
+ tuple: (DataFrame or None, frequency_used, success_flag)
221
+ """
222
+ for freq in ["M", "Q", "A"]:
223
+ url = self._build_url(series, dataset_id, filter_val, freq, special_flag)
224
+
225
+ try:
226
+ response = requests.get(url, timeout=15)
227
+
228
+ if response.status_code == 429:
229
+ self._log(f"Rate limit hit for {col_name}")
230
+ return None, None, False
231
+
232
+ if response.status_code != 200:
233
+ continue
234
+
235
+ dates, values = self._extract_observations(response.content)
236
+
237
+ if len(dates) == 0:
238
+ continue
239
+
240
+ data = pd.DataFrame({"OBS": dates, col_name: values})
241
+
242
+ if freq == "Q":
243
+ data["OBS"] = data["OBS"].apply(self.parse_quarter)
244
+ else:
245
+ data["OBS"] = data["OBS"].apply(lambda x: datetime.strptime(x, "%Y-%m"))
246
+
247
+ data.sort_values(by="OBS", inplace=True)
248
+
249
+ return data, freq, True
250
+
251
+ except Exception as e:
252
+ self._log(f"Error fetching {col_name}: {str(e)}")
253
+ continue
254
+
255
+ return None, None, False
256
+
257
+ def _load_existing_data(self):
258
+ """Load existing data from CSV file if it exists."""
259
+ if self.data_file.exists():
260
+ df = pd.read_csv(self.data_file)
261
+ df['OBS'] = pd.to_datetime(df['OBS'])
262
+ return df
263
+ else:
264
+ date_range = pd.date_range(start=self.start_date, end=datetime.today(), freq="D")
265
+ return pd.DataFrame({"OBS": date_range})
266
+
267
+ def _save_data(self, df):
268
+ """Save DataFrame to CSV file."""
269
+ df.to_csv(self.data_file, index=False)
270
+
271
+ def fetch_pending_indicators(self):
272
+ """Fetch all indicators to ensure the most up-to-date data."""
273
+ # Reset completed indicators for this fresh run
274
+ self.state["completed_indicators"] = []
275
+
276
+ daily_df = self._load_existing_data()
277
+
278
+ # Always attempt to refresh all indicators
279
+ pending_indicators = self.INDICATOR_CONFIG
280
+
281
+ self._log(f"Starting fetch cycle. Pending: {len(pending_indicators)}/{len(self.INDICATOR_CONFIG)}")
282
+
283
+ for indicator in pending_indicators:
284
+ col_name = indicator["col_name"]
285
+
286
+ self._log(f"Fetching: {indicator['name']} ({col_name})")
287
+
288
+ data, freq_used, success = self._fetch_indicator_data(
289
+ indicator["series"],
290
+ indicator["dataset"],
291
+ indicator["filter"],
292
+ col_name,
293
+ indicator.get("special")
294
+ )
295
+
296
+ if success:
297
+ self._log(f" [OK] Success! {len(data)} observations ({freq_used})")
298
+
299
+ daily_df = pd.merge_asof(
300
+ daily_df,
301
+ data[["OBS", col_name]],
302
+ on="OBS",
303
+ direction="backward",
304
+ )
305
+
306
+ self.state["completed_indicators"].append(col_name)
307
+
308
+ if col_name in self.state["failed_attempts"]:
309
+ del self.state["failed_attempts"][col_name]
310
+
311
+ self._save_data(daily_df)
312
+ self._save_state()
313
+
314
+ time.sleep(2)
315
+
316
+ else:
317
+ self._log(f" [FAIL] Failed to fetch {col_name}")
318
+
319
+ if col_name not in self.state["failed_attempts"]:
320
+ self.state["failed_attempts"][col_name] = 0
321
+ self.state["failed_attempts"][col_name] += 1
322
+
323
+ self._save_state()
324
+
325
+ remaining = len(self.INDICATOR_CONFIG) - len(self.state["completed_indicators"])
326
+
327
+ if remaining == 0:
328
+ self._log("All indicators successfully fetched!")
329
+ else:
330
+ self._log(f"Fetch cycle complete. {remaining} indicators still have failures.")
331
+
332
+ return True # Always return True to indicate the refresh cycle completed
333
+
334
+ def run_until_complete(self, max_iterations=None, sleep_seconds=3600):
335
+ """
336
+ Run the fetcher to refresh all indicators with up-to-date data.
337
+
338
+ Args:
339
+ max_iterations (int): Maximum number of cycles to run (None = unlimited)
340
+ sleep_seconds (int): Seconds to wait between cycles (default: 3600 = 1 hour)
341
+ """
342
+ iteration = 0
343
+
344
+ self._log("=" * 80)
345
+ self._log(f"Starting OECD data puller for {self.country}")
346
+ self._log(f"Output directory: {self.output_dir.absolute()}")
347
+ self._log("=" * 80)
348
+
349
+ while True:
350
+ iteration += 1
351
+
352
+ if max_iterations and iteration > max_iterations:
353
+ self._log(f"Reached maximum iterations ({max_iterations}). Stopping.")
354
+ break
355
+
356
+ self._log(f"{'='*80}")
357
+ self._log(f"ITERATION {iteration}")
358
+ self._log(f"{'='*80}")
359
+
360
+ self.fetch_pending_indicators()
361
+
362
+ # Check if all indicators have data
363
+ if len(self.state["completed_indicators"]) == len(self.INDICATOR_CONFIG):
364
+ self._log("=" * 80)
365
+ self._log("SUCCESS! All indicators have been fetched.")
366
+ self._log(f"Data saved to: {self.data_file.absolute()}")
367
+ self._log("=" * 80)
368
+ break
369
+
370
+ self._log(f"Waiting {sleep_seconds} seconds before next attempt...")
371
+ self._log(f"Next run scheduled for: {(datetime.now() + pd.Timedelta(seconds=sleep_seconds)).strftime('%Y-%m-%d %H:%M:%S')}")
372
+
373
+ time.sleep(sleep_seconds)
374
+
375
+ def get_status(self):
376
+ """Get current status of data collection."""
377
+ total = len(self.INDICATOR_CONFIG)
378
+ completed = len(self.state["completed_indicators"])
379
+
380
+ status = {
381
+ "country": self.country,
382
+ "total_indicators": total,
383
+ "completed_indicators": completed,
384
+ "remaining_indicators": total - completed,
385
+ "completion_percentage": (completed / total) * 100,
386
+ "fully_complete": self.state["fully_complete"],
387
+ "last_run": self.state["last_run"],
388
+ "failed_attempts": self.state["failed_attempts"],
389
+ "completed_list": self.state["completed_indicators"]
390
+ }
391
+
392
+ return status
393
+
394
+
395
+ def main():
396
+ """Main execution function with example usage."""
397
+
398
+ # Example 1: Run until complete with 1-hour intervals
399
+ puller = OECDDataPuller(
400
+ country="GBR",
401
+ start_date="2020-01-01",
402
+ output_dir="oecd_data"
403
+ )
404
+
405
+ # Check current status
406
+ status = puller.get_status()
407
+ print(f"Current progress: {status['completed_indicators']}/{status['total_indicators']} "
408
+ f"({status['completion_percentage']:.1f}%)")
409
+
410
+ # Run until complete (will retry every hour)
411
+ puller.run_until_complete(sleep_seconds=3600)
412
+
413
+ # Example 2: Run a single fetch cycle (useful for manual/scheduled execution)
414
+ # puller = OECDDataPuller(country="GBR")
415
+ # puller.fetch_pending_indicators()
416
+
417
+ # Example 3: Run with a custom sleep interval (e.g., 30 minutes)
418
+ # puller = OECDDataPuller(country="USA")
419
+ # puller.run_until_complete(sleep_seconds=1800)
420
+
421
+
422
+ if __name__ == "__main__":
423
+ main()