python-eia 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eia/cache.py ADDED
@@ -0,0 +1,399 @@
1
+ """Local parquet cache for EIA API time-series data.
2
+
3
+ Caches query results as parquet files, fetching only missing date ranges
4
+ on subsequent requests. Historical energy data is immutable once
5
+ published (~48h), so caching is safe and enabled by default.
6
+
7
+ Storage layout::
8
+
9
+ {cache_dir}/
10
+ └── electricity/rto/fuel-type-data/
11
+ ├── hourly/
12
+ │ ├── respondent=CISO/
13
+ │ │ ├── data.parquet
14
+ │ │ └── meta.json
15
+ │ └── respondent=PJM.fueltype=SUN,WND/
16
+ │ ├── data.parquet
17
+ │ └── meta.json
18
+ └── monthly/
19
+ └── _all_/
20
+ ├── data.parquet
21
+ └── meta.json
22
+
23
+ Unlike ENTSO-E, EIA stores DataFrames in long format (facet columns +
24
+ value column) rather than wide format, because multiple rows per period
25
+ are common (e.g. one row per fuel type per respondent).
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import json
31
+ import logging
32
+ import os
33
+ import shutil
34
+ import tempfile
35
+ from dataclasses import dataclass, field
36
+ from datetime import datetime
37
+ from pathlib import Path
38
+
39
+ import pandas as pd
40
+
41
+ logger = logging.getLogger("eia")
42
+
43
+ # Default cache location — respects XDG_CACHE_HOME
44
+ _DEFAULT_CACHE_DIR = Path(
45
+ os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")
46
+ ) / "eia"
47
+
48
+ # Data older than this (hours) is considered final and won't be re-fetched
49
+ _DEFAULT_RECENT_TTL_HOURS = 48
50
+
51
+
52
+ def _facets_key(facets: dict | None) -> str:
53
+ """Build a deterministic partition string from facet filters.
54
+
55
+ Examples:
56
+ None → "_all_"
57
+ {"respondent": "CISO"} → "respondent=CISO"
58
+ {"respondent": "PJM", "fueltype": ["SUN", "WND"]}
59
+ → "fueltype=SUN,WND.respondent=PJM"
60
+ """
61
+ if not facets:
62
+ return "_all_"
63
+ parts = []
64
+ for k in sorted(facets.keys()):
65
+ v = facets[k]
66
+ if isinstance(v, list):
67
+ v_str = ",".join(sorted(str(x) for x in v))
68
+ else:
69
+ v_str = str(v)
70
+ parts.append(f"{k}={v_str}")
71
+ return ".".join(parts)
72
+
73
+
74
+ @dataclass
75
+ class CacheConfig:
76
+ """Cache configuration."""
77
+
78
+ enabled: bool = True
79
+ cache_dir: Path = field(default_factory=lambda: _DEFAULT_CACHE_DIR)
80
+ recent_ttl_hours: int = _DEFAULT_RECENT_TTL_HOURS
81
+
82
+ def __post_init__(self) -> None:
83
+ self.cache_dir = Path(self.cache_dir)
84
+
85
+
86
+ @dataclass(frozen=True)
87
+ class DateRange:
88
+ """A contiguous date range [start, end] inclusive."""
89
+
90
+ start: pd.Timestamp
91
+ end: pd.Timestamp
92
+
93
+
94
+ class CacheStore:
95
+ """Read, write, and merge parquet files for cached EIA data."""
96
+
97
+ def __init__(self, config: CacheConfig):
98
+ self.config = config
99
+
100
+ # -- Path resolution -------------------------------------------------------
101
+
102
+ def _parquet_path(self, route: str, frequency: str, facets_key: str) -> Path:
103
+ """Data file: {cache_dir}/{route}/{frequency}/{facets_key}/data.parquet"""
104
+ return self.config.cache_dir / route / frequency / facets_key / "data.parquet"
105
+
106
+ def _meta_path(self, route: str, frequency: str, facets_key: str) -> Path:
107
+ """Metadata file: {cache_dir}/{route}/{frequency}/{facets_key}/meta.json"""
108
+ return self.config.cache_dir / route / frequency / facets_key / "meta.json"
109
+
110
+ # -- Data Read / Write -----------------------------------------------------
111
+
112
+ def read(
113
+ self,
114
+ route: str,
115
+ frequency: str,
116
+ facets_key: str,
117
+ start: pd.Timestamp,
118
+ end: pd.Timestamp,
119
+ ) -> pd.DataFrame:
120
+ """Read cached data for a date range.
121
+
122
+ Returns DataFrame with ``period`` as DatetimeIndex.
123
+ Returns empty DataFrame on cache miss.
124
+ """
125
+ path = self._parquet_path(route, frequency, facets_key)
126
+ if not path.exists():
127
+ return pd.DataFrame()
128
+
129
+ try:
130
+ df = pd.read_parquet(path)
131
+ except Exception as exc:
132
+ logger.warning("Corrupted cache file %s: %s — removing.", path, exc)
133
+ path.unlink(missing_ok=True)
134
+ return pd.DataFrame()
135
+
136
+ if df.empty or not isinstance(df.index, pd.DatetimeIndex):
137
+ return pd.DataFrame()
138
+
139
+ return self._slice(df, start, end)
140
+
141
+ def _slice(
142
+ self, df: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp
143
+ ) -> pd.DataFrame:
144
+ """Slice a DataFrame by [start, end], handling timezone alignment."""
145
+ if df.index.tz is not None:
146
+ if start.tz is None:
147
+ start = start.tz_localize(df.index.tz)
148
+ if end.tz is None:
149
+ end = end.tz_localize(df.index.tz)
150
+ elif start.tz is not None:
151
+ start = start.tz_localize(None)
152
+ if end.tz is not None and df.index.tz is None:
153
+ end = end.tz_localize(None)
154
+
155
+ # When end is a date-level timestamp (midnight), extend to end of day
156
+ if end.hour == 0 and end.minute == 0 and end.second == 0:
157
+ end = end + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
158
+
159
+ return df[start:end]
160
+
161
+ def write(
162
+ self,
163
+ route: str,
164
+ frequency: str,
165
+ facets_key: str,
166
+ df: pd.DataFrame,
167
+ ) -> None:
168
+ """Merge new data with existing cache and persist.
169
+
170
+ *df* should have ``period`` as DatetimeIndex. New data is merged
171
+ with existing, deduplicating on the index. Rows from the new data
172
+ take precedence for overlapping timestamps.
173
+ """
174
+ if df.empty:
175
+ return
176
+
177
+ path = self._parquet_path(route, frequency, facets_key)
178
+ path.parent.mkdir(parents=True, exist_ok=True)
179
+
180
+ # Read existing and merge
181
+ existing = pd.DataFrame()
182
+ if path.exists():
183
+ try:
184
+ existing = pd.read_parquet(path)
185
+ except Exception:
186
+ logger.warning("Corrupted cache at %s — overwriting.", path)
187
+
188
+ if not existing.empty:
189
+ # For long-format data, concat + deduplicate
190
+ merged = pd.concat([existing, df])
191
+ # Drop duplicates: keep last (new data wins)
192
+ # Use all columns for deduplication since index alone isn't unique
193
+ # (multiple rows per period with different facet values)
194
+ merged = merged[~merged.index.duplicated(keep="last")]
195
+ merged = merged.sort_index()
196
+ else:
197
+ merged = df.sort_index()
198
+
199
+ _atomic_write_parquet(path, merged)
200
+
201
+ def write_meta(
202
+ self,
203
+ route: str,
204
+ frequency: str,
205
+ facets_key: str,
206
+ meta: dict,
207
+ ) -> None:
208
+ """Write metadata for a partition."""
209
+ meta = {**meta, "cached_at": datetime.now().isoformat()}
210
+ path = self._meta_path(route, frequency, facets_key)
211
+ _atomic_write_json(path, meta)
212
+
213
+ def read_meta(
214
+ self,
215
+ route: str,
216
+ frequency: str,
217
+ facets_key: str,
218
+ ) -> dict | None:
219
+ """Read cached metadata for a partition."""
220
+ path = self._meta_path(route, frequency, facets_key)
221
+ if not path.exists():
222
+ return None
223
+ try:
224
+ return json.loads(path.read_text(encoding="utf-8"))
225
+ except (json.JSONDecodeError, OSError):
226
+ return None
227
+
228
+ # -- Gap detection ---------------------------------------------------------
229
+
230
+ def find_gaps(
231
+ self,
232
+ cached_df: pd.DataFrame,
233
+ start: pd.Timestamp,
234
+ end: pd.Timestamp,
235
+ *,
236
+ recent_ttl_hours: int | None = None,
237
+ ) -> list[DateRange]:
238
+ """Find date ranges not covered by cached data.
239
+
240
+ Also marks data within ``recent_ttl_hours`` of now as a gap
241
+ (needs re-fetch since it may have been updated).
242
+ """
243
+ ttl = recent_ttl_hours if recent_ttl_hours is not None else self.config.recent_ttl_hours
244
+ now = pd.Timestamp.now(tz="UTC")
245
+ cutoff = now - pd.Timedelta(hours=ttl)
246
+
247
+ if cached_df.empty:
248
+ return [DateRange(start, end)]
249
+
250
+ # Normalize to UTC for comparison
251
+ idx = cached_df.index
252
+ if idx.tz is None:
253
+ idx = idx.tz_localize("UTC")
254
+ else:
255
+ idx = idx.tz_convert("UTC")
256
+
257
+ start_utc = start.tz_localize("UTC") if start.tz is None else start.tz_convert("UTC")
258
+ end_utc = end.tz_localize("UTC") if end.tz is None else end.tz_convert("UTC")
259
+
260
+ cached_start = idx.min()
261
+ cached_end = idx.max()
262
+
263
+ gaps: list[DateRange] = []
264
+
265
+ # Gap before cached data
266
+ if start_utc < cached_start:
267
+ gap_end = min(cached_start - pd.Timedelta(hours=1), end_utc)
268
+ if gap_end >= start_utc:
269
+ gaps.append(DateRange(start, _to_tz_aware(gap_end, start)))
270
+
271
+ # Gap after cached data
272
+ if end_utc > cached_end:
273
+ gap_start = max(cached_end + pd.Timedelta(hours=1), start_utc)
274
+ if gap_start <= end_utc:
275
+ gaps.append(DateRange(_to_tz_aware(gap_start, end), end))
276
+
277
+ # Recent data that may still change
278
+ if cached_end > cutoff and end_utc > cutoff:
279
+ recent_start = max(cutoff, start_utc)
280
+ if recent_start <= end_utc:
281
+ gaps.append(DateRange(_to_tz_aware(recent_start, end), end))
282
+
283
+ return _merge_overlapping(gaps)
284
+
285
+ # -- Maintenance -----------------------------------------------------------
286
+
287
+ def clear(
288
+ self,
289
+ route: str | None = None,
290
+ frequency: str | None = None,
291
+ ) -> int:
292
+ """Remove cached files. Returns number of files removed.
293
+
294
+ - No args: clear everything
295
+ - route only: clear all data for that route
296
+ - route + frequency: clear one frequency partition
297
+ """
298
+ count = 0
299
+
300
+ if route and frequency:
301
+ target = self.config.cache_dir / route / frequency
302
+ elif route:
303
+ target = self.config.cache_dir / route
304
+ else:
305
+ target = self.config.cache_dir
306
+
307
+ if target.exists():
308
+ count = sum(1 for f in target.rglob("*") if f.is_file())
309
+ shutil.rmtree(target)
310
+
311
+ return count
312
+
313
+ def status(self) -> dict:
314
+ """Return cache statistics."""
315
+ cache_dir = self.config.cache_dir
316
+ if not cache_dir.exists():
317
+ return {"path": str(cache_dir), "files": 0, "size_mb": 0.0, "routes": {}}
318
+
319
+ all_files = [f for f in cache_dir.rglob("*") if f.is_file()]
320
+ total_size = sum(f.stat().st_size for f in all_files)
321
+
322
+ # Per-route breakdown (first path component)
323
+ routes: dict[str, int] = {}
324
+ for f in all_files:
325
+ try:
326
+ rel = f.relative_to(cache_dir)
327
+ if len(rel.parts) > 1:
328
+ r = rel.parts[0]
329
+ routes[r] = routes.get(r, 0) + 1
330
+ except ValueError:
331
+ pass
332
+
333
+ return {
334
+ "path": str(cache_dir),
335
+ "files": len(all_files),
336
+ "size_mb": round(total_size / (1024 * 1024), 2),
337
+ "routes": routes,
338
+ }
339
+
340
+
341
+ # -- Helpers -------------------------------------------------------------------
342
+
343
+
344
+ def _to_tz_aware(ts: pd.Timestamp, reference: pd.Timestamp) -> pd.Timestamp:
345
+ """Convert a UTC timestamp to match the reference timestamp's timezone."""
346
+ if reference.tz is not None:
347
+ return ts.tz_convert(reference.tz) if ts.tz is not None else ts.tz_localize(reference.tz)
348
+ return ts.tz_localize(None) if ts.tz is not None else ts
349
+
350
+
351
+ def _merge_overlapping(gaps: list[DateRange]) -> list[DateRange]:
352
+ """Merge overlapping or adjacent date ranges."""
353
+ if not gaps:
354
+ return []
355
+
356
+ sorted_gaps = sorted(gaps, key=lambda g: g.start)
357
+ merged = [sorted_gaps[0]]
358
+
359
+ for gap in sorted_gaps[1:]:
360
+ prev = merged[-1]
361
+ if gap.start <= prev.end + pd.Timedelta(days=1):
362
+ merged[-1] = DateRange(prev.start, max(prev.end, gap.end))
363
+ else:
364
+ merged.append(gap)
365
+
366
+ return merged
367
+
368
+
369
+ def _atomic_write_json(path: Path, data: dict) -> None:
370
+ """Write JSON atomically via temp file + rename."""
371
+ path.parent.mkdir(parents=True, exist_ok=True)
372
+ tmp_path = None
373
+ try:
374
+ fd, tmp_path = tempfile.mkstemp(suffix=".json", dir=path.parent)
375
+ os.close(fd)
376
+ Path(tmp_path).write_text(
377
+ json.dumps(data, indent=2, ensure_ascii=False, default=str),
378
+ encoding="utf-8",
379
+ )
380
+ Path(tmp_path).rename(path)
381
+ except OSError as exc:
382
+ logger.warning("Failed to write %s: %s", path, exc)
383
+ if tmp_path:
384
+ Path(tmp_path).unlink(missing_ok=True)
385
+
386
+
387
+ def _atomic_write_parquet(path: Path, df: pd.DataFrame) -> None:
388
+ """Write parquet atomically via temp file + rename."""
389
+ path.parent.mkdir(parents=True, exist_ok=True)
390
+ tmp_path = None
391
+ try:
392
+ fd, tmp_path = tempfile.mkstemp(suffix=".parquet", dir=path.parent)
393
+ os.close(fd)
394
+ df.to_parquet(tmp_path)
395
+ Path(tmp_path).rename(path)
396
+ except OSError as exc:
397
+ logger.warning("Failed to write cache %s: %s — continuing without cache.", path, exc)
398
+ if tmp_path:
399
+ Path(tmp_path).unlink(missing_ok=True)
eia/catalog.py ADDED
@@ -0,0 +1,137 @@
1
+ """Built-in data catalog and recipes for the EIA API v2.
2
+
3
+ The EIA API is a tree of routes. This module provides:
4
+ - Curated route metadata with descriptions and key facets
5
+ - Named "recipes" — pre-configured queries for common use cases
6
+ - Facet cheat-sheets so users don't have to discover facet values every time
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass, field
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class DataColumn:
16
+ """Metadata for a data column from the API schema."""
17
+
18
+ id: str
19
+ units: str = ""
20
+ aggregation_method: str = ""
21
+ alias: str = ""
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class Frequency:
26
+ """Metadata for a frequency option from the API schema."""
27
+
28
+ id: str
29
+ description: str = ""
30
+ query: str = ""
31
+ format: str = ""
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class FacetHint:
36
+ """Documents a facet's key values without requiring an API call."""
37
+
38
+ id: str
39
+ description: str
40
+ common_values: dict[str, str] # hand-curated subset (value_id → human label)
41
+ values: dict[str, str] = field(default_factory=dict) # full API values (value_id → name)
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class RouteInfo:
46
+ """Curated metadata for a data route."""
47
+
48
+ route: str
49
+ name: str
50
+ description: str
51
+ frequency: str # default frequency
52
+ facets: tuple[FacetHint, ...]
53
+ notes: str = ""
54
+ # --- API-fetched schema (optional, populated by refresh) ---
55
+ data_columns: tuple[DataColumn, ...] = ()
56
+ frequencies: tuple[Frequency, ...] = ()
57
+ start_period: str = ""
58
+ end_period: str = ""
59
+ default_date_format: str = ""
60
+ api_hash: str = ""
61
+ last_refreshed: str = ""
62
+
63
+
64
+ @dataclass(frozen=True)
65
+ class Recipe:
66
+ """A named, pre-configured query for a common use case."""
67
+
68
+ id: str
69
+ name: str
70
+ description: str
71
+ route: str
72
+ facets: dict[str, str | list[str]]
73
+ frequency: str
74
+ notes: str = ""
75
+ cli_example: str = ""
76
+ python_example: str = ""
77
+
78
+
79
+ # ── Route & Recipe Catalog (loaded from YAML) ─────────────────────────
80
+
81
+ from eia.catalog_manager import EIACatalogManager as _EIACatalogManager
82
+
83
+ _mgr = _EIACatalogManager()
84
+
85
+ ROUTES: dict[str, RouteInfo] = {r.route: r for r in _mgr._load_routes()}
86
+ RECIPES: dict[str, Recipe] = {r.id: r for r in _mgr._load_recipes()}
87
+
88
+
89
+ # ── Convenience functions ──────────────────────────────────────────────
90
+
91
+ def get_route(route: str) -> RouteInfo:
92
+ """Look up route metadata."""
93
+ if route not in ROUTES:
94
+ raise KeyError(
95
+ f"Unknown route '{route}'. Use catalog.list_routes() to see available routes."
96
+ )
97
+ return ROUTES[route]
98
+
99
+
100
+ def get_recipe(recipe_id: str) -> Recipe:
101
+ """Look up a named recipe."""
102
+ if recipe_id not in RECIPES:
103
+ raise KeyError(
104
+ f"Unknown recipe '{recipe_id}'. Available: {', '.join(RECIPES.keys())}"
105
+ )
106
+ return RECIPES[recipe_id]
107
+
108
+
109
+ def list_routes() -> list[str]:
110
+ """Return all cataloged route paths."""
111
+ return sorted(ROUTES.keys())
112
+
113
+
114
+ def list_recipes() -> list[str]:
115
+ """Return all recipe IDs."""
116
+ return sorted(RECIPES.keys())
117
+
118
+
119
+ def summary() -> str:
120
+ """Return a human-readable summary of the catalog."""
121
+ lines = ["EIA Data Catalog", "=" * 50, ""]
122
+
123
+ lines.append("Routes:")
124
+ for route_path, info in sorted(ROUTES.items()):
125
+ lines.append(f" {route_path}")
126
+ lines.append(f" {info.name}: {info.description}")
127
+ lines.append(f" Default frequency: {info.frequency}")
128
+ if info.notes:
129
+ lines.append(f" Note: {info.notes}")
130
+
131
+ lines.append("")
132
+ lines.append("Recipes (pre-configured queries):")
133
+ for recipe_id, recipe in sorted(RECIPES.items()):
134
+ lines.append(f" {recipe_id}: {recipe.name}")
135
+ lines.append(f" {recipe.description}")
136
+
137
+ return "\n".join(lines)