python-eia 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eia/.agents/skills/eia/SKILL.md +172 -0
- eia/__init__.py +3 -1
- eia/cache.py +399 -0
- eia/catalog.py +137 -0
- eia/catalog_manager.py +464 -0
- eia/cli/app.py +4 -0
- eia/cli/cache_cmd.py +53 -0
- eia/cli/catalog_cmd.py +186 -0
- eia/client.py +309 -19
- {python_eia-0.2.0.dist-info → python_eia-0.3.0.dist-info}/METADATA +7 -3
- python_eia-0.3.0.dist-info/RECORD +22 -0
- python_eia-0.2.0.dist-info/RECORD +0 -16
- {python_eia-0.2.0.dist-info → python_eia-0.3.0.dist-info}/WHEEL +0 -0
- {python_eia-0.2.0.dist-info → python_eia-0.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: eia
|
|
3
|
+
description: Query U.S. energy data (EIA API v2). Use when the user asks about U.S. electricity, petroleum, natural gas, or coal data from the EIA.
|
|
4
|
+
version: 2.0.0
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# EIA Data Assistant
|
|
8
|
+
|
|
9
|
+
You have access to the `python-eia` library and CLI for querying the U.S. Energy Information Administration (EIA) API v2.
|
|
10
|
+
|
|
11
|
+
## When to use what
|
|
12
|
+
|
|
13
|
+
- **Python scripts** (default): reproducible, composable, saveable. Use for any data work the user will want to keep or iterate on.
|
|
14
|
+
- **CLI**: quick one-shot lookups, exploration, sanity checks. Use when the user wants a fast answer they won't need again.
|
|
15
|
+
- **If unsure**: ask the user whether they want a script or a quick CLI check.
|
|
16
|
+
|
|
17
|
+
## Built-in Catalog (OFFLINE — no API calls)
|
|
18
|
+
|
|
19
|
+
The library ships with a YAML catalog containing full API schema for curated routes: columns, frequencies, periods, and **all facet values**. Always check the catalog first to avoid unnecessary API calls.
|
|
20
|
+
|
|
21
|
+
### Cataloged Routes
|
|
22
|
+
|
|
23
|
+
| Route | Description | Frequency |
|
|
24
|
+
|-------|-------------|-----------|
|
|
25
|
+
| `electricity/rto/fuel-type-data` | Real-time grid generation by fuel type | hourly |
|
|
26
|
+
| `electricity/rto/region-data` | Real-time grid demand/generation by region | hourly |
|
|
27
|
+
| `electricity/rto/interchange-data` | Real-time interchange between regions | hourly |
|
|
28
|
+
| `electricity/retail-sales` | Retail electricity sales by state/sector | monthly |
|
|
29
|
+
| `petroleum/pri/spt` | Spot petroleum prices (crude, gasoline, etc.) | daily |
|
|
30
|
+
| `natural-gas/pri/sum` | Natural gas prices summary | monthly |
|
|
31
|
+
| `natural-gas/move/expc` | US natural gas exports by country | monthly |
|
|
32
|
+
| `natural-gas/move/impc` | US natural gas imports by country | monthly |
|
|
33
|
+
| `total-energy/data` | Total energy overview (production, consumption, etc.) | monthly |
|
|
34
|
+
|
|
35
|
+
For routes **not** in the catalog, use `eia routes` (CLI) to discover and `eia meta` to inspect.
|
|
36
|
+
|
|
37
|
+
## Python Library (default)
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from eia import EIAClient
|
|
41
|
+
|
|
42
|
+
client = EIAClient() # reads config file, then EIA_API_KEY env var
|
|
43
|
+
|
|
44
|
+
# --- Catalog access (offline, no API calls) ---
|
|
45
|
+
|
|
46
|
+
# Get endpoint with cached schema — no API metadata call
|
|
47
|
+
data = client.get_data_endpoint("electricity/rto/fuel-type-data")
|
|
48
|
+
|
|
49
|
+
# Inspect metadata (all cached for cataloged routes)
|
|
50
|
+
data.facets # FacetContainer with attribute access
|
|
51
|
+
data.frequencies # List[FrequencyInfo]
|
|
52
|
+
data.data_columns # Dict[str, DataColumnInfo]
|
|
53
|
+
data.start_period # "2019-01-01T00"
|
|
54
|
+
data.end_period # "2026-03-04T07"
|
|
55
|
+
|
|
56
|
+
# Facet values — cached, no API call
|
|
57
|
+
respondents = data.facets.respondent.get_values()
|
|
58
|
+
# [FacetValue(id='CISO', name='California ISO'), ...]
|
|
59
|
+
|
|
60
|
+
fuel_types = data.facets.fueltype.get_values()
|
|
61
|
+
# [FacetValue(id='SUN', name='Solar'), ...]
|
|
62
|
+
|
|
63
|
+
# Or access catalog directly
|
|
64
|
+
from eia.catalog import get_route, list_routes
|
|
65
|
+
route = get_route("electricity/rto/fuel-type-data")
|
|
66
|
+
route.data_columns # (DataColumn(id='value', units='megawatthours', ...),)
|
|
67
|
+
route.frequencies # (Frequency(id='hourly', ...), Frequency(id='local-hourly', ...))
|
|
68
|
+
route.facets[0].values # {'CISO': 'California ISO', 'PJM': 'PJM Interconnection LLC', ...}
|
|
69
|
+
|
|
70
|
+
# --- Fetch data (hits API) ---
|
|
71
|
+
|
|
72
|
+
df = data.get(
|
|
73
|
+
data_columns=["value"],
|
|
74
|
+
facets={"respondent": "CISO"},
|
|
75
|
+
frequency="hourly",
|
|
76
|
+
start="2024-01-01",
|
|
77
|
+
end="2024-01-31",
|
|
78
|
+
sort=[{"column": "period", "direction": "desc"}],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Multiple facet values
|
|
82
|
+
df = data.get(
|
|
83
|
+
data_columns=["revenue", "sales"],
|
|
84
|
+
facets={"stateid": "CA", "sectorid": ["RES", "COM"]},
|
|
85
|
+
frequency="monthly",
|
|
86
|
+
start="2024-01-01",
|
|
87
|
+
end="2024-12-31",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# --- Route tree navigation (for discovery — hits API) ---
|
|
91
|
+
route = client.route("electricity/rto/fuel-type-data")
|
|
92
|
+
route.routes # Dict of child routes (if branch node)
|
|
93
|
+
route.data # Data object (if leaf node)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Facet conventions
|
|
97
|
+
|
|
98
|
+
- **Common facets**: `respondent` (grid operator), `fueltype`, `stateid`, `sectorid`, `series`
|
|
99
|
+
- **Multiple values**: pass a list — `facets={"sectorid": ["RES", "COM"]}`
|
|
100
|
+
- **Prefer catalog** for facet discovery: `get_route().facets[i].values` has all valid values offline
|
|
101
|
+
|
|
102
|
+
### Key conventions
|
|
103
|
+
|
|
104
|
+
- The `period` column is auto-converted to datetime (UTC for non-local frequencies)
|
|
105
|
+
- The `value` column is auto-converted to numeric
|
|
106
|
+
- Pagination is automatic by default (fetches all pages)
|
|
107
|
+
- API page limit is 5000 rows per request
|
|
108
|
+
- Custom exception: `EIAError` (includes HTTP status code and API error code)
|
|
109
|
+
|
|
110
|
+
## CLI Reference (quick lookups)
|
|
111
|
+
|
|
112
|
+
### Catalog (offline)
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
eia catalog routes # List all cataloged routes
|
|
116
|
+
eia catalog show electricity/rto/fuel-type-data # Full details (columns, frequencies, facet values)
|
|
117
|
+
eia catalog recipes # Pre-configured query recipes
|
|
118
|
+
eia catalog recipe lng-exports-europe # Show a specific recipe
|
|
119
|
+
eia catalog refresh --apply # Refresh schema from API
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Explore (hits API)
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
eia routes # Top-level routes
|
|
126
|
+
eia routes electricity/rto # Navigate deeper
|
|
127
|
+
eia meta electricity/rto/fuel-type-data # Endpoint metadata
|
|
128
|
+
eia facets electricity/rto/fuel-type-data respondent # Facet values
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Fetch data
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
eia get electricity/rto/fuel-type-data \
|
|
135
|
+
--start 2024-06-01 --end 2024-06-08 \
|
|
136
|
+
--frequency hourly --facet respondent=CISO --data value
|
|
137
|
+
|
|
138
|
+
# Multiple facet values (repeat --facet)
|
|
139
|
+
eia get electricity/retail-sales \
|
|
140
|
+
--start 2024-01-01 --end 2024-12-31 \
|
|
141
|
+
--facet stateid=CA --facet sectorid=RES --facet sectorid=COM \
|
|
142
|
+
--data revenue --data sales
|
|
143
|
+
|
|
144
|
+
# Export
|
|
145
|
+
eia get petroleum/pri/spt --start 2024-01-01 --end 2024-06-01 \
|
|
146
|
+
--format csv --output prices.csv
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Exec (ad-hoc pandas)
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
eia exec electricity/rto/fuel-type-data \
|
|
153
|
+
--start 2024-06-01 --end 2024-06-08 \
|
|
154
|
+
--frequency hourly --facet respondent=CISO --data value \
|
|
155
|
+
-x "df.groupby('fueltype')['value'].mean()"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Output options
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
--format table|csv|json (default: table)
|
|
162
|
+
--output file.csv (write to file instead of stdout)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Configuration
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
eia config set api-key YOUR_KEY # Store API key
|
|
169
|
+
eia config get api-key # Verify
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Config file: `~/.config/eia/config.toml`. API key resolution: config file > `EIA_API_KEY` env var.
|
eia/__init__.py
CHANGED
|
@@ -5,6 +5,8 @@ A Python client for interacting with the U.S. Energy Information Administration
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from .client import EIAClient, EIAError
|
|
8
|
+
from .cache import CacheConfig
|
|
9
|
+
from . import catalog
|
|
8
10
|
|
|
9
11
|
__version__ = "0.1.0"
|
|
10
|
-
__all__ = ["EIAClient", "EIAError"]
|
|
12
|
+
__all__ = ["EIAClient", "EIAError", "CacheConfig", "catalog"]
|
eia/cache.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""Local parquet cache for EIA API time-series data.
|
|
2
|
+
|
|
3
|
+
Caches query results as parquet files, fetching only missing date ranges
|
|
4
|
+
on subsequent requests. Historical energy data is immutable once
|
|
5
|
+
published (~48h), so caching is safe and enabled by default.
|
|
6
|
+
|
|
7
|
+
Storage layout::
|
|
8
|
+
|
|
9
|
+
{cache_dir}/
|
|
10
|
+
└── electricity/rto/fuel-type-data/
|
|
11
|
+
├── hourly/
|
|
12
|
+
│ ├── respondent=CISO/
|
|
13
|
+
│ │ ├── data.parquet
|
|
14
|
+
│ │ └── meta.json
|
|
15
|
+
│ └── respondent=PJM.fueltype=SUN,WND/
|
|
16
|
+
│ ├── data.parquet
|
|
17
|
+
│ └── meta.json
|
|
18
|
+
└── monthly/
|
|
19
|
+
└── _all_/
|
|
20
|
+
├── data.parquet
|
|
21
|
+
└── meta.json
|
|
22
|
+
|
|
23
|
+
Unlike ENTSO-E, EIA stores DataFrames in long format (facet columns +
|
|
24
|
+
value column) rather than wide format, because multiple rows per period
|
|
25
|
+
are common (e.g. one row per fuel type per respondent).
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import json
|
|
31
|
+
import logging
|
|
32
|
+
import os
|
|
33
|
+
import shutil
|
|
34
|
+
import tempfile
|
|
35
|
+
from dataclasses import dataclass, field
|
|
36
|
+
from datetime import datetime
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
|
|
39
|
+
import pandas as pd
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger("eia")
|
|
42
|
+
|
|
43
|
+
# Default cache location — respects XDG_CACHE_HOME
|
|
44
|
+
_DEFAULT_CACHE_DIR = Path(
|
|
45
|
+
os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")
|
|
46
|
+
) / "eia"
|
|
47
|
+
|
|
48
|
+
# Data older than this (hours) is considered final and won't be re-fetched
|
|
49
|
+
_DEFAULT_RECENT_TTL_HOURS = 48
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _facets_key(facets: dict | None) -> str:
|
|
53
|
+
"""Build a deterministic partition string from facet filters.
|
|
54
|
+
|
|
55
|
+
Examples:
|
|
56
|
+
None → "_all_"
|
|
57
|
+
{"respondent": "CISO"} → "respondent=CISO"
|
|
58
|
+
{"respondent": "PJM", "fueltype": ["SUN", "WND"]}
|
|
59
|
+
→ "fueltype=SUN,WND.respondent=PJM"
|
|
60
|
+
"""
|
|
61
|
+
if not facets:
|
|
62
|
+
return "_all_"
|
|
63
|
+
parts = []
|
|
64
|
+
for k in sorted(facets.keys()):
|
|
65
|
+
v = facets[k]
|
|
66
|
+
if isinstance(v, list):
|
|
67
|
+
v_str = ",".join(sorted(str(x) for x in v))
|
|
68
|
+
else:
|
|
69
|
+
v_str = str(v)
|
|
70
|
+
parts.append(f"{k}={v_str}")
|
|
71
|
+
return ".".join(parts)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class CacheConfig:
|
|
76
|
+
"""Cache configuration."""
|
|
77
|
+
|
|
78
|
+
enabled: bool = True
|
|
79
|
+
cache_dir: Path = field(default_factory=lambda: _DEFAULT_CACHE_DIR)
|
|
80
|
+
recent_ttl_hours: int = _DEFAULT_RECENT_TTL_HOURS
|
|
81
|
+
|
|
82
|
+
def __post_init__(self) -> None:
|
|
83
|
+
self.cache_dir = Path(self.cache_dir)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass(frozen=True)
|
|
87
|
+
class DateRange:
|
|
88
|
+
"""A contiguous date range [start, end] inclusive."""
|
|
89
|
+
|
|
90
|
+
start: pd.Timestamp
|
|
91
|
+
end: pd.Timestamp
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class CacheStore:
|
|
95
|
+
"""Read, write, and merge parquet files for cached EIA data."""
|
|
96
|
+
|
|
97
|
+
def __init__(self, config: CacheConfig):
|
|
98
|
+
self.config = config
|
|
99
|
+
|
|
100
|
+
# -- Path resolution -------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
def _parquet_path(self, route: str, frequency: str, facets_key: str) -> Path:
|
|
103
|
+
"""Data file: {cache_dir}/{route}/{frequency}/{facets_key}/data.parquet"""
|
|
104
|
+
return self.config.cache_dir / route / frequency / facets_key / "data.parquet"
|
|
105
|
+
|
|
106
|
+
def _meta_path(self, route: str, frequency: str, facets_key: str) -> Path:
|
|
107
|
+
"""Metadata file: {cache_dir}/{route}/{frequency}/{facets_key}/meta.json"""
|
|
108
|
+
return self.config.cache_dir / route / frequency / facets_key / "meta.json"
|
|
109
|
+
|
|
110
|
+
# -- Data Read / Write -----------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def read(
|
|
113
|
+
self,
|
|
114
|
+
route: str,
|
|
115
|
+
frequency: str,
|
|
116
|
+
facets_key: str,
|
|
117
|
+
start: pd.Timestamp,
|
|
118
|
+
end: pd.Timestamp,
|
|
119
|
+
) -> pd.DataFrame:
|
|
120
|
+
"""Read cached data for a date range.
|
|
121
|
+
|
|
122
|
+
Returns DataFrame with ``period`` as DatetimeIndex.
|
|
123
|
+
Returns empty DataFrame on cache miss.
|
|
124
|
+
"""
|
|
125
|
+
path = self._parquet_path(route, frequency, facets_key)
|
|
126
|
+
if not path.exists():
|
|
127
|
+
return pd.DataFrame()
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
df = pd.read_parquet(path)
|
|
131
|
+
except Exception as exc:
|
|
132
|
+
logger.warning("Corrupted cache file %s: %s — removing.", path, exc)
|
|
133
|
+
path.unlink(missing_ok=True)
|
|
134
|
+
return pd.DataFrame()
|
|
135
|
+
|
|
136
|
+
if df.empty or not isinstance(df.index, pd.DatetimeIndex):
|
|
137
|
+
return pd.DataFrame()
|
|
138
|
+
|
|
139
|
+
return self._slice(df, start, end)
|
|
140
|
+
|
|
141
|
+
def _slice(
|
|
142
|
+
self, df: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp
|
|
143
|
+
) -> pd.DataFrame:
|
|
144
|
+
"""Slice a DataFrame by [start, end], handling timezone alignment."""
|
|
145
|
+
if df.index.tz is not None:
|
|
146
|
+
if start.tz is None:
|
|
147
|
+
start = start.tz_localize(df.index.tz)
|
|
148
|
+
if end.tz is None:
|
|
149
|
+
end = end.tz_localize(df.index.tz)
|
|
150
|
+
elif start.tz is not None:
|
|
151
|
+
start = start.tz_localize(None)
|
|
152
|
+
if end.tz is not None and df.index.tz is None:
|
|
153
|
+
end = end.tz_localize(None)
|
|
154
|
+
|
|
155
|
+
# When end is a date-level timestamp (midnight), extend to end of day
|
|
156
|
+
if end.hour == 0 and end.minute == 0 and end.second == 0:
|
|
157
|
+
end = end + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
|
|
158
|
+
|
|
159
|
+
return df[start:end]
|
|
160
|
+
|
|
161
|
+
def write(
|
|
162
|
+
self,
|
|
163
|
+
route: str,
|
|
164
|
+
frequency: str,
|
|
165
|
+
facets_key: str,
|
|
166
|
+
df: pd.DataFrame,
|
|
167
|
+
) -> None:
|
|
168
|
+
"""Merge new data with existing cache and persist.
|
|
169
|
+
|
|
170
|
+
*df* should have ``period`` as DatetimeIndex. New data is merged
|
|
171
|
+
with existing, deduplicating on the index. Rows from the new data
|
|
172
|
+
take precedence for overlapping timestamps.
|
|
173
|
+
"""
|
|
174
|
+
if df.empty:
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
path = self._parquet_path(route, frequency, facets_key)
|
|
178
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
# Read existing and merge
|
|
181
|
+
existing = pd.DataFrame()
|
|
182
|
+
if path.exists():
|
|
183
|
+
try:
|
|
184
|
+
existing = pd.read_parquet(path)
|
|
185
|
+
except Exception:
|
|
186
|
+
logger.warning("Corrupted cache at %s — overwriting.", path)
|
|
187
|
+
|
|
188
|
+
if not existing.empty:
|
|
189
|
+
# For long-format data, concat + deduplicate
|
|
190
|
+
merged = pd.concat([existing, df])
|
|
191
|
+
# Drop duplicates: keep last (new data wins)
|
|
192
|
+
# Use all columns for deduplication since index alone isn't unique
|
|
193
|
+
# (multiple rows per period with different facet values)
|
|
194
|
+
merged = merged[~merged.index.duplicated(keep="last")]
|
|
195
|
+
merged = merged.sort_index()
|
|
196
|
+
else:
|
|
197
|
+
merged = df.sort_index()
|
|
198
|
+
|
|
199
|
+
_atomic_write_parquet(path, merged)
|
|
200
|
+
|
|
201
|
+
def write_meta(
|
|
202
|
+
self,
|
|
203
|
+
route: str,
|
|
204
|
+
frequency: str,
|
|
205
|
+
facets_key: str,
|
|
206
|
+
meta: dict,
|
|
207
|
+
) -> None:
|
|
208
|
+
"""Write metadata for a partition."""
|
|
209
|
+
meta = {**meta, "cached_at": datetime.now().isoformat()}
|
|
210
|
+
path = self._meta_path(route, frequency, facets_key)
|
|
211
|
+
_atomic_write_json(path, meta)
|
|
212
|
+
|
|
213
|
+
def read_meta(
|
|
214
|
+
self,
|
|
215
|
+
route: str,
|
|
216
|
+
frequency: str,
|
|
217
|
+
facets_key: str,
|
|
218
|
+
) -> dict | None:
|
|
219
|
+
"""Read cached metadata for a partition."""
|
|
220
|
+
path = self._meta_path(route, frequency, facets_key)
|
|
221
|
+
if not path.exists():
|
|
222
|
+
return None
|
|
223
|
+
try:
|
|
224
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
225
|
+
except (json.JSONDecodeError, OSError):
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
# -- Gap detection ---------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
def find_gaps(
|
|
231
|
+
self,
|
|
232
|
+
cached_df: pd.DataFrame,
|
|
233
|
+
start: pd.Timestamp,
|
|
234
|
+
end: pd.Timestamp,
|
|
235
|
+
*,
|
|
236
|
+
recent_ttl_hours: int | None = None,
|
|
237
|
+
) -> list[DateRange]:
|
|
238
|
+
"""Find date ranges not covered by cached data.
|
|
239
|
+
|
|
240
|
+
Also marks data within ``recent_ttl_hours`` of now as a gap
|
|
241
|
+
(needs re-fetch since it may have been updated).
|
|
242
|
+
"""
|
|
243
|
+
ttl = recent_ttl_hours if recent_ttl_hours is not None else self.config.recent_ttl_hours
|
|
244
|
+
now = pd.Timestamp.now(tz="UTC")
|
|
245
|
+
cutoff = now - pd.Timedelta(hours=ttl)
|
|
246
|
+
|
|
247
|
+
if cached_df.empty:
|
|
248
|
+
return [DateRange(start, end)]
|
|
249
|
+
|
|
250
|
+
# Normalize to UTC for comparison
|
|
251
|
+
idx = cached_df.index
|
|
252
|
+
if idx.tz is None:
|
|
253
|
+
idx = idx.tz_localize("UTC")
|
|
254
|
+
else:
|
|
255
|
+
idx = idx.tz_convert("UTC")
|
|
256
|
+
|
|
257
|
+
start_utc = start.tz_localize("UTC") if start.tz is None else start.tz_convert("UTC")
|
|
258
|
+
end_utc = end.tz_localize("UTC") if end.tz is None else end.tz_convert("UTC")
|
|
259
|
+
|
|
260
|
+
cached_start = idx.min()
|
|
261
|
+
cached_end = idx.max()
|
|
262
|
+
|
|
263
|
+
gaps: list[DateRange] = []
|
|
264
|
+
|
|
265
|
+
# Gap before cached data
|
|
266
|
+
if start_utc < cached_start:
|
|
267
|
+
gap_end = min(cached_start - pd.Timedelta(hours=1), end_utc)
|
|
268
|
+
if gap_end >= start_utc:
|
|
269
|
+
gaps.append(DateRange(start, _to_tz_aware(gap_end, start)))
|
|
270
|
+
|
|
271
|
+
# Gap after cached data
|
|
272
|
+
if end_utc > cached_end:
|
|
273
|
+
gap_start = max(cached_end + pd.Timedelta(hours=1), start_utc)
|
|
274
|
+
if gap_start <= end_utc:
|
|
275
|
+
gaps.append(DateRange(_to_tz_aware(gap_start, end), end))
|
|
276
|
+
|
|
277
|
+
# Recent data that may still change
|
|
278
|
+
if cached_end > cutoff and end_utc > cutoff:
|
|
279
|
+
recent_start = max(cutoff, start_utc)
|
|
280
|
+
if recent_start <= end_utc:
|
|
281
|
+
gaps.append(DateRange(_to_tz_aware(recent_start, end), end))
|
|
282
|
+
|
|
283
|
+
return _merge_overlapping(gaps)
|
|
284
|
+
|
|
285
|
+
# -- Maintenance -----------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
def clear(
|
|
288
|
+
self,
|
|
289
|
+
route: str | None = None,
|
|
290
|
+
frequency: str | None = None,
|
|
291
|
+
) -> int:
|
|
292
|
+
"""Remove cached files. Returns number of files removed.
|
|
293
|
+
|
|
294
|
+
- No args: clear everything
|
|
295
|
+
- route only: clear all data for that route
|
|
296
|
+
- route + frequency: clear one frequency partition
|
|
297
|
+
"""
|
|
298
|
+
count = 0
|
|
299
|
+
|
|
300
|
+
if route and frequency:
|
|
301
|
+
target = self.config.cache_dir / route / frequency
|
|
302
|
+
elif route:
|
|
303
|
+
target = self.config.cache_dir / route
|
|
304
|
+
else:
|
|
305
|
+
target = self.config.cache_dir
|
|
306
|
+
|
|
307
|
+
if target.exists():
|
|
308
|
+
count = sum(1 for f in target.rglob("*") if f.is_file())
|
|
309
|
+
shutil.rmtree(target)
|
|
310
|
+
|
|
311
|
+
return count
|
|
312
|
+
|
|
313
|
+
def status(self) -> dict:
|
|
314
|
+
"""Return cache statistics."""
|
|
315
|
+
cache_dir = self.config.cache_dir
|
|
316
|
+
if not cache_dir.exists():
|
|
317
|
+
return {"path": str(cache_dir), "files": 0, "size_mb": 0.0, "routes": {}}
|
|
318
|
+
|
|
319
|
+
all_files = [f for f in cache_dir.rglob("*") if f.is_file()]
|
|
320
|
+
total_size = sum(f.stat().st_size for f in all_files)
|
|
321
|
+
|
|
322
|
+
# Per-route breakdown (first path component)
|
|
323
|
+
routes: dict[str, int] = {}
|
|
324
|
+
for f in all_files:
|
|
325
|
+
try:
|
|
326
|
+
rel = f.relative_to(cache_dir)
|
|
327
|
+
if len(rel.parts) > 1:
|
|
328
|
+
r = rel.parts[0]
|
|
329
|
+
routes[r] = routes.get(r, 0) + 1
|
|
330
|
+
except ValueError:
|
|
331
|
+
pass
|
|
332
|
+
|
|
333
|
+
return {
|
|
334
|
+
"path": str(cache_dir),
|
|
335
|
+
"files": len(all_files),
|
|
336
|
+
"size_mb": round(total_size / (1024 * 1024), 2),
|
|
337
|
+
"routes": routes,
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
# -- Helpers -------------------------------------------------------------------
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _to_tz_aware(ts: pd.Timestamp, reference: pd.Timestamp) -> pd.Timestamp:
|
|
345
|
+
"""Convert a UTC timestamp to match the reference timestamp's timezone."""
|
|
346
|
+
if reference.tz is not None:
|
|
347
|
+
return ts.tz_convert(reference.tz) if ts.tz is not None else ts.tz_localize(reference.tz)
|
|
348
|
+
return ts.tz_localize(None) if ts.tz is not None else ts
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _merge_overlapping(gaps: list[DateRange]) -> list[DateRange]:
|
|
352
|
+
"""Merge overlapping or adjacent date ranges."""
|
|
353
|
+
if not gaps:
|
|
354
|
+
return []
|
|
355
|
+
|
|
356
|
+
sorted_gaps = sorted(gaps, key=lambda g: g.start)
|
|
357
|
+
merged = [sorted_gaps[0]]
|
|
358
|
+
|
|
359
|
+
for gap in sorted_gaps[1:]:
|
|
360
|
+
prev = merged[-1]
|
|
361
|
+
if gap.start <= prev.end + pd.Timedelta(days=1):
|
|
362
|
+
merged[-1] = DateRange(prev.start, max(prev.end, gap.end))
|
|
363
|
+
else:
|
|
364
|
+
merged.append(gap)
|
|
365
|
+
|
|
366
|
+
return merged
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _atomic_write_json(path: Path, data: dict) -> None:
|
|
370
|
+
"""Write JSON atomically via temp file + rename."""
|
|
371
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
372
|
+
tmp_path = None
|
|
373
|
+
try:
|
|
374
|
+
fd, tmp_path = tempfile.mkstemp(suffix=".json", dir=path.parent)
|
|
375
|
+
os.close(fd)
|
|
376
|
+
Path(tmp_path).write_text(
|
|
377
|
+
json.dumps(data, indent=2, ensure_ascii=False, default=str),
|
|
378
|
+
encoding="utf-8",
|
|
379
|
+
)
|
|
380
|
+
Path(tmp_path).rename(path)
|
|
381
|
+
except OSError as exc:
|
|
382
|
+
logger.warning("Failed to write %s: %s", path, exc)
|
|
383
|
+
if tmp_path:
|
|
384
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def _atomic_write_parquet(path: Path, df: pd.DataFrame) -> None:
|
|
388
|
+
"""Write parquet atomically via temp file + rename."""
|
|
389
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
390
|
+
tmp_path = None
|
|
391
|
+
try:
|
|
392
|
+
fd, tmp_path = tempfile.mkstemp(suffix=".parquet", dir=path.parent)
|
|
393
|
+
os.close(fd)
|
|
394
|
+
df.to_parquet(tmp_path)
|
|
395
|
+
Path(tmp_path).rename(path)
|
|
396
|
+
except OSError as exc:
|
|
397
|
+
logger.warning("Failed to write cache %s: %s — continuing without cache.", path, exc)
|
|
398
|
+
if tmp_path:
|
|
399
|
+
Path(tmp_path).unlink(missing_ok=True)
|