PyPI - python-eia - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

python-eia 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

eia/.agents/skills/eia/SKILL.md +115 -119
eia/__init__.py +3 -1
eia/cache.py +399 -0
eia/catalog.py +137 -0
eia/catalog_manager.py +464 -0
eia/cli/app.py +4 -0
eia/cli/cache_cmd.py +53 -0
eia/cli/catalog_cmd.py +186 -0
eia/client.py +309 -19
{python_eia-0.2.1.dist-info → python_eia-0.3.0.dist-info}/METADATA +7 -3
python_eia-0.3.0.dist-info/RECORD +22 -0
python_eia-0.2.1.dist-info/RECORD +0 -17
{python_eia-0.2.1.dist-info → python_eia-0.3.0.dist-info}/WHEEL +0 -0
{python_eia-0.2.1.dist-info → python_eia-0.3.0.dist-info}/entry_points.txt +0 -0

eia/cache.py ADDED Viewed

@@ -0,0 +1,399 @@
+"""Local parquet cache for EIA API time-series data.
+Caches query results as parquet files, fetching only missing date ranges
+on subsequent requests. Historical energy data is immutable once
+published (~48h), so caching is safe and enabled by default.
+Storage layout::
+    {cache_dir}/
+    └── electricity/rto/fuel-type-data/
+        ├── hourly/
+        │   ├── respondent=CISO/
+        │   │   ├── data.parquet
+        │   │   └── meta.json
+        │   └── respondent=PJM.fueltype=SUN,WND/
+        │       ├── data.parquet
+        │       └── meta.json
+        └── monthly/
+            └── _all_/
+                ├── data.parquet
+                └── meta.json
+Unlike ENTSO-E, EIA stores DataFrames in long format (facet columns +
+value column) rather than wide format, because multiple rows per period
+are common (e.g. one row per fuel type per respondent).
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import shutil
+import tempfile
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+import pandas as pd
+logger = logging.getLogger("eia")
+# Default cache location — respects XDG_CACHE_HOME
+_DEFAULT_CACHE_DIR = Path(
+    os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")
+) / "eia"
+# Data older than this (hours) is considered final and won't be re-fetched
+_DEFAULT_RECENT_TTL_HOURS = 48
+def _facets_key(facets: dict | None) -> str:
+    """Build a deterministic partition string from facet filters.
+    Examples:
+        None  →  "_all_"
+        {"respondent": "CISO"}  →  "respondent=CISO"
+        {"respondent": "PJM", "fueltype": ["SUN", "WND"]}
+            →  "fueltype=SUN,WND.respondent=PJM"
+    """
+    if not facets:
+        return "_all_"
+    parts = []
+    for k in sorted(facets.keys()):
+        v = facets[k]
+        if isinstance(v, list):
+            v_str = ",".join(sorted(str(x) for x in v))
+        else:
+            v_str = str(v)
+        parts.append(f"{k}={v_str}")
+    return ".".join(parts)
+@dataclass
+class CacheConfig:
+    """Cache configuration."""
+    enabled: bool = True
+    cache_dir: Path = field(default_factory=lambda: _DEFAULT_CACHE_DIR)
+    recent_ttl_hours: int = _DEFAULT_RECENT_TTL_HOURS
+    def __post_init__(self) -> None:
+        self.cache_dir = Path(self.cache_dir)
+@dataclass(frozen=True)
+class DateRange:
+    """A contiguous date range [start, end] inclusive."""
+    start: pd.Timestamp
+    end: pd.Timestamp
+class CacheStore:
+    """Read, write, and merge parquet files for cached EIA data."""
+    def __init__(self, config: CacheConfig):
+        self.config = config
+    # -- Path resolution -------------------------------------------------------
+    def _parquet_path(self, route: str, frequency: str, facets_key: str) -> Path:
+        """Data file: {cache_dir}/{route}/{frequency}/{facets_key}/data.parquet"""
+        return self.config.cache_dir / route / frequency / facets_key / "data.parquet"
+    def _meta_path(self, route: str, frequency: str, facets_key: str) -> Path:
+        """Metadata file: {cache_dir}/{route}/{frequency}/{facets_key}/meta.json"""
+        return self.config.cache_dir / route / frequency / facets_key / "meta.json"
+    # -- Data Read / Write -----------------------------------------------------
+    def read(
+        self,
+        route: str,
+        frequency: str,
+        facets_key: str,
+        start: pd.Timestamp,
+        end: pd.Timestamp,
+    ) -> pd.DataFrame:
+        """Read cached data for a date range.
+        Returns DataFrame with ``period`` as DatetimeIndex.
+        Returns empty DataFrame on cache miss.
+        """
+        path = self._parquet_path(route, frequency, facets_key)
+        if not path.exists():
+            return pd.DataFrame()
+        try:
+            df = pd.read_parquet(path)
+        except Exception as exc:
+            logger.warning("Corrupted cache file %s: %s — removing.", path, exc)
+            path.unlink(missing_ok=True)
+            return pd.DataFrame()
+        if df.empty or not isinstance(df.index, pd.DatetimeIndex):
+            return pd.DataFrame()
+        return self._slice(df, start, end)
+    def _slice(
+        self, df: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp
+    ) -> pd.DataFrame:
+        """Slice a DataFrame by [start, end], handling timezone alignment."""
+        if df.index.tz is not None:
+            if start.tz is None:
+                start = start.tz_localize(df.index.tz)
+            if end.tz is None:
+                end = end.tz_localize(df.index.tz)
+        elif start.tz is not None:
+            start = start.tz_localize(None)
+        if end.tz is not None and df.index.tz is None:
+            end = end.tz_localize(None)
+        # When end is a date-level timestamp (midnight), extend to end of day
+        if end.hour == 0 and end.minute == 0 and end.second == 0:
+            end = end + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
+        return df[start:end]
+    def write(
+        self,
+        route: str,
+        frequency: str,
+        facets_key: str,
+        df: pd.DataFrame,
+    ) -> None:
+        """Merge new data with existing cache and persist.
+        *df* should have ``period`` as DatetimeIndex. New data is merged
+        with existing, deduplicating on the index. Rows from the new data
+        take precedence for overlapping timestamps.
+        """
+        if df.empty:
+            return
+        path = self._parquet_path(route, frequency, facets_key)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        # Read existing and merge
+        existing = pd.DataFrame()
+        if path.exists():
+            try:
+                existing = pd.read_parquet(path)
+            except Exception:
+                logger.warning("Corrupted cache at %s — overwriting.", path)
+        if not existing.empty:
+            # For long-format data, concat + deduplicate
+            merged = pd.concat([existing, df])
+            # Drop duplicates: keep last (new data wins)
+            # Use all columns for deduplication since index alone isn't unique
+            # (multiple rows per period with different facet values)
+            merged = merged[~merged.index.duplicated(keep="last")]
+            merged = merged.sort_index()
+        else:
+            merged = df.sort_index()
+        _atomic_write_parquet(path, merged)
+    def write_meta(
+        self,
+        route: str,
+        frequency: str,
+        facets_key: str,
+        meta: dict,
+    ) -> None:
+        """Write metadata for a partition."""
+        meta = {**meta, "cached_at": datetime.now().isoformat()}
+        path = self._meta_path(route, frequency, facets_key)
+        _atomic_write_json(path, meta)
+    def read_meta(
+        self,
+        route: str,
+        frequency: str,
+        facets_key: str,
+    ) -> dict | None:
+        """Read cached metadata for a partition."""
+        path = self._meta_path(route, frequency, facets_key)
+        if not path.exists():
+            return None
+        try:
+            return json.loads(path.read_text(encoding="utf-8"))
+        except (json.JSONDecodeError, OSError):
+            return None
+    # -- Gap detection ---------------------------------------------------------
+    def find_gaps(
+        self,
+        cached_df: pd.DataFrame,
+        start: pd.Timestamp,
+        end: pd.Timestamp,
+        *,
+        recent_ttl_hours: int | None = None,
+    ) -> list[DateRange]:
+        """Find date ranges not covered by cached data.
+        Also marks data within ``recent_ttl_hours`` of now as a gap
+        (needs re-fetch since it may have been updated).
+        """
+        ttl = recent_ttl_hours if recent_ttl_hours is not None else self.config.recent_ttl_hours
+        now = pd.Timestamp.now(tz="UTC")
+        cutoff = now - pd.Timedelta(hours=ttl)
+        if cached_df.empty:
+            return [DateRange(start, end)]
+        # Normalize to UTC for comparison
+        idx = cached_df.index
+        if idx.tz is None:
+            idx = idx.tz_localize("UTC")
+        else:
+            idx = idx.tz_convert("UTC")
+        start_utc = start.tz_localize("UTC") if start.tz is None else start.tz_convert("UTC")
+        end_utc = end.tz_localize("UTC") if end.tz is None else end.tz_convert("UTC")
+        cached_start = idx.min()
+        cached_end = idx.max()
+        gaps: list[DateRange] = []
+        # Gap before cached data
+        if start_utc < cached_start:
+            gap_end = min(cached_start - pd.Timedelta(hours=1), end_utc)
+            if gap_end >= start_utc:
+                gaps.append(DateRange(start, _to_tz_aware(gap_end, start)))
+        # Gap after cached data
+        if end_utc > cached_end:
+            gap_start = max(cached_end + pd.Timedelta(hours=1), start_utc)
+            if gap_start <= end_utc:
+                gaps.append(DateRange(_to_tz_aware(gap_start, end), end))
+        # Recent data that may still change
+        if cached_end > cutoff and end_utc > cutoff:
+            recent_start = max(cutoff, start_utc)
+            if recent_start <= end_utc:
+                gaps.append(DateRange(_to_tz_aware(recent_start, end), end))
+        return _merge_overlapping(gaps)
+    # -- Maintenance -----------------------------------------------------------
+    def clear(
+        self,
+        route: str | None = None,
+        frequency: str | None = None,
+    ) -> int:
+        """Remove cached files. Returns number of files removed.
+        - No args: clear everything
+        - route only: clear all data for that route
+        - route + frequency: clear one frequency partition
+        """
+        count = 0
+        if route and frequency:
+            target = self.config.cache_dir / route / frequency
+        elif route:
+            target = self.config.cache_dir / route
+        else:
+            target = self.config.cache_dir
+        if target.exists():
+            count = sum(1 for f in target.rglob("*") if f.is_file())
+            shutil.rmtree(target)
+        return count
+    def status(self) -> dict:
+        """Return cache statistics."""
+        cache_dir = self.config.cache_dir
+        if not cache_dir.exists():
+            return {"path": str(cache_dir), "files": 0, "size_mb": 0.0, "routes": {}}
+        all_files = [f for f in cache_dir.rglob("*") if f.is_file()]
+        total_size = sum(f.stat().st_size for f in all_files)
+        # Per-route breakdown (first path component)
+        routes: dict[str, int] = {}
+        for f in all_files:
+            try:
+                rel = f.relative_to(cache_dir)
+                if len(rel.parts) > 1:
+                    r = rel.parts[0]
+                    routes[r] = routes.get(r, 0) + 1
+            except ValueError:
+                pass
+        return {
+            "path": str(cache_dir),
+            "files": len(all_files),
+            "size_mb": round(total_size / (1024 * 1024), 2),
+            "routes": routes,
+        }
+# -- Helpers -------------------------------------------------------------------
+def _to_tz_aware(ts: pd.Timestamp, reference: pd.Timestamp) -> pd.Timestamp:
+    """Convert a UTC timestamp to match the reference timestamp's timezone."""
+    if reference.tz is not None:
+        return ts.tz_convert(reference.tz) if ts.tz is not None else ts.tz_localize(reference.tz)
+    return ts.tz_localize(None) if ts.tz is not None else ts
+def _merge_overlapping(gaps: list[DateRange]) -> list[DateRange]:
+    """Merge overlapping or adjacent date ranges."""
+    if not gaps:
+        return []
+    sorted_gaps = sorted(gaps, key=lambda g: g.start)
+    merged = [sorted_gaps[0]]
+    for gap in sorted_gaps[1:]:
+        prev = merged[-1]
+        if gap.start <= prev.end + pd.Timedelta(days=1):
+            merged[-1] = DateRange(prev.start, max(prev.end, gap.end))
+        else:
+            merged.append(gap)
+    return merged
+def _atomic_write_json(path: Path, data: dict) -> None:
+    """Write JSON atomically via temp file + rename."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = None
+    try:
+        fd, tmp_path = tempfile.mkstemp(suffix=".json", dir=path.parent)
+        os.close(fd)
+        Path(tmp_path).write_text(
+            json.dumps(data, indent=2, ensure_ascii=False, default=str),
+            encoding="utf-8",
+        )
+        Path(tmp_path).rename(path)
+    except OSError as exc:
+        logger.warning("Failed to write %s: %s", path, exc)
+        if tmp_path:
+            Path(tmp_path).unlink(missing_ok=True)
+def _atomic_write_parquet(path: Path, df: pd.DataFrame) -> None:
+    """Write parquet atomically via temp file + rename."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = None
+    try:
+        fd, tmp_path = tempfile.mkstemp(suffix=".parquet", dir=path.parent)
+        os.close(fd)
+        df.to_parquet(tmp_path)
+        Path(tmp_path).rename(path)
+    except OSError as exc:
+        logger.warning("Failed to write cache %s: %s — continuing without cache.", path, exc)
+        if tmp_path:
+            Path(tmp_path).unlink(missing_ok=True)

eia/catalog.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""Built-in data catalog and recipes for the EIA API v2.
+The EIA API is a tree of routes. This module provides:
+- Curated route metadata with descriptions and key facets
+- Named "recipes" — pre-configured queries for common use cases
+- Facet cheat-sheets so users don't have to discover facet values every time
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass(frozen=True)
+class DataColumn:
+    """Metadata for a data column from the API schema."""
+    id: str
+    units: str = ""
+    aggregation_method: str = ""
+    alias: str = ""
+@dataclass(frozen=True)
+class Frequency:
+    """Metadata for a frequency option from the API schema."""
+    id: str
+    description: str = ""
+    query: str = ""
+    format: str = ""
+@dataclass(frozen=True)
+class FacetHint:
+    """Documents a facet's key values without requiring an API call."""
+    id: str
+    description: str
+    common_values: dict[str, str]  # hand-curated subset (value_id → human label)
+    values: dict[str, str] = field(default_factory=dict)  # full API values (value_id → name)
+@dataclass(frozen=True)
+class RouteInfo:
+    """Curated metadata for a data route."""
+    route: str
+    name: str
+    description: str
+    frequency: str  # default frequency
+    facets: tuple[FacetHint, ...]
+    notes: str = ""
+    # --- API-fetched schema (optional, populated by refresh) ---
+    data_columns: tuple[DataColumn, ...] = ()
+    frequencies: tuple[Frequency, ...] = ()
+    start_period: str = ""
+    end_period: str = ""
+    default_date_format: str = ""
+    api_hash: str = ""
+    last_refreshed: str = ""
+@dataclass(frozen=True)
+class Recipe:
+    """A named, pre-configured query for a common use case."""
+    id: str
+    name: str
+    description: str
+    route: str
+    facets: dict[str, str | list[str]]
+    frequency: str
+    notes: str = ""
+    cli_example: str = ""
+    python_example: str = ""
+# ── Route & Recipe Catalog (loaded from YAML) ─────────────────────────
+from eia.catalog_manager import EIACatalogManager as _EIACatalogManager
+_mgr = _EIACatalogManager()
+ROUTES: dict[str, RouteInfo] = {r.route: r for r in _mgr._load_routes()}
+RECIPES: dict[str, Recipe] = {r.id: r for r in _mgr._load_recipes()}
+# ── Convenience functions ──────────────────────────────────────────────
+def get_route(route: str) -> RouteInfo:
+    """Look up route metadata."""
+    if route not in ROUTES:
+        raise KeyError(
+            f"Unknown route '{route}'. Use catalog.list_routes() to see available routes."
+        )
+    return ROUTES[route]
+def get_recipe(recipe_id: str) -> Recipe:
+    """Look up a named recipe."""
+    if recipe_id not in RECIPES:
+        raise KeyError(
+            f"Unknown recipe '{recipe_id}'. Available: {', '.join(RECIPES.keys())}"
+        )
+    return RECIPES[recipe_id]
+def list_routes() -> list[str]:
+    """Return all cataloged route paths."""
+    return sorted(ROUTES.keys())
+def list_recipes() -> list[str]:
+    """Return all recipe IDs."""
+    return sorted(RECIPES.keys())
+def summary() -> str:
+    """Return a human-readable summary of the catalog."""
+    lines = ["EIA Data Catalog", "=" * 50, ""]
+    lines.append("Routes:")
+    for route_path, info in sorted(ROUTES.items()):
+        lines.append(f"  {route_path}")
+        lines.append(f"    {info.name}: {info.description}")
+        lines.append(f"    Default frequency: {info.frequency}")
+        if info.notes:
+            lines.append(f"    Note: {info.notes}")
+    lines.append("")
+    lines.append("Recipes (pre-configured queries):")
+    for recipe_id, recipe in sorted(RECIPES.items()):
+        lines.append(f"  {recipe_id}: {recipe.name}")
+        lines.append(f"    {recipe.description}")
+    return "\n".join(lines)

python-eia 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

python-eia 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl