PyPI - agrobr - Versions diffs - 0.1.2__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

agrobr 0.1.2py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

agrobr/__init__.py +3 -2
agrobr/benchmark/__init__.py +343 -0
agrobr/cache/policies.py +3 -8
agrobr/cepea/api.py +87 -30
agrobr/cepea/client.py +0 -7
agrobr/cli.py +141 -5
agrobr/conab/api.py +72 -6
agrobr/config.py +137 -0
agrobr/constants.py +1 -2
agrobr/contracts/__init__.py +186 -0
agrobr/contracts/cepea.py +80 -0
agrobr/contracts/conab.py +181 -0
agrobr/contracts/ibge.py +146 -0
agrobr/export.py +251 -0
agrobr/health/__init__.py +10 -0
agrobr/health/doctor.py +321 -0
agrobr/http/browser.py +0 -9
agrobr/ibge/api.py +104 -25
agrobr/ibge/client.py +5 -20
agrobr/models.py +100 -1
agrobr/noticias_agricolas/client.py +0 -7
agrobr/noticias_agricolas/parser.py +0 -17
agrobr/plugins/__init__.py +205 -0
agrobr/quality.py +319 -0
agrobr/sla.py +249 -0
agrobr/snapshots.py +321 -0
agrobr/stability.py +148 -0
agrobr/validators/semantic.py +447 -0
{agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/METADATA +12 -12
{agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/RECORD +33 -19
{agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/WHEEL +0 -0
{agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/entry_points.txt +0 -0
{agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/licenses/LICENSE +0 -0

agrobr/export.py ADDED Viewed

@@ -0,0 +1,251 @@
+"""Export para formatos auditaveis."""
+from __future__ import annotations
+import csv
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import structlog
+if TYPE_CHECKING:
+    import pandas as pd
+    from agrobr.models import MetaInfo
+logger = structlog.get_logger()
+def export_parquet(
+    df: pd.DataFrame,
+    path: str | Path,
+    meta: MetaInfo | None = None,
+    compression: str = "snappy",
+) -> Path:
+    """
+    Exporta DataFrame para Parquet com metadados.
+    Args:
+        df: DataFrame a exportar
+        path: Caminho do arquivo
+        meta: Metadados opcionais
+        compression: Compressao (snappy, gzip, zstd)
+    Returns:
+        Path do arquivo criado
+    """
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    table = pa.Table.from_pandas(df)
+    metadata = {
+        b"agrobr_version": _get_version().encode(),
+        b"export_timestamp": datetime.now().isoformat().encode(),
+        b"row_count": str(len(df)).encode(),
+    }
+    if meta:
+        metadata[b"source"] = meta.source.encode()
+        metadata[b"source_url"] = meta.source_url.encode()
+        metadata[b"fetched_at"] = meta.fetched_at.isoformat().encode()
+        if meta.raw_content_hash:
+            metadata[b"content_hash"] = meta.raw_content_hash.encode()
+    existing_meta = table.schema.metadata or {}
+    table = table.replace_schema_metadata({**existing_meta, **metadata})
+    pq.write_table(table, path, compression=compression)
+    logger.info("export_parquet", path=str(path), rows=len(df))
+    return path
+def export_csv(
+    df: pd.DataFrame,
+    path: str | Path,
+    meta: MetaInfo | None = None,
+    include_header: bool = True,
+    include_sidecar: bool = True,
+) -> tuple[Path, Path | None]:
+    """
+    Exporta DataFrame para CSV com arquivo sidecar de metadados.
+    Args:
+        df: DataFrame a exportar
+        path: Caminho do arquivo
+        meta: Metadados opcionais
+        include_header: Incluir linha de cabecalho
+        include_sidecar: Criar arquivo .meta.json
+    Returns:
+        Tupla (path_csv, path_sidecar ou None)
+    """
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(path, index=False, header=include_header, quoting=csv.QUOTE_NONNUMERIC)
+    sidecar_path = None
+    if include_sidecar:
+        sidecar_path = path.with_suffix(".meta.json")
+        sidecar_data = _create_sidecar(df, meta)
+        with open(sidecar_path, "w") as f:
+            json.dump(sidecar_data, f, indent=2, ensure_ascii=False)
+    logger.info("export_csv", path=str(path), rows=len(df))
+    return path, sidecar_path
+def export_json(
+    df: pd.DataFrame,
+    path: str | Path,
+    meta: MetaInfo | None = None,
+    orient: str = "records",
+    include_metadata: bool = True,
+) -> Path:
+    """
+    Exporta DataFrame para JSON com metadados embutidos.
+    Args:
+        df: DataFrame a exportar
+        path: Caminho do arquivo
+        meta: Metadados opcionais
+        orient: Orientacao do JSON (records, split, index, etc)
+        include_metadata: Incluir metadados no JSON
+    Returns:
+        Path do arquivo criado
+    """
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if include_metadata:
+        output = {
+            "metadata": _create_sidecar(df, meta),
+            "data": json.loads(df.to_json(orient=orient, date_format="iso")),  # type: ignore[call-overload]
+        }
+        with open(path, "w") as f:
+            json.dump(output, f, indent=2, ensure_ascii=False)
+    else:
+        df.to_json(path, orient=orient, date_format="iso", indent=2)  # type: ignore[call-overload]
+    logger.info("export_json", path=str(path), rows=len(df))
+    return path
+def _create_sidecar(df: pd.DataFrame, meta: MetaInfo | None = None) -> dict[str, Any]:
+    """Cria metadados para arquivo sidecar."""
+    csv_bytes = df.to_csv(index=False).encode("utf-8")
+    content_hash = hashlib.sha256(csv_bytes).hexdigest()
+    sidecar: dict[str, Any] = {
+        "agrobr_version": _get_version(),
+        "export_timestamp": datetime.now().isoformat(),
+        "file_info": {
+            "row_count": len(df),
+            "column_count": len(df.columns),
+            "columns": df.columns.tolist(),
+            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
+            "content_hash": f"sha256:{content_hash}",
+        },
+    }
+    if meta:
+        sidecar["provenance"] = {
+            "source": meta.source,
+            "source_url": meta.source_url,
+            "source_method": meta.source_method,
+            "fetched_at": meta.fetched_at.isoformat(),
+            "from_cache": meta.from_cache,
+            "original_hash": meta.raw_content_hash,
+        }
+    return sidecar
+def verify_export(path: str | Path, expected_hash: str | None = None) -> dict[str, Any]:
+    """
+    Verifica integridade de um arquivo exportado.
+    Args:
+        path: Caminho do arquivo
+        expected_hash: Hash esperado (opcional)
+    Returns:
+        Dict com status da verificacao
+    """
+    path = Path(path)
+    if not path.exists():
+        return {"valid": False, "error": "File not found"}
+    result: dict[str, Any] = {
+        "valid": True,
+        "path": str(path),
+        "size_bytes": path.stat().st_size,
+    }
+    if path.suffix == ".parquet":
+        import pyarrow.parquet as pq
+        try:
+            table = pq.read_table(path)
+            result["row_count"] = table.num_rows
+            result["columns"] = table.schema.names
+            metadata = table.schema.metadata or {}
+            if b"content_hash" in metadata:
+                result["stored_hash"] = metadata[b"content_hash"].decode()
+        except Exception as e:
+            result["valid"] = False
+            result["error"] = str(e)
+    elif path.suffix == ".csv":
+        import pandas as pd
+        try:
+            df = pd.read_csv(path)
+            result["row_count"] = len(df)
+            result["columns"] = df.columns.tolist()
+            csv_bytes = df.to_csv(index=False).encode("utf-8")
+            result["computed_hash"] = f"sha256:{hashlib.sha256(csv_bytes).hexdigest()}"
+            sidecar_path = path.with_suffix(".meta.json")
+            if sidecar_path.exists():
+                with open(sidecar_path) as f:
+                    sidecar = json.load(f)
+                    result["stored_hash"] = sidecar.get("file_info", {}).get("content_hash")
+        except Exception as e:
+            result["valid"] = False
+            result["error"] = str(e)
+    if expected_hash and result.get("computed_hash"):
+        result["hash_match"] = result["computed_hash"] == expected_hash
+    return result
+def _get_version() -> str:
+    """Retorna versao do agrobr."""
+    try:
+        import agrobr
+        return getattr(agrobr, "__version__", "unknown")
+    except ImportError:
+        return "unknown"
+__all__ = [
+    "export_parquet",
+    "export_csv",
+    "export_json",
+    "verify_export",
+]

agrobr/health/__init__.py CHANGED Viewed

@@ -8,6 +8,12 @@ from .checker import (
     check_source,
     run_all_checks,
 )
+from .doctor import (
+    CacheStats,
+    DiagnosticsResult,
+    SourceStatus,
+    run_diagnostics,
+)
 from .reporter import (
     HealthReport,
     generate_report,
@@ -20,4 +26,8 @@ __all__: list[str] = [
     "run_all_checks",
     "HealthReport",
     "generate_report",
+    "DiagnosticsResult",
+    "SourceStatus",
+    "CacheStats",
+    "run_diagnostics",
 ]

agrobr/health/doctor.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Diagnóstico completo do sistema agrobr."""
+from __future__ import annotations
+import asyncio
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import httpx
+import structlog
+from agrobr import __version__
+from agrobr.cache.duckdb_store import get_store
+from agrobr.cache.policies import get_next_update_info
+logger = structlog.get_logger()
+@dataclass
+class SourceStatus:
+    """Status de conectividade de uma fonte."""
+    name: str
+    url: str
+    status: str
+    latency_ms: int
+    error: str | None = None
+@dataclass
+class CacheStats:
+    """Estatísticas do cache."""
+    location: str
+    size_bytes: int
+    total_records: int
+    by_source: dict[str, dict[str, Any]] = field(default_factory=dict)
+@dataclass
+class DiagnosticsResult:
+    """Resultado do diagnóstico completo."""
+    version: str
+    timestamp: datetime
+    sources: list[SourceStatus]
+    cache: CacheStats
+    last_collections: dict[str, datetime | None]
+    cache_expiry: dict[str, dict[str, str]]
+    config: dict[str, Any]
+    overall_status: str
+    def to_dict(self) -> dict[str, Any]:
+        """Converte para dicionário serializável."""
+        return {
+            "version": self.version,
+            "timestamp": self.timestamp.isoformat(),
+            "sources": [
+                {
+                    "name": s.name,
+                    "url": s.url,
+                    "status": s.status,
+                    "latency_ms": s.latency_ms,
+                    "error": s.error,
+                }
+                for s in self.sources
+            ],
+            "cache": {
+                "location": self.cache.location,
+                "size_mb": round(self.cache.size_bytes / 1024 / 1024, 2),
+                "total_records": self.cache.total_records,
+                "by_source": self.cache.by_source,
+            },
+            "last_collections": {
+                k: v.isoformat() if v else None for k, v in self.last_collections.items()
+            },
+            "cache_expiry": self.cache_expiry,
+            "config": self.config,
+            "overall_status": self.overall_status,
+        }
+    def to_rich(self) -> str:
+        """Formata para output no terminal."""
+        lines = [
+            "",
+            f"agrobr diagnostics v{self.version}",
+            "=" * 50,
+            "",
+            "Sources Connectivity",
+        ]
+        for s in self.sources:
+            if s.status == "ok":
+                icon = "[OK]"
+            elif s.status == "slow":
+                icon = "[SLOW]"
+            else:
+                icon = "[FAIL]"
+            line = f"  {icon} {s.name:<35} {s.latency_ms:>5}ms"
+            if s.error:
+                line += f"  ({s.error})"
+            lines.append(line)
+        lines.extend(
+            [
+                "",
+                "Cache Status",
+                f"  Location:      {self.cache.location}",
+                f"  Size:          {self.cache.size_bytes / 1024 / 1024:.2f} MB",
+                f"  Total records: {self.cache.total_records:,}",
+                "",
+                "  By source:",
+            ]
+        )
+        for fonte, stats in self.cache.by_source.items():
+            count = stats.get("count", 0)
+            oldest = stats.get("oldest", "-")
+            newest = stats.get("newest", "-")
+            lines.append(f"    {fonte.upper()}: {count:,} records ({oldest} to {newest})")
+        lines.extend(
+            [
+                "",
+                "Cache Expiry",
+            ]
+        )
+        for fonte, info in self.cache_expiry.items():
+            exp_type = info.get("type", "unknown")
+            if exp_type == "smart":
+                lines.append(f"  {fonte.upper()}: {info.get('description', '')}")
+            else:
+                lines.append(f"  {fonte.upper()}: TTL {info.get('ttl', 'unknown')}")
+        lines.extend(
+            [
+                "",
+                "Configuration",
+                f"  Browser fallback:   {'enabled' if self.config.get('browser_fallback') else 'disabled'}",
+                f"  Alternative source: {'enabled' if self.config.get('alternative_source') else 'disabled'}",
+                "",
+            ]
+        )
+        if self.overall_status == "healthy":
+            lines.append("[OK] All systems operational")
+        elif self.overall_status == "degraded":
+            lines.append("[WARN] System degraded - some sources unavailable")
+        else:
+            lines.append("[FAIL] System error - check source connectivity")
+        lines.append("")
+        return "\n".join(lines)
+async def _check_source(name: str, url: str, timeout: float = 10.0) -> SourceStatus:
+    """Verifica conectividade de uma fonte."""
+    start = time.perf_counter()
+    try:
+        async with httpx.AsyncClient(timeout=timeout) as http_client:
+            response = await http_client.head(url, follow_redirects=True)
+            latency_ms = int((time.perf_counter() - start) * 1000)
+            if response.status_code < 400:
+                status = "ok" if latency_ms < 2000 else "slow"
+                return SourceStatus(name, url, status, latency_ms)
+            return SourceStatus(
+                name,
+                url,
+                "error",
+                latency_ms,
+                error=f"HTTP {response.status_code}",
+            )
+    except httpx.TimeoutException:
+        latency_ms = int((time.perf_counter() - start) * 1000)
+        return SourceStatus(name, url, "error", latency_ms, error="timeout")
+    except httpx.ConnectError as e:
+        latency_ms = int((time.perf_counter() - start) * 1000)
+        return SourceStatus(name, url, "error", latency_ms, error=f"connection error: {e}")
+    except Exception as e:
+        latency_ms = int((time.perf_counter() - start) * 1000)
+        return SourceStatus(name, url, "error", latency_ms, error=str(e))
+def _get_cache_stats() -> CacheStats:
+    """Obtém estatísticas do cache."""
+    try:
+        store = get_store()
+        cache_path = Path(store.db_path)
+        size_bytes = cache_path.stat().st_size if cache_path.exists() else 0
+        by_source: dict[str, dict[str, Any]] = {}
+        conn = store._get_conn()
+        for fonte in ["cepea", "conab", "ibge"]:
+            try:
+                result = conn.execute(
+                    """
+                    SELECT COUNT(*), MIN(data), MAX(data)
+                    FROM indicadores
+                    WHERE LOWER(fonte) = ?
+                    """,
+                    [fonte],
+                ).fetchone()
+                if result and result[0] > 0:
+                    by_source[fonte] = {
+                        "count": result[0],
+                        "oldest": str(result[1]) if result[1] else None,
+                        "newest": str(result[2]) if result[2] else None,
+                    }
+            except Exception:
+                pass
+        total_records = sum(s.get("count", 0) for s in by_source.values())
+        return CacheStats(
+            location=str(cache_path),
+            size_bytes=size_bytes,
+            total_records=total_records,
+            by_source=by_source,
+        )
+    except Exception as e:
+        logger.warning("cache_stats_failed", error=str(e))
+        return CacheStats(
+            location="unknown",
+            size_bytes=0,
+            total_records=0,
+            by_source={},
+        )
+def _get_last_collections() -> dict[str, datetime | None]:
+    """Obtém data da última coleta por fonte."""
+    collections: dict[str, datetime | None] = {}
+    try:
+        store = get_store()
+        conn = store._get_conn()
+        for fonte in ["cepea", "conab", "ibge"]:
+            try:
+                result = conn.execute(
+                    """
+                    SELECT MAX(collected_at)
+                    FROM indicadores
+                    WHERE LOWER(fonte) = ?
+                    """,
+                    [fonte],
+                ).fetchone()
+                collections[fonte] = result[0] if result and result[0] else None
+            except Exception:
+                collections[fonte] = None
+    except Exception:
+        pass
+    return collections
+async def run_diagnostics(verbose: bool = False) -> DiagnosticsResult:  # noqa: ARG001
+    """
+    Executa diagnóstico completo do sistema.
+    Args:
+        verbose: Se True, inclui informações detalhadas (reservado para uso futuro)
+    Returns:
+        DiagnosticsResult com status completo
+    """
+    sources_to_check = [
+        ("CEPEA (Noticias Agricolas)", "https://www.noticiasagricolas.com.br"),
+        ("CONAB", "https://www.conab.gov.br"),
+        ("IBGE/SIDRA", "https://sidra.ibge.gov.br"),
+    ]
+    source_tasks = [_check_source(name, url) for name, url in sources_to_check]
+    sources = await asyncio.gather(*source_tasks)
+    cache = _get_cache_stats()
+    cache_expiry = {}
+    for fonte in ["cepea", "conab", "ibge"]:
+        cache_expiry[fonte] = get_next_update_info(fonte)
+    last_collections = _get_last_collections()
+    error_count = sum(1 for s in sources if s.status == "error")
+    if error_count == len(sources):
+        overall_status = "error"
+    elif error_count > 0:
+        overall_status = "degraded"
+    else:
+        overall_status = "healthy"
+    return DiagnosticsResult(
+        version=__version__,
+        timestamp=datetime.now(),
+        sources=list(sources),
+        cache=cache,
+        last_collections=last_collections,
+        cache_expiry=cache_expiry,
+        config={
+            "browser_fallback": False,
+            "alternative_source": True,
+        },
+        overall_status=overall_status,
+    )

agrobr/http/browser.py CHANGED Viewed

@@ -15,7 +15,6 @@ from agrobr.http.user_agents import UserAgentRotator
 logger = structlog.get_logger()
-# Singleton para reutilizar browser
 _playwright: Playwright | None = None
 _browser: Browser | None = None
 _lock = asyncio.Lock()
@@ -64,7 +63,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
     """Context manager para obter uma página do browser."""
     browser = await _get_browser()
-    # Cria contexto com fingerprint realista
     ua = UserAgentRotator.get_random()
     context = await browser.new_context(
         user_agent=ua,
@@ -78,7 +76,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
     page = await context.new_page()
-    # Esconde sinais de automação
     await page.add_init_script(
         """
         Object.defineProperty(navigator, 'webdriver', {
@@ -124,7 +121,6 @@ async def fetch_with_browser(
     try:
         async with get_page() as page:
-            # Navega para a URL
             response = await page.goto(
                 url,
                 wait_until="domcontentloaded",
@@ -138,7 +134,6 @@ async def fetch_with_browser(
                     last_error="No response received",
                 )
-            # Aguarda seletor específico se fornecido
             if wait_selector:
                 try:
                     await page.wait_for_selector(
@@ -152,13 +147,10 @@ async def fetch_with_browser(
                         error=str(e),
                     )
-            # Aguarda Cloudflare resolver e JS terminar
             await page.wait_for_timeout(5000)
-            # Verifica se foi bloqueado pelo Cloudflare
             if response.status in (403, 503):
                 check_html: str = await page.content()
-                # Detecta página de challenge do Cloudflare
                 if "cloudflare" in check_html.lower() or "challenge" in check_html.lower():
                     raise SourceUnavailableError(
                         source=source,
@@ -166,7 +158,6 @@ async def fetch_with_browser(
                         last_error=f"Cloudflare block detected (status {response.status})",
                     )
-            # Obtém HTML
             html: str = await page.content()
             logger.info(

agrobr 0.1.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

agrobr 0.1.2py3-none-any.whl → 0.5.0py3-none-any.whl