PyPI - agrobr - Versions diffs - 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

agrobr 0.1.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

agrobr/__init__.py +3 -2
agrobr/benchmark/__init__.py +343 -0
agrobr/cache/policies.py +99 -17
agrobr/cepea/api.py +87 -30
agrobr/cepea/client.py +1 -8
agrobr/cli.py +141 -5
agrobr/conab/api.py +72 -6
agrobr/config.py +137 -0
agrobr/constants.py +1 -2
agrobr/contracts/__init__.py +186 -0
agrobr/contracts/cepea.py +80 -0
agrobr/contracts/conab.py +181 -0
agrobr/contracts/ibge.py +146 -0
agrobr/export.py +251 -0
agrobr/health/__init__.py +10 -0
agrobr/health/doctor.py +321 -0
agrobr/http/browser.py +0 -9
agrobr/ibge/api.py +104 -25
agrobr/ibge/client.py +5 -20
agrobr/models.py +100 -1
agrobr/noticias_agricolas/client.py +0 -7
agrobr/noticias_agricolas/parser.py +0 -17
agrobr/plugins/__init__.py +205 -0
agrobr/quality.py +319 -0
agrobr/sla.py +249 -0
agrobr/snapshots.py +321 -0
agrobr/stability.py +148 -0
agrobr/validators/semantic.py +447 -0
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/METADATA +12 -12
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/RECORD +33 -19
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/WHEEL +0 -0
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/entry_points.txt +0 -0
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/licenses/LICENSE +0 -0

agrobr/health/doctor.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Diagnóstico completo do sistema agrobr."""
+from __future__ import annotations
+import asyncio
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import httpx
+import structlog
+from agrobr import __version__
+from agrobr.cache.duckdb_store import get_store
+from agrobr.cache.policies import get_next_update_info
+logger = structlog.get_logger()
+@dataclass
+class SourceStatus:
+    """Status de conectividade de uma fonte."""
+    name: str
+    url: str
+    status: str
+    latency_ms: int
+    error: str | None = None
+@dataclass
+class CacheStats:
+    """Estatísticas do cache."""
+    location: str
+    size_bytes: int
+    total_records: int
+    by_source: dict[str, dict[str, Any]] = field(default_factory=dict)
+@dataclass
+class DiagnosticsResult:
+    """Resultado do diagnóstico completo."""
+    version: str
+    timestamp: datetime
+    sources: list[SourceStatus]
+    cache: CacheStats
+    last_collections: dict[str, datetime | None]
+    cache_expiry: dict[str, dict[str, str]]
+    config: dict[str, Any]
+    overall_status: str
+    def to_dict(self) -> dict[str, Any]:
+        """Converte para dicionário serializável."""
+        return {
+            "version": self.version,
+            "timestamp": self.timestamp.isoformat(),
+            "sources": [
+                {
+                    "name": s.name,
+                    "url": s.url,
+                    "status": s.status,
+                    "latency_ms": s.latency_ms,
+                    "error": s.error,
+                }
+                for s in self.sources
+            ],
+            "cache": {
+                "location": self.cache.location,
+                "size_mb": round(self.cache.size_bytes / 1024 / 1024, 2),
+                "total_records": self.cache.total_records,
+                "by_source": self.cache.by_source,
+            },
+            "last_collections": {
+                k: v.isoformat() if v else None for k, v in self.last_collections.items()
+            },
+            "cache_expiry": self.cache_expiry,
+            "config": self.config,
+            "overall_status": self.overall_status,
+        }
+    def to_rich(self) -> str:
+        """Formata para output no terminal."""
+        lines = [
+            "",
+            f"agrobr diagnostics v{self.version}",
+            "=" * 50,
+            "",
+            "Sources Connectivity",
+        ]
+        for s in self.sources:
+            if s.status == "ok":
+                icon = "[OK]"
+            elif s.status == "slow":
+                icon = "[SLOW]"
+            else:
+                icon = "[FAIL]"
+            line = f"  {icon} {s.name:<35} {s.latency_ms:>5}ms"
+            if s.error:
+                line += f"  ({s.error})"
+            lines.append(line)
+        lines.extend(
+            [
+                "",
+                "Cache Status",
+                f"  Location:      {self.cache.location}",
+                f"  Size:          {self.cache.size_bytes / 1024 / 1024:.2f} MB",
+                f"  Total records: {self.cache.total_records:,}",
+                "",
+                "  By source:",
+            ]
+        )
+        for fonte, stats in self.cache.by_source.items():
+            count = stats.get("count", 0)
+            oldest = stats.get("oldest", "-")
+            newest = stats.get("newest", "-")
+            lines.append(f"    {fonte.upper()}: {count:,} records ({oldest} to {newest})")
+        lines.extend(
+            [
+                "",
+                "Cache Expiry",
+            ]
+        )
+        for fonte, info in self.cache_expiry.items():
+            exp_type = info.get("type", "unknown")
+            if exp_type == "smart":
+                lines.append(f"  {fonte.upper()}: {info.get('description', '')}")
+            else:
+                lines.append(f"  {fonte.upper()}: TTL {info.get('ttl', 'unknown')}")
+        lines.extend(
+            [
+                "",
+                "Configuration",
+                f"  Browser fallback:   {'enabled' if self.config.get('browser_fallback') else 'disabled'}",
+                f"  Alternative source: {'enabled' if self.config.get('alternative_source') else 'disabled'}",
+                "",
+            ]
+        )
+        if self.overall_status == "healthy":
+            lines.append("[OK] All systems operational")
+        elif self.overall_status == "degraded":
+            lines.append("[WARN] System degraded - some sources unavailable")
+        else:
+            lines.append("[FAIL] System error - check source connectivity")
+        lines.append("")
+        return "\n".join(lines)
+async def _check_source(name: str, url: str, timeout: float = 10.0) -> SourceStatus:
+    """Verifica conectividade de uma fonte."""
+    start = time.perf_counter()
+    try:
+        async with httpx.AsyncClient(timeout=timeout) as http_client:
+            response = await http_client.head(url, follow_redirects=True)
+            latency_ms = int((time.perf_counter() - start) * 1000)
+            if response.status_code < 400:
+                status = "ok" if latency_ms < 2000 else "slow"
+                return SourceStatus(name, url, status, latency_ms)
+            return SourceStatus(
+                name,
+                url,
+                "error",
+                latency_ms,
+                error=f"HTTP {response.status_code}",
+            )
+    except httpx.TimeoutException:
+        latency_ms = int((time.perf_counter() - start) * 1000)
+        return SourceStatus(name, url, "error", latency_ms, error="timeout")
+    except httpx.ConnectError as e:
+        latency_ms = int((time.perf_counter() - start) * 1000)
+        return SourceStatus(name, url, "error", latency_ms, error=f"connection error: {e}")
+    except Exception as e:
+        latency_ms = int((time.perf_counter() - start) * 1000)
+        return SourceStatus(name, url, "error", latency_ms, error=str(e))
+def _get_cache_stats() -> CacheStats:
+    """Obtém estatísticas do cache."""
+    try:
+        store = get_store()
+        cache_path = Path(store.db_path)
+        size_bytes = cache_path.stat().st_size if cache_path.exists() else 0
+        by_source: dict[str, dict[str, Any]] = {}
+        conn = store._get_conn()
+        for fonte in ["cepea", "conab", "ibge"]:
+            try:
+                result = conn.execute(
+                    """
+                    SELECT COUNT(*), MIN(data), MAX(data)
+                    FROM indicadores
+                    WHERE LOWER(fonte) = ?
+                    """,
+                    [fonte],
+                ).fetchone()
+                if result and result[0] > 0:
+                    by_source[fonte] = {
+                        "count": result[0],
+                        "oldest": str(result[1]) if result[1] else None,
+                        "newest": str(result[2]) if result[2] else None,
+                    }
+            except Exception:
+                pass
+        total_records = sum(s.get("count", 0) for s in by_source.values())
+        return CacheStats(
+            location=str(cache_path),
+            size_bytes=size_bytes,
+            total_records=total_records,
+            by_source=by_source,
+        )
+    except Exception as e:
+        logger.warning("cache_stats_failed", error=str(e))
+        return CacheStats(
+            location="unknown",
+            size_bytes=0,
+            total_records=0,
+            by_source={},
+        )
+def _get_last_collections() -> dict[str, datetime | None]:
+    """Obtém data da última coleta por fonte."""
+    collections: dict[str, datetime | None] = {}
+    try:
+        store = get_store()
+        conn = store._get_conn()
+        for fonte in ["cepea", "conab", "ibge"]:
+            try:
+                result = conn.execute(
+                    """
+                    SELECT MAX(collected_at)
+                    FROM indicadores
+                    WHERE LOWER(fonte) = ?
+                    """,
+                    [fonte],
+                ).fetchone()
+                collections[fonte] = result[0] if result and result[0] else None
+            except Exception:
+                collections[fonte] = None
+    except Exception:
+        pass
+    return collections
+async def run_diagnostics(verbose: bool = False) -> DiagnosticsResult:  # noqa: ARG001
+    """
+    Executa diagnóstico completo do sistema.
+    Args:
+        verbose: Se True, inclui informações detalhadas (reservado para uso futuro)
+    Returns:
+        DiagnosticsResult com status completo
+    """
+    sources_to_check = [
+        ("CEPEA (Noticias Agricolas)", "https://www.noticiasagricolas.com.br"),
+        ("CONAB", "https://www.conab.gov.br"),
+        ("IBGE/SIDRA", "https://sidra.ibge.gov.br"),
+    ]
+    source_tasks = [_check_source(name, url) for name, url in sources_to_check]
+    sources = await asyncio.gather(*source_tasks)
+    cache = _get_cache_stats()
+    cache_expiry = {}
+    for fonte in ["cepea", "conab", "ibge"]:
+        cache_expiry[fonte] = get_next_update_info(fonte)
+    last_collections = _get_last_collections()
+    error_count = sum(1 for s in sources if s.status == "error")
+    if error_count == len(sources):
+        overall_status = "error"
+    elif error_count > 0:
+        overall_status = "degraded"
+    else:
+        overall_status = "healthy"
+    return DiagnosticsResult(
+        version=__version__,
+        timestamp=datetime.now(),
+        sources=list(sources),
+        cache=cache,
+        last_collections=last_collections,
+        cache_expiry=cache_expiry,
+        config={
+            "browser_fallback": False,
+            "alternative_source": True,
+        },
+        overall_status=overall_status,
+    )

agrobr/http/browser.py CHANGED Viewed

@@ -15,7 +15,6 @@ from agrobr.http.user_agents import UserAgentRotator
 logger = structlog.get_logger()
-# Singleton para reutilizar browser
 _playwright: Playwright | None = None
 _browser: Browser | None = None
 _lock = asyncio.Lock()
@@ -64,7 +63,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
     """Context manager para obter uma página do browser."""
     browser = await _get_browser()
-    # Cria contexto com fingerprint realista
     ua = UserAgentRotator.get_random()
     context = await browser.new_context(
         user_agent=ua,
@@ -78,7 +76,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
     page = await context.new_page()
-    # Esconde sinais de automação
     await page.add_init_script(
         """
         Object.defineProperty(navigator, 'webdriver', {
@@ -124,7 +121,6 @@ async def fetch_with_browser(
     try:
         async with get_page() as page:
-            # Navega para a URL
             response = await page.goto(
                 url,
                 wait_until="domcontentloaded",
@@ -138,7 +134,6 @@ async def fetch_with_browser(
                     last_error="No response received",
                 )
-            # Aguarda seletor específico se fornecido
             if wait_selector:
                 try:
                     await page.wait_for_selector(
@@ -152,13 +147,10 @@ async def fetch_with_browser(
                         error=str(e),
                     )
-            # Aguarda Cloudflare resolver e JS terminar
             await page.wait_for_timeout(5000)
-            # Verifica se foi bloqueado pelo Cloudflare
             if response.status in (403, 503):
                 check_html: str = await page.content()
-                # Detecta página de challenge do Cloudflare
                 if "cloudflare" in check_html.lower() or "challenge" in check_html.lower():
                     raise SourceUnavailableError(
                         source=source,
@@ -166,7 +158,6 @@ async def fetch_with_browser(
                         last_error=f"Cloudflare block detected (status {response.status})",
                     )
-            # Obtém HTML
             html: str = await page.content()
             logger.info(

agrobr/ibge/api.py CHANGED Viewed

@@ -2,16 +2,22 @@
 from __future__ import annotations
-from typing import Literal
+import time
+from datetime import datetime
+from typing import Literal, overload
 import pandas as pd
 import structlog
+from agrobr import constants
+from agrobr.cache.policies import calculate_expiry
 from agrobr.ibge import client
+from agrobr.models import MetaInfo
 logger = structlog.get_logger()
+@overload
 async def pam(
     produto: str,
     ano: int | str | list[int] | None = None,
@@ -19,7 +25,33 @@ async def pam(
     nivel: Literal["brasil", "uf", "municipio"] = "uf",
     variaveis: list[str] | None = None,
     as_polars: bool = False,
-) -> pd.DataFrame:
+    *,
+    return_meta: Literal[False] = False,
+) -> pd.DataFrame: ...
+@overload
+async def pam(
+    produto: str,
+    ano: int | str | list[int] | None = None,
+    uf: str | None = None,
+    nivel: Literal["brasil", "uf", "municipio"] = "uf",
+    variaveis: list[str] | None = None,
+    as_polars: bool = False,
+    *,
+    return_meta: Literal[True],
+) -> tuple[pd.DataFrame, MetaInfo]: ...
+async def pam(
+    produto: str,
+    ano: int | str | list[int] | None = None,
+    uf: str | None = None,
+    nivel: Literal["brasil", "uf", "municipio"] = "uf",
+    variaveis: list[str] | None = None,
+    as_polars: bool = False,
+    return_meta: bool = False,
+) -> pd.DataFrame | tuple[pd.DataFrame, MetaInfo]:
     """
     Obtém dados da Produção Agrícola Municipal (PAM).
@@ -30,14 +62,22 @@ async def pam(
         nivel: Nível territorial ("brasil", "uf", "municipio")
         variaveis: Lista de variáveis (area_plantada, area_colhida, producao, rendimento)
         as_polars: Se True, retorna polars.DataFrame
+        return_meta: Se True, retorna tupla (DataFrame, MetaInfo)
     Returns:
-        DataFrame com dados da PAM
+        DataFrame com dados da PAM ou tupla (DataFrame, MetaInfo)
     Example:
         >>> df = await ibge.pam('soja', ano=2023)
-        >>> df = await ibge.pam('milho', ano=[2020, 2021, 2022], uf='MT')
+        >>> df, meta = await ibge.pam('milho', ano=[2020, 2021, 2022], uf='MT', return_meta=True)
     """
+    fetch_start = time.perf_counter()
+    meta = MetaInfo(
+        source="ibge_pam",
+        source_url="https://sidra.ibge.gov.br",
+        source_method="httpx",
+        fetched_at=datetime.now(),
+    )
     logger.info(
         "ibge_pam_request",
         produto=produto,
@@ -46,7 +86,6 @@ async def pam(
         nivel=nivel,
     )
-    # Mapeia produto para código SIDRA
     produto_lower = produto.lower()
     if produto_lower not in client.PRODUTOS_PAM:
         raise ValueError(
@@ -55,7 +94,6 @@ async def pam(
     produto_cod = client.PRODUTOS_PAM[produto_lower]
-    # Mapeia variáveis
     if variaveis is None:
         variaveis = ["area_plantada", "area_colhida", "producao", "rendimento"]
@@ -66,7 +104,6 @@ async def pam(
         else:
             logger.warning(f"Variável desconhecida: {var}")
-    # Mapeia nível territorial
     nivel_map = {
         "brasil": "1",
         "uf": "3",
@@ -74,12 +111,10 @@ async def pam(
     }
     territorial_level = nivel_map.get(nivel, "3")
-    # Define código territorial
     ibge_code = "all"
     if uf and nivel in ("uf", "municipio"):
         ibge_code = client.uf_to_ibge_code(uf)
-    # Define período
     if ano is None:
         period = "last"
     elif isinstance(ano, list):
@@ -87,7 +122,6 @@ async def pam(
     else:
         period = str(ano)
-    # Busca dados
     df = await client.fetch_sidra(
         table_code=client.TABELAS["pam_nova"],
         territorial_level=territorial_level,
@@ -97,10 +131,8 @@ async def pam(
         classifications={"782": produto_cod},
     )
-    # Processa resposta
     df = client.parse_sidra_response(df)
-    # Pivota para ter variáveis como colunas
     if "variavel" in df.columns and "valor" in df.columns:
         df_pivot = df.pivot_table(
             index=["localidade", "ano"] if "localidade" in df.columns else ["ano"],
@@ -109,7 +141,6 @@ async def pam(
             aggfunc="first",
         ).reset_index()
-        # Renomeia colunas para nomes mais simples
         rename_map = {
             "Área plantada": "area_plantada",
             "Área colhida": "area_colhida",
@@ -123,11 +154,20 @@ async def pam(
     df["produto"] = produto_lower
     df["fonte"] = "ibge_pam"
+    meta.fetch_duration_ms = int((time.perf_counter() - fetch_start) * 1000)
+    meta.records_count = len(df)
+    meta.columns = df.columns.tolist()
+    meta.cache_key = f"ibge:pam:{produto}:{ano}"
+    meta.cache_expires_at = calculate_expiry(constants.Fonte.IBGE, "pam")
     if as_polars:
         try:
             import polars as pl
-            return pl.from_pandas(df)  # type: ignore[no-any-return]
+            result_df = pl.from_pandas(df)
+            if return_meta:
+                return result_df, meta  # type: ignore[return-value,no-any-return]
+            return result_df  # type: ignore[return-value,no-any-return]
         except ImportError:
             logger.warning("polars_not_installed", fallback="pandas")
@@ -137,16 +177,43 @@ async def pam(
         records=len(df),
     )
+    if return_meta:
+        return df, meta
     return df
+@overload
+async def lspa(
+    produto: str,
+    ano: int | str | None = None,
+    mes: int | str | None = None,
+    uf: str | None = None,
+    as_polars: bool = False,
+    *,
+    return_meta: Literal[False] = False,
+) -> pd.DataFrame: ...
+@overload
 async def lspa(
     produto: str,
     ano: int | str | None = None,
     mes: int | str | None = None,
     uf: str | None = None,
     as_polars: bool = False,
-) -> pd.DataFrame:
+    *,
+    return_meta: Literal[True],
+) -> tuple[pd.DataFrame, MetaInfo]: ...
+async def lspa(
+    produto: str,
+    ano: int | str | None = None,
+    mes: int | str | None = None,
+    uf: str | None = None,
+    as_polars: bool = False,
+    return_meta: bool = False,
+) -> pd.DataFrame | tuple[pd.DataFrame, MetaInfo]:
     """
     Obtém dados do Levantamento Sistemático da Produção Agrícola (LSPA).
@@ -158,14 +225,22 @@ async def lspa(
         mes: Mês de referência (1-12). Se None, retorna todos os meses do ano.
         uf: Filtrar por UF (ex: "MT", "PR")
         as_polars: Se True, retorna polars.DataFrame
+        return_meta: Se True, retorna tupla (DataFrame, MetaInfo)
     Returns:
-        DataFrame com estimativas LSPA
+        DataFrame com estimativas LSPA ou tupla (DataFrame, MetaInfo)
     Example:
         >>> df = await ibge.lspa('soja', ano=2024)
-        >>> df = await ibge.lspa('milho_1', ano=2024, mes=6, uf='PR')
+        >>> df, meta = await ibge.lspa('milho_1', ano=2024, mes=6, uf='PR', return_meta=True)
     """
+    fetch_start = time.perf_counter()
+    meta = MetaInfo(
+        source="ibge_lspa",
+        source_url="https://sidra.ibge.gov.br",
+        source_method="httpx",
+        fetched_at=datetime.now(),
+    )
     logger.info(
         "ibge_lspa_request",
         produto=produto,
@@ -174,7 +249,6 @@ async def lspa(
         uf=uf,
     )
-    # Mapeia produto para código SIDRA
     produto_lower = produto.lower()
     if produto_lower not in client.PRODUTOS_LSPA:
         raise ValueError(
@@ -183,20 +257,16 @@ async def lspa(
     produto_cod = client.PRODUTOS_LSPA[produto_lower]
-    # Define período
     if ano is None:
         from datetime import date
         ano = date.today().year
-    # Define período
     period = f"{ano}{int(mes):02d}" if mes else ",".join(f"{ano}{m:02d}" for m in range(1, 13))
-    # Define nível territorial
     territorial_level = "3" if uf else "1"
     ibge_code = client.uf_to_ibge_code(uf) if uf else "all"
-    # Busca dados (não especifica variáveis - retorna todas)
     df = await client.fetch_sidra(
         table_code=client.TABELAS["lspa"],
         territorial_level=territorial_level,
@@ -205,10 +275,8 @@ async def lspa(
         classifications={"48": produto_cod},
     )
-    # Processa resposta
     df = client.parse_sidra_response(df)
-    # Adiciona período da consulta
     df["ano"] = ano
     if mes:
         df["mes"] = mes
@@ -216,11 +284,20 @@ async def lspa(
     df["produto"] = produto_lower
     df["fonte"] = "ibge_lspa"
+    meta.fetch_duration_ms = int((time.perf_counter() - fetch_start) * 1000)
+    meta.records_count = len(df)
+    meta.columns = df.columns.tolist()
+    meta.cache_key = f"ibge:lspa:{produto}:{ano}:{mes}"
+    meta.cache_expires_at = calculate_expiry(constants.Fonte.IBGE, "lspa")
     if as_polars:
         try:
             import polars as pl
-            return pl.from_pandas(df)  # type: ignore[no-any-return]
+            result_df = pl.from_pandas(df)
+            if return_meta:
+                return result_df, meta  # type: ignore[return-value,no-any-return]
+            return result_df  # type: ignore[return-value,no-any-return]
         except ImportError:
             logger.warning("polars_not_installed", fallback="pandas")
@@ -230,6 +307,8 @@ async def lspa(
         records=len(df),
     )
+    if return_meta:
+        return df, meta
     return df

agrobr 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

agrobr 0.1.0py3-none-any.whl → 0.5.0py3-none-any.whl