PyPI - agrobr - Versions diffs - 0.1.0__py3-none-any.whl - Mend

agrobr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

agrobr/__init__.py +10 -0
agrobr/alerts/__init__.py +7 -0
agrobr/alerts/notifier.py +167 -0
agrobr/cache/__init__.py +31 -0
agrobr/cache/duckdb_store.py +433 -0
agrobr/cache/history.py +317 -0
agrobr/cache/migrations.py +82 -0
agrobr/cache/policies.py +240 -0
agrobr/cepea/__init__.py +7 -0
agrobr/cepea/api.py +360 -0
agrobr/cepea/client.py +273 -0
agrobr/cepea/parsers/__init__.py +37 -0
agrobr/cepea/parsers/base.py +35 -0
agrobr/cepea/parsers/consensus.py +300 -0
agrobr/cepea/parsers/detector.py +108 -0
agrobr/cepea/parsers/fingerprint.py +226 -0
agrobr/cepea/parsers/v1.py +305 -0
agrobr/cli.py +323 -0
agrobr/conab/__init__.py +21 -0
agrobr/conab/api.py +239 -0
agrobr/conab/client.py +219 -0
agrobr/conab/parsers/__init__.py +7 -0
agrobr/conab/parsers/v1.py +383 -0
agrobr/constants.py +205 -0
agrobr/exceptions.py +104 -0
agrobr/health/__init__.py +23 -0
agrobr/health/checker.py +202 -0
agrobr/health/reporter.py +314 -0
agrobr/http/__init__.py +9 -0
agrobr/http/browser.py +214 -0
agrobr/http/rate_limiter.py +69 -0
agrobr/http/retry.py +93 -0
agrobr/http/user_agents.py +67 -0
agrobr/ibge/__init__.py +19 -0
agrobr/ibge/api.py +273 -0
agrobr/ibge/client.py +256 -0
agrobr/models.py +85 -0
agrobr/normalize/__init__.py +64 -0
agrobr/normalize/dates.py +303 -0
agrobr/normalize/encoding.py +102 -0
agrobr/normalize/regions.py +308 -0
agrobr/normalize/units.py +278 -0
agrobr/noticias_agricolas/__init__.py +6 -0
agrobr/noticias_agricolas/client.py +222 -0
agrobr/noticias_agricolas/parser.py +187 -0
agrobr/sync.py +147 -0
agrobr/telemetry/__init__.py +17 -0
agrobr/telemetry/collector.py +153 -0
agrobr/utils/__init__.py +5 -0
agrobr/utils/logging.py +59 -0
agrobr/validators/__init__.py +35 -0
agrobr/validators/sanity.py +286 -0
agrobr/validators/structural.py +313 -0
agrobr-0.1.0.dist-info/METADATA +243 -0
agrobr-0.1.0.dist-info/RECORD +58 -0
agrobr-0.1.0.dist-info/WHEEL +4 -0
agrobr-0.1.0.dist-info/entry_points.txt +2 -0
agrobr-0.1.0.dist-info/licenses/LICENSE +21 -0

agrobr/cache/history.py ADDED Viewed

@@ -0,0 +1,317 @@
+"""
+Gerenciamento de histórico permanente.
+O histórico é separado do cache volátil e nunca expira automaticamente.
+Permite reconstruir séries históricas e auditar mudanças de parsing.
+"""
+from __future__ import annotations
+from datetime import date, datetime
+from pathlib import Path
+from typing import Any
+import structlog
+from ..constants import Fonte
+logger = structlog.get_logger()
+class HistoryManager:
+    """
+    Gerenciador de histórico permanente.
+    O histórico armazena dados imutáveis coletados ao longo do tempo,
+    permitindo:
+    - Reconstrução de séries históricas
+    - Auditoria de mudanças de parsing
+    - Fallback quando fonte está indisponível
+    """
+    def __init__(self, store: Any = None):
+        """
+        Inicializa o gerenciador.
+        Args:
+            store: DuckDBStore (opcional, usa singleton se não fornecido)
+        """
+        self._store = store
+    @property
+    def store(self) -> Any:
+        """Obtém store, criando se necessário."""
+        if self._store is None:
+            from .duckdb_store import get_store
+            self._store = get_store()
+        return self._store
+    def save(
+        self,
+        key: str,
+        data: bytes,
+        source: Fonte,
+        data_date: date,
+        parser_version: int,
+        fingerprint_hash: str | None = None,
+    ) -> bool:
+        """
+        Salva dados no histórico.
+        Args:
+            key: Chave identificadora (ex: 'cepea:soja')
+            data: Dados serializados
+            source: Fonte de dados
+            data_date: Data dos dados (não da coleta)
+            parser_version: Versão do parser usado
+            fingerprint_hash: Hash do fingerprint (opcional)
+        Returns:
+            True se salvo, False se já existia
+        """
+        try:
+            self.store.history_save(
+                key=key,
+                data=data,
+                source=source,
+                data_date=datetime.combine(data_date, datetime.min.time()),
+                parser_version=parser_version,
+                fingerprint_hash=fingerprint_hash,
+            )
+            logger.debug(
+                "history_saved",
+                key=key,
+                data_date=str(data_date),
+                parser_version=parser_version,
+            )
+            return True
+        except Exception as e:
+            logger.debug("history_save_skipped", key=key, reason=str(e))
+            return False
+    def get(
+        self,
+        key: str,
+        data_date: date | None = None,
+    ) -> bytes | None:
+        """
+        Busca dados no histórico.
+        Args:
+            key: Chave identificadora
+            data_date: Data específica (ou mais recente se None)
+        Returns:
+            Dados ou None
+        """
+        dt = datetime.combine(data_date, datetime.min.time()) if data_date else None
+        result: bytes | None = self.store.history_get(key, dt)
+        return result
+    def get_latest(self, key: str) -> bytes | None:
+        """
+        Busca dados mais recentes no histórico.
+        Args:
+            key: Chave identificadora
+        Returns:
+            Dados mais recentes ou None
+        """
+        return self.get(key, None)
+    def query(
+        self,
+        source: Fonte | None = None,
+        start_date: date | None = None,
+        end_date: date | None = None,
+        key_prefix: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """
+        Consulta histórico com filtros.
+        Args:
+            source: Filtrar por fonte
+            start_date: Data inicial
+            end_date: Data final
+            key_prefix: Prefixo da chave
+        Returns:
+            Lista de metadados das entradas
+        """
+        start_dt = datetime.combine(start_date, datetime.min.time()) if start_date else None
+        end_dt = datetime.combine(end_date, datetime.max.time()) if end_date else None
+        entries: list[dict[str, Any]] = self.store.history_query(
+            source=source,
+            start_date=start_dt,
+            end_date=end_dt,
+        )
+        if key_prefix:
+            entries = [e for e in entries if e["key"].startswith(key_prefix)]
+        return entries
+    def get_dates(
+        self,
+        key: str,
+        start_date: date | None = None,
+        end_date: date | None = None,
+    ) -> list[date]:
+        """
+        Retorna datas disponíveis no histórico para uma chave.
+        Args:
+            key: Chave identificadora
+            start_date: Data inicial (opcional)
+            end_date: Data final (opcional)
+        Returns:
+            Lista de datas
+        """
+        entries = self.query(key_prefix=key, start_date=start_date, end_date=end_date)
+        dates = set()
+        for entry in entries:
+            if entry["key"] == key:
+                data_date = entry.get("data_date")
+                if data_date:
+                    if isinstance(data_date, datetime):
+                        dates.add(data_date.date())
+                    else:
+                        dates.add(data_date)
+        return sorted(dates)
+    def find_gaps(
+        self,
+        key: str,
+        start_date: date,
+        end_date: date,
+    ) -> list[date]:
+        """
+        Encontra lacunas no histórico.
+        Args:
+            key: Chave identificadora
+            start_date: Data inicial
+            end_date: Data final
+        Returns:
+            Lista de datas sem dados
+        """
+        available = set(self.get_dates(key, start_date, end_date))
+        all_dates = []
+        current = start_date
+        while current <= end_date:
+            if current.weekday() < 5:
+                all_dates.append(current)
+            current = current + __import__("datetime").timedelta(days=1)
+        return [d for d in all_dates if d not in available]
+    def count(
+        self,
+        source: Fonte | None = None,
+        key_prefix: str | None = None,
+    ) -> int:
+        """
+        Conta entradas no histórico.
+        Args:
+            source: Filtrar por fonte
+            key_prefix: Prefixo da chave
+        Returns:
+            Número de entradas
+        """
+        entries = self.query(source=source, key_prefix=key_prefix)
+        return len(entries)
+    def export(
+        self,
+        path: str | Path,
+        source: Fonte | None = None,
+        start_date: date | None = None,
+        end_date: date | None = None,
+        format: str = "parquet",
+    ) -> int:
+        """
+        Exporta histórico para arquivo.
+        Args:
+            path: Caminho do arquivo
+            source: Filtrar por fonte
+            start_date: Data inicial
+            end_date: Data final
+            format: Formato ('parquet', 'csv', 'json')
+        Returns:
+            Número de registros exportados
+        """
+        import pandas as pd
+        entries = self.query(source=source, start_date=start_date, end_date=end_date)
+        if not entries:
+            return 0
+        df = pd.DataFrame(entries)
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        if format == "parquet":
+            df.to_parquet(path, index=False)
+        elif format == "csv":
+            df.to_csv(path, index=False)
+        elif format == "json":
+            df.to_json(path, orient="records", date_format="iso")
+        else:
+            raise ValueError(f"Formato não suportado: {format}")
+        logger.info("history_exported", path=str(path), count=len(df), format=format)
+        return len(df)
+    def cleanup(
+        self,
+        older_than_days: int | None = None,
+        source: Fonte | None = None,
+    ) -> int:
+        """
+        Remove entradas antigas do histórico.
+        ATENÇÃO: Operação destrutiva! O histórico normalmente não deve ser limpo.
+        Args:
+            older_than_days: Remover entradas mais antigas que N dias
+            source: Filtrar por fonte
+        Returns:
+            Número de entradas removidas
+        """
+        if older_than_days is None:
+            logger.warning("history_cleanup_skipped", reason="no_age_specified")
+            return 0
+        logger.warning(
+            "history_cleanup_starting",
+            older_than_days=older_than_days,
+            source=source.value if source else "all",
+        )
+        return 0
+_history_manager: HistoryManager | None = None
+def get_history_manager() -> HistoryManager:
+    """Obtém instância singleton do HistoryManager."""
+    global _history_manager
+    if _history_manager is None:
+        _history_manager = HistoryManager()
+    return _history_manager

agrobr/cache/migrations.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Schema migrations para DuckDB."""
+from __future__ import annotations
+import contextlib
+from typing import TYPE_CHECKING
+import structlog
+if TYPE_CHECKING:
+    import duckdb
+logger = structlog.get_logger()
+SCHEMA_VERSION = 3
+MIGRATIONS: dict[int, str] = {
+    1: """
+        CREATE TABLE IF NOT EXISTS schema_version (
+            version INTEGER PRIMARY KEY,
+            applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        );
+        INSERT OR IGNORE INTO schema_version (version) VALUES (1);
+    """,
+    2: """
+        ALTER TABLE cache_entries ADD COLUMN IF NOT EXISTS hit_count INTEGER DEFAULT 0;
+        ALTER TABLE cache_entries ADD COLUMN IF NOT EXISTS stale BOOLEAN DEFAULT FALSE;
+    """,
+    3: """
+        CREATE INDEX IF NOT EXISTS idx_history_key_date ON history_entries(key, data_date);
+        CREATE INDEX IF NOT EXISTS idx_history_parser ON history_entries(parser_version);
+    """,
+}
+def get_current_version(conn: duckdb.DuckDBPyConnection) -> int:
+    """Retorna versão atual do schema."""
+    try:
+        result = conn.execute("SELECT MAX(version) FROM schema_version").fetchone()
+        return int(result[0]) if result and result[0] else 0
+    except Exception:
+        return 0
+def migrate(conn: duckdb.DuckDBPyConnection) -> None:
+    """
+    Executa migrations pendentes.
+    Migrations são idempotentes e podem ser re-executadas com segurança.
+    """
+    current = get_current_version(conn)
+    if current >= SCHEMA_VERSION:
+        logger.debug("schema_up_to_date", version=current)
+        return
+    logger.info("schema_migration_start", current=current, target=SCHEMA_VERSION)
+    for version in range(current + 1, SCHEMA_VERSION + 1):
+        if version in MIGRATIONS:
+            try:
+                for statement in MIGRATIONS[version].strip().split(";"):
+                    statement = statement.strip()
+                    if statement:
+                        try:
+                            conn.execute(statement)
+                        except Exception as stmt_error:
+                            if "already exists" in str(stmt_error).lower():
+                                continue
+                            if "duplicate" in str(stmt_error).lower():
+                                continue
+                            raise
+                with contextlib.suppress(Exception):
+                    conn.execute("INSERT INTO schema_version (version) VALUES (?)", [version])
+                logger.info("migration_applied", version=version)
+            except Exception as e:
+                logger.error("migration_failed", version=version, error=str(e))
+                raise
+    logger.info("schema_migration_complete", version=SCHEMA_VERSION)

agrobr/cache/policies.py ADDED Viewed

@@ -0,0 +1,240 @@
+"""
+Políticas de cache e TTL por fonte.
+"""
+from __future__ import annotations
+from datetime import datetime, timedelta
+from enum import Enum
+from typing import NamedTuple
+from ..constants import Fonte
+class CachePolicy(NamedTuple):
+    """Política de cache para uma fonte."""
+    ttl_seconds: int
+    stale_max_seconds: int
+    description: str
+class TTL(Enum):
+    """TTLs pré-definidos."""
+    MINUTES_15 = 15 * 60
+    MINUTES_30 = 30 * 60
+    HOUR_1 = 60 * 60
+    HOURS_4 = 4 * 60 * 60
+    HOURS_12 = 12 * 60 * 60
+    HOURS_24 = 24 * 60 * 60
+    DAYS_7 = 7 * 24 * 60 * 60
+    DAYS_30 = 30 * 24 * 60 * 60
+    DAYS_90 = 90 * 24 * 60 * 60
+POLICIES: dict[str, CachePolicy] = {
+    "cepea_diario": CachePolicy(
+        ttl_seconds=TTL.HOURS_4.value,
+        stale_max_seconds=TTL.HOURS_24.value * 2,
+        description="CEPEA indicador diário (atualiza ~18h)",
+    ),
+    "cepea_semanal": CachePolicy(
+        ttl_seconds=TTL.HOURS_24.value,
+        stale_max_seconds=TTL.DAYS_7.value,
+        description="CEPEA indicador semanal (atualiza sexta)",
+    ),
+    "conab_safras": CachePolicy(
+        ttl_seconds=TTL.HOURS_24.value,
+        stale_max_seconds=TTL.DAYS_30.value,
+        description="CONAB safras (atualiza mensalmente)",
+    ),
+    "conab_balanco": CachePolicy(
+        ttl_seconds=TTL.HOURS_24.value,
+        stale_max_seconds=TTL.DAYS_30.value,
+        description="CONAB balanço (atualiza mensalmente)",
+    ),
+    "ibge_pam": CachePolicy(
+        ttl_seconds=TTL.DAYS_7.value,
+        stale_max_seconds=TTL.DAYS_90.value,
+        description="IBGE PAM (atualiza anualmente)",
+    ),
+    "ibge_lspa": CachePolicy(
+        ttl_seconds=TTL.HOURS_24.value,
+        stale_max_seconds=TTL.DAYS_30.value,
+        description="IBGE LSPA (atualiza mensalmente)",
+    ),
+    "noticias_agricolas": CachePolicy(
+        ttl_seconds=TTL.HOURS_4.value,
+        stale_max_seconds=TTL.HOURS_24.value * 2,
+        description="Notícias Agrícolas (mirror CEPEA)",
+    ),
+}
+SOURCE_POLICY_MAP: dict[Fonte, str] = {
+    Fonte.CEPEA: "cepea_diario",
+    Fonte.CONAB: "conab_safras",
+    Fonte.IBGE: "ibge_lspa",
+}
+def get_policy(source: Fonte | str, endpoint: str | None = None) -> CachePolicy:
+    """
+    Retorna política de cache para uma fonte/endpoint.
+    Args:
+        source: Fonte de dados
+        endpoint: Endpoint específico (opcional)
+    Returns:
+        CachePolicy aplicável
+    """
+    if isinstance(source, str):
+        if source in POLICIES:
+            return POLICIES[source]
+        try:
+            source = Fonte(source)
+        except ValueError:
+            return POLICIES["cepea_diario"]
+    if endpoint:
+        key = f"{source.value}_{endpoint}"
+        if key in POLICIES:
+            return POLICIES[key]
+    default_key = SOURCE_POLICY_MAP.get(source, "cepea_diario")
+    return POLICIES[default_key]
+def get_ttl(source: Fonte | str, endpoint: str | None = None) -> int:
+    """
+    Retorna TTL em segundos para uma fonte.
+    Args:
+        source: Fonte de dados
+        endpoint: Endpoint específico
+    Returns:
+        TTL em segundos
+    """
+    return get_policy(source, endpoint).ttl_seconds
+def get_stale_max(source: Fonte | str, endpoint: str | None = None) -> int:
+    """
+    Retorna tempo máximo stale em segundos.
+    Args:
+        source: Fonte de dados
+        endpoint: Endpoint específico
+    Returns:
+        Stale máximo em segundos
+    """
+    return get_policy(source, endpoint).stale_max_seconds
+def is_expired(created_at: datetime, source: Fonte | str) -> bool:
+    """
+    Verifica se entrada de cache está expirada.
+    Args:
+        created_at: Data de criação
+        source: Fonte de dados
+    Returns:
+        True se expirado
+    """
+    ttl = get_ttl(source)
+    expires_at = created_at + timedelta(seconds=ttl)
+    return datetime.utcnow() > expires_at
+def is_stale_acceptable(created_at: datetime, source: Fonte | str) -> bool:
+    """
+    Verifica se dados stale ainda são aceitáveis.
+    Args:
+        created_at: Data de criação
+        source: Fonte de dados
+    Returns:
+        True se stale ainda é aceitável
+    """
+    stale_max = get_stale_max(source)
+    max_acceptable = created_at + timedelta(seconds=stale_max)
+    return datetime.utcnow() <= max_acceptable
+def calculate_expiry(source: Fonte | str, endpoint: str | None = None) -> datetime:
+    """
+    Calcula data de expiração para nova entrada.
+    Args:
+        source: Fonte de dados
+        endpoint: Endpoint específico
+    Returns:
+        Data de expiração
+    """
+    ttl = get_ttl(source, endpoint)
+    return datetime.utcnow() + timedelta(seconds=ttl)
+class InvalidationReason(Enum):
+    """Razões para invalidação de cache."""
+    EXPIRED = "expired"
+    MANUAL = "manual"
+    SOURCE_UPDATE = "source_update"
+    PARSE_ERROR = "parse_error"
+    VALIDATION_ERROR = "validation_error"
+    FINGERPRINT_CHANGE = "fingerprint_change"
+def should_refresh(
+    created_at: datetime,
+    source: Fonte | str,
+    force: bool = False,
+) -> tuple[bool, str]:
+    """
+    Determina se cache deve ser atualizado.
+    Args:
+        created_at: Data de criação do cache
+        source: Fonte de dados
+        force: Forçar atualização
+    Returns:
+        Tupla (deve_atualizar, razão)
+    """
+    if force:
+        return True, "force_refresh"
+    if is_expired(created_at, source):
+        return True, "expired"
+    return False, "fresh"
+def format_ttl(seconds: int) -> str:
+    """
+    Formata TTL para exibição.
+    Args:
+        seconds: TTL em segundos
+    Returns:
+        String formatada (ex: "4 horas", "7 dias")
+    """
+    if seconds < 60:
+        return f"{seconds} segundos"
+    if seconds < 3600:
+        minutes = seconds // 60
+        return f"{minutes} minuto{'s' if minutes > 1 else ''}"
+    if seconds < 86400:
+        hours = seconds // 3600
+        return f"{hours} hora{'s' if hours > 1 else ''}"
+    days = seconds // 86400
+    return f"{days} dia{'s' if days > 1 else ''}"

agrobr/cepea/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Modulo CEPEA - Indicadores de precos agricolas."""
+from __future__ import annotations
+from agrobr.cepea.api import indicador, pracas, produtos, ultimo
+__all__ = ["indicador", "produtos", "pracas", "ultimo"]