PyPI - agrobr - Versions diffs - 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

agrobr 0.1.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

agrobr/__init__.py +3 -2
agrobr/benchmark/__init__.py +343 -0
agrobr/cache/policies.py +99 -17
agrobr/cepea/api.py +87 -30
agrobr/cepea/client.py +1 -8
agrobr/cli.py +141 -5
agrobr/conab/api.py +72 -6
agrobr/config.py +137 -0
agrobr/constants.py +1 -2
agrobr/contracts/__init__.py +186 -0
agrobr/contracts/cepea.py +80 -0
agrobr/contracts/conab.py +181 -0
agrobr/contracts/ibge.py +146 -0
agrobr/export.py +251 -0
agrobr/health/__init__.py +10 -0
agrobr/health/doctor.py +321 -0
agrobr/http/browser.py +0 -9
agrobr/ibge/api.py +104 -25
agrobr/ibge/client.py +5 -20
agrobr/models.py +100 -1
agrobr/noticias_agricolas/client.py +0 -7
agrobr/noticias_agricolas/parser.py +0 -17
agrobr/plugins/__init__.py +205 -0
agrobr/quality.py +319 -0
agrobr/sla.py +249 -0
agrobr/snapshots.py +321 -0
agrobr/stability.py +148 -0
agrobr/validators/semantic.py +447 -0
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/METADATA +12 -12
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/RECORD +33 -19
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/WHEEL +0 -0
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/entry_points.txt +0 -0
{agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/licenses/LICENSE +0 -0

agrobr/quality.py ADDED Viewed

@@ -0,0 +1,319 @@
+"""Certificacao de qualidade de dados."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import StrEnum
+from typing import TYPE_CHECKING, Any
+import structlog
+if TYPE_CHECKING:
+    import pandas as pd
+logger = structlog.get_logger()
+class QualityLevel(StrEnum):
+    GOLD = "gold"
+    SILVER = "silver"
+    BRONZE = "bronze"
+    UNCERTIFIED = "uncertified"
+class CheckStatus(StrEnum):
+    PASSED = "passed"
+    FAILED = "failed"
+    SKIPPED = "skipped"
+    WARNING = "warning"
+@dataclass
+class QualityCheck:
+    name: str
+    status: CheckStatus
+    message: str = ""
+    details: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class QualityCertificate:
+    level: QualityLevel
+    checks: list[QualityCheck]
+    issued_at: datetime
+    valid_until: datetime | None = None
+    source: str = ""
+    dataset: str = ""
+    row_count: int = 0
+    column_count: int = 0
+    score: float = 0.0
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "level": self.level.value,
+            "score": round(self.score, 2),
+            "issued_at": self.issued_at.isoformat(),
+            "valid_until": self.valid_until.isoformat() if self.valid_until else None,
+            "source": self.source,
+            "dataset": self.dataset,
+            "row_count": self.row_count,
+            "column_count": self.column_count,
+            "checks": [
+                {
+                    "name": c.name,
+                    "status": c.status.value,
+                    "message": c.message,
+                }
+                for c in self.checks
+            ],
+            "summary": {
+                "passed": sum(1 for c in self.checks if c.status == CheckStatus.PASSED),
+                "failed": sum(1 for c in self.checks if c.status == CheckStatus.FAILED),
+                "warnings": sum(1 for c in self.checks if c.status == CheckStatus.WARNING),
+                "skipped": sum(1 for c in self.checks if c.status == CheckStatus.SKIPPED),
+            },
+        }
+    def is_valid(self) -> bool:
+        if self.valid_until is None:
+            return True
+        return datetime.now() < self.valid_until
+def _check_completeness(df: pd.DataFrame, threshold: float = 0.95) -> QualityCheck:
+    total_cells = df.size
+    non_null_cells = df.count().sum()
+    completeness = non_null_cells / total_cells if total_cells > 0 else 0
+    if completeness >= threshold:
+        return QualityCheck(
+            name="completeness",
+            status=CheckStatus.PASSED,
+            message=f"Completeness: {completeness:.1%}",
+            details={"completeness": completeness, "threshold": threshold},
+        )
+    elif completeness >= threshold * 0.9:
+        return QualityCheck(
+            name="completeness",
+            status=CheckStatus.WARNING,
+            message=f"Completeness below threshold: {completeness:.1%}",
+            details={"completeness": completeness, "threshold": threshold},
+        )
+    else:
+        return QualityCheck(
+            name="completeness",
+            status=CheckStatus.FAILED,
+            message=f"Low completeness: {completeness:.1%}",
+            details={"completeness": completeness, "threshold": threshold},
+        )
+def _check_duplicates(df: pd.DataFrame, max_dup_pct: float = 0.01) -> QualityCheck:
+    dup_count = df.duplicated().sum()
+    dup_pct = dup_count / len(df) if len(df) > 0 else 0
+    if dup_pct <= max_dup_pct:
+        return QualityCheck(
+            name="duplicates",
+            status=CheckStatus.PASSED,
+            message=f"Duplicates: {dup_count} ({dup_pct:.1%})",
+            details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
+        )
+    else:
+        return QualityCheck(
+            name="duplicates",
+            status=CheckStatus.FAILED,
+            message=f"Too many duplicates: {dup_count} ({dup_pct:.1%})",
+            details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
+        )
+def _check_schema(df: pd.DataFrame, expected_columns: list[str] | None = None) -> QualityCheck:
+    if expected_columns is None:
+        return QualityCheck(
+            name="schema",
+            status=CheckStatus.SKIPPED,
+            message="No expected schema provided",
+        )
+    actual_columns = set(df.columns)
+    expected_set = set(expected_columns)
+    missing = expected_set - actual_columns
+    extra = actual_columns - expected_set
+    if not missing and not extra:
+        return QualityCheck(
+            name="schema",
+            status=CheckStatus.PASSED,
+            message="Schema matches expected columns",
+            details={"columns": list(actual_columns)},
+        )
+    elif not missing:
+        return QualityCheck(
+            name="schema",
+            status=CheckStatus.WARNING,
+            message=f"Extra columns found: {extra}",
+            details={"missing": list(missing), "extra": list(extra)},
+        )
+    else:
+        return QualityCheck(
+            name="schema",
+            status=CheckStatus.FAILED,
+            message=f"Missing columns: {missing}",
+            details={"missing": list(missing), "extra": list(extra)},
+        )
+def _check_freshness(
+    df: pd.DataFrame,
+    date_column: str = "data",
+    max_age_days: int = 7,
+) -> QualityCheck:
+    if date_column not in df.columns:
+        return QualityCheck(
+            name="freshness",
+            status=CheckStatus.SKIPPED,
+            message=f"Date column '{date_column}' not found",
+        )
+    import pandas
+    df[date_column] = pandas.to_datetime(df[date_column])
+    max_date = df[date_column].max()
+    age_days = (datetime.now() - max_date).days
+    if age_days <= max_age_days:
+        return QualityCheck(
+            name="freshness",
+            status=CheckStatus.PASSED,
+            message=f"Data age: {age_days} days",
+            details={"max_date": max_date.isoformat(), "age_days": age_days},
+        )
+    elif age_days <= max_age_days * 2:
+        return QualityCheck(
+            name="freshness",
+            status=CheckStatus.WARNING,
+            message=f"Data slightly stale: {age_days} days",
+            details={"max_date": max_date.isoformat(), "age_days": age_days},
+        )
+    else:
+        return QualityCheck(
+            name="freshness",
+            status=CheckStatus.FAILED,
+            message=f"Data too old: {age_days} days",
+            details={"max_date": max_date.isoformat(), "age_days": age_days},
+        )
+def _check_value_ranges(
+    df: pd.DataFrame,
+    column: str,
+    min_val: float | None = None,
+    max_val: float | None = None,
+) -> QualityCheck:
+    if column not in df.columns:
+        return QualityCheck(
+            name=f"range_{column}",
+            status=CheckStatus.SKIPPED,
+            message=f"Column '{column}' not found",
+        )
+    values = df[column].dropna()
+    if len(values) == 0:
+        return QualityCheck(
+            name=f"range_{column}",
+            status=CheckStatus.WARNING,
+            message=f"Column '{column}' is empty",
+        )
+    actual_min = values.min()
+    actual_max = values.max()
+    violations = 0
+    if min_val is not None:
+        violations += (values < min_val).sum()
+    if max_val is not None:
+        violations += (values > max_val).sum()
+    if violations == 0:
+        return QualityCheck(
+            name=f"range_{column}",
+            status=CheckStatus.PASSED,
+            message=f"All values in range [{min_val}, {max_val}]",
+            details={"min": float(actual_min), "max": float(actual_max)},
+        )
+    else:
+        return QualityCheck(
+            name=f"range_{column}",
+            status=CheckStatus.FAILED,
+            message=f"{violations} values out of range",
+            details={
+                "min": float(actual_min),
+                "max": float(actual_max),
+                "violations": int(violations),
+            },
+        )
+def certify(
+    df: pd.DataFrame,
+    source: str = "",
+    dataset: str = "",
+    expected_columns: list[str] | None = None,
+    date_column: str = "data",
+    value_column: str = "valor",
+    min_value: float | None = 0,
+    max_value: float | None = None,
+) -> QualityCertificate:
+    checks = []
+    checks.append(_check_completeness(df))
+    checks.append(_check_duplicates(df))
+    checks.append(_check_schema(df, expected_columns))
+    checks.append(_check_freshness(df, date_column))
+    if value_column in df.columns:
+        checks.append(_check_value_ranges(df, value_column, min_value, max_value))
+    passed = sum(1 for c in checks if c.status == CheckStatus.PASSED)
+    failed = sum(1 for c in checks if c.status == CheckStatus.FAILED)
+    warnings = sum(1 for c in checks if c.status == CheckStatus.WARNING)
+    total = passed + failed + warnings
+    score = (passed + warnings * 0.5) / total if total > 0 else 0
+    if score >= 0.9 and failed == 0:
+        level = QualityLevel.GOLD
+    elif score >= 0.7 and failed <= 1:
+        level = QualityLevel.SILVER
+    elif score >= 0.5:
+        level = QualityLevel.BRONZE
+    else:
+        level = QualityLevel.UNCERTIFIED
+    return QualityCertificate(
+        level=level,
+        checks=checks,
+        issued_at=datetime.now(),
+        source=source,
+        dataset=dataset,
+        row_count=len(df),
+        column_count=len(df.columns),
+        score=score,
+    )
+def quick_check(df: pd.DataFrame) -> tuple[QualityLevel, float]:
+    cert = certify(df)
+    return cert.level, cert.score
+__all__ = [
+    "QualityLevel",
+    "CheckStatus",
+    "QualityCheck",
+    "QualityCertificate",
+    "certify",
+    "quick_check",
+]

agrobr/sla.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Service Level Agreement definitions per source."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from datetime import time
+from enum import StrEnum
+from typing import Any
+from agrobr.constants import Fonte
+class Tier(StrEnum):
+    CRITICAL = "critical"
+    STANDARD = "standard"
+    BEST_EFFORT = "best_effort"
+@dataclass
+class FreshnessPolicy:
+    update_frequency: str
+    update_time: time | None = None
+    timezone: str = "America/Sao_Paulo"
+    weekends: bool = False
+    holidays: bool = False
+@dataclass
+class LatencyTarget:
+    p50_ms: int
+    p95_ms: int
+    p99_ms: int
+    timeout_ms: int
+@dataclass
+class AvailabilityTarget:
+    uptime_pct: float
+    planned_maintenance_window: str | None = None
+    degraded_mode_available: bool = True
+@dataclass
+class DataQualityTarget:
+    completeness_pct: float = 99.0
+    accuracy_checks: bool = True
+    schema_validation: bool = True
+    anomaly_detection: bool = True
+@dataclass
+class SourceSLA:
+    source: Fonte
+    tier: Tier
+    freshness: FreshnessPolicy
+    latency: LatencyTarget
+    availability: AvailabilityTarget
+    data_quality: DataQualityTarget
+    fallback_sources: list[Fonte] = field(default_factory=list)
+    notes: str = ""
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "source": self.source.value,
+            "tier": self.tier.value,
+            "freshness": {
+                "update_frequency": self.freshness.update_frequency,
+                "update_time": self.freshness.update_time.isoformat()
+                if self.freshness.update_time
+                else None,
+                "timezone": self.freshness.timezone,
+                "weekends": self.freshness.weekends,
+                "holidays": self.freshness.holidays,
+            },
+            "latency": {
+                "p50_ms": self.latency.p50_ms,
+                "p95_ms": self.latency.p95_ms,
+                "p99_ms": self.latency.p99_ms,
+                "timeout_ms": self.latency.timeout_ms,
+            },
+            "availability": {
+                "uptime_pct": self.availability.uptime_pct,
+                "planned_maintenance_window": self.availability.planned_maintenance_window,
+                "degraded_mode_available": self.availability.degraded_mode_available,
+            },
+            "data_quality": {
+                "completeness_pct": self.data_quality.completeness_pct,
+                "accuracy_checks": self.data_quality.accuracy_checks,
+                "schema_validation": self.data_quality.schema_validation,
+                "anomaly_detection": self.data_quality.anomaly_detection,
+            },
+            "fallback_sources": [f.value for f in self.fallback_sources],
+            "notes": self.notes,
+        }
+CEPEA_SLA = SourceSLA(
+    source=Fonte.CEPEA,
+    tier=Tier.CRITICAL,
+    freshness=FreshnessPolicy(
+        update_frequency="daily",
+        update_time=time(18, 0),
+        weekends=False,
+        holidays=False,
+    ),
+    latency=LatencyTarget(
+        p50_ms=500,
+        p95_ms=2000,
+        p99_ms=5000,
+        timeout_ms=30000,
+    ),
+    availability=AvailabilityTarget(
+        uptime_pct=99.0,
+        degraded_mode_available=True,
+    ),
+    data_quality=DataQualityTarget(
+        completeness_pct=99.0,
+        accuracy_checks=True,
+        schema_validation=True,
+        anomaly_detection=True,
+    ),
+    fallback_sources=[Fonte.NOTICIAS_AGRICOLAS],
+    notes="CEPEA publica indicadores diarios as 18h. Cache expira as 18h do dia seguinte.",
+)
+CONAB_SLA = SourceSLA(
+    source=Fonte.CONAB,
+    tier=Tier.STANDARD,
+    freshness=FreshnessPolicy(
+        update_frequency="monthly",
+        weekends=False,
+        holidays=False,
+    ),
+    latency=LatencyTarget(
+        p50_ms=1000,
+        p95_ms=3000,
+        p99_ms=10000,
+        timeout_ms=60000,
+    ),
+    availability=AvailabilityTarget(
+        uptime_pct=95.0,
+        degraded_mode_available=True,
+    ),
+    data_quality=DataQualityTarget(
+        completeness_pct=95.0,
+        accuracy_checks=True,
+        schema_validation=True,
+        anomaly_detection=False,
+    ),
+    notes="CONAB publica boletins mensais. Dados de safra atualizados mensalmente.",
+)
+IBGE_SLA = SourceSLA(
+    source=Fonte.IBGE,
+    tier=Tier.STANDARD,
+    freshness=FreshnessPolicy(
+        update_frequency="monthly",
+        weekends=False,
+        holidays=False,
+    ),
+    latency=LatencyTarget(
+        p50_ms=800,
+        p95_ms=2500,
+        p99_ms=8000,
+        timeout_ms=45000,
+    ),
+    availability=AvailabilityTarget(
+        uptime_pct=98.0,
+        degraded_mode_available=True,
+    ),
+    data_quality=DataQualityTarget(
+        completeness_pct=98.0,
+        accuracy_checks=True,
+        schema_validation=True,
+        anomaly_detection=False,
+    ),
+    notes="IBGE SIDRA API. PAM anual, LSPA mensal.",
+)
+NOTICIAS_AGRICOLAS_SLA = SourceSLA(
+    source=Fonte.NOTICIAS_AGRICOLAS,
+    tier=Tier.BEST_EFFORT,
+    freshness=FreshnessPolicy(
+        update_frequency="daily",
+        update_time=time(19, 0),
+        weekends=False,
+        holidays=False,
+    ),
+    latency=LatencyTarget(
+        p50_ms=1500,
+        p95_ms=5000,
+        p99_ms=15000,
+        timeout_ms=45000,
+    ),
+    availability=AvailabilityTarget(
+        uptime_pct=90.0,
+        degraded_mode_available=False,
+    ),
+    data_quality=DataQualityTarget(
+        completeness_pct=90.0,
+        accuracy_checks=False,
+        schema_validation=True,
+        anomaly_detection=False,
+    ),
+    notes="Fonte alternativa para CEPEA. Usado como fallback.",
+)
+_SLA_REGISTRY: dict[Fonte, SourceSLA] = {
+    Fonte.CEPEA: CEPEA_SLA,
+    Fonte.CONAB: CONAB_SLA,
+    Fonte.IBGE: IBGE_SLA,
+    Fonte.NOTICIAS_AGRICOLAS: NOTICIAS_AGRICOLAS_SLA,
+}
+def get_sla(source: Fonte) -> SourceSLA | None:
+    return _SLA_REGISTRY.get(source)
+def list_slas() -> list[SourceSLA]:
+    return list(_SLA_REGISTRY.values())
+def get_sla_summary() -> dict[str, Any]:
+    return {
+        "sources": [sla.to_dict() for sla in _SLA_REGISTRY.values()],
+        "tiers": {
+            "critical": "99%+ uptime, daily freshness, full validation",
+            "standard": "95%+ uptime, monthly freshness, schema validation",
+            "best_effort": "90%+ uptime, fallback source, basic validation",
+        },
+    }
+__all__ = [
+    "Tier",
+    "FreshnessPolicy",
+    "LatencyTarget",
+    "AvailabilityTarget",
+    "DataQualityTarget",
+    "SourceSLA",
+    "CEPEA_SLA",
+    "CONAB_SLA",
+    "IBGE_SLA",
+    "NOTICIAS_AGRICOLAS_SLA",
+    "get_sla",
+    "list_slas",
+    "get_sla_summary",
+]

agrobr 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

agrobr 0.1.0py3-none-any.whl → 0.5.0py3-none-any.whl