agrobr 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agrobr/quality.py ADDED
@@ -0,0 +1,319 @@
1
+ """Certificacao de qualidade de dados."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from enum import StrEnum
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import structlog
11
+
12
+ if TYPE_CHECKING:
13
+ import pandas as pd
14
+
15
+ logger = structlog.get_logger()
16
+
17
+
18
+ class QualityLevel(StrEnum):
19
+ GOLD = "gold"
20
+ SILVER = "silver"
21
+ BRONZE = "bronze"
22
+ UNCERTIFIED = "uncertified"
23
+
24
+
25
+ class CheckStatus(StrEnum):
26
+ PASSED = "passed"
27
+ FAILED = "failed"
28
+ SKIPPED = "skipped"
29
+ WARNING = "warning"
30
+
31
+
32
+ @dataclass
33
+ class QualityCheck:
34
+ name: str
35
+ status: CheckStatus
36
+ message: str = ""
37
+ details: dict[str, Any] = field(default_factory=dict)
38
+
39
+
40
+ @dataclass
41
+ class QualityCertificate:
42
+ level: QualityLevel
43
+ checks: list[QualityCheck]
44
+ issued_at: datetime
45
+ valid_until: datetime | None = None
46
+ source: str = ""
47
+ dataset: str = ""
48
+ row_count: int = 0
49
+ column_count: int = 0
50
+ score: float = 0.0
51
+
52
+ def to_dict(self) -> dict[str, Any]:
53
+ return {
54
+ "level": self.level.value,
55
+ "score": round(self.score, 2),
56
+ "issued_at": self.issued_at.isoformat(),
57
+ "valid_until": self.valid_until.isoformat() if self.valid_until else None,
58
+ "source": self.source,
59
+ "dataset": self.dataset,
60
+ "row_count": self.row_count,
61
+ "column_count": self.column_count,
62
+ "checks": [
63
+ {
64
+ "name": c.name,
65
+ "status": c.status.value,
66
+ "message": c.message,
67
+ }
68
+ for c in self.checks
69
+ ],
70
+ "summary": {
71
+ "passed": sum(1 for c in self.checks if c.status == CheckStatus.PASSED),
72
+ "failed": sum(1 for c in self.checks if c.status == CheckStatus.FAILED),
73
+ "warnings": sum(1 for c in self.checks if c.status == CheckStatus.WARNING),
74
+ "skipped": sum(1 for c in self.checks if c.status == CheckStatus.SKIPPED),
75
+ },
76
+ }
77
+
78
+ def is_valid(self) -> bool:
79
+ if self.valid_until is None:
80
+ return True
81
+ return datetime.now() < self.valid_until
82
+
83
+
84
+ def _check_completeness(df: pd.DataFrame, threshold: float = 0.95) -> QualityCheck:
85
+ total_cells = df.size
86
+ non_null_cells = df.count().sum()
87
+ completeness = non_null_cells / total_cells if total_cells > 0 else 0
88
+
89
+ if completeness >= threshold:
90
+ return QualityCheck(
91
+ name="completeness",
92
+ status=CheckStatus.PASSED,
93
+ message=f"Completeness: {completeness:.1%}",
94
+ details={"completeness": completeness, "threshold": threshold},
95
+ )
96
+ elif completeness >= threshold * 0.9:
97
+ return QualityCheck(
98
+ name="completeness",
99
+ status=CheckStatus.WARNING,
100
+ message=f"Completeness below threshold: {completeness:.1%}",
101
+ details={"completeness": completeness, "threshold": threshold},
102
+ )
103
+ else:
104
+ return QualityCheck(
105
+ name="completeness",
106
+ status=CheckStatus.FAILED,
107
+ message=f"Low completeness: {completeness:.1%}",
108
+ details={"completeness": completeness, "threshold": threshold},
109
+ )
110
+
111
+
112
+ def _check_duplicates(df: pd.DataFrame, max_dup_pct: float = 0.01) -> QualityCheck:
113
+ dup_count = df.duplicated().sum()
114
+ dup_pct = dup_count / len(df) if len(df) > 0 else 0
115
+
116
+ if dup_pct <= max_dup_pct:
117
+ return QualityCheck(
118
+ name="duplicates",
119
+ status=CheckStatus.PASSED,
120
+ message=f"Duplicates: {dup_count} ({dup_pct:.1%})",
121
+ details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
122
+ )
123
+ else:
124
+ return QualityCheck(
125
+ name="duplicates",
126
+ status=CheckStatus.FAILED,
127
+ message=f"Too many duplicates: {dup_count} ({dup_pct:.1%})",
128
+ details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
129
+ )
130
+
131
+
132
+ def _check_schema(df: pd.DataFrame, expected_columns: list[str] | None = None) -> QualityCheck:
133
+ if expected_columns is None:
134
+ return QualityCheck(
135
+ name="schema",
136
+ status=CheckStatus.SKIPPED,
137
+ message="No expected schema provided",
138
+ )
139
+
140
+ actual_columns = set(df.columns)
141
+ expected_set = set(expected_columns)
142
+ missing = expected_set - actual_columns
143
+ extra = actual_columns - expected_set
144
+
145
+ if not missing and not extra:
146
+ return QualityCheck(
147
+ name="schema",
148
+ status=CheckStatus.PASSED,
149
+ message="Schema matches expected columns",
150
+ details={"columns": list(actual_columns)},
151
+ )
152
+ elif not missing:
153
+ return QualityCheck(
154
+ name="schema",
155
+ status=CheckStatus.WARNING,
156
+ message=f"Extra columns found: {extra}",
157
+ details={"missing": list(missing), "extra": list(extra)},
158
+ )
159
+ else:
160
+ return QualityCheck(
161
+ name="schema",
162
+ status=CheckStatus.FAILED,
163
+ message=f"Missing columns: {missing}",
164
+ details={"missing": list(missing), "extra": list(extra)},
165
+ )
166
+
167
+
168
+ def _check_freshness(
169
+ df: pd.DataFrame,
170
+ date_column: str = "data",
171
+ max_age_days: int = 7,
172
+ ) -> QualityCheck:
173
+ if date_column not in df.columns:
174
+ return QualityCheck(
175
+ name="freshness",
176
+ status=CheckStatus.SKIPPED,
177
+ message=f"Date column '{date_column}' not found",
178
+ )
179
+
180
+ import pandas
181
+
182
+ df[date_column] = pandas.to_datetime(df[date_column])
183
+ max_date = df[date_column].max()
184
+ age_days = (datetime.now() - max_date).days
185
+
186
+ if age_days <= max_age_days:
187
+ return QualityCheck(
188
+ name="freshness",
189
+ status=CheckStatus.PASSED,
190
+ message=f"Data age: {age_days} days",
191
+ details={"max_date": max_date.isoformat(), "age_days": age_days},
192
+ )
193
+ elif age_days <= max_age_days * 2:
194
+ return QualityCheck(
195
+ name="freshness",
196
+ status=CheckStatus.WARNING,
197
+ message=f"Data slightly stale: {age_days} days",
198
+ details={"max_date": max_date.isoformat(), "age_days": age_days},
199
+ )
200
+ else:
201
+ return QualityCheck(
202
+ name="freshness",
203
+ status=CheckStatus.FAILED,
204
+ message=f"Data too old: {age_days} days",
205
+ details={"max_date": max_date.isoformat(), "age_days": age_days},
206
+ )
207
+
208
+
209
+ def _check_value_ranges(
210
+ df: pd.DataFrame,
211
+ column: str,
212
+ min_val: float | None = None,
213
+ max_val: float | None = None,
214
+ ) -> QualityCheck:
215
+ if column not in df.columns:
216
+ return QualityCheck(
217
+ name=f"range_{column}",
218
+ status=CheckStatus.SKIPPED,
219
+ message=f"Column '{column}' not found",
220
+ )
221
+
222
+ values = df[column].dropna()
223
+ if len(values) == 0:
224
+ return QualityCheck(
225
+ name=f"range_{column}",
226
+ status=CheckStatus.WARNING,
227
+ message=f"Column '{column}' is empty",
228
+ )
229
+
230
+ actual_min = values.min()
231
+ actual_max = values.max()
232
+ violations = 0
233
+
234
+ if min_val is not None:
235
+ violations += (values < min_val).sum()
236
+ if max_val is not None:
237
+ violations += (values > max_val).sum()
238
+
239
+ if violations == 0:
240
+ return QualityCheck(
241
+ name=f"range_{column}",
242
+ status=CheckStatus.PASSED,
243
+ message=f"All values in range [{min_val}, {max_val}]",
244
+ details={"min": float(actual_min), "max": float(actual_max)},
245
+ )
246
+ else:
247
+ return QualityCheck(
248
+ name=f"range_{column}",
249
+ status=CheckStatus.FAILED,
250
+ message=f"{violations} values out of range",
251
+ details={
252
+ "min": float(actual_min),
253
+ "max": float(actual_max),
254
+ "violations": int(violations),
255
+ },
256
+ )
257
+
258
+
259
+ def certify(
260
+ df: pd.DataFrame,
261
+ source: str = "",
262
+ dataset: str = "",
263
+ expected_columns: list[str] | None = None,
264
+ date_column: str = "data",
265
+ value_column: str = "valor",
266
+ min_value: float | None = 0,
267
+ max_value: float | None = None,
268
+ ) -> QualityCertificate:
269
+ checks = []
270
+
271
+ checks.append(_check_completeness(df))
272
+ checks.append(_check_duplicates(df))
273
+ checks.append(_check_schema(df, expected_columns))
274
+ checks.append(_check_freshness(df, date_column))
275
+
276
+ if value_column in df.columns:
277
+ checks.append(_check_value_ranges(df, value_column, min_value, max_value))
278
+
279
+ passed = sum(1 for c in checks if c.status == CheckStatus.PASSED)
280
+ failed = sum(1 for c in checks if c.status == CheckStatus.FAILED)
281
+ warnings = sum(1 for c in checks if c.status == CheckStatus.WARNING)
282
+ total = passed + failed + warnings
283
+
284
+ score = (passed + warnings * 0.5) / total if total > 0 else 0
285
+
286
+ if score >= 0.9 and failed == 0:
287
+ level = QualityLevel.GOLD
288
+ elif score >= 0.7 and failed <= 1:
289
+ level = QualityLevel.SILVER
290
+ elif score >= 0.5:
291
+ level = QualityLevel.BRONZE
292
+ else:
293
+ level = QualityLevel.UNCERTIFIED
294
+
295
+ return QualityCertificate(
296
+ level=level,
297
+ checks=checks,
298
+ issued_at=datetime.now(),
299
+ source=source,
300
+ dataset=dataset,
301
+ row_count=len(df),
302
+ column_count=len(df.columns),
303
+ score=score,
304
+ )
305
+
306
+
307
+ def quick_check(df: pd.DataFrame) -> tuple[QualityLevel, float]:
308
+ cert = certify(df)
309
+ return cert.level, cert.score
310
+
311
+
312
+ __all__ = [
313
+ "QualityLevel",
314
+ "CheckStatus",
315
+ "QualityCheck",
316
+ "QualityCertificate",
317
+ "certify",
318
+ "quick_check",
319
+ ]
agrobr/sla.py ADDED
@@ -0,0 +1,249 @@
1
+ """Service Level Agreement definitions per source."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import time
7
+ from enum import StrEnum
8
+ from typing import Any
9
+
10
+ from agrobr.constants import Fonte
11
+
12
+
13
+ class Tier(StrEnum):
14
+ CRITICAL = "critical"
15
+ STANDARD = "standard"
16
+ BEST_EFFORT = "best_effort"
17
+
18
+
19
+ @dataclass
20
+ class FreshnessPolicy:
21
+ update_frequency: str
22
+ update_time: time | None = None
23
+ timezone: str = "America/Sao_Paulo"
24
+ weekends: bool = False
25
+ holidays: bool = False
26
+
27
+
28
+ @dataclass
29
+ class LatencyTarget:
30
+ p50_ms: int
31
+ p95_ms: int
32
+ p99_ms: int
33
+ timeout_ms: int
34
+
35
+
36
+ @dataclass
37
+ class AvailabilityTarget:
38
+ uptime_pct: float
39
+ planned_maintenance_window: str | None = None
40
+ degraded_mode_available: bool = True
41
+
42
+
43
+ @dataclass
44
+ class DataQualityTarget:
45
+ completeness_pct: float = 99.0
46
+ accuracy_checks: bool = True
47
+ schema_validation: bool = True
48
+ anomaly_detection: bool = True
49
+
50
+
51
+ @dataclass
52
+ class SourceSLA:
53
+ source: Fonte
54
+ tier: Tier
55
+ freshness: FreshnessPolicy
56
+ latency: LatencyTarget
57
+ availability: AvailabilityTarget
58
+ data_quality: DataQualityTarget
59
+ fallback_sources: list[Fonte] = field(default_factory=list)
60
+ notes: str = ""
61
+
62
+ def to_dict(self) -> dict[str, Any]:
63
+ return {
64
+ "source": self.source.value,
65
+ "tier": self.tier.value,
66
+ "freshness": {
67
+ "update_frequency": self.freshness.update_frequency,
68
+ "update_time": self.freshness.update_time.isoformat()
69
+ if self.freshness.update_time
70
+ else None,
71
+ "timezone": self.freshness.timezone,
72
+ "weekends": self.freshness.weekends,
73
+ "holidays": self.freshness.holidays,
74
+ },
75
+ "latency": {
76
+ "p50_ms": self.latency.p50_ms,
77
+ "p95_ms": self.latency.p95_ms,
78
+ "p99_ms": self.latency.p99_ms,
79
+ "timeout_ms": self.latency.timeout_ms,
80
+ },
81
+ "availability": {
82
+ "uptime_pct": self.availability.uptime_pct,
83
+ "planned_maintenance_window": self.availability.planned_maintenance_window,
84
+ "degraded_mode_available": self.availability.degraded_mode_available,
85
+ },
86
+ "data_quality": {
87
+ "completeness_pct": self.data_quality.completeness_pct,
88
+ "accuracy_checks": self.data_quality.accuracy_checks,
89
+ "schema_validation": self.data_quality.schema_validation,
90
+ "anomaly_detection": self.data_quality.anomaly_detection,
91
+ },
92
+ "fallback_sources": [f.value for f in self.fallback_sources],
93
+ "notes": self.notes,
94
+ }
95
+
96
+
97
+ CEPEA_SLA = SourceSLA(
98
+ source=Fonte.CEPEA,
99
+ tier=Tier.CRITICAL,
100
+ freshness=FreshnessPolicy(
101
+ update_frequency="daily",
102
+ update_time=time(18, 0),
103
+ weekends=False,
104
+ holidays=False,
105
+ ),
106
+ latency=LatencyTarget(
107
+ p50_ms=500,
108
+ p95_ms=2000,
109
+ p99_ms=5000,
110
+ timeout_ms=30000,
111
+ ),
112
+ availability=AvailabilityTarget(
113
+ uptime_pct=99.0,
114
+ degraded_mode_available=True,
115
+ ),
116
+ data_quality=DataQualityTarget(
117
+ completeness_pct=99.0,
118
+ accuracy_checks=True,
119
+ schema_validation=True,
120
+ anomaly_detection=True,
121
+ ),
122
+ fallback_sources=[Fonte.NOTICIAS_AGRICOLAS],
123
+ notes="CEPEA publica indicadores diarios as 18h. Cache expira as 18h do dia seguinte.",
124
+ )
125
+
126
+ CONAB_SLA = SourceSLA(
127
+ source=Fonte.CONAB,
128
+ tier=Tier.STANDARD,
129
+ freshness=FreshnessPolicy(
130
+ update_frequency="monthly",
131
+ weekends=False,
132
+ holidays=False,
133
+ ),
134
+ latency=LatencyTarget(
135
+ p50_ms=1000,
136
+ p95_ms=3000,
137
+ p99_ms=10000,
138
+ timeout_ms=60000,
139
+ ),
140
+ availability=AvailabilityTarget(
141
+ uptime_pct=95.0,
142
+ degraded_mode_available=True,
143
+ ),
144
+ data_quality=DataQualityTarget(
145
+ completeness_pct=95.0,
146
+ accuracy_checks=True,
147
+ schema_validation=True,
148
+ anomaly_detection=False,
149
+ ),
150
+ notes="CONAB publica boletins mensais. Dados de safra atualizados mensalmente.",
151
+ )
152
+
153
+ IBGE_SLA = SourceSLA(
154
+ source=Fonte.IBGE,
155
+ tier=Tier.STANDARD,
156
+ freshness=FreshnessPolicy(
157
+ update_frequency="monthly",
158
+ weekends=False,
159
+ holidays=False,
160
+ ),
161
+ latency=LatencyTarget(
162
+ p50_ms=800,
163
+ p95_ms=2500,
164
+ p99_ms=8000,
165
+ timeout_ms=45000,
166
+ ),
167
+ availability=AvailabilityTarget(
168
+ uptime_pct=98.0,
169
+ degraded_mode_available=True,
170
+ ),
171
+ data_quality=DataQualityTarget(
172
+ completeness_pct=98.0,
173
+ accuracy_checks=True,
174
+ schema_validation=True,
175
+ anomaly_detection=False,
176
+ ),
177
+ notes="IBGE SIDRA API. PAM anual, LSPA mensal.",
178
+ )
179
+
180
+ NOTICIAS_AGRICOLAS_SLA = SourceSLA(
181
+ source=Fonte.NOTICIAS_AGRICOLAS,
182
+ tier=Tier.BEST_EFFORT,
183
+ freshness=FreshnessPolicy(
184
+ update_frequency="daily",
185
+ update_time=time(19, 0),
186
+ weekends=False,
187
+ holidays=False,
188
+ ),
189
+ latency=LatencyTarget(
190
+ p50_ms=1500,
191
+ p95_ms=5000,
192
+ p99_ms=15000,
193
+ timeout_ms=45000,
194
+ ),
195
+ availability=AvailabilityTarget(
196
+ uptime_pct=90.0,
197
+ degraded_mode_available=False,
198
+ ),
199
+ data_quality=DataQualityTarget(
200
+ completeness_pct=90.0,
201
+ accuracy_checks=False,
202
+ schema_validation=True,
203
+ anomaly_detection=False,
204
+ ),
205
+ notes="Fonte alternativa para CEPEA. Usado como fallback.",
206
+ )
207
+
208
+ _SLA_REGISTRY: dict[Fonte, SourceSLA] = {
209
+ Fonte.CEPEA: CEPEA_SLA,
210
+ Fonte.CONAB: CONAB_SLA,
211
+ Fonte.IBGE: IBGE_SLA,
212
+ Fonte.NOTICIAS_AGRICOLAS: NOTICIAS_AGRICOLAS_SLA,
213
+ }
214
+
215
+
216
+ def get_sla(source: Fonte) -> SourceSLA | None:
217
+ return _SLA_REGISTRY.get(source)
218
+
219
+
220
+ def list_slas() -> list[SourceSLA]:
221
+ return list(_SLA_REGISTRY.values())
222
+
223
+
224
+ def get_sla_summary() -> dict[str, Any]:
225
+ return {
226
+ "sources": [sla.to_dict() for sla in _SLA_REGISTRY.values()],
227
+ "tiers": {
228
+ "critical": "99%+ uptime, daily freshness, full validation",
229
+ "standard": "95%+ uptime, monthly freshness, schema validation",
230
+ "best_effort": "90%+ uptime, fallback source, basic validation",
231
+ },
232
+ }
233
+
234
+
235
+ __all__ = [
236
+ "Tier",
237
+ "FreshnessPolicy",
238
+ "LatencyTarget",
239
+ "AvailabilityTarget",
240
+ "DataQualityTarget",
241
+ "SourceSLA",
242
+ "CEPEA_SLA",
243
+ "CONAB_SLA",
244
+ "IBGE_SLA",
245
+ "NOTICIAS_AGRICOLAS_SLA",
246
+ "get_sla",
247
+ "list_slas",
248
+ "get_sla_summary",
249
+ ]