agrobr 0.1.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,205 @@
1
+ """Sistema de plugins para extensibilidade do agrobr."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ import importlib.util
7
+ from abc import ABC, abstractmethod
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any, TypeVar
11
+
12
+ import structlog
13
+
14
+ logger = structlog.get_logger()
15
+
16
+ T = TypeVar("T", bound="Plugin")
17
+
18
+ _registry: dict[str, type[Plugin]] = {}
19
+ _instances: dict[str, Plugin] = {}
20
+
21
+
22
+ @dataclass
23
+ class PluginMeta:
24
+ name: str
25
+ version: str
26
+ description: str
27
+ author: str = ""
28
+ requires: list[str] = field(default_factory=list)
29
+ provides: list[str] = field(default_factory=list)
30
+
31
+
32
+ class Plugin(ABC):
33
+ meta: PluginMeta
34
+
35
+ @abstractmethod
36
+ def setup(self) -> None:
37
+ pass
38
+
39
+ @abstractmethod
40
+ def teardown(self) -> None:
41
+ pass
42
+
43
+ def is_enabled(self) -> bool:
44
+ return True
45
+
46
+
47
+ class SourcePlugin(Plugin):
48
+ @abstractmethod
49
+ async def fetch(self, **kwargs: Any) -> Any:
50
+ pass
51
+
52
+ @abstractmethod
53
+ async def parse(self, content: Any, **kwargs: Any) -> Any:
54
+ pass
55
+
56
+ def get_source_name(self) -> str:
57
+ return self.meta.name
58
+
59
+
60
+ class ParserPlugin(Plugin):
61
+ @abstractmethod
62
+ def can_parse(self, content: str) -> bool:
63
+ pass
64
+
65
+ @abstractmethod
66
+ def parse(self, content: str, **kwargs: Any) -> Any:
67
+ pass
68
+
69
+ @property
70
+ def priority(self) -> int:
71
+ return 0
72
+
73
+
74
+ class ExporterPlugin(Plugin):
75
+ @abstractmethod
76
+ def export(self, data: Any, path: Path, **kwargs: Any) -> Path:
77
+ pass
78
+
79
+ @abstractmethod
80
+ def get_extension(self) -> str:
81
+ pass
82
+
83
+
84
+ class ValidatorPlugin(Plugin):
85
+ @abstractmethod
86
+ def validate(self, data: Any, **kwargs: Any) -> tuple[bool, list[str]]:
87
+ pass
88
+
89
+
90
+ def register(plugin_class: type[T]) -> type[T]:
91
+ if not hasattr(plugin_class, "meta"):
92
+ raise ValueError(f"Plugin {plugin_class.__name__} must have 'meta' attribute")
93
+
94
+ name = plugin_class.meta.name
95
+ if name in _registry:
96
+ logger.warning(
97
+ "plugin_override", name=name, old=_registry[name].__name__, new=plugin_class.__name__
98
+ )
99
+
100
+ _registry[name] = plugin_class
101
+ logger.info("plugin_registered", name=name, version=plugin_class.meta.version)
102
+ return plugin_class
103
+
104
+
105
+ def get_plugin(name: str) -> Plugin | None:
106
+ if name in _instances:
107
+ return _instances[name]
108
+
109
+ if name not in _registry:
110
+ return None
111
+
112
+ plugin_class = _registry[name]
113
+ instance = plugin_class()
114
+ instance.setup()
115
+ _instances[name] = instance
116
+ return instance
117
+
118
+
119
+ def list_plugins() -> list[PluginMeta]:
120
+ return [cls.meta for cls in _registry.values()]
121
+
122
+
123
+ def list_plugins_by_type(plugin_type: type[Plugin]) -> list[PluginMeta]:
124
+ return [cls.meta for cls in _registry.values() if issubclass(cls, plugin_type)]
125
+
126
+
127
+ def load_plugin_from_file(path: Path) -> type[Plugin] | None:
128
+ if not path.exists():
129
+ logger.error("plugin_file_not_found", path=str(path))
130
+ return None
131
+
132
+ spec = importlib.util.spec_from_file_location(path.stem, path)
133
+ if spec is None or spec.loader is None:
134
+ logger.error("plugin_spec_failed", path=str(path))
135
+ return None
136
+
137
+ module = importlib.util.module_from_spec(spec)
138
+ spec.loader.exec_module(module)
139
+
140
+ for attr_name in dir(module):
141
+ attr = getattr(module, attr_name)
142
+ if (
143
+ isinstance(attr, type)
144
+ and issubclass(attr, Plugin)
145
+ and attr is not Plugin
146
+ and hasattr(attr, "meta")
147
+ ):
148
+ register(attr)
149
+ return attr
150
+
151
+ logger.warning("no_plugin_found", path=str(path))
152
+ return None
153
+
154
+
155
+ def load_plugins_from_dir(directory: Path) -> list[type[Plugin]]:
156
+ loaded: list[type[Plugin]] = []
157
+ if not directory.exists():
158
+ return loaded
159
+
160
+ for path in directory.glob("*.py"):
161
+ if path.name.startswith("_"):
162
+ continue
163
+ plugin_class = load_plugin_from_file(path)
164
+ if plugin_class:
165
+ loaded.append(plugin_class)
166
+
167
+ return loaded
168
+
169
+
170
+ def unload_plugin(name: str) -> bool:
171
+ if name in _instances:
172
+ _instances[name].teardown()
173
+ del _instances[name]
174
+
175
+ if name in _registry:
176
+ del _registry[name]
177
+ logger.info("plugin_unloaded", name=name)
178
+ return True
179
+
180
+ return False
181
+
182
+
183
+ def unload_all() -> None:
184
+ for name in list(_instances.keys()):
185
+ _instances[name].teardown()
186
+ _instances.clear()
187
+ _registry.clear()
188
+
189
+
190
+ __all__ = [
191
+ "Plugin",
192
+ "PluginMeta",
193
+ "SourcePlugin",
194
+ "ParserPlugin",
195
+ "ExporterPlugin",
196
+ "ValidatorPlugin",
197
+ "register",
198
+ "get_plugin",
199
+ "list_plugins",
200
+ "list_plugins_by_type",
201
+ "load_plugin_from_file",
202
+ "load_plugins_from_dir",
203
+ "unload_plugin",
204
+ "unload_all",
205
+ ]
agrobr/quality.py ADDED
@@ -0,0 +1,319 @@
1
+ """Certificacao de qualidade de dados."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from enum import StrEnum
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import structlog
11
+
12
+ if TYPE_CHECKING:
13
+ import pandas as pd
14
+
15
+ logger = structlog.get_logger()
16
+
17
+
18
+ class QualityLevel(StrEnum):
19
+ GOLD = "gold"
20
+ SILVER = "silver"
21
+ BRONZE = "bronze"
22
+ UNCERTIFIED = "uncertified"
23
+
24
+
25
+ class CheckStatus(StrEnum):
26
+ PASSED = "passed"
27
+ FAILED = "failed"
28
+ SKIPPED = "skipped"
29
+ WARNING = "warning"
30
+
31
+
32
+ @dataclass
33
+ class QualityCheck:
34
+ name: str
35
+ status: CheckStatus
36
+ message: str = ""
37
+ details: dict[str, Any] = field(default_factory=dict)
38
+
39
+
40
+ @dataclass
41
+ class QualityCertificate:
42
+ level: QualityLevel
43
+ checks: list[QualityCheck]
44
+ issued_at: datetime
45
+ valid_until: datetime | None = None
46
+ source: str = ""
47
+ dataset: str = ""
48
+ row_count: int = 0
49
+ column_count: int = 0
50
+ score: float = 0.0
51
+
52
+ def to_dict(self) -> dict[str, Any]:
53
+ return {
54
+ "level": self.level.value,
55
+ "score": round(self.score, 2),
56
+ "issued_at": self.issued_at.isoformat(),
57
+ "valid_until": self.valid_until.isoformat() if self.valid_until else None,
58
+ "source": self.source,
59
+ "dataset": self.dataset,
60
+ "row_count": self.row_count,
61
+ "column_count": self.column_count,
62
+ "checks": [
63
+ {
64
+ "name": c.name,
65
+ "status": c.status.value,
66
+ "message": c.message,
67
+ }
68
+ for c in self.checks
69
+ ],
70
+ "summary": {
71
+ "passed": sum(1 for c in self.checks if c.status == CheckStatus.PASSED),
72
+ "failed": sum(1 for c in self.checks if c.status == CheckStatus.FAILED),
73
+ "warnings": sum(1 for c in self.checks if c.status == CheckStatus.WARNING),
74
+ "skipped": sum(1 for c in self.checks if c.status == CheckStatus.SKIPPED),
75
+ },
76
+ }
77
+
78
+ def is_valid(self) -> bool:
79
+ if self.valid_until is None:
80
+ return True
81
+ return datetime.now() < self.valid_until
82
+
83
+
84
+ def _check_completeness(df: pd.DataFrame, threshold: float = 0.95) -> QualityCheck:
85
+ total_cells = df.size
86
+ non_null_cells = df.count().sum()
87
+ completeness = non_null_cells / total_cells if total_cells > 0 else 0
88
+
89
+ if completeness >= threshold:
90
+ return QualityCheck(
91
+ name="completeness",
92
+ status=CheckStatus.PASSED,
93
+ message=f"Completeness: {completeness:.1%}",
94
+ details={"completeness": completeness, "threshold": threshold},
95
+ )
96
+ elif completeness >= threshold * 0.9:
97
+ return QualityCheck(
98
+ name="completeness",
99
+ status=CheckStatus.WARNING,
100
+ message=f"Completeness below threshold: {completeness:.1%}",
101
+ details={"completeness": completeness, "threshold": threshold},
102
+ )
103
+ else:
104
+ return QualityCheck(
105
+ name="completeness",
106
+ status=CheckStatus.FAILED,
107
+ message=f"Low completeness: {completeness:.1%}",
108
+ details={"completeness": completeness, "threshold": threshold},
109
+ )
110
+
111
+
112
+ def _check_duplicates(df: pd.DataFrame, max_dup_pct: float = 0.01) -> QualityCheck:
113
+ dup_count = df.duplicated().sum()
114
+ dup_pct = dup_count / len(df) if len(df) > 0 else 0
115
+
116
+ if dup_pct <= max_dup_pct:
117
+ return QualityCheck(
118
+ name="duplicates",
119
+ status=CheckStatus.PASSED,
120
+ message=f"Duplicates: {dup_count} ({dup_pct:.1%})",
121
+ details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
122
+ )
123
+ else:
124
+ return QualityCheck(
125
+ name="duplicates",
126
+ status=CheckStatus.FAILED,
127
+ message=f"Too many duplicates: {dup_count} ({dup_pct:.1%})",
128
+ details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
129
+ )
130
+
131
+
132
+ def _check_schema(df: pd.DataFrame, expected_columns: list[str] | None = None) -> QualityCheck:
133
+ if expected_columns is None:
134
+ return QualityCheck(
135
+ name="schema",
136
+ status=CheckStatus.SKIPPED,
137
+ message="No expected schema provided",
138
+ )
139
+
140
+ actual_columns = set(df.columns)
141
+ expected_set = set(expected_columns)
142
+ missing = expected_set - actual_columns
143
+ extra = actual_columns - expected_set
144
+
145
+ if not missing and not extra:
146
+ return QualityCheck(
147
+ name="schema",
148
+ status=CheckStatus.PASSED,
149
+ message="Schema matches expected columns",
150
+ details={"columns": list(actual_columns)},
151
+ )
152
+ elif not missing:
153
+ return QualityCheck(
154
+ name="schema",
155
+ status=CheckStatus.WARNING,
156
+ message=f"Extra columns found: {extra}",
157
+ details={"missing": list(missing), "extra": list(extra)},
158
+ )
159
+ else:
160
+ return QualityCheck(
161
+ name="schema",
162
+ status=CheckStatus.FAILED,
163
+ message=f"Missing columns: {missing}",
164
+ details={"missing": list(missing), "extra": list(extra)},
165
+ )
166
+
167
+
168
+ def _check_freshness(
169
+ df: pd.DataFrame,
170
+ date_column: str = "data",
171
+ max_age_days: int = 7,
172
+ ) -> QualityCheck:
173
+ if date_column not in df.columns:
174
+ return QualityCheck(
175
+ name="freshness",
176
+ status=CheckStatus.SKIPPED,
177
+ message=f"Date column '{date_column}' not found",
178
+ )
179
+
180
+ import pandas
181
+
182
+ df[date_column] = pandas.to_datetime(df[date_column])
183
+ max_date = df[date_column].max()
184
+ age_days = (datetime.now() - max_date).days
185
+
186
+ if age_days <= max_age_days:
187
+ return QualityCheck(
188
+ name="freshness",
189
+ status=CheckStatus.PASSED,
190
+ message=f"Data age: {age_days} days",
191
+ details={"max_date": max_date.isoformat(), "age_days": age_days},
192
+ )
193
+ elif age_days <= max_age_days * 2:
194
+ return QualityCheck(
195
+ name="freshness",
196
+ status=CheckStatus.WARNING,
197
+ message=f"Data slightly stale: {age_days} days",
198
+ details={"max_date": max_date.isoformat(), "age_days": age_days},
199
+ )
200
+ else:
201
+ return QualityCheck(
202
+ name="freshness",
203
+ status=CheckStatus.FAILED,
204
+ message=f"Data too old: {age_days} days",
205
+ details={"max_date": max_date.isoformat(), "age_days": age_days},
206
+ )
207
+
208
+
209
+ def _check_value_ranges(
210
+ df: pd.DataFrame,
211
+ column: str,
212
+ min_val: float | None = None,
213
+ max_val: float | None = None,
214
+ ) -> QualityCheck:
215
+ if column not in df.columns:
216
+ return QualityCheck(
217
+ name=f"range_{column}",
218
+ status=CheckStatus.SKIPPED,
219
+ message=f"Column '{column}' not found",
220
+ )
221
+
222
+ values = df[column].dropna()
223
+ if len(values) == 0:
224
+ return QualityCheck(
225
+ name=f"range_{column}",
226
+ status=CheckStatus.WARNING,
227
+ message=f"Column '{column}' is empty",
228
+ )
229
+
230
+ actual_min = values.min()
231
+ actual_max = values.max()
232
+ violations = 0
233
+
234
+ if min_val is not None:
235
+ violations += (values < min_val).sum()
236
+ if max_val is not None:
237
+ violations += (values > max_val).sum()
238
+
239
+ if violations == 0:
240
+ return QualityCheck(
241
+ name=f"range_{column}",
242
+ status=CheckStatus.PASSED,
243
+ message=f"All values in range [{min_val}, {max_val}]",
244
+ details={"min": float(actual_min), "max": float(actual_max)},
245
+ )
246
+ else:
247
+ return QualityCheck(
248
+ name=f"range_{column}",
249
+ status=CheckStatus.FAILED,
250
+ message=f"{violations} values out of range",
251
+ details={
252
+ "min": float(actual_min),
253
+ "max": float(actual_max),
254
+ "violations": int(violations),
255
+ },
256
+ )
257
+
258
+
259
+ def certify(
260
+ df: pd.DataFrame,
261
+ source: str = "",
262
+ dataset: str = "",
263
+ expected_columns: list[str] | None = None,
264
+ date_column: str = "data",
265
+ value_column: str = "valor",
266
+ min_value: float | None = 0,
267
+ max_value: float | None = None,
268
+ ) -> QualityCertificate:
269
+ checks = []
270
+
271
+ checks.append(_check_completeness(df))
272
+ checks.append(_check_duplicates(df))
273
+ checks.append(_check_schema(df, expected_columns))
274
+ checks.append(_check_freshness(df, date_column))
275
+
276
+ if value_column in df.columns:
277
+ checks.append(_check_value_ranges(df, value_column, min_value, max_value))
278
+
279
+ passed = sum(1 for c in checks if c.status == CheckStatus.PASSED)
280
+ failed = sum(1 for c in checks if c.status == CheckStatus.FAILED)
281
+ warnings = sum(1 for c in checks if c.status == CheckStatus.WARNING)
282
+ total = passed + failed + warnings
283
+
284
+ score = (passed + warnings * 0.5) / total if total > 0 else 0
285
+
286
+ if score >= 0.9 and failed == 0:
287
+ level = QualityLevel.GOLD
288
+ elif score >= 0.7 and failed <= 1:
289
+ level = QualityLevel.SILVER
290
+ elif score >= 0.5:
291
+ level = QualityLevel.BRONZE
292
+ else:
293
+ level = QualityLevel.UNCERTIFIED
294
+
295
+ return QualityCertificate(
296
+ level=level,
297
+ checks=checks,
298
+ issued_at=datetime.now(),
299
+ source=source,
300
+ dataset=dataset,
301
+ row_count=len(df),
302
+ column_count=len(df.columns),
303
+ score=score,
304
+ )
305
+
306
+
307
+ def quick_check(df: pd.DataFrame) -> tuple[QualityLevel, float]:
308
+ cert = certify(df)
309
+ return cert.level, cert.score
310
+
311
+
312
+ __all__ = [
313
+ "QualityLevel",
314
+ "CheckStatus",
315
+ "QualityCheck",
316
+ "QualityCertificate",
317
+ "certify",
318
+ "quick_check",
319
+ ]