agrobr 0.1.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agrobr/__init__.py +3 -2
- agrobr/benchmark/__init__.py +343 -0
- agrobr/cache/policies.py +3 -8
- agrobr/cepea/api.py +87 -30
- agrobr/cepea/client.py +0 -7
- agrobr/cli.py +141 -5
- agrobr/conab/api.py +72 -6
- agrobr/config.py +137 -0
- agrobr/constants.py +1 -2
- agrobr/contracts/__init__.py +186 -0
- agrobr/contracts/cepea.py +80 -0
- agrobr/contracts/conab.py +181 -0
- agrobr/contracts/ibge.py +146 -0
- agrobr/export.py +251 -0
- agrobr/health/__init__.py +10 -0
- agrobr/health/doctor.py +321 -0
- agrobr/http/browser.py +0 -9
- agrobr/ibge/api.py +104 -25
- agrobr/ibge/client.py +5 -20
- agrobr/models.py +100 -1
- agrobr/noticias_agricolas/client.py +0 -7
- agrobr/noticias_agricolas/parser.py +0 -17
- agrobr/plugins/__init__.py +205 -0
- agrobr/quality.py +319 -0
- agrobr/sla.py +249 -0
- agrobr/snapshots.py +321 -0
- agrobr/stability.py +148 -0
- agrobr/validators/semantic.py +447 -0
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/METADATA +12 -12
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/RECORD +33 -19
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/WHEEL +0 -0
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/entry_points.txt +0 -0
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Sistema de plugins para extensibilidade do agrobr."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
import importlib.util
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, TypeVar
|
|
11
|
+
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger()
|
|
15
|
+
|
|
16
|
+
T = TypeVar("T", bound="Plugin")
|
|
17
|
+
|
|
18
|
+
_registry: dict[str, type[Plugin]] = {}
|
|
19
|
+
_instances: dict[str, Plugin] = {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class PluginMeta:
|
|
24
|
+
name: str
|
|
25
|
+
version: str
|
|
26
|
+
description: str
|
|
27
|
+
author: str = ""
|
|
28
|
+
requires: list[str] = field(default_factory=list)
|
|
29
|
+
provides: list[str] = field(default_factory=list)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Plugin(ABC):
|
|
33
|
+
meta: PluginMeta
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def setup(self) -> None:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def teardown(self) -> None:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
def is_enabled(self) -> bool:
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SourcePlugin(Plugin):
|
|
48
|
+
@abstractmethod
|
|
49
|
+
async def fetch(self, **kwargs: Any) -> Any:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
async def parse(self, content: Any, **kwargs: Any) -> Any:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
def get_source_name(self) -> str:
|
|
57
|
+
return self.meta.name
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ParserPlugin(Plugin):
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def can_parse(self, content: str) -> bool:
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def parse(self, content: str, **kwargs: Any) -> Any:
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def priority(self) -> int:
|
|
71
|
+
return 0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ExporterPlugin(Plugin):
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def export(self, data: Any, path: Path, **kwargs: Any) -> Path:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def get_extension(self) -> str:
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class ValidatorPlugin(Plugin):
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def validate(self, data: Any, **kwargs: Any) -> tuple[bool, list[str]]:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def register(plugin_class: type[T]) -> type[T]:
|
|
91
|
+
if not hasattr(plugin_class, "meta"):
|
|
92
|
+
raise ValueError(f"Plugin {plugin_class.__name__} must have 'meta' attribute")
|
|
93
|
+
|
|
94
|
+
name = plugin_class.meta.name
|
|
95
|
+
if name in _registry:
|
|
96
|
+
logger.warning(
|
|
97
|
+
"plugin_override", name=name, old=_registry[name].__name__, new=plugin_class.__name__
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
_registry[name] = plugin_class
|
|
101
|
+
logger.info("plugin_registered", name=name, version=plugin_class.meta.version)
|
|
102
|
+
return plugin_class
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_plugin(name: str) -> Plugin | None:
|
|
106
|
+
if name in _instances:
|
|
107
|
+
return _instances[name]
|
|
108
|
+
|
|
109
|
+
if name not in _registry:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
plugin_class = _registry[name]
|
|
113
|
+
instance = plugin_class()
|
|
114
|
+
instance.setup()
|
|
115
|
+
_instances[name] = instance
|
|
116
|
+
return instance
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def list_plugins() -> list[PluginMeta]:
|
|
120
|
+
return [cls.meta for cls in _registry.values()]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def list_plugins_by_type(plugin_type: type[Plugin]) -> list[PluginMeta]:
|
|
124
|
+
return [cls.meta for cls in _registry.values() if issubclass(cls, plugin_type)]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def load_plugin_from_file(path: Path) -> type[Plugin] | None:
|
|
128
|
+
if not path.exists():
|
|
129
|
+
logger.error("plugin_file_not_found", path=str(path))
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
spec = importlib.util.spec_from_file_location(path.stem, path)
|
|
133
|
+
if spec is None or spec.loader is None:
|
|
134
|
+
logger.error("plugin_spec_failed", path=str(path))
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
module = importlib.util.module_from_spec(spec)
|
|
138
|
+
spec.loader.exec_module(module)
|
|
139
|
+
|
|
140
|
+
for attr_name in dir(module):
|
|
141
|
+
attr = getattr(module, attr_name)
|
|
142
|
+
if (
|
|
143
|
+
isinstance(attr, type)
|
|
144
|
+
and issubclass(attr, Plugin)
|
|
145
|
+
and attr is not Plugin
|
|
146
|
+
and hasattr(attr, "meta")
|
|
147
|
+
):
|
|
148
|
+
register(attr)
|
|
149
|
+
return attr
|
|
150
|
+
|
|
151
|
+
logger.warning("no_plugin_found", path=str(path))
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def load_plugins_from_dir(directory: Path) -> list[type[Plugin]]:
|
|
156
|
+
loaded: list[type[Plugin]] = []
|
|
157
|
+
if not directory.exists():
|
|
158
|
+
return loaded
|
|
159
|
+
|
|
160
|
+
for path in directory.glob("*.py"):
|
|
161
|
+
if path.name.startswith("_"):
|
|
162
|
+
continue
|
|
163
|
+
plugin_class = load_plugin_from_file(path)
|
|
164
|
+
if plugin_class:
|
|
165
|
+
loaded.append(plugin_class)
|
|
166
|
+
|
|
167
|
+
return loaded
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def unload_plugin(name: str) -> bool:
|
|
171
|
+
if name in _instances:
|
|
172
|
+
_instances[name].teardown()
|
|
173
|
+
del _instances[name]
|
|
174
|
+
|
|
175
|
+
if name in _registry:
|
|
176
|
+
del _registry[name]
|
|
177
|
+
logger.info("plugin_unloaded", name=name)
|
|
178
|
+
return True
|
|
179
|
+
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def unload_all() -> None:
|
|
184
|
+
for name in list(_instances.keys()):
|
|
185
|
+
_instances[name].teardown()
|
|
186
|
+
_instances.clear()
|
|
187
|
+
_registry.clear()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
__all__ = [
|
|
191
|
+
"Plugin",
|
|
192
|
+
"PluginMeta",
|
|
193
|
+
"SourcePlugin",
|
|
194
|
+
"ParserPlugin",
|
|
195
|
+
"ExporterPlugin",
|
|
196
|
+
"ValidatorPlugin",
|
|
197
|
+
"register",
|
|
198
|
+
"get_plugin",
|
|
199
|
+
"list_plugins",
|
|
200
|
+
"list_plugins_by_type",
|
|
201
|
+
"load_plugin_from_file",
|
|
202
|
+
"load_plugins_from_dir",
|
|
203
|
+
"unload_plugin",
|
|
204
|
+
"unload_all",
|
|
205
|
+
]
|
agrobr/quality.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""Certificacao de qualidade de dados."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import StrEnum
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
import structlog
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
logger = structlog.get_logger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class QualityLevel(StrEnum):
|
|
19
|
+
GOLD = "gold"
|
|
20
|
+
SILVER = "silver"
|
|
21
|
+
BRONZE = "bronze"
|
|
22
|
+
UNCERTIFIED = "uncertified"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CheckStatus(StrEnum):
|
|
26
|
+
PASSED = "passed"
|
|
27
|
+
FAILED = "failed"
|
|
28
|
+
SKIPPED = "skipped"
|
|
29
|
+
WARNING = "warning"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class QualityCheck:
|
|
34
|
+
name: str
|
|
35
|
+
status: CheckStatus
|
|
36
|
+
message: str = ""
|
|
37
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class QualityCertificate:
|
|
42
|
+
level: QualityLevel
|
|
43
|
+
checks: list[QualityCheck]
|
|
44
|
+
issued_at: datetime
|
|
45
|
+
valid_until: datetime | None = None
|
|
46
|
+
source: str = ""
|
|
47
|
+
dataset: str = ""
|
|
48
|
+
row_count: int = 0
|
|
49
|
+
column_count: int = 0
|
|
50
|
+
score: float = 0.0
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> dict[str, Any]:
|
|
53
|
+
return {
|
|
54
|
+
"level": self.level.value,
|
|
55
|
+
"score": round(self.score, 2),
|
|
56
|
+
"issued_at": self.issued_at.isoformat(),
|
|
57
|
+
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
|
|
58
|
+
"source": self.source,
|
|
59
|
+
"dataset": self.dataset,
|
|
60
|
+
"row_count": self.row_count,
|
|
61
|
+
"column_count": self.column_count,
|
|
62
|
+
"checks": [
|
|
63
|
+
{
|
|
64
|
+
"name": c.name,
|
|
65
|
+
"status": c.status.value,
|
|
66
|
+
"message": c.message,
|
|
67
|
+
}
|
|
68
|
+
for c in self.checks
|
|
69
|
+
],
|
|
70
|
+
"summary": {
|
|
71
|
+
"passed": sum(1 for c in self.checks if c.status == CheckStatus.PASSED),
|
|
72
|
+
"failed": sum(1 for c in self.checks if c.status == CheckStatus.FAILED),
|
|
73
|
+
"warnings": sum(1 for c in self.checks if c.status == CheckStatus.WARNING),
|
|
74
|
+
"skipped": sum(1 for c in self.checks if c.status == CheckStatus.SKIPPED),
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def is_valid(self) -> bool:
|
|
79
|
+
if self.valid_until is None:
|
|
80
|
+
return True
|
|
81
|
+
return datetime.now() < self.valid_until
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _check_completeness(df: pd.DataFrame, threshold: float = 0.95) -> QualityCheck:
|
|
85
|
+
total_cells = df.size
|
|
86
|
+
non_null_cells = df.count().sum()
|
|
87
|
+
completeness = non_null_cells / total_cells if total_cells > 0 else 0
|
|
88
|
+
|
|
89
|
+
if completeness >= threshold:
|
|
90
|
+
return QualityCheck(
|
|
91
|
+
name="completeness",
|
|
92
|
+
status=CheckStatus.PASSED,
|
|
93
|
+
message=f"Completeness: {completeness:.1%}",
|
|
94
|
+
details={"completeness": completeness, "threshold": threshold},
|
|
95
|
+
)
|
|
96
|
+
elif completeness >= threshold * 0.9:
|
|
97
|
+
return QualityCheck(
|
|
98
|
+
name="completeness",
|
|
99
|
+
status=CheckStatus.WARNING,
|
|
100
|
+
message=f"Completeness below threshold: {completeness:.1%}",
|
|
101
|
+
details={"completeness": completeness, "threshold": threshold},
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
return QualityCheck(
|
|
105
|
+
name="completeness",
|
|
106
|
+
status=CheckStatus.FAILED,
|
|
107
|
+
message=f"Low completeness: {completeness:.1%}",
|
|
108
|
+
details={"completeness": completeness, "threshold": threshold},
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _check_duplicates(df: pd.DataFrame, max_dup_pct: float = 0.01) -> QualityCheck:
|
|
113
|
+
dup_count = df.duplicated().sum()
|
|
114
|
+
dup_pct = dup_count / len(df) if len(df) > 0 else 0
|
|
115
|
+
|
|
116
|
+
if dup_pct <= max_dup_pct:
|
|
117
|
+
return QualityCheck(
|
|
118
|
+
name="duplicates",
|
|
119
|
+
status=CheckStatus.PASSED,
|
|
120
|
+
message=f"Duplicates: {dup_count} ({dup_pct:.1%})",
|
|
121
|
+
details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
return QualityCheck(
|
|
125
|
+
name="duplicates",
|
|
126
|
+
status=CheckStatus.FAILED,
|
|
127
|
+
message=f"Too many duplicates: {dup_count} ({dup_pct:.1%})",
|
|
128
|
+
details={"duplicate_count": int(dup_count), "duplicate_pct": dup_pct},
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _check_schema(df: pd.DataFrame, expected_columns: list[str] | None = None) -> QualityCheck:
|
|
133
|
+
if expected_columns is None:
|
|
134
|
+
return QualityCheck(
|
|
135
|
+
name="schema",
|
|
136
|
+
status=CheckStatus.SKIPPED,
|
|
137
|
+
message="No expected schema provided",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
actual_columns = set(df.columns)
|
|
141
|
+
expected_set = set(expected_columns)
|
|
142
|
+
missing = expected_set - actual_columns
|
|
143
|
+
extra = actual_columns - expected_set
|
|
144
|
+
|
|
145
|
+
if not missing and not extra:
|
|
146
|
+
return QualityCheck(
|
|
147
|
+
name="schema",
|
|
148
|
+
status=CheckStatus.PASSED,
|
|
149
|
+
message="Schema matches expected columns",
|
|
150
|
+
details={"columns": list(actual_columns)},
|
|
151
|
+
)
|
|
152
|
+
elif not missing:
|
|
153
|
+
return QualityCheck(
|
|
154
|
+
name="schema",
|
|
155
|
+
status=CheckStatus.WARNING,
|
|
156
|
+
message=f"Extra columns found: {extra}",
|
|
157
|
+
details={"missing": list(missing), "extra": list(extra)},
|
|
158
|
+
)
|
|
159
|
+
else:
|
|
160
|
+
return QualityCheck(
|
|
161
|
+
name="schema",
|
|
162
|
+
status=CheckStatus.FAILED,
|
|
163
|
+
message=f"Missing columns: {missing}",
|
|
164
|
+
details={"missing": list(missing), "extra": list(extra)},
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _check_freshness(
|
|
169
|
+
df: pd.DataFrame,
|
|
170
|
+
date_column: str = "data",
|
|
171
|
+
max_age_days: int = 7,
|
|
172
|
+
) -> QualityCheck:
|
|
173
|
+
if date_column not in df.columns:
|
|
174
|
+
return QualityCheck(
|
|
175
|
+
name="freshness",
|
|
176
|
+
status=CheckStatus.SKIPPED,
|
|
177
|
+
message=f"Date column '{date_column}' not found",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
import pandas
|
|
181
|
+
|
|
182
|
+
df[date_column] = pandas.to_datetime(df[date_column])
|
|
183
|
+
max_date = df[date_column].max()
|
|
184
|
+
age_days = (datetime.now() - max_date).days
|
|
185
|
+
|
|
186
|
+
if age_days <= max_age_days:
|
|
187
|
+
return QualityCheck(
|
|
188
|
+
name="freshness",
|
|
189
|
+
status=CheckStatus.PASSED,
|
|
190
|
+
message=f"Data age: {age_days} days",
|
|
191
|
+
details={"max_date": max_date.isoformat(), "age_days": age_days},
|
|
192
|
+
)
|
|
193
|
+
elif age_days <= max_age_days * 2:
|
|
194
|
+
return QualityCheck(
|
|
195
|
+
name="freshness",
|
|
196
|
+
status=CheckStatus.WARNING,
|
|
197
|
+
message=f"Data slightly stale: {age_days} days",
|
|
198
|
+
details={"max_date": max_date.isoformat(), "age_days": age_days},
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
return QualityCheck(
|
|
202
|
+
name="freshness",
|
|
203
|
+
status=CheckStatus.FAILED,
|
|
204
|
+
message=f"Data too old: {age_days} days",
|
|
205
|
+
details={"max_date": max_date.isoformat(), "age_days": age_days},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _check_value_ranges(
|
|
210
|
+
df: pd.DataFrame,
|
|
211
|
+
column: str,
|
|
212
|
+
min_val: float | None = None,
|
|
213
|
+
max_val: float | None = None,
|
|
214
|
+
) -> QualityCheck:
|
|
215
|
+
if column not in df.columns:
|
|
216
|
+
return QualityCheck(
|
|
217
|
+
name=f"range_{column}",
|
|
218
|
+
status=CheckStatus.SKIPPED,
|
|
219
|
+
message=f"Column '{column}' not found",
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
values = df[column].dropna()
|
|
223
|
+
if len(values) == 0:
|
|
224
|
+
return QualityCheck(
|
|
225
|
+
name=f"range_{column}",
|
|
226
|
+
status=CheckStatus.WARNING,
|
|
227
|
+
message=f"Column '{column}' is empty",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
actual_min = values.min()
|
|
231
|
+
actual_max = values.max()
|
|
232
|
+
violations = 0
|
|
233
|
+
|
|
234
|
+
if min_val is not None:
|
|
235
|
+
violations += (values < min_val).sum()
|
|
236
|
+
if max_val is not None:
|
|
237
|
+
violations += (values > max_val).sum()
|
|
238
|
+
|
|
239
|
+
if violations == 0:
|
|
240
|
+
return QualityCheck(
|
|
241
|
+
name=f"range_{column}",
|
|
242
|
+
status=CheckStatus.PASSED,
|
|
243
|
+
message=f"All values in range [{min_val}, {max_val}]",
|
|
244
|
+
details={"min": float(actual_min), "max": float(actual_max)},
|
|
245
|
+
)
|
|
246
|
+
else:
|
|
247
|
+
return QualityCheck(
|
|
248
|
+
name=f"range_{column}",
|
|
249
|
+
status=CheckStatus.FAILED,
|
|
250
|
+
message=f"{violations} values out of range",
|
|
251
|
+
details={
|
|
252
|
+
"min": float(actual_min),
|
|
253
|
+
"max": float(actual_max),
|
|
254
|
+
"violations": int(violations),
|
|
255
|
+
},
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def certify(
|
|
260
|
+
df: pd.DataFrame,
|
|
261
|
+
source: str = "",
|
|
262
|
+
dataset: str = "",
|
|
263
|
+
expected_columns: list[str] | None = None,
|
|
264
|
+
date_column: str = "data",
|
|
265
|
+
value_column: str = "valor",
|
|
266
|
+
min_value: float | None = 0,
|
|
267
|
+
max_value: float | None = None,
|
|
268
|
+
) -> QualityCertificate:
|
|
269
|
+
checks = []
|
|
270
|
+
|
|
271
|
+
checks.append(_check_completeness(df))
|
|
272
|
+
checks.append(_check_duplicates(df))
|
|
273
|
+
checks.append(_check_schema(df, expected_columns))
|
|
274
|
+
checks.append(_check_freshness(df, date_column))
|
|
275
|
+
|
|
276
|
+
if value_column in df.columns:
|
|
277
|
+
checks.append(_check_value_ranges(df, value_column, min_value, max_value))
|
|
278
|
+
|
|
279
|
+
passed = sum(1 for c in checks if c.status == CheckStatus.PASSED)
|
|
280
|
+
failed = sum(1 for c in checks if c.status == CheckStatus.FAILED)
|
|
281
|
+
warnings = sum(1 for c in checks if c.status == CheckStatus.WARNING)
|
|
282
|
+
total = passed + failed + warnings
|
|
283
|
+
|
|
284
|
+
score = (passed + warnings * 0.5) / total if total > 0 else 0
|
|
285
|
+
|
|
286
|
+
if score >= 0.9 and failed == 0:
|
|
287
|
+
level = QualityLevel.GOLD
|
|
288
|
+
elif score >= 0.7 and failed <= 1:
|
|
289
|
+
level = QualityLevel.SILVER
|
|
290
|
+
elif score >= 0.5:
|
|
291
|
+
level = QualityLevel.BRONZE
|
|
292
|
+
else:
|
|
293
|
+
level = QualityLevel.UNCERTIFIED
|
|
294
|
+
|
|
295
|
+
return QualityCertificate(
|
|
296
|
+
level=level,
|
|
297
|
+
checks=checks,
|
|
298
|
+
issued_at=datetime.now(),
|
|
299
|
+
source=source,
|
|
300
|
+
dataset=dataset,
|
|
301
|
+
row_count=len(df),
|
|
302
|
+
column_count=len(df.columns),
|
|
303
|
+
score=score,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def quick_check(df: pd.DataFrame) -> tuple[QualityLevel, float]:
|
|
308
|
+
cert = certify(df)
|
|
309
|
+
return cert.level, cert.score
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
__all__ = [
|
|
313
|
+
"QualityLevel",
|
|
314
|
+
"CheckStatus",
|
|
315
|
+
"QualityCheck",
|
|
316
|
+
"QualityCertificate",
|
|
317
|
+
"certify",
|
|
318
|
+
"quick_check",
|
|
319
|
+
]
|