agrobr 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agrobr/__init__.py +3 -2
- agrobr/benchmark/__init__.py +343 -0
- agrobr/cache/policies.py +99 -17
- agrobr/cepea/api.py +87 -30
- agrobr/cepea/client.py +1 -8
- agrobr/cli.py +141 -5
- agrobr/conab/api.py +72 -6
- agrobr/config.py +137 -0
- agrobr/constants.py +1 -2
- agrobr/contracts/__init__.py +186 -0
- agrobr/contracts/cepea.py +80 -0
- agrobr/contracts/conab.py +181 -0
- agrobr/contracts/ibge.py +146 -0
- agrobr/export.py +251 -0
- agrobr/health/__init__.py +10 -0
- agrobr/health/doctor.py +321 -0
- agrobr/http/browser.py +0 -9
- agrobr/ibge/api.py +104 -25
- agrobr/ibge/client.py +5 -20
- agrobr/models.py +100 -1
- agrobr/noticias_agricolas/client.py +0 -7
- agrobr/noticias_agricolas/parser.py +0 -17
- agrobr/plugins/__init__.py +205 -0
- agrobr/quality.py +319 -0
- agrobr/sla.py +249 -0
- agrobr/snapshots.py +321 -0
- agrobr/stability.py +148 -0
- agrobr/validators/semantic.py +447 -0
- {agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/METADATA +12 -12
- {agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/RECORD +33 -19
- {agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/WHEEL +0 -0
- {agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/entry_points.txt +0 -0
- {agrobr-0.1.0.dist-info → agrobr-0.5.0.dist-info}/licenses/LICENSE +0 -0
agrobr/contracts/ibge.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Contratos de estabilidade para dados IBGE."""
|
|
2
|
+
|
|
3
|
+
from agrobr.contracts import BreakingChangePolicy, Column, ColumnType, Contract
|
|
4
|
+
|
|
5
|
+
IBGE_PAM_V1 = Contract(
|
|
6
|
+
name="ibge.pam",
|
|
7
|
+
version="1.0",
|
|
8
|
+
effective_from="0.3.0",
|
|
9
|
+
columns=[
|
|
10
|
+
Column(
|
|
11
|
+
name="ano",
|
|
12
|
+
type=ColumnType.INTEGER,
|
|
13
|
+
nullable=False,
|
|
14
|
+
stable=True,
|
|
15
|
+
description="Ano de referencia",
|
|
16
|
+
),
|
|
17
|
+
Column(
|
|
18
|
+
name="localidade",
|
|
19
|
+
type=ColumnType.STRING,
|
|
20
|
+
nullable=True,
|
|
21
|
+
stable=True,
|
|
22
|
+
description="Nome da localidade (UF ou municipio)",
|
|
23
|
+
),
|
|
24
|
+
Column(
|
|
25
|
+
name="produto",
|
|
26
|
+
type=ColumnType.STRING,
|
|
27
|
+
nullable=False,
|
|
28
|
+
stable=True,
|
|
29
|
+
description="Nome do produto",
|
|
30
|
+
),
|
|
31
|
+
Column(
|
|
32
|
+
name="area_plantada",
|
|
33
|
+
type=ColumnType.FLOAT,
|
|
34
|
+
nullable=True,
|
|
35
|
+
unit="ha",
|
|
36
|
+
stable=True,
|
|
37
|
+
description="Area plantada em hectares",
|
|
38
|
+
),
|
|
39
|
+
Column(
|
|
40
|
+
name="area_colhida",
|
|
41
|
+
type=ColumnType.FLOAT,
|
|
42
|
+
nullable=True,
|
|
43
|
+
unit="ha",
|
|
44
|
+
stable=True,
|
|
45
|
+
description="Area colhida em hectares",
|
|
46
|
+
),
|
|
47
|
+
Column(
|
|
48
|
+
name="producao",
|
|
49
|
+
type=ColumnType.FLOAT,
|
|
50
|
+
nullable=True,
|
|
51
|
+
unit="ton",
|
|
52
|
+
stable=True,
|
|
53
|
+
description="Quantidade produzida em toneladas",
|
|
54
|
+
),
|
|
55
|
+
Column(
|
|
56
|
+
name="rendimento",
|
|
57
|
+
type=ColumnType.FLOAT,
|
|
58
|
+
nullable=True,
|
|
59
|
+
unit="kg/ha",
|
|
60
|
+
stable=True,
|
|
61
|
+
description="Rendimento medio em kg/ha",
|
|
62
|
+
),
|
|
63
|
+
Column(
|
|
64
|
+
name="valor_producao",
|
|
65
|
+
type=ColumnType.FLOAT,
|
|
66
|
+
nullable=True,
|
|
67
|
+
unit="mil_reais",
|
|
68
|
+
stable=True,
|
|
69
|
+
description="Valor da producao em mil reais",
|
|
70
|
+
),
|
|
71
|
+
Column(
|
|
72
|
+
name="fonte",
|
|
73
|
+
type=ColumnType.STRING,
|
|
74
|
+
nullable=False,
|
|
75
|
+
stable=True,
|
|
76
|
+
description="Fonte dos dados (ibge_pam)",
|
|
77
|
+
),
|
|
78
|
+
],
|
|
79
|
+
guarantees=[
|
|
80
|
+
"Column names never change (additions only)",
|
|
81
|
+
"'ano' is always a valid year (>= 1974)",
|
|
82
|
+
"Numeric values are always >= 0",
|
|
83
|
+
"'fonte' is always 'ibge_pam'",
|
|
84
|
+
],
|
|
85
|
+
breaking_policy=BreakingChangePolicy.MAJOR_VERSION,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
IBGE_LSPA_V1 = Contract(
|
|
89
|
+
name="ibge.lspa",
|
|
90
|
+
version="1.0",
|
|
91
|
+
effective_from="0.3.0",
|
|
92
|
+
columns=[
|
|
93
|
+
Column(
|
|
94
|
+
name="ano",
|
|
95
|
+
type=ColumnType.INTEGER,
|
|
96
|
+
nullable=False,
|
|
97
|
+
stable=True,
|
|
98
|
+
description="Ano de referencia",
|
|
99
|
+
),
|
|
100
|
+
Column(
|
|
101
|
+
name="mes",
|
|
102
|
+
type=ColumnType.INTEGER,
|
|
103
|
+
nullable=True,
|
|
104
|
+
stable=True,
|
|
105
|
+
description="Mes de referencia (1-12)",
|
|
106
|
+
),
|
|
107
|
+
Column(
|
|
108
|
+
name="produto",
|
|
109
|
+
type=ColumnType.STRING,
|
|
110
|
+
nullable=False,
|
|
111
|
+
stable=True,
|
|
112
|
+
description="Nome do produto",
|
|
113
|
+
),
|
|
114
|
+
Column(
|
|
115
|
+
name="variavel",
|
|
116
|
+
type=ColumnType.STRING,
|
|
117
|
+
nullable=True,
|
|
118
|
+
stable=False,
|
|
119
|
+
description="Nome da variavel",
|
|
120
|
+
),
|
|
121
|
+
Column(
|
|
122
|
+
name="valor",
|
|
123
|
+
type=ColumnType.FLOAT,
|
|
124
|
+
nullable=True,
|
|
125
|
+
stable=False,
|
|
126
|
+
description="Valor da variavel",
|
|
127
|
+
),
|
|
128
|
+
Column(
|
|
129
|
+
name="fonte",
|
|
130
|
+
type=ColumnType.STRING,
|
|
131
|
+
nullable=False,
|
|
132
|
+
stable=True,
|
|
133
|
+
description="Fonte dos dados (ibge_lspa)",
|
|
134
|
+
),
|
|
135
|
+
],
|
|
136
|
+
guarantees=[
|
|
137
|
+
"Column names never change (additions only)",
|
|
138
|
+
"'ano' is always a valid year",
|
|
139
|
+
"'mes' is between 1 and 12 when present",
|
|
140
|
+
"'fonte' is always 'ibge_lspa'",
|
|
141
|
+
],
|
|
142
|
+
breaking_policy=BreakingChangePolicy.MAJOR_VERSION,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
__all__ = ["IBGE_PAM_V1", "IBGE_LSPA_V1"]
|
agrobr/export.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Export para formatos auditaveis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from agrobr.models import MetaInfo
|
|
18
|
+
|
|
19
|
+
logger = structlog.get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def export_parquet(
|
|
23
|
+
df: pd.DataFrame,
|
|
24
|
+
path: str | Path,
|
|
25
|
+
meta: MetaInfo | None = None,
|
|
26
|
+
compression: str = "snappy",
|
|
27
|
+
) -> Path:
|
|
28
|
+
"""
|
|
29
|
+
Exporta DataFrame para Parquet com metadados.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
df: DataFrame a exportar
|
|
33
|
+
path: Caminho do arquivo
|
|
34
|
+
meta: Metadados opcionais
|
|
35
|
+
compression: Compressao (snappy, gzip, zstd)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Path do arquivo criado
|
|
39
|
+
"""
|
|
40
|
+
import pyarrow as pa
|
|
41
|
+
import pyarrow.parquet as pq
|
|
42
|
+
|
|
43
|
+
path = Path(path)
|
|
44
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
|
|
46
|
+
table = pa.Table.from_pandas(df)
|
|
47
|
+
|
|
48
|
+
metadata = {
|
|
49
|
+
b"agrobr_version": _get_version().encode(),
|
|
50
|
+
b"export_timestamp": datetime.now().isoformat().encode(),
|
|
51
|
+
b"row_count": str(len(df)).encode(),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if meta:
|
|
55
|
+
metadata[b"source"] = meta.source.encode()
|
|
56
|
+
metadata[b"source_url"] = meta.source_url.encode()
|
|
57
|
+
metadata[b"fetched_at"] = meta.fetched_at.isoformat().encode()
|
|
58
|
+
if meta.raw_content_hash:
|
|
59
|
+
metadata[b"content_hash"] = meta.raw_content_hash.encode()
|
|
60
|
+
|
|
61
|
+
existing_meta = table.schema.metadata or {}
|
|
62
|
+
table = table.replace_schema_metadata({**existing_meta, **metadata})
|
|
63
|
+
|
|
64
|
+
pq.write_table(table, path, compression=compression)
|
|
65
|
+
|
|
66
|
+
logger.info("export_parquet", path=str(path), rows=len(df))
|
|
67
|
+
return path
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def export_csv(
|
|
71
|
+
df: pd.DataFrame,
|
|
72
|
+
path: str | Path,
|
|
73
|
+
meta: MetaInfo | None = None,
|
|
74
|
+
include_header: bool = True,
|
|
75
|
+
include_sidecar: bool = True,
|
|
76
|
+
) -> tuple[Path, Path | None]:
|
|
77
|
+
"""
|
|
78
|
+
Exporta DataFrame para CSV com arquivo sidecar de metadados.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
df: DataFrame a exportar
|
|
82
|
+
path: Caminho do arquivo
|
|
83
|
+
meta: Metadados opcionais
|
|
84
|
+
include_header: Incluir linha de cabecalho
|
|
85
|
+
include_sidecar: Criar arquivo .meta.json
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Tupla (path_csv, path_sidecar ou None)
|
|
89
|
+
"""
|
|
90
|
+
path = Path(path)
|
|
91
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
df.to_csv(path, index=False, header=include_header, quoting=csv.QUOTE_NONNUMERIC)
|
|
94
|
+
|
|
95
|
+
sidecar_path = None
|
|
96
|
+
if include_sidecar:
|
|
97
|
+
sidecar_path = path.with_suffix(".meta.json")
|
|
98
|
+
sidecar_data = _create_sidecar(df, meta)
|
|
99
|
+
with open(sidecar_path, "w") as f:
|
|
100
|
+
json.dump(sidecar_data, f, indent=2, ensure_ascii=False)
|
|
101
|
+
|
|
102
|
+
logger.info("export_csv", path=str(path), rows=len(df))
|
|
103
|
+
return path, sidecar_path
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def export_json(
|
|
107
|
+
df: pd.DataFrame,
|
|
108
|
+
path: str | Path,
|
|
109
|
+
meta: MetaInfo | None = None,
|
|
110
|
+
orient: str = "records",
|
|
111
|
+
include_metadata: bool = True,
|
|
112
|
+
) -> Path:
|
|
113
|
+
"""
|
|
114
|
+
Exporta DataFrame para JSON com metadados embutidos.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df: DataFrame a exportar
|
|
118
|
+
path: Caminho do arquivo
|
|
119
|
+
meta: Metadados opcionais
|
|
120
|
+
orient: Orientacao do JSON (records, split, index, etc)
|
|
121
|
+
include_metadata: Incluir metadados no JSON
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Path do arquivo criado
|
|
125
|
+
"""
|
|
126
|
+
path = Path(path)
|
|
127
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
|
|
129
|
+
if include_metadata:
|
|
130
|
+
output = {
|
|
131
|
+
"metadata": _create_sidecar(df, meta),
|
|
132
|
+
"data": json.loads(df.to_json(orient=orient, date_format="iso")), # type: ignore[call-overload]
|
|
133
|
+
}
|
|
134
|
+
with open(path, "w") as f:
|
|
135
|
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
136
|
+
else:
|
|
137
|
+
df.to_json(path, orient=orient, date_format="iso", indent=2) # type: ignore[call-overload]
|
|
138
|
+
|
|
139
|
+
logger.info("export_json", path=str(path), rows=len(df))
|
|
140
|
+
return path
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _create_sidecar(df: pd.DataFrame, meta: MetaInfo | None = None) -> dict[str, Any]:
|
|
144
|
+
"""Cria metadados para arquivo sidecar."""
|
|
145
|
+
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
|
146
|
+
content_hash = hashlib.sha256(csv_bytes).hexdigest()
|
|
147
|
+
|
|
148
|
+
sidecar: dict[str, Any] = {
|
|
149
|
+
"agrobr_version": _get_version(),
|
|
150
|
+
"export_timestamp": datetime.now().isoformat(),
|
|
151
|
+
"file_info": {
|
|
152
|
+
"row_count": len(df),
|
|
153
|
+
"column_count": len(df.columns),
|
|
154
|
+
"columns": df.columns.tolist(),
|
|
155
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
156
|
+
"content_hash": f"sha256:{content_hash}",
|
|
157
|
+
},
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if meta:
|
|
161
|
+
sidecar["provenance"] = {
|
|
162
|
+
"source": meta.source,
|
|
163
|
+
"source_url": meta.source_url,
|
|
164
|
+
"source_method": meta.source_method,
|
|
165
|
+
"fetched_at": meta.fetched_at.isoformat(),
|
|
166
|
+
"from_cache": meta.from_cache,
|
|
167
|
+
"original_hash": meta.raw_content_hash,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return sidecar
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def verify_export(path: str | Path, expected_hash: str | None = None) -> dict[str, Any]:
|
|
174
|
+
"""
|
|
175
|
+
Verifica integridade de um arquivo exportado.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
path: Caminho do arquivo
|
|
179
|
+
expected_hash: Hash esperado (opcional)
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dict com status da verificacao
|
|
183
|
+
"""
|
|
184
|
+
path = Path(path)
|
|
185
|
+
|
|
186
|
+
if not path.exists():
|
|
187
|
+
return {"valid": False, "error": "File not found"}
|
|
188
|
+
|
|
189
|
+
result: dict[str, Any] = {
|
|
190
|
+
"valid": True,
|
|
191
|
+
"path": str(path),
|
|
192
|
+
"size_bytes": path.stat().st_size,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if path.suffix == ".parquet":
|
|
196
|
+
import pyarrow.parquet as pq
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
table = pq.read_table(path)
|
|
200
|
+
result["row_count"] = table.num_rows
|
|
201
|
+
result["columns"] = table.schema.names
|
|
202
|
+
|
|
203
|
+
metadata = table.schema.metadata or {}
|
|
204
|
+
if b"content_hash" in metadata:
|
|
205
|
+
result["stored_hash"] = metadata[b"content_hash"].decode()
|
|
206
|
+
except Exception as e:
|
|
207
|
+
result["valid"] = False
|
|
208
|
+
result["error"] = str(e)
|
|
209
|
+
|
|
210
|
+
elif path.suffix == ".csv":
|
|
211
|
+
import pandas as pd
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
df = pd.read_csv(path)
|
|
215
|
+
result["row_count"] = len(df)
|
|
216
|
+
result["columns"] = df.columns.tolist()
|
|
217
|
+
|
|
218
|
+
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
|
219
|
+
result["computed_hash"] = f"sha256:{hashlib.sha256(csv_bytes).hexdigest()}"
|
|
220
|
+
|
|
221
|
+
sidecar_path = path.with_suffix(".meta.json")
|
|
222
|
+
if sidecar_path.exists():
|
|
223
|
+
with open(sidecar_path) as f:
|
|
224
|
+
sidecar = json.load(f)
|
|
225
|
+
result["stored_hash"] = sidecar.get("file_info", {}).get("content_hash")
|
|
226
|
+
except Exception as e:
|
|
227
|
+
result["valid"] = False
|
|
228
|
+
result["error"] = str(e)
|
|
229
|
+
|
|
230
|
+
if expected_hash and result.get("computed_hash"):
|
|
231
|
+
result["hash_match"] = result["computed_hash"] == expected_hash
|
|
232
|
+
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _get_version() -> str:
|
|
237
|
+
"""Retorna versao do agrobr."""
|
|
238
|
+
try:
|
|
239
|
+
import agrobr
|
|
240
|
+
|
|
241
|
+
return getattr(agrobr, "__version__", "unknown")
|
|
242
|
+
except ImportError:
|
|
243
|
+
return "unknown"
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
__all__ = [
|
|
247
|
+
"export_parquet",
|
|
248
|
+
"export_csv",
|
|
249
|
+
"export_json",
|
|
250
|
+
"verify_export",
|
|
251
|
+
]
|
agrobr/health/__init__.py
CHANGED
|
@@ -8,6 +8,12 @@ from .checker import (
|
|
|
8
8
|
check_source,
|
|
9
9
|
run_all_checks,
|
|
10
10
|
)
|
|
11
|
+
from .doctor import (
|
|
12
|
+
CacheStats,
|
|
13
|
+
DiagnosticsResult,
|
|
14
|
+
SourceStatus,
|
|
15
|
+
run_diagnostics,
|
|
16
|
+
)
|
|
11
17
|
from .reporter import (
|
|
12
18
|
HealthReport,
|
|
13
19
|
generate_report,
|
|
@@ -20,4 +26,8 @@ __all__: list[str] = [
|
|
|
20
26
|
"run_all_checks",
|
|
21
27
|
"HealthReport",
|
|
22
28
|
"generate_report",
|
|
29
|
+
"DiagnosticsResult",
|
|
30
|
+
"SourceStatus",
|
|
31
|
+
"CacheStats",
|
|
32
|
+
"run_diagnostics",
|
|
23
33
|
]
|