agrobr 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ """Contratos de estabilidade para dados IBGE."""
2
+
3
+ from agrobr.contracts import BreakingChangePolicy, Column, ColumnType, Contract
4
+
5
+ IBGE_PAM_V1 = Contract(
6
+ name="ibge.pam",
7
+ version="1.0",
8
+ effective_from="0.3.0",
9
+ columns=[
10
+ Column(
11
+ name="ano",
12
+ type=ColumnType.INTEGER,
13
+ nullable=False,
14
+ stable=True,
15
+ description="Ano de referencia",
16
+ ),
17
+ Column(
18
+ name="localidade",
19
+ type=ColumnType.STRING,
20
+ nullable=True,
21
+ stable=True,
22
+ description="Nome da localidade (UF ou municipio)",
23
+ ),
24
+ Column(
25
+ name="produto",
26
+ type=ColumnType.STRING,
27
+ nullable=False,
28
+ stable=True,
29
+ description="Nome do produto",
30
+ ),
31
+ Column(
32
+ name="area_plantada",
33
+ type=ColumnType.FLOAT,
34
+ nullable=True,
35
+ unit="ha",
36
+ stable=True,
37
+ description="Area plantada em hectares",
38
+ ),
39
+ Column(
40
+ name="area_colhida",
41
+ type=ColumnType.FLOAT,
42
+ nullable=True,
43
+ unit="ha",
44
+ stable=True,
45
+ description="Area colhida em hectares",
46
+ ),
47
+ Column(
48
+ name="producao",
49
+ type=ColumnType.FLOAT,
50
+ nullable=True,
51
+ unit="ton",
52
+ stable=True,
53
+ description="Quantidade produzida em toneladas",
54
+ ),
55
+ Column(
56
+ name="rendimento",
57
+ type=ColumnType.FLOAT,
58
+ nullable=True,
59
+ unit="kg/ha",
60
+ stable=True,
61
+ description="Rendimento medio em kg/ha",
62
+ ),
63
+ Column(
64
+ name="valor_producao",
65
+ type=ColumnType.FLOAT,
66
+ nullable=True,
67
+ unit="mil_reais",
68
+ stable=True,
69
+ description="Valor da producao em mil reais",
70
+ ),
71
+ Column(
72
+ name="fonte",
73
+ type=ColumnType.STRING,
74
+ nullable=False,
75
+ stable=True,
76
+ description="Fonte dos dados (ibge_pam)",
77
+ ),
78
+ ],
79
+ guarantees=[
80
+ "Column names never change (additions only)",
81
+ "'ano' is always a valid year (>= 1974)",
82
+ "Numeric values are always >= 0",
83
+ "'fonte' is always 'ibge_pam'",
84
+ ],
85
+ breaking_policy=BreakingChangePolicy.MAJOR_VERSION,
86
+ )
87
+
88
+ IBGE_LSPA_V1 = Contract(
89
+ name="ibge.lspa",
90
+ version="1.0",
91
+ effective_from="0.3.0",
92
+ columns=[
93
+ Column(
94
+ name="ano",
95
+ type=ColumnType.INTEGER,
96
+ nullable=False,
97
+ stable=True,
98
+ description="Ano de referencia",
99
+ ),
100
+ Column(
101
+ name="mes",
102
+ type=ColumnType.INTEGER,
103
+ nullable=True,
104
+ stable=True,
105
+ description="Mes de referencia (1-12)",
106
+ ),
107
+ Column(
108
+ name="produto",
109
+ type=ColumnType.STRING,
110
+ nullable=False,
111
+ stable=True,
112
+ description="Nome do produto",
113
+ ),
114
+ Column(
115
+ name="variavel",
116
+ type=ColumnType.STRING,
117
+ nullable=True,
118
+ stable=False,
119
+ description="Nome da variavel",
120
+ ),
121
+ Column(
122
+ name="valor",
123
+ type=ColumnType.FLOAT,
124
+ nullable=True,
125
+ stable=False,
126
+ description="Valor da variavel",
127
+ ),
128
+ Column(
129
+ name="fonte",
130
+ type=ColumnType.STRING,
131
+ nullable=False,
132
+ stable=True,
133
+ description="Fonte dos dados (ibge_lspa)",
134
+ ),
135
+ ],
136
+ guarantees=[
137
+ "Column names never change (additions only)",
138
+ "'ano' is always a valid year",
139
+ "'mes' is between 1 and 12 when present",
140
+ "'fonte' is always 'ibge_lspa'",
141
+ ],
142
+ breaking_policy=BreakingChangePolicy.MAJOR_VERSION,
143
+ )
144
+
145
+
146
+ __all__ = ["IBGE_PAM_V1", "IBGE_LSPA_V1"]
agrobr/export.py ADDED
@@ -0,0 +1,251 @@
1
+ """Export para formatos auditaveis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import hashlib
7
+ import json
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ import structlog
13
+
14
+ if TYPE_CHECKING:
15
+ import pandas as pd
16
+
17
+ from agrobr.models import MetaInfo
18
+
19
+ logger = structlog.get_logger()
20
+
21
+
22
+ def export_parquet(
23
+ df: pd.DataFrame,
24
+ path: str | Path,
25
+ meta: MetaInfo | None = None,
26
+ compression: str = "snappy",
27
+ ) -> Path:
28
+ """
29
+ Exporta DataFrame para Parquet com metadados.
30
+
31
+ Args:
32
+ df: DataFrame a exportar
33
+ path: Caminho do arquivo
34
+ meta: Metadados opcionais
35
+ compression: Compressao (snappy, gzip, zstd)
36
+
37
+ Returns:
38
+ Path do arquivo criado
39
+ """
40
+ import pyarrow as pa
41
+ import pyarrow.parquet as pq
42
+
43
+ path = Path(path)
44
+ path.parent.mkdir(parents=True, exist_ok=True)
45
+
46
+ table = pa.Table.from_pandas(df)
47
+
48
+ metadata = {
49
+ b"agrobr_version": _get_version().encode(),
50
+ b"export_timestamp": datetime.now().isoformat().encode(),
51
+ b"row_count": str(len(df)).encode(),
52
+ }
53
+
54
+ if meta:
55
+ metadata[b"source"] = meta.source.encode()
56
+ metadata[b"source_url"] = meta.source_url.encode()
57
+ metadata[b"fetched_at"] = meta.fetched_at.isoformat().encode()
58
+ if meta.raw_content_hash:
59
+ metadata[b"content_hash"] = meta.raw_content_hash.encode()
60
+
61
+ existing_meta = table.schema.metadata or {}
62
+ table = table.replace_schema_metadata({**existing_meta, **metadata})
63
+
64
+ pq.write_table(table, path, compression=compression)
65
+
66
+ logger.info("export_parquet", path=str(path), rows=len(df))
67
+ return path
68
+
69
+
70
+ def export_csv(
71
+ df: pd.DataFrame,
72
+ path: str | Path,
73
+ meta: MetaInfo | None = None,
74
+ include_header: bool = True,
75
+ include_sidecar: bool = True,
76
+ ) -> tuple[Path, Path | None]:
77
+ """
78
+ Exporta DataFrame para CSV com arquivo sidecar de metadados.
79
+
80
+ Args:
81
+ df: DataFrame a exportar
82
+ path: Caminho do arquivo
83
+ meta: Metadados opcionais
84
+ include_header: Incluir linha de cabecalho
85
+ include_sidecar: Criar arquivo .meta.json
86
+
87
+ Returns:
88
+ Tupla (path_csv, path_sidecar ou None)
89
+ """
90
+ path = Path(path)
91
+ path.parent.mkdir(parents=True, exist_ok=True)
92
+
93
+ df.to_csv(path, index=False, header=include_header, quoting=csv.QUOTE_NONNUMERIC)
94
+
95
+ sidecar_path = None
96
+ if include_sidecar:
97
+ sidecar_path = path.with_suffix(".meta.json")
98
+ sidecar_data = _create_sidecar(df, meta)
99
+ with open(sidecar_path, "w") as f:
100
+ json.dump(sidecar_data, f, indent=2, ensure_ascii=False)
101
+
102
+ logger.info("export_csv", path=str(path), rows=len(df))
103
+ return path, sidecar_path
104
+
105
+
106
+ def export_json(
107
+ df: pd.DataFrame,
108
+ path: str | Path,
109
+ meta: MetaInfo | None = None,
110
+ orient: str = "records",
111
+ include_metadata: bool = True,
112
+ ) -> Path:
113
+ """
114
+ Exporta DataFrame para JSON com metadados embutidos.
115
+
116
+ Args:
117
+ df: DataFrame a exportar
118
+ path: Caminho do arquivo
119
+ meta: Metadados opcionais
120
+ orient: Orientacao do JSON (records, split, index, etc)
121
+ include_metadata: Incluir metadados no JSON
122
+
123
+ Returns:
124
+ Path do arquivo criado
125
+ """
126
+ path = Path(path)
127
+ path.parent.mkdir(parents=True, exist_ok=True)
128
+
129
+ if include_metadata:
130
+ output = {
131
+ "metadata": _create_sidecar(df, meta),
132
+ "data": json.loads(df.to_json(orient=orient, date_format="iso")), # type: ignore[call-overload]
133
+ }
134
+ with open(path, "w") as f:
135
+ json.dump(output, f, indent=2, ensure_ascii=False)
136
+ else:
137
+ df.to_json(path, orient=orient, date_format="iso", indent=2) # type: ignore[call-overload]
138
+
139
+ logger.info("export_json", path=str(path), rows=len(df))
140
+ return path
141
+
142
+
143
+ def _create_sidecar(df: pd.DataFrame, meta: MetaInfo | None = None) -> dict[str, Any]:
144
+ """Cria metadados para arquivo sidecar."""
145
+ csv_bytes = df.to_csv(index=False).encode("utf-8")
146
+ content_hash = hashlib.sha256(csv_bytes).hexdigest()
147
+
148
+ sidecar: dict[str, Any] = {
149
+ "agrobr_version": _get_version(),
150
+ "export_timestamp": datetime.now().isoformat(),
151
+ "file_info": {
152
+ "row_count": len(df),
153
+ "column_count": len(df.columns),
154
+ "columns": df.columns.tolist(),
155
+ "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
156
+ "content_hash": f"sha256:{content_hash}",
157
+ },
158
+ }
159
+
160
+ if meta:
161
+ sidecar["provenance"] = {
162
+ "source": meta.source,
163
+ "source_url": meta.source_url,
164
+ "source_method": meta.source_method,
165
+ "fetched_at": meta.fetched_at.isoformat(),
166
+ "from_cache": meta.from_cache,
167
+ "original_hash": meta.raw_content_hash,
168
+ }
169
+
170
+ return sidecar
171
+
172
+
173
+ def verify_export(path: str | Path, expected_hash: str | None = None) -> dict[str, Any]:
174
+ """
175
+ Verifica integridade de um arquivo exportado.
176
+
177
+ Args:
178
+ path: Caminho do arquivo
179
+ expected_hash: Hash esperado (opcional)
180
+
181
+ Returns:
182
+ Dict com status da verificacao
183
+ """
184
+ path = Path(path)
185
+
186
+ if not path.exists():
187
+ return {"valid": False, "error": "File not found"}
188
+
189
+ result: dict[str, Any] = {
190
+ "valid": True,
191
+ "path": str(path),
192
+ "size_bytes": path.stat().st_size,
193
+ }
194
+
195
+ if path.suffix == ".parquet":
196
+ import pyarrow.parquet as pq
197
+
198
+ try:
199
+ table = pq.read_table(path)
200
+ result["row_count"] = table.num_rows
201
+ result["columns"] = table.schema.names
202
+
203
+ metadata = table.schema.metadata or {}
204
+ if b"content_hash" in metadata:
205
+ result["stored_hash"] = metadata[b"content_hash"].decode()
206
+ except Exception as e:
207
+ result["valid"] = False
208
+ result["error"] = str(e)
209
+
210
+ elif path.suffix == ".csv":
211
+ import pandas as pd
212
+
213
+ try:
214
+ df = pd.read_csv(path)
215
+ result["row_count"] = len(df)
216
+ result["columns"] = df.columns.tolist()
217
+
218
+ csv_bytes = df.to_csv(index=False).encode("utf-8")
219
+ result["computed_hash"] = f"sha256:{hashlib.sha256(csv_bytes).hexdigest()}"
220
+
221
+ sidecar_path = path.with_suffix(".meta.json")
222
+ if sidecar_path.exists():
223
+ with open(sidecar_path) as f:
224
+ sidecar = json.load(f)
225
+ result["stored_hash"] = sidecar.get("file_info", {}).get("content_hash")
226
+ except Exception as e:
227
+ result["valid"] = False
228
+ result["error"] = str(e)
229
+
230
+ if expected_hash and result.get("computed_hash"):
231
+ result["hash_match"] = result["computed_hash"] == expected_hash
232
+
233
+ return result
234
+
235
+
236
+ def _get_version() -> str:
237
+ """Retorna versao do agrobr."""
238
+ try:
239
+ import agrobr
240
+
241
+ return getattr(agrobr, "__version__", "unknown")
242
+ except ImportError:
243
+ return "unknown"
244
+
245
+
246
+ __all__ = [
247
+ "export_parquet",
248
+ "export_csv",
249
+ "export_json",
250
+ "verify_export",
251
+ ]
agrobr/health/__init__.py CHANGED
@@ -8,6 +8,12 @@ from .checker import (
8
8
  check_source,
9
9
  run_all_checks,
10
10
  )
11
+ from .doctor import (
12
+ CacheStats,
13
+ DiagnosticsResult,
14
+ SourceStatus,
15
+ run_diagnostics,
16
+ )
11
17
  from .reporter import (
12
18
  HealthReport,
13
19
  generate_report,
@@ -20,4 +26,8 @@ __all__: list[str] = [
20
26
  "run_all_checks",
21
27
  "HealthReport",
22
28
  "generate_report",
29
+ "DiagnosticsResult",
30
+ "SourceStatus",
31
+ "CacheStats",
32
+ "run_diagnostics",
23
33
  ]