agrobr 0.1.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agrobr/export.py ADDED
@@ -0,0 +1,251 @@
1
+ """Export para formatos auditaveis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import hashlib
7
+ import json
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ import structlog
13
+
14
+ if TYPE_CHECKING:
15
+ import pandas as pd
16
+
17
+ from agrobr.models import MetaInfo
18
+
19
+ logger = structlog.get_logger()
20
+
21
+
22
+ def export_parquet(
23
+ df: pd.DataFrame,
24
+ path: str | Path,
25
+ meta: MetaInfo | None = None,
26
+ compression: str = "snappy",
27
+ ) -> Path:
28
+ """
29
+ Exporta DataFrame para Parquet com metadados.
30
+
31
+ Args:
32
+ df: DataFrame a exportar
33
+ path: Caminho do arquivo
34
+ meta: Metadados opcionais
35
+ compression: Compressao (snappy, gzip, zstd)
36
+
37
+ Returns:
38
+ Path do arquivo criado
39
+ """
40
+ import pyarrow as pa
41
+ import pyarrow.parquet as pq
42
+
43
+ path = Path(path)
44
+ path.parent.mkdir(parents=True, exist_ok=True)
45
+
46
+ table = pa.Table.from_pandas(df)
47
+
48
+ metadata = {
49
+ b"agrobr_version": _get_version().encode(),
50
+ b"export_timestamp": datetime.now().isoformat().encode(),
51
+ b"row_count": str(len(df)).encode(),
52
+ }
53
+
54
+ if meta:
55
+ metadata[b"source"] = meta.source.encode()
56
+ metadata[b"source_url"] = meta.source_url.encode()
57
+ metadata[b"fetched_at"] = meta.fetched_at.isoformat().encode()
58
+ if meta.raw_content_hash:
59
+ metadata[b"content_hash"] = meta.raw_content_hash.encode()
60
+
61
+ existing_meta = table.schema.metadata or {}
62
+ table = table.replace_schema_metadata({**existing_meta, **metadata})
63
+
64
+ pq.write_table(table, path, compression=compression)
65
+
66
+ logger.info("export_parquet", path=str(path), rows=len(df))
67
+ return path
68
+
69
+
70
+ def export_csv(
71
+ df: pd.DataFrame,
72
+ path: str | Path,
73
+ meta: MetaInfo | None = None,
74
+ include_header: bool = True,
75
+ include_sidecar: bool = True,
76
+ ) -> tuple[Path, Path | None]:
77
+ """
78
+ Exporta DataFrame para CSV com arquivo sidecar de metadados.
79
+
80
+ Args:
81
+ df: DataFrame a exportar
82
+ path: Caminho do arquivo
83
+ meta: Metadados opcionais
84
+ include_header: Incluir linha de cabecalho
85
+ include_sidecar: Criar arquivo .meta.json
86
+
87
+ Returns:
88
+ Tupla (path_csv, path_sidecar ou None)
89
+ """
90
+ path = Path(path)
91
+ path.parent.mkdir(parents=True, exist_ok=True)
92
+
93
+ df.to_csv(path, index=False, header=include_header, quoting=csv.QUOTE_NONNUMERIC)
94
+
95
+ sidecar_path = None
96
+ if include_sidecar:
97
+ sidecar_path = path.with_suffix(".meta.json")
98
+ sidecar_data = _create_sidecar(df, meta)
99
+ with open(sidecar_path, "w") as f:
100
+ json.dump(sidecar_data, f, indent=2, ensure_ascii=False)
101
+
102
+ logger.info("export_csv", path=str(path), rows=len(df))
103
+ return path, sidecar_path
104
+
105
+
106
+ def export_json(
107
+ df: pd.DataFrame,
108
+ path: str | Path,
109
+ meta: MetaInfo | None = None,
110
+ orient: str = "records",
111
+ include_metadata: bool = True,
112
+ ) -> Path:
113
+ """
114
+ Exporta DataFrame para JSON com metadados embutidos.
115
+
116
+ Args:
117
+ df: DataFrame a exportar
118
+ path: Caminho do arquivo
119
+ meta: Metadados opcionais
120
+ orient: Orientacao do JSON (records, split, index, etc)
121
+ include_metadata: Incluir metadados no JSON
122
+
123
+ Returns:
124
+ Path do arquivo criado
125
+ """
126
+ path = Path(path)
127
+ path.parent.mkdir(parents=True, exist_ok=True)
128
+
129
+ if include_metadata:
130
+ output = {
131
+ "metadata": _create_sidecar(df, meta),
132
+ "data": json.loads(df.to_json(orient=orient, date_format="iso")), # type: ignore[call-overload]
133
+ }
134
+ with open(path, "w") as f:
135
+ json.dump(output, f, indent=2, ensure_ascii=False)
136
+ else:
137
+ df.to_json(path, orient=orient, date_format="iso", indent=2) # type: ignore[call-overload]
138
+
139
+ logger.info("export_json", path=str(path), rows=len(df))
140
+ return path
141
+
142
+
143
+ def _create_sidecar(df: pd.DataFrame, meta: MetaInfo | None = None) -> dict[str, Any]:
144
+ """Cria metadados para arquivo sidecar."""
145
+ csv_bytes = df.to_csv(index=False).encode("utf-8")
146
+ content_hash = hashlib.sha256(csv_bytes).hexdigest()
147
+
148
+ sidecar: dict[str, Any] = {
149
+ "agrobr_version": _get_version(),
150
+ "export_timestamp": datetime.now().isoformat(),
151
+ "file_info": {
152
+ "row_count": len(df),
153
+ "column_count": len(df.columns),
154
+ "columns": df.columns.tolist(),
155
+ "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
156
+ "content_hash": f"sha256:{content_hash}",
157
+ },
158
+ }
159
+
160
+ if meta:
161
+ sidecar["provenance"] = {
162
+ "source": meta.source,
163
+ "source_url": meta.source_url,
164
+ "source_method": meta.source_method,
165
+ "fetched_at": meta.fetched_at.isoformat(),
166
+ "from_cache": meta.from_cache,
167
+ "original_hash": meta.raw_content_hash,
168
+ }
169
+
170
+ return sidecar
171
+
172
+
173
+ def verify_export(path: str | Path, expected_hash: str | None = None) -> dict[str, Any]:
174
+ """
175
+ Verifica integridade de um arquivo exportado.
176
+
177
+ Args:
178
+ path: Caminho do arquivo
179
+ expected_hash: Hash esperado (opcional)
180
+
181
+ Returns:
182
+ Dict com status da verificacao
183
+ """
184
+ path = Path(path)
185
+
186
+ if not path.exists():
187
+ return {"valid": False, "error": "File not found"}
188
+
189
+ result: dict[str, Any] = {
190
+ "valid": True,
191
+ "path": str(path),
192
+ "size_bytes": path.stat().st_size,
193
+ }
194
+
195
+ if path.suffix == ".parquet":
196
+ import pyarrow.parquet as pq
197
+
198
+ try:
199
+ table = pq.read_table(path)
200
+ result["row_count"] = table.num_rows
201
+ result["columns"] = table.schema.names
202
+
203
+ metadata = table.schema.metadata or {}
204
+ if b"content_hash" in metadata:
205
+ result["stored_hash"] = metadata[b"content_hash"].decode()
206
+ except Exception as e:
207
+ result["valid"] = False
208
+ result["error"] = str(e)
209
+
210
+ elif path.suffix == ".csv":
211
+ import pandas as pd
212
+
213
+ try:
214
+ df = pd.read_csv(path)
215
+ result["row_count"] = len(df)
216
+ result["columns"] = df.columns.tolist()
217
+
218
+ csv_bytes = df.to_csv(index=False).encode("utf-8")
219
+ result["computed_hash"] = f"sha256:{hashlib.sha256(csv_bytes).hexdigest()}"
220
+
221
+ sidecar_path = path.with_suffix(".meta.json")
222
+ if sidecar_path.exists():
223
+ with open(sidecar_path) as f:
224
+ sidecar = json.load(f)
225
+ result["stored_hash"] = sidecar.get("file_info", {}).get("content_hash")
226
+ except Exception as e:
227
+ result["valid"] = False
228
+ result["error"] = str(e)
229
+
230
+ if expected_hash and result.get("computed_hash"):
231
+ result["hash_match"] = result["computed_hash"] == expected_hash
232
+
233
+ return result
234
+
235
+
236
+ def _get_version() -> str:
237
+ """Retorna versao do agrobr."""
238
+ try:
239
+ import agrobr
240
+
241
+ return getattr(agrobr, "__version__", "unknown")
242
+ except ImportError:
243
+ return "unknown"
244
+
245
+
246
+ __all__ = [
247
+ "export_parquet",
248
+ "export_csv",
249
+ "export_json",
250
+ "verify_export",
251
+ ]
agrobr/health/__init__.py CHANGED
@@ -8,6 +8,12 @@ from .checker import (
8
8
  check_source,
9
9
  run_all_checks,
10
10
  )
11
+ from .doctor import (
12
+ CacheStats,
13
+ DiagnosticsResult,
14
+ SourceStatus,
15
+ run_diagnostics,
16
+ )
11
17
  from .reporter import (
12
18
  HealthReport,
13
19
  generate_report,
@@ -20,4 +26,8 @@ __all__: list[str] = [
20
26
  "run_all_checks",
21
27
  "HealthReport",
22
28
  "generate_report",
29
+ "DiagnosticsResult",
30
+ "SourceStatus",
31
+ "CacheStats",
32
+ "run_diagnostics",
23
33
  ]
@@ -0,0 +1,321 @@
1
+ """Diagnóstico completo do sistema agrobr."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import httpx
13
+ import structlog
14
+
15
+ from agrobr import __version__
16
+ from agrobr.cache.duckdb_store import get_store
17
+ from agrobr.cache.policies import get_next_update_info
18
+
19
+ logger = structlog.get_logger()
20
+
21
+
22
+ @dataclass
23
+ class SourceStatus:
24
+ """Status de conectividade de uma fonte."""
25
+
26
+ name: str
27
+ url: str
28
+ status: str
29
+ latency_ms: int
30
+ error: str | None = None
31
+
32
+
33
+ @dataclass
34
+ class CacheStats:
35
+ """Estatísticas do cache."""
36
+
37
+ location: str
38
+ size_bytes: int
39
+ total_records: int
40
+ by_source: dict[str, dict[str, Any]] = field(default_factory=dict)
41
+
42
+
43
+ @dataclass
44
+ class DiagnosticsResult:
45
+ """Resultado do diagnóstico completo."""
46
+
47
+ version: str
48
+ timestamp: datetime
49
+ sources: list[SourceStatus]
50
+ cache: CacheStats
51
+ last_collections: dict[str, datetime | None]
52
+ cache_expiry: dict[str, dict[str, str]]
53
+ config: dict[str, Any]
54
+ overall_status: str
55
+
56
+ def to_dict(self) -> dict[str, Any]:
57
+ """Converte para dicionário serializável."""
58
+ return {
59
+ "version": self.version,
60
+ "timestamp": self.timestamp.isoformat(),
61
+ "sources": [
62
+ {
63
+ "name": s.name,
64
+ "url": s.url,
65
+ "status": s.status,
66
+ "latency_ms": s.latency_ms,
67
+ "error": s.error,
68
+ }
69
+ for s in self.sources
70
+ ],
71
+ "cache": {
72
+ "location": self.cache.location,
73
+ "size_mb": round(self.cache.size_bytes / 1024 / 1024, 2),
74
+ "total_records": self.cache.total_records,
75
+ "by_source": self.cache.by_source,
76
+ },
77
+ "last_collections": {
78
+ k: v.isoformat() if v else None for k, v in self.last_collections.items()
79
+ },
80
+ "cache_expiry": self.cache_expiry,
81
+ "config": self.config,
82
+ "overall_status": self.overall_status,
83
+ }
84
+
85
+ def to_rich(self) -> str:
86
+ """Formata para output no terminal."""
87
+ lines = [
88
+ "",
89
+ f"agrobr diagnostics v{self.version}",
90
+ "=" * 50,
91
+ "",
92
+ "Sources Connectivity",
93
+ ]
94
+
95
+ for s in self.sources:
96
+ if s.status == "ok":
97
+ icon = "[OK]"
98
+ elif s.status == "slow":
99
+ icon = "[SLOW]"
100
+ else:
101
+ icon = "[FAIL]"
102
+
103
+ line = f" {icon} {s.name:<35} {s.latency_ms:>5}ms"
104
+ if s.error:
105
+ line += f" ({s.error})"
106
+ lines.append(line)
107
+
108
+ lines.extend(
109
+ [
110
+ "",
111
+ "Cache Status",
112
+ f" Location: {self.cache.location}",
113
+ f" Size: {self.cache.size_bytes / 1024 / 1024:.2f} MB",
114
+ f" Total records: {self.cache.total_records:,}",
115
+ "",
116
+ " By source:",
117
+ ]
118
+ )
119
+
120
+ for fonte, stats in self.cache.by_source.items():
121
+ count = stats.get("count", 0)
122
+ oldest = stats.get("oldest", "-")
123
+ newest = stats.get("newest", "-")
124
+ lines.append(f" {fonte.upper()}: {count:,} records ({oldest} to {newest})")
125
+
126
+ lines.extend(
127
+ [
128
+ "",
129
+ "Cache Expiry",
130
+ ]
131
+ )
132
+
133
+ for fonte, info in self.cache_expiry.items():
134
+ exp_type = info.get("type", "unknown")
135
+ if exp_type == "smart":
136
+ lines.append(f" {fonte.upper()}: {info.get('description', '')}")
137
+ else:
138
+ lines.append(f" {fonte.upper()}: TTL {info.get('ttl', 'unknown')}")
139
+
140
+ lines.extend(
141
+ [
142
+ "",
143
+ "Configuration",
144
+ f" Browser fallback: {'enabled' if self.config.get('browser_fallback') else 'disabled'}",
145
+ f" Alternative source: {'enabled' if self.config.get('alternative_source') else 'disabled'}",
146
+ "",
147
+ ]
148
+ )
149
+
150
+ if self.overall_status == "healthy":
151
+ lines.append("[OK] All systems operational")
152
+ elif self.overall_status == "degraded":
153
+ lines.append("[WARN] System degraded - some sources unavailable")
154
+ else:
155
+ lines.append("[FAIL] System error - check source connectivity")
156
+
157
+ lines.append("")
158
+ return "\n".join(lines)
159
+
160
+
161
+ async def _check_source(name: str, url: str, timeout: float = 10.0) -> SourceStatus:
162
+ """Verifica conectividade de uma fonte."""
163
+ start = time.perf_counter()
164
+
165
+ try:
166
+ async with httpx.AsyncClient(timeout=timeout) as http_client:
167
+ response = await http_client.head(url, follow_redirects=True)
168
+ latency_ms = int((time.perf_counter() - start) * 1000)
169
+
170
+ if response.status_code < 400:
171
+ status = "ok" if latency_ms < 2000 else "slow"
172
+ return SourceStatus(name, url, status, latency_ms)
173
+
174
+ return SourceStatus(
175
+ name,
176
+ url,
177
+ "error",
178
+ latency_ms,
179
+ error=f"HTTP {response.status_code}",
180
+ )
181
+
182
+ except httpx.TimeoutException:
183
+ latency_ms = int((time.perf_counter() - start) * 1000)
184
+ return SourceStatus(name, url, "error", latency_ms, error="timeout")
185
+
186
+ except httpx.ConnectError as e:
187
+ latency_ms = int((time.perf_counter() - start) * 1000)
188
+ return SourceStatus(name, url, "error", latency_ms, error=f"connection error: {e}")
189
+
190
+ except Exception as e:
191
+ latency_ms = int((time.perf_counter() - start) * 1000)
192
+ return SourceStatus(name, url, "error", latency_ms, error=str(e))
193
+
194
+
195
+ def _get_cache_stats() -> CacheStats:
196
+ """Obtém estatísticas do cache."""
197
+ try:
198
+ store = get_store()
199
+ cache_path = Path(store.db_path)
200
+ size_bytes = cache_path.stat().st_size if cache_path.exists() else 0
201
+
202
+ by_source: dict[str, dict[str, Any]] = {}
203
+ conn = store._get_conn()
204
+
205
+ for fonte in ["cepea", "conab", "ibge"]:
206
+ try:
207
+ result = conn.execute(
208
+ """
209
+ SELECT COUNT(*), MIN(data), MAX(data)
210
+ FROM indicadores
211
+ WHERE LOWER(fonte) = ?
212
+ """,
213
+ [fonte],
214
+ ).fetchone()
215
+
216
+ if result and result[0] > 0:
217
+ by_source[fonte] = {
218
+ "count": result[0],
219
+ "oldest": str(result[1]) if result[1] else None,
220
+ "newest": str(result[2]) if result[2] else None,
221
+ }
222
+ except Exception:
223
+ pass
224
+
225
+ total_records = sum(s.get("count", 0) for s in by_source.values())
226
+
227
+ return CacheStats(
228
+ location=str(cache_path),
229
+ size_bytes=size_bytes,
230
+ total_records=total_records,
231
+ by_source=by_source,
232
+ )
233
+
234
+ except Exception as e:
235
+ logger.warning("cache_stats_failed", error=str(e))
236
+ return CacheStats(
237
+ location="unknown",
238
+ size_bytes=0,
239
+ total_records=0,
240
+ by_source={},
241
+ )
242
+
243
+
244
+ def _get_last_collections() -> dict[str, datetime | None]:
245
+ """Obtém data da última coleta por fonte."""
246
+ collections: dict[str, datetime | None] = {}
247
+
248
+ try:
249
+ store = get_store()
250
+ conn = store._get_conn()
251
+
252
+ for fonte in ["cepea", "conab", "ibge"]:
253
+ try:
254
+ result = conn.execute(
255
+ """
256
+ SELECT MAX(collected_at)
257
+ FROM indicadores
258
+ WHERE LOWER(fonte) = ?
259
+ """,
260
+ [fonte],
261
+ ).fetchone()
262
+
263
+ collections[fonte] = result[0] if result and result[0] else None
264
+
265
+ except Exception:
266
+ collections[fonte] = None
267
+
268
+ except Exception:
269
+ pass
270
+
271
+ return collections
272
+
273
+
274
+ async def run_diagnostics(verbose: bool = False) -> DiagnosticsResult: # noqa: ARG001
275
+ """
276
+ Executa diagnóstico completo do sistema.
277
+
278
+ Args:
279
+ verbose: Se True, inclui informações detalhadas (reservado para uso futuro)
280
+
281
+ Returns:
282
+ DiagnosticsResult com status completo
283
+ """
284
+ sources_to_check = [
285
+ ("CEPEA (Noticias Agricolas)", "https://www.noticiasagricolas.com.br"),
286
+ ("CONAB", "https://www.conab.gov.br"),
287
+ ("IBGE/SIDRA", "https://sidra.ibge.gov.br"),
288
+ ]
289
+
290
+ source_tasks = [_check_source(name, url) for name, url in sources_to_check]
291
+ sources = await asyncio.gather(*source_tasks)
292
+
293
+ cache = _get_cache_stats()
294
+
295
+ cache_expiry = {}
296
+ for fonte in ["cepea", "conab", "ibge"]:
297
+ cache_expiry[fonte] = get_next_update_info(fonte)
298
+
299
+ last_collections = _get_last_collections()
300
+
301
+ error_count = sum(1 for s in sources if s.status == "error")
302
+ if error_count == len(sources):
303
+ overall_status = "error"
304
+ elif error_count > 0:
305
+ overall_status = "degraded"
306
+ else:
307
+ overall_status = "healthy"
308
+
309
+ return DiagnosticsResult(
310
+ version=__version__,
311
+ timestamp=datetime.now(),
312
+ sources=list(sources),
313
+ cache=cache,
314
+ last_collections=last_collections,
315
+ cache_expiry=cache_expiry,
316
+ config={
317
+ "browser_fallback": False,
318
+ "alternative_source": True,
319
+ },
320
+ overall_status=overall_status,
321
+ )
agrobr/http/browser.py CHANGED
@@ -15,7 +15,6 @@ from agrobr.http.user_agents import UserAgentRotator
15
15
 
16
16
  logger = structlog.get_logger()
17
17
 
18
- # Singleton para reutilizar browser
19
18
  _playwright: Playwright | None = None
20
19
  _browser: Browser | None = None
21
20
  _lock = asyncio.Lock()
@@ -64,7 +63,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
64
63
  """Context manager para obter uma página do browser."""
65
64
  browser = await _get_browser()
66
65
 
67
- # Cria contexto com fingerprint realista
68
66
  ua = UserAgentRotator.get_random()
69
67
  context = await browser.new_context(
70
68
  user_agent=ua,
@@ -78,7 +76,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
78
76
 
79
77
  page = await context.new_page()
80
78
 
81
- # Esconde sinais de automação
82
79
  await page.add_init_script(
83
80
  """
84
81
  Object.defineProperty(navigator, 'webdriver', {
@@ -124,7 +121,6 @@ async def fetch_with_browser(
124
121
 
125
122
  try:
126
123
  async with get_page() as page:
127
- # Navega para a URL
128
124
  response = await page.goto(
129
125
  url,
130
126
  wait_until="domcontentloaded",
@@ -138,7 +134,6 @@ async def fetch_with_browser(
138
134
  last_error="No response received",
139
135
  )
140
136
 
141
- # Aguarda seletor específico se fornecido
142
137
  if wait_selector:
143
138
  try:
144
139
  await page.wait_for_selector(
@@ -152,13 +147,10 @@ async def fetch_with_browser(
152
147
  error=str(e),
153
148
  )
154
149
 
155
- # Aguarda Cloudflare resolver e JS terminar
156
150
  await page.wait_for_timeout(5000)
157
151
 
158
- # Verifica se foi bloqueado pelo Cloudflare
159
152
  if response.status in (403, 503):
160
153
  check_html: str = await page.content()
161
- # Detecta página de challenge do Cloudflare
162
154
  if "cloudflare" in check_html.lower() or "challenge" in check_html.lower():
163
155
  raise SourceUnavailableError(
164
156
  source=source,
@@ -166,7 +158,6 @@ async def fetch_with_browser(
166
158
  last_error=f"Cloudflare block detected (status {response.status})",
167
159
  )
168
160
 
169
- # Obtém HTML
170
161
  html: str = await page.content()
171
162
 
172
163
  logger.info(