agrobr 0.1.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agrobr/__init__.py +3 -2
- agrobr/benchmark/__init__.py +343 -0
- agrobr/cache/policies.py +3 -8
- agrobr/cepea/api.py +87 -30
- agrobr/cepea/client.py +0 -7
- agrobr/cli.py +141 -5
- agrobr/conab/api.py +72 -6
- agrobr/config.py +137 -0
- agrobr/constants.py +1 -2
- agrobr/contracts/__init__.py +186 -0
- agrobr/contracts/cepea.py +80 -0
- agrobr/contracts/conab.py +181 -0
- agrobr/contracts/ibge.py +146 -0
- agrobr/export.py +251 -0
- agrobr/health/__init__.py +10 -0
- agrobr/health/doctor.py +321 -0
- agrobr/http/browser.py +0 -9
- agrobr/ibge/api.py +104 -25
- agrobr/ibge/client.py +5 -20
- agrobr/models.py +100 -1
- agrobr/noticias_agricolas/client.py +0 -7
- agrobr/noticias_agricolas/parser.py +0 -17
- agrobr/plugins/__init__.py +205 -0
- agrobr/quality.py +319 -0
- agrobr/sla.py +249 -0
- agrobr/snapshots.py +321 -0
- agrobr/stability.py +148 -0
- agrobr/validators/semantic.py +447 -0
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/METADATA +12 -12
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/RECORD +33 -19
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/WHEEL +0 -0
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/entry_points.txt +0 -0
- {agrobr-0.1.2.dist-info → agrobr-0.5.0.dist-info}/licenses/LICENSE +0 -0
agrobr/export.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Export para formatos auditaveis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from agrobr.models import MetaInfo
|
|
18
|
+
|
|
19
|
+
logger = structlog.get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def export_parquet(
|
|
23
|
+
df: pd.DataFrame,
|
|
24
|
+
path: str | Path,
|
|
25
|
+
meta: MetaInfo | None = None,
|
|
26
|
+
compression: str = "snappy",
|
|
27
|
+
) -> Path:
|
|
28
|
+
"""
|
|
29
|
+
Exporta DataFrame para Parquet com metadados.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
df: DataFrame a exportar
|
|
33
|
+
path: Caminho do arquivo
|
|
34
|
+
meta: Metadados opcionais
|
|
35
|
+
compression: Compressao (snappy, gzip, zstd)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Path do arquivo criado
|
|
39
|
+
"""
|
|
40
|
+
import pyarrow as pa
|
|
41
|
+
import pyarrow.parquet as pq
|
|
42
|
+
|
|
43
|
+
path = Path(path)
|
|
44
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
|
|
46
|
+
table = pa.Table.from_pandas(df)
|
|
47
|
+
|
|
48
|
+
metadata = {
|
|
49
|
+
b"agrobr_version": _get_version().encode(),
|
|
50
|
+
b"export_timestamp": datetime.now().isoformat().encode(),
|
|
51
|
+
b"row_count": str(len(df)).encode(),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if meta:
|
|
55
|
+
metadata[b"source"] = meta.source.encode()
|
|
56
|
+
metadata[b"source_url"] = meta.source_url.encode()
|
|
57
|
+
metadata[b"fetched_at"] = meta.fetched_at.isoformat().encode()
|
|
58
|
+
if meta.raw_content_hash:
|
|
59
|
+
metadata[b"content_hash"] = meta.raw_content_hash.encode()
|
|
60
|
+
|
|
61
|
+
existing_meta = table.schema.metadata or {}
|
|
62
|
+
table = table.replace_schema_metadata({**existing_meta, **metadata})
|
|
63
|
+
|
|
64
|
+
pq.write_table(table, path, compression=compression)
|
|
65
|
+
|
|
66
|
+
logger.info("export_parquet", path=str(path), rows=len(df))
|
|
67
|
+
return path
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def export_csv(
|
|
71
|
+
df: pd.DataFrame,
|
|
72
|
+
path: str | Path,
|
|
73
|
+
meta: MetaInfo | None = None,
|
|
74
|
+
include_header: bool = True,
|
|
75
|
+
include_sidecar: bool = True,
|
|
76
|
+
) -> tuple[Path, Path | None]:
|
|
77
|
+
"""
|
|
78
|
+
Exporta DataFrame para CSV com arquivo sidecar de metadados.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
df: DataFrame a exportar
|
|
82
|
+
path: Caminho do arquivo
|
|
83
|
+
meta: Metadados opcionais
|
|
84
|
+
include_header: Incluir linha de cabecalho
|
|
85
|
+
include_sidecar: Criar arquivo .meta.json
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Tupla (path_csv, path_sidecar ou None)
|
|
89
|
+
"""
|
|
90
|
+
path = Path(path)
|
|
91
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
df.to_csv(path, index=False, header=include_header, quoting=csv.QUOTE_NONNUMERIC)
|
|
94
|
+
|
|
95
|
+
sidecar_path = None
|
|
96
|
+
if include_sidecar:
|
|
97
|
+
sidecar_path = path.with_suffix(".meta.json")
|
|
98
|
+
sidecar_data = _create_sidecar(df, meta)
|
|
99
|
+
with open(sidecar_path, "w") as f:
|
|
100
|
+
json.dump(sidecar_data, f, indent=2, ensure_ascii=False)
|
|
101
|
+
|
|
102
|
+
logger.info("export_csv", path=str(path), rows=len(df))
|
|
103
|
+
return path, sidecar_path
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def export_json(
|
|
107
|
+
df: pd.DataFrame,
|
|
108
|
+
path: str | Path,
|
|
109
|
+
meta: MetaInfo | None = None,
|
|
110
|
+
orient: str = "records",
|
|
111
|
+
include_metadata: bool = True,
|
|
112
|
+
) -> Path:
|
|
113
|
+
"""
|
|
114
|
+
Exporta DataFrame para JSON com metadados embutidos.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df: DataFrame a exportar
|
|
118
|
+
path: Caminho do arquivo
|
|
119
|
+
meta: Metadados opcionais
|
|
120
|
+
orient: Orientacao do JSON (records, split, index, etc)
|
|
121
|
+
include_metadata: Incluir metadados no JSON
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Path do arquivo criado
|
|
125
|
+
"""
|
|
126
|
+
path = Path(path)
|
|
127
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
|
|
129
|
+
if include_metadata:
|
|
130
|
+
output = {
|
|
131
|
+
"metadata": _create_sidecar(df, meta),
|
|
132
|
+
"data": json.loads(df.to_json(orient=orient, date_format="iso")), # type: ignore[call-overload]
|
|
133
|
+
}
|
|
134
|
+
with open(path, "w") as f:
|
|
135
|
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
136
|
+
else:
|
|
137
|
+
df.to_json(path, orient=orient, date_format="iso", indent=2) # type: ignore[call-overload]
|
|
138
|
+
|
|
139
|
+
logger.info("export_json", path=str(path), rows=len(df))
|
|
140
|
+
return path
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _create_sidecar(df: pd.DataFrame, meta: MetaInfo | None = None) -> dict[str, Any]:
|
|
144
|
+
"""Cria metadados para arquivo sidecar."""
|
|
145
|
+
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
|
146
|
+
content_hash = hashlib.sha256(csv_bytes).hexdigest()
|
|
147
|
+
|
|
148
|
+
sidecar: dict[str, Any] = {
|
|
149
|
+
"agrobr_version": _get_version(),
|
|
150
|
+
"export_timestamp": datetime.now().isoformat(),
|
|
151
|
+
"file_info": {
|
|
152
|
+
"row_count": len(df),
|
|
153
|
+
"column_count": len(df.columns),
|
|
154
|
+
"columns": df.columns.tolist(),
|
|
155
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
156
|
+
"content_hash": f"sha256:{content_hash}",
|
|
157
|
+
},
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if meta:
|
|
161
|
+
sidecar["provenance"] = {
|
|
162
|
+
"source": meta.source,
|
|
163
|
+
"source_url": meta.source_url,
|
|
164
|
+
"source_method": meta.source_method,
|
|
165
|
+
"fetched_at": meta.fetched_at.isoformat(),
|
|
166
|
+
"from_cache": meta.from_cache,
|
|
167
|
+
"original_hash": meta.raw_content_hash,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return sidecar
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def verify_export(path: str | Path, expected_hash: str | None = None) -> dict[str, Any]:
|
|
174
|
+
"""
|
|
175
|
+
Verifica integridade de um arquivo exportado.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
path: Caminho do arquivo
|
|
179
|
+
expected_hash: Hash esperado (opcional)
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dict com status da verificacao
|
|
183
|
+
"""
|
|
184
|
+
path = Path(path)
|
|
185
|
+
|
|
186
|
+
if not path.exists():
|
|
187
|
+
return {"valid": False, "error": "File not found"}
|
|
188
|
+
|
|
189
|
+
result: dict[str, Any] = {
|
|
190
|
+
"valid": True,
|
|
191
|
+
"path": str(path),
|
|
192
|
+
"size_bytes": path.stat().st_size,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if path.suffix == ".parquet":
|
|
196
|
+
import pyarrow.parquet as pq
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
table = pq.read_table(path)
|
|
200
|
+
result["row_count"] = table.num_rows
|
|
201
|
+
result["columns"] = table.schema.names
|
|
202
|
+
|
|
203
|
+
metadata = table.schema.metadata or {}
|
|
204
|
+
if b"content_hash" in metadata:
|
|
205
|
+
result["stored_hash"] = metadata[b"content_hash"].decode()
|
|
206
|
+
except Exception as e:
|
|
207
|
+
result["valid"] = False
|
|
208
|
+
result["error"] = str(e)
|
|
209
|
+
|
|
210
|
+
elif path.suffix == ".csv":
|
|
211
|
+
import pandas as pd
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
df = pd.read_csv(path)
|
|
215
|
+
result["row_count"] = len(df)
|
|
216
|
+
result["columns"] = df.columns.tolist()
|
|
217
|
+
|
|
218
|
+
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
|
219
|
+
result["computed_hash"] = f"sha256:{hashlib.sha256(csv_bytes).hexdigest()}"
|
|
220
|
+
|
|
221
|
+
sidecar_path = path.with_suffix(".meta.json")
|
|
222
|
+
if sidecar_path.exists():
|
|
223
|
+
with open(sidecar_path) as f:
|
|
224
|
+
sidecar = json.load(f)
|
|
225
|
+
result["stored_hash"] = sidecar.get("file_info", {}).get("content_hash")
|
|
226
|
+
except Exception as e:
|
|
227
|
+
result["valid"] = False
|
|
228
|
+
result["error"] = str(e)
|
|
229
|
+
|
|
230
|
+
if expected_hash and result.get("computed_hash"):
|
|
231
|
+
result["hash_match"] = result["computed_hash"] == expected_hash
|
|
232
|
+
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _get_version() -> str:
|
|
237
|
+
"""Retorna versao do agrobr."""
|
|
238
|
+
try:
|
|
239
|
+
import agrobr
|
|
240
|
+
|
|
241
|
+
return getattr(agrobr, "__version__", "unknown")
|
|
242
|
+
except ImportError:
|
|
243
|
+
return "unknown"
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
__all__ = [
|
|
247
|
+
"export_parquet",
|
|
248
|
+
"export_csv",
|
|
249
|
+
"export_json",
|
|
250
|
+
"verify_export",
|
|
251
|
+
]
|
agrobr/health/__init__.py
CHANGED
|
@@ -8,6 +8,12 @@ from .checker import (
|
|
|
8
8
|
check_source,
|
|
9
9
|
run_all_checks,
|
|
10
10
|
)
|
|
11
|
+
from .doctor import (
|
|
12
|
+
CacheStats,
|
|
13
|
+
DiagnosticsResult,
|
|
14
|
+
SourceStatus,
|
|
15
|
+
run_diagnostics,
|
|
16
|
+
)
|
|
11
17
|
from .reporter import (
|
|
12
18
|
HealthReport,
|
|
13
19
|
generate_report,
|
|
@@ -20,4 +26,8 @@ __all__: list[str] = [
|
|
|
20
26
|
"run_all_checks",
|
|
21
27
|
"HealthReport",
|
|
22
28
|
"generate_report",
|
|
29
|
+
"DiagnosticsResult",
|
|
30
|
+
"SourceStatus",
|
|
31
|
+
"CacheStats",
|
|
32
|
+
"run_diagnostics",
|
|
23
33
|
]
|
agrobr/health/doctor.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""Diagnóstico completo do sistema agrobr."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import httpx
|
|
13
|
+
import structlog
|
|
14
|
+
|
|
15
|
+
from agrobr import __version__
|
|
16
|
+
from agrobr.cache.duckdb_store import get_store
|
|
17
|
+
from agrobr.cache.policies import get_next_update_info
|
|
18
|
+
|
|
19
|
+
logger = structlog.get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class SourceStatus:
|
|
24
|
+
"""Status de conectividade de uma fonte."""
|
|
25
|
+
|
|
26
|
+
name: str
|
|
27
|
+
url: str
|
|
28
|
+
status: str
|
|
29
|
+
latency_ms: int
|
|
30
|
+
error: str | None = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class CacheStats:
|
|
35
|
+
"""Estatísticas do cache."""
|
|
36
|
+
|
|
37
|
+
location: str
|
|
38
|
+
size_bytes: int
|
|
39
|
+
total_records: int
|
|
40
|
+
by_source: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class DiagnosticsResult:
|
|
45
|
+
"""Resultado do diagnóstico completo."""
|
|
46
|
+
|
|
47
|
+
version: str
|
|
48
|
+
timestamp: datetime
|
|
49
|
+
sources: list[SourceStatus]
|
|
50
|
+
cache: CacheStats
|
|
51
|
+
last_collections: dict[str, datetime | None]
|
|
52
|
+
cache_expiry: dict[str, dict[str, str]]
|
|
53
|
+
config: dict[str, Any]
|
|
54
|
+
overall_status: str
|
|
55
|
+
|
|
56
|
+
def to_dict(self) -> dict[str, Any]:
|
|
57
|
+
"""Converte para dicionário serializável."""
|
|
58
|
+
return {
|
|
59
|
+
"version": self.version,
|
|
60
|
+
"timestamp": self.timestamp.isoformat(),
|
|
61
|
+
"sources": [
|
|
62
|
+
{
|
|
63
|
+
"name": s.name,
|
|
64
|
+
"url": s.url,
|
|
65
|
+
"status": s.status,
|
|
66
|
+
"latency_ms": s.latency_ms,
|
|
67
|
+
"error": s.error,
|
|
68
|
+
}
|
|
69
|
+
for s in self.sources
|
|
70
|
+
],
|
|
71
|
+
"cache": {
|
|
72
|
+
"location": self.cache.location,
|
|
73
|
+
"size_mb": round(self.cache.size_bytes / 1024 / 1024, 2),
|
|
74
|
+
"total_records": self.cache.total_records,
|
|
75
|
+
"by_source": self.cache.by_source,
|
|
76
|
+
},
|
|
77
|
+
"last_collections": {
|
|
78
|
+
k: v.isoformat() if v else None for k, v in self.last_collections.items()
|
|
79
|
+
},
|
|
80
|
+
"cache_expiry": self.cache_expiry,
|
|
81
|
+
"config": self.config,
|
|
82
|
+
"overall_status": self.overall_status,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
def to_rich(self) -> str:
|
|
86
|
+
"""Formata para output no terminal."""
|
|
87
|
+
lines = [
|
|
88
|
+
"",
|
|
89
|
+
f"agrobr diagnostics v{self.version}",
|
|
90
|
+
"=" * 50,
|
|
91
|
+
"",
|
|
92
|
+
"Sources Connectivity",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
for s in self.sources:
|
|
96
|
+
if s.status == "ok":
|
|
97
|
+
icon = "[OK]"
|
|
98
|
+
elif s.status == "slow":
|
|
99
|
+
icon = "[SLOW]"
|
|
100
|
+
else:
|
|
101
|
+
icon = "[FAIL]"
|
|
102
|
+
|
|
103
|
+
line = f" {icon} {s.name:<35} {s.latency_ms:>5}ms"
|
|
104
|
+
if s.error:
|
|
105
|
+
line += f" ({s.error})"
|
|
106
|
+
lines.append(line)
|
|
107
|
+
|
|
108
|
+
lines.extend(
|
|
109
|
+
[
|
|
110
|
+
"",
|
|
111
|
+
"Cache Status",
|
|
112
|
+
f" Location: {self.cache.location}",
|
|
113
|
+
f" Size: {self.cache.size_bytes / 1024 / 1024:.2f} MB",
|
|
114
|
+
f" Total records: {self.cache.total_records:,}",
|
|
115
|
+
"",
|
|
116
|
+
" By source:",
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
for fonte, stats in self.cache.by_source.items():
|
|
121
|
+
count = stats.get("count", 0)
|
|
122
|
+
oldest = stats.get("oldest", "-")
|
|
123
|
+
newest = stats.get("newest", "-")
|
|
124
|
+
lines.append(f" {fonte.upper()}: {count:,} records ({oldest} to {newest})")
|
|
125
|
+
|
|
126
|
+
lines.extend(
|
|
127
|
+
[
|
|
128
|
+
"",
|
|
129
|
+
"Cache Expiry",
|
|
130
|
+
]
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
for fonte, info in self.cache_expiry.items():
|
|
134
|
+
exp_type = info.get("type", "unknown")
|
|
135
|
+
if exp_type == "smart":
|
|
136
|
+
lines.append(f" {fonte.upper()}: {info.get('description', '')}")
|
|
137
|
+
else:
|
|
138
|
+
lines.append(f" {fonte.upper()}: TTL {info.get('ttl', 'unknown')}")
|
|
139
|
+
|
|
140
|
+
lines.extend(
|
|
141
|
+
[
|
|
142
|
+
"",
|
|
143
|
+
"Configuration",
|
|
144
|
+
f" Browser fallback: {'enabled' if self.config.get('browser_fallback') else 'disabled'}",
|
|
145
|
+
f" Alternative source: {'enabled' if self.config.get('alternative_source') else 'disabled'}",
|
|
146
|
+
"",
|
|
147
|
+
]
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if self.overall_status == "healthy":
|
|
151
|
+
lines.append("[OK] All systems operational")
|
|
152
|
+
elif self.overall_status == "degraded":
|
|
153
|
+
lines.append("[WARN] System degraded - some sources unavailable")
|
|
154
|
+
else:
|
|
155
|
+
lines.append("[FAIL] System error - check source connectivity")
|
|
156
|
+
|
|
157
|
+
lines.append("")
|
|
158
|
+
return "\n".join(lines)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
async def _check_source(name: str, url: str, timeout: float = 10.0) -> SourceStatus:
|
|
162
|
+
"""Verifica conectividade de uma fonte."""
|
|
163
|
+
start = time.perf_counter()
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
async with httpx.AsyncClient(timeout=timeout) as http_client:
|
|
167
|
+
response = await http_client.head(url, follow_redirects=True)
|
|
168
|
+
latency_ms = int((time.perf_counter() - start) * 1000)
|
|
169
|
+
|
|
170
|
+
if response.status_code < 400:
|
|
171
|
+
status = "ok" if latency_ms < 2000 else "slow"
|
|
172
|
+
return SourceStatus(name, url, status, latency_ms)
|
|
173
|
+
|
|
174
|
+
return SourceStatus(
|
|
175
|
+
name,
|
|
176
|
+
url,
|
|
177
|
+
"error",
|
|
178
|
+
latency_ms,
|
|
179
|
+
error=f"HTTP {response.status_code}",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
except httpx.TimeoutException:
|
|
183
|
+
latency_ms = int((time.perf_counter() - start) * 1000)
|
|
184
|
+
return SourceStatus(name, url, "error", latency_ms, error="timeout")
|
|
185
|
+
|
|
186
|
+
except httpx.ConnectError as e:
|
|
187
|
+
latency_ms = int((time.perf_counter() - start) * 1000)
|
|
188
|
+
return SourceStatus(name, url, "error", latency_ms, error=f"connection error: {e}")
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
latency_ms = int((time.perf_counter() - start) * 1000)
|
|
192
|
+
return SourceStatus(name, url, "error", latency_ms, error=str(e))
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _get_cache_stats() -> CacheStats:
|
|
196
|
+
"""Obtém estatísticas do cache."""
|
|
197
|
+
try:
|
|
198
|
+
store = get_store()
|
|
199
|
+
cache_path = Path(store.db_path)
|
|
200
|
+
size_bytes = cache_path.stat().st_size if cache_path.exists() else 0
|
|
201
|
+
|
|
202
|
+
by_source: dict[str, dict[str, Any]] = {}
|
|
203
|
+
conn = store._get_conn()
|
|
204
|
+
|
|
205
|
+
for fonte in ["cepea", "conab", "ibge"]:
|
|
206
|
+
try:
|
|
207
|
+
result = conn.execute(
|
|
208
|
+
"""
|
|
209
|
+
SELECT COUNT(*), MIN(data), MAX(data)
|
|
210
|
+
FROM indicadores
|
|
211
|
+
WHERE LOWER(fonte) = ?
|
|
212
|
+
""",
|
|
213
|
+
[fonte],
|
|
214
|
+
).fetchone()
|
|
215
|
+
|
|
216
|
+
if result and result[0] > 0:
|
|
217
|
+
by_source[fonte] = {
|
|
218
|
+
"count": result[0],
|
|
219
|
+
"oldest": str(result[1]) if result[1] else None,
|
|
220
|
+
"newest": str(result[2]) if result[2] else None,
|
|
221
|
+
}
|
|
222
|
+
except Exception:
|
|
223
|
+
pass
|
|
224
|
+
|
|
225
|
+
total_records = sum(s.get("count", 0) for s in by_source.values())
|
|
226
|
+
|
|
227
|
+
return CacheStats(
|
|
228
|
+
location=str(cache_path),
|
|
229
|
+
size_bytes=size_bytes,
|
|
230
|
+
total_records=total_records,
|
|
231
|
+
by_source=by_source,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
except Exception as e:
|
|
235
|
+
logger.warning("cache_stats_failed", error=str(e))
|
|
236
|
+
return CacheStats(
|
|
237
|
+
location="unknown",
|
|
238
|
+
size_bytes=0,
|
|
239
|
+
total_records=0,
|
|
240
|
+
by_source={},
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _get_last_collections() -> dict[str, datetime | None]:
|
|
245
|
+
"""Obtém data da última coleta por fonte."""
|
|
246
|
+
collections: dict[str, datetime | None] = {}
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
store = get_store()
|
|
250
|
+
conn = store._get_conn()
|
|
251
|
+
|
|
252
|
+
for fonte in ["cepea", "conab", "ibge"]:
|
|
253
|
+
try:
|
|
254
|
+
result = conn.execute(
|
|
255
|
+
"""
|
|
256
|
+
SELECT MAX(collected_at)
|
|
257
|
+
FROM indicadores
|
|
258
|
+
WHERE LOWER(fonte) = ?
|
|
259
|
+
""",
|
|
260
|
+
[fonte],
|
|
261
|
+
).fetchone()
|
|
262
|
+
|
|
263
|
+
collections[fonte] = result[0] if result and result[0] else None
|
|
264
|
+
|
|
265
|
+
except Exception:
|
|
266
|
+
collections[fonte] = None
|
|
267
|
+
|
|
268
|
+
except Exception:
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
return collections
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
async def run_diagnostics(verbose: bool = False) -> DiagnosticsResult: # noqa: ARG001
|
|
275
|
+
"""
|
|
276
|
+
Executa diagnóstico completo do sistema.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
verbose: Se True, inclui informações detalhadas (reservado para uso futuro)
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
DiagnosticsResult com status completo
|
|
283
|
+
"""
|
|
284
|
+
sources_to_check = [
|
|
285
|
+
("CEPEA (Noticias Agricolas)", "https://www.noticiasagricolas.com.br"),
|
|
286
|
+
("CONAB", "https://www.conab.gov.br"),
|
|
287
|
+
("IBGE/SIDRA", "https://sidra.ibge.gov.br"),
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
source_tasks = [_check_source(name, url) for name, url in sources_to_check]
|
|
291
|
+
sources = await asyncio.gather(*source_tasks)
|
|
292
|
+
|
|
293
|
+
cache = _get_cache_stats()
|
|
294
|
+
|
|
295
|
+
cache_expiry = {}
|
|
296
|
+
for fonte in ["cepea", "conab", "ibge"]:
|
|
297
|
+
cache_expiry[fonte] = get_next_update_info(fonte)
|
|
298
|
+
|
|
299
|
+
last_collections = _get_last_collections()
|
|
300
|
+
|
|
301
|
+
error_count = sum(1 for s in sources if s.status == "error")
|
|
302
|
+
if error_count == len(sources):
|
|
303
|
+
overall_status = "error"
|
|
304
|
+
elif error_count > 0:
|
|
305
|
+
overall_status = "degraded"
|
|
306
|
+
else:
|
|
307
|
+
overall_status = "healthy"
|
|
308
|
+
|
|
309
|
+
return DiagnosticsResult(
|
|
310
|
+
version=__version__,
|
|
311
|
+
timestamp=datetime.now(),
|
|
312
|
+
sources=list(sources),
|
|
313
|
+
cache=cache,
|
|
314
|
+
last_collections=last_collections,
|
|
315
|
+
cache_expiry=cache_expiry,
|
|
316
|
+
config={
|
|
317
|
+
"browser_fallback": False,
|
|
318
|
+
"alternative_source": True,
|
|
319
|
+
},
|
|
320
|
+
overall_status=overall_status,
|
|
321
|
+
)
|
agrobr/http/browser.py
CHANGED
|
@@ -15,7 +15,6 @@ from agrobr.http.user_agents import UserAgentRotator
|
|
|
15
15
|
|
|
16
16
|
logger = structlog.get_logger()
|
|
17
17
|
|
|
18
|
-
# Singleton para reutilizar browser
|
|
19
18
|
_playwright: Playwright | None = None
|
|
20
19
|
_browser: Browser | None = None
|
|
21
20
|
_lock = asyncio.Lock()
|
|
@@ -64,7 +63,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
|
|
|
64
63
|
"""Context manager para obter uma página do browser."""
|
|
65
64
|
browser = await _get_browser()
|
|
66
65
|
|
|
67
|
-
# Cria contexto com fingerprint realista
|
|
68
66
|
ua = UserAgentRotator.get_random()
|
|
69
67
|
context = await browser.new_context(
|
|
70
68
|
user_agent=ua,
|
|
@@ -78,7 +76,6 @@ async def get_page() -> AsyncGenerator[Page, None]:
|
|
|
78
76
|
|
|
79
77
|
page = await context.new_page()
|
|
80
78
|
|
|
81
|
-
# Esconde sinais de automação
|
|
82
79
|
await page.add_init_script(
|
|
83
80
|
"""
|
|
84
81
|
Object.defineProperty(navigator, 'webdriver', {
|
|
@@ -124,7 +121,6 @@ async def fetch_with_browser(
|
|
|
124
121
|
|
|
125
122
|
try:
|
|
126
123
|
async with get_page() as page:
|
|
127
|
-
# Navega para a URL
|
|
128
124
|
response = await page.goto(
|
|
129
125
|
url,
|
|
130
126
|
wait_until="domcontentloaded",
|
|
@@ -138,7 +134,6 @@ async def fetch_with_browser(
|
|
|
138
134
|
last_error="No response received",
|
|
139
135
|
)
|
|
140
136
|
|
|
141
|
-
# Aguarda seletor específico se fornecido
|
|
142
137
|
if wait_selector:
|
|
143
138
|
try:
|
|
144
139
|
await page.wait_for_selector(
|
|
@@ -152,13 +147,10 @@ async def fetch_with_browser(
|
|
|
152
147
|
error=str(e),
|
|
153
148
|
)
|
|
154
149
|
|
|
155
|
-
# Aguarda Cloudflare resolver e JS terminar
|
|
156
150
|
await page.wait_for_timeout(5000)
|
|
157
151
|
|
|
158
|
-
# Verifica se foi bloqueado pelo Cloudflare
|
|
159
152
|
if response.status in (403, 503):
|
|
160
153
|
check_html: str = await page.content()
|
|
161
|
-
# Detecta página de challenge do Cloudflare
|
|
162
154
|
if "cloudflare" in check_html.lower() or "challenge" in check_html.lower():
|
|
163
155
|
raise SourceUnavailableError(
|
|
164
156
|
source=source,
|
|
@@ -166,7 +158,6 @@ async def fetch_with_browser(
|
|
|
166
158
|
last_error=f"Cloudflare block detected (status {response.status})",
|
|
167
159
|
)
|
|
168
160
|
|
|
169
|
-
# Obtém HTML
|
|
170
161
|
html: str = await page.content()
|
|
171
162
|
|
|
172
163
|
logger.info(
|