agrobr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agrobr/__init__.py +10 -0
- agrobr/alerts/__init__.py +7 -0
- agrobr/alerts/notifier.py +167 -0
- agrobr/cache/__init__.py +31 -0
- agrobr/cache/duckdb_store.py +433 -0
- agrobr/cache/history.py +317 -0
- agrobr/cache/migrations.py +82 -0
- agrobr/cache/policies.py +240 -0
- agrobr/cepea/__init__.py +7 -0
- agrobr/cepea/api.py +360 -0
- agrobr/cepea/client.py +273 -0
- agrobr/cepea/parsers/__init__.py +37 -0
- agrobr/cepea/parsers/base.py +35 -0
- agrobr/cepea/parsers/consensus.py +300 -0
- agrobr/cepea/parsers/detector.py +108 -0
- agrobr/cepea/parsers/fingerprint.py +226 -0
- agrobr/cepea/parsers/v1.py +305 -0
- agrobr/cli.py +323 -0
- agrobr/conab/__init__.py +21 -0
- agrobr/conab/api.py +239 -0
- agrobr/conab/client.py +219 -0
- agrobr/conab/parsers/__init__.py +7 -0
- agrobr/conab/parsers/v1.py +383 -0
- agrobr/constants.py +205 -0
- agrobr/exceptions.py +104 -0
- agrobr/health/__init__.py +23 -0
- agrobr/health/checker.py +202 -0
- agrobr/health/reporter.py +314 -0
- agrobr/http/__init__.py +9 -0
- agrobr/http/browser.py +214 -0
- agrobr/http/rate_limiter.py +69 -0
- agrobr/http/retry.py +93 -0
- agrobr/http/user_agents.py +67 -0
- agrobr/ibge/__init__.py +19 -0
- agrobr/ibge/api.py +273 -0
- agrobr/ibge/client.py +256 -0
- agrobr/models.py +85 -0
- agrobr/normalize/__init__.py +64 -0
- agrobr/normalize/dates.py +303 -0
- agrobr/normalize/encoding.py +102 -0
- agrobr/normalize/regions.py +308 -0
- agrobr/normalize/units.py +278 -0
- agrobr/noticias_agricolas/__init__.py +6 -0
- agrobr/noticias_agricolas/client.py +222 -0
- agrobr/noticias_agricolas/parser.py +187 -0
- agrobr/sync.py +147 -0
- agrobr/telemetry/__init__.py +17 -0
- agrobr/telemetry/collector.py +153 -0
- agrobr/utils/__init__.py +5 -0
- agrobr/utils/logging.py +59 -0
- agrobr/validators/__init__.py +35 -0
- agrobr/validators/sanity.py +286 -0
- agrobr/validators/structural.py +313 -0
- agrobr-0.1.0.dist-info/METADATA +243 -0
- agrobr-0.1.0.dist-info/RECORD +58 -0
- agrobr-0.1.0.dist-info/WHEEL +4 -0
- agrobr-0.1.0.dist-info/entry_points.txt +2 -0
- agrobr-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multi-parser consensus para validação cruzada.
|
|
3
|
+
|
|
4
|
+
Executa múltiplos parsers e compara resultados para detectar
|
|
5
|
+
problemas de parsing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import structlog
|
|
14
|
+
|
|
15
|
+
from ...alerts.notifier import AlertLevel, send_alert
|
|
16
|
+
from ...exceptions import ParseError
|
|
17
|
+
from ...models import Indicador
|
|
18
|
+
from .base import BaseParser
|
|
19
|
+
from .v1 import CepeaParserV1
|
|
20
|
+
|
|
21
|
+
logger = structlog.get_logger()
|
|
22
|
+
|
|
23
|
+
CONSENSUS_PARSERS: list[type[BaseParser]] = [
|
|
24
|
+
CepeaParserV1,
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
DIVERGENCE_THRESHOLD_COUNT = 0.1
|
|
28
|
+
DIVERGENCE_THRESHOLD_VALUE = 0.01
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ConsensusResult:
|
|
33
|
+
"""Resultado de parsing com consensus."""
|
|
34
|
+
|
|
35
|
+
indicadores: list[Indicador]
|
|
36
|
+
parser_used: BaseParser
|
|
37
|
+
all_results: dict[int, list[Indicador]]
|
|
38
|
+
has_consensus: bool
|
|
39
|
+
divergences: list[dict[str, Any]]
|
|
40
|
+
report: dict[str, Any]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ParserDivergence:
|
|
45
|
+
"""Divergência entre parsers."""
|
|
46
|
+
|
|
47
|
+
divergence_type: str
|
|
48
|
+
versions: list[int]
|
|
49
|
+
details: dict[str, Any]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def parse_with_consensus(
|
|
53
|
+
html: str,
|
|
54
|
+
produto: str,
|
|
55
|
+
require_consensus: bool = False,
|
|
56
|
+
) -> ConsensusResult:
|
|
57
|
+
"""
|
|
58
|
+
Executa múltiplos parsers e compara resultados.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
html: Conteúdo HTML
|
|
62
|
+
produto: Produto a parsear
|
|
63
|
+
require_consensus: Se True, levanta erro se parsers divergem
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
ConsensusResult
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
ParseError: Se require_consensus e parsers divergem
|
|
70
|
+
"""
|
|
71
|
+
results: dict[int, list[Indicador]] = {}
|
|
72
|
+
errors: dict[int, str] = {}
|
|
73
|
+
|
|
74
|
+
for parser_cls in CONSENSUS_PARSERS:
|
|
75
|
+
parser = parser_cls()
|
|
76
|
+
try:
|
|
77
|
+
can_parse, confidence = parser.can_parse(html)
|
|
78
|
+
if can_parse and confidence > 0.5:
|
|
79
|
+
parsed = parser.parse(html, produto)
|
|
80
|
+
results[parser.version] = parsed
|
|
81
|
+
logger.debug(
|
|
82
|
+
"consensus_parser_success",
|
|
83
|
+
version=parser.version,
|
|
84
|
+
count=len(parsed),
|
|
85
|
+
)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
errors[parser.version] = str(e)
|
|
88
|
+
logger.warning(
|
|
89
|
+
"consensus_parser_failed",
|
|
90
|
+
version=parser.version,
|
|
91
|
+
error=str(e),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
divergences, report = analyze_consensus(results, errors)
|
|
95
|
+
|
|
96
|
+
has_consensus = len(divergences) == 0
|
|
97
|
+
|
|
98
|
+
if not has_consensus:
|
|
99
|
+
logger.warning(
|
|
100
|
+
"consensus_divergence_detected",
|
|
101
|
+
divergence_count=len(divergences),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if require_consensus:
|
|
105
|
+
await send_alert(
|
|
106
|
+
level=AlertLevel.WARNING,
|
|
107
|
+
title="Parser consensus failed",
|
|
108
|
+
details=report,
|
|
109
|
+
)
|
|
110
|
+
raise ParseError(
|
|
111
|
+
source="cepea",
|
|
112
|
+
parser_version=0,
|
|
113
|
+
reason=f"Parsers diverged: {len(divergences)} differences",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
latest_version = max(results.keys()) if results else 0
|
|
117
|
+
best_results = results.get(latest_version, [])
|
|
118
|
+
|
|
119
|
+
parser_used: BaseParser = CepeaParserV1()
|
|
120
|
+
for parser_cls in CONSENSUS_PARSERS:
|
|
121
|
+
if parser_cls().version == latest_version:
|
|
122
|
+
parser_used = parser_cls()
|
|
123
|
+
break
|
|
124
|
+
|
|
125
|
+
return ConsensusResult(
|
|
126
|
+
indicadores=best_results,
|
|
127
|
+
parser_used=parser_used,
|
|
128
|
+
all_results=results,
|
|
129
|
+
has_consensus=has_consensus,
|
|
130
|
+
divergences=divergences,
|
|
131
|
+
report=report,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def analyze_consensus(
|
|
136
|
+
results: dict[int, list[Indicador]],
|
|
137
|
+
errors: dict[int, str],
|
|
138
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
139
|
+
"""
|
|
140
|
+
Analisa resultados de múltiplos parsers.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
results: Resultados por versão de parser
|
|
144
|
+
errors: Erros por versão de parser
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tupla (lista de divergências, relatório completo)
|
|
148
|
+
"""
|
|
149
|
+
report = {
|
|
150
|
+
"parser_count": len(CONSENSUS_PARSERS),
|
|
151
|
+
"successful": list(results.keys()),
|
|
152
|
+
"failed": list(errors.keys()),
|
|
153
|
+
"errors": errors,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
divergences: list[dict[str, Any]] = []
|
|
157
|
+
|
|
158
|
+
if len(results) < 2:
|
|
159
|
+
return divergences, report
|
|
160
|
+
|
|
161
|
+
counts = {v: len(r) for v, r in results.items()}
|
|
162
|
+
unique_counts = set(counts.values())
|
|
163
|
+
|
|
164
|
+
if len(unique_counts) > 1:
|
|
165
|
+
divergences.append(
|
|
166
|
+
{
|
|
167
|
+
"type": "count_mismatch",
|
|
168
|
+
"versions": list(counts.keys()),
|
|
169
|
+
"counts": counts,
|
|
170
|
+
"description": f"Different record counts: {counts}",
|
|
171
|
+
}
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
versions = list(results.keys())
|
|
175
|
+
base_version = versions[0]
|
|
176
|
+
base_results = results[base_version]
|
|
177
|
+
|
|
178
|
+
for other_version in versions[1:]:
|
|
179
|
+
other_results = results[other_version]
|
|
180
|
+
|
|
181
|
+
if base_results and other_results:
|
|
182
|
+
if base_results[0].data != other_results[0].data:
|
|
183
|
+
divergences.append(
|
|
184
|
+
{
|
|
185
|
+
"type": "first_date_mismatch",
|
|
186
|
+
"versions": [base_version, other_version],
|
|
187
|
+
"values": [str(base_results[0].data), str(other_results[0].data)],
|
|
188
|
+
}
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
first_diff = abs(float(base_results[0].valor) - float(other_results[0].valor))
|
|
192
|
+
if first_diff > DIVERGENCE_THRESHOLD_VALUE:
|
|
193
|
+
divergences.append(
|
|
194
|
+
{
|
|
195
|
+
"type": "first_value_mismatch",
|
|
196
|
+
"versions": [base_version, other_version],
|
|
197
|
+
"values": [str(base_results[0].valor), str(other_results[0].valor)],
|
|
198
|
+
"difference": first_diff,
|
|
199
|
+
}
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if base_results[-1].data != other_results[-1].data:
|
|
203
|
+
divergences.append(
|
|
204
|
+
{
|
|
205
|
+
"type": "last_date_mismatch",
|
|
206
|
+
"versions": [base_version, other_version],
|
|
207
|
+
"values": [str(base_results[-1].data), str(other_results[-1].data)],
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
last_diff = abs(float(base_results[-1].valor) - float(other_results[-1].valor))
|
|
212
|
+
if last_diff > DIVERGENCE_THRESHOLD_VALUE:
|
|
213
|
+
divergences.append(
|
|
214
|
+
{
|
|
215
|
+
"type": "last_value_mismatch",
|
|
216
|
+
"versions": [base_version, other_version],
|
|
217
|
+
"values": [str(base_results[-1].valor), str(other_results[-1].valor)],
|
|
218
|
+
"difference": last_diff,
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
report["divergences"] = divergences
|
|
223
|
+
report["has_divergence"] = len(divergences) > 0
|
|
224
|
+
|
|
225
|
+
return divergences, report
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def select_best_result(
|
|
229
|
+
results: dict[int, list[Indicador]],
|
|
230
|
+
divergences: list[dict[str, Any]],
|
|
231
|
+
) -> tuple[int, list[Indicador]]:
|
|
232
|
+
"""
|
|
233
|
+
Seleciona melhor resultado quando há divergência.
|
|
234
|
+
|
|
235
|
+
Estratégia:
|
|
236
|
+
1. Prefere parser mais recente
|
|
237
|
+
2. Se contagens diferem, prefere quem tem mais dados
|
|
238
|
+
3. Em último caso, usa mais recente
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
results: Resultados por versão
|
|
242
|
+
divergences: Divergências detectadas
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
Tupla (versão selecionada, indicadores)
|
|
246
|
+
"""
|
|
247
|
+
if not results:
|
|
248
|
+
return 0, []
|
|
249
|
+
|
|
250
|
+
has_count_mismatch = any(d["type"] == "count_mismatch" for d in divergences)
|
|
251
|
+
|
|
252
|
+
if has_count_mismatch:
|
|
253
|
+
best_version = max(results.keys(), key=lambda v: len(results[v]))
|
|
254
|
+
else:
|
|
255
|
+
best_version = max(results.keys())
|
|
256
|
+
|
|
257
|
+
return best_version, results[best_version]
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class ConsensusValidator:
|
|
261
|
+
"""Validador de consensus para uso contínuo."""
|
|
262
|
+
|
|
263
|
+
def __init__(self) -> None:
|
|
264
|
+
self.history: list[ConsensusResult] = []
|
|
265
|
+
self.divergence_count = 0
|
|
266
|
+
|
|
267
|
+
async def validate(self, html: str, produto: str) -> ConsensusResult:
|
|
268
|
+
"""
|
|
269
|
+
Executa validação com tracking de histórico.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
html: Conteúdo HTML
|
|
273
|
+
produto: Produto
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
ConsensusResult
|
|
277
|
+
"""
|
|
278
|
+
result = await parse_with_consensus(html, produto, require_consensus=False)
|
|
279
|
+
|
|
280
|
+
self.history.append(result)
|
|
281
|
+
if not result.has_consensus:
|
|
282
|
+
self.divergence_count += 1
|
|
283
|
+
|
|
284
|
+
return result
|
|
285
|
+
|
|
286
|
+
@property
|
|
287
|
+
def divergence_rate(self) -> float:
|
|
288
|
+
"""Taxa de divergência no histórico."""
|
|
289
|
+
if not self.history:
|
|
290
|
+
return 0.0
|
|
291
|
+
return self.divergence_count / len(self.history)
|
|
292
|
+
|
|
293
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
294
|
+
"""Retorna estatísticas do validador."""
|
|
295
|
+
return {
|
|
296
|
+
"total_validations": len(self.history),
|
|
297
|
+
"divergence_count": self.divergence_count,
|
|
298
|
+
"divergence_rate": self.divergence_rate,
|
|
299
|
+
"consensus_rate": 1 - self.divergence_rate,
|
|
300
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Detector e seletor de parser com fallback em cascata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import date
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
import structlog
|
|
9
|
+
|
|
10
|
+
from agrobr import constants, exceptions
|
|
11
|
+
from agrobr.cepea.parsers import base
|
|
12
|
+
from agrobr.cepea.parsers.v1 import CepeaParserV1
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from agrobr import models
|
|
16
|
+
|
|
17
|
+
logger = structlog.get_logger()
|
|
18
|
+
|
|
19
|
+
PARSERS: list[type[base.BaseParser]] = [
|
|
20
|
+
CepeaParserV1,
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def get_parser_with_fallback(
|
|
25
|
+
html: str,
|
|
26
|
+
produto: str,
|
|
27
|
+
data_referencia: date | None = None,
|
|
28
|
+
strict: bool = False,
|
|
29
|
+
) -> tuple[base.BaseParser, list[models.Indicador]]:
|
|
30
|
+
"""Seleciona parser e executa com fallback em cascata."""
|
|
31
|
+
if not PARSERS:
|
|
32
|
+
raise exceptions.ParseError(
|
|
33
|
+
source="cepea",
|
|
34
|
+
parser_version=0,
|
|
35
|
+
reason="No parsers registered. CEPEA parser will be implemented in WEEK 3.",
|
|
36
|
+
html_snippet=html[:200],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
errors: list[tuple[str, str]] = []
|
|
40
|
+
warnings: list[str] = []
|
|
41
|
+
|
|
42
|
+
for parser_cls in reversed(PARSERS):
|
|
43
|
+
parser = parser_cls()
|
|
44
|
+
|
|
45
|
+
if data_referencia:
|
|
46
|
+
if parser.valid_from > data_referencia:
|
|
47
|
+
continue
|
|
48
|
+
if parser.valid_until and data_referencia > parser.valid_until:
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
can_parse, confidence = parser.can_parse(html)
|
|
52
|
+
|
|
53
|
+
logger.debug(
|
|
54
|
+
"parser_check",
|
|
55
|
+
parser_version=parser.version,
|
|
56
|
+
can_parse=can_parse,
|
|
57
|
+
confidence=confidence,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if not can_parse:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
if confidence < constants.CONFIDENCE_LOW and strict:
|
|
64
|
+
raise exceptions.FingerprintMismatchError(
|
|
65
|
+
source=parser.source,
|
|
66
|
+
similarity=confidence,
|
|
67
|
+
threshold=constants.CONFIDENCE_LOW,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if confidence < constants.CONFIDENCE_HIGH:
|
|
71
|
+
warnings.append(
|
|
72
|
+
f"Parser v{parser.version} confidence {confidence:.1%} "
|
|
73
|
+
f"(expected >= {constants.CONFIDENCE_HIGH:.1%})"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
result = parser.parse(html, produto)
|
|
78
|
+
|
|
79
|
+
if not result:
|
|
80
|
+
errors.append((f"v{parser.version}", "No data extracted"))
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
if warnings:
|
|
84
|
+
logger.warning(
|
|
85
|
+
"parser_low_confidence",
|
|
86
|
+
parser_version=parser.version,
|
|
87
|
+
confidence=confidence,
|
|
88
|
+
warnings=warnings,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return parser, result
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
errors.append((f"v{parser.version}", str(e)))
|
|
95
|
+
logger.warning(
|
|
96
|
+
"parser_failed",
|
|
97
|
+
parser_version=parser.version,
|
|
98
|
+
error=str(e),
|
|
99
|
+
)
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
error_summary = "; ".join(f"{v}: {e}" for v, e in errors)
|
|
103
|
+
raise exceptions.ParseError(
|
|
104
|
+
source=PARSERS[0]().source if PARSERS else "cepea",
|
|
105
|
+
parser_version=0,
|
|
106
|
+
reason=f"All parsers failed: {error_summary}",
|
|
107
|
+
html_snippet=html[:500],
|
|
108
|
+
)
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""Fingerprinting de layout para detecção de mudanças."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import structlog
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
|
|
14
|
+
from agrobr.constants import Fonte
|
|
15
|
+
from agrobr.models import Fingerprint
|
|
16
|
+
|
|
17
|
+
logger = structlog.get_logger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def extract_fingerprint(
|
|
21
|
+
html: str,
|
|
22
|
+
source: Fonte,
|
|
23
|
+
url: str,
|
|
24
|
+
) -> Fingerprint:
|
|
25
|
+
"""
|
|
26
|
+
Extrai assinatura estrutural do HTML.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
html: Conteúdo HTML
|
|
30
|
+
source: Fonte de dados
|
|
31
|
+
url: URL original
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Fingerprint com assinatura estrutural
|
|
35
|
+
"""
|
|
36
|
+
soup = BeautifulSoup(html, "lxml")
|
|
37
|
+
|
|
38
|
+
table_classes: list[list[str]] = []
|
|
39
|
+
for table in soup.find_all("table")[:10]:
|
|
40
|
+
classes_raw = table.get("class")
|
|
41
|
+
if classes_raw is None:
|
|
42
|
+
classes: list[str] = []
|
|
43
|
+
elif isinstance(classes_raw, str):
|
|
44
|
+
classes = [classes_raw]
|
|
45
|
+
else:
|
|
46
|
+
classes = list(classes_raw)
|
|
47
|
+
table_classes.append(sorted(classes))
|
|
48
|
+
|
|
49
|
+
keywords = ["preco", "indicador", "cotacao", "valor", "tabela", "dados"]
|
|
50
|
+
key_ids: list[str] = []
|
|
51
|
+
for elem in soup.find_all(id=True):
|
|
52
|
+
elem_id_raw = elem.get("id")
|
|
53
|
+
if elem_id_raw is None or not isinstance(elem_id_raw, str):
|
|
54
|
+
continue
|
|
55
|
+
elem_id = elem_id_raw.lower()
|
|
56
|
+
if any(kw in elem_id for kw in keywords):
|
|
57
|
+
key_ids.append(elem_id_raw)
|
|
58
|
+
key_ids = sorted(set(key_ids))[:20]
|
|
59
|
+
|
|
60
|
+
table_headers: list[list[str]] = []
|
|
61
|
+
for table in soup.find_all("table")[:5]:
|
|
62
|
+
headers: list[str] = []
|
|
63
|
+
for th in table.find_all("th"):
|
|
64
|
+
text = th.get_text(strip=True)[:50]
|
|
65
|
+
if text:
|
|
66
|
+
headers.append(text)
|
|
67
|
+
if headers:
|
|
68
|
+
table_headers.append(headers)
|
|
69
|
+
|
|
70
|
+
element_counts = {
|
|
71
|
+
"tables": len(soup.find_all("table")),
|
|
72
|
+
"forms": len(soup.find_all("form")),
|
|
73
|
+
"divs_with_id": len(soup.find_all("div", id=True)),
|
|
74
|
+
"inputs": len(soup.find_all("input")),
|
|
75
|
+
"selects": len(soup.find_all("select")),
|
|
76
|
+
"links": len(soup.find_all("a")),
|
|
77
|
+
"scripts": len(soup.find_all("script")),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
structure_elements: list[tuple[str, int, tuple[str, ...]]] = []
|
|
81
|
+
for tag in soup.find_all(["table", "div", "form", "section", "article"])[:30]:
|
|
82
|
+
tag_classes_raw = tag.get("class")
|
|
83
|
+
if tag_classes_raw is None:
|
|
84
|
+
tag_classes: list[str] = []
|
|
85
|
+
elif isinstance(tag_classes_raw, str):
|
|
86
|
+
tag_classes = [tag_classes_raw]
|
|
87
|
+
else:
|
|
88
|
+
tag_classes = list(tag_classes_raw)
|
|
89
|
+
structure_elements.append(
|
|
90
|
+
(
|
|
91
|
+
tag.name or "",
|
|
92
|
+
len(tag.find_all(recursive=False)),
|
|
93
|
+
tuple(sorted(tag_classes))[:3] if tag_classes else (),
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
structure_hash = hashlib.md5(str(structure_elements).encode()).hexdigest()[:12]
|
|
98
|
+
|
|
99
|
+
return Fingerprint(
|
|
100
|
+
source=source,
|
|
101
|
+
url=url,
|
|
102
|
+
collected_at=datetime.utcnow(),
|
|
103
|
+
table_classes=table_classes,
|
|
104
|
+
key_ids=key_ids,
|
|
105
|
+
structure_hash=structure_hash,
|
|
106
|
+
table_headers=table_headers,
|
|
107
|
+
element_counts=element_counts,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def compare_fingerprints(
|
|
112
|
+
current: Fingerprint,
|
|
113
|
+
reference: Fingerprint,
|
|
114
|
+
) -> tuple[float, dict[str, Any]]:
|
|
115
|
+
"""
|
|
116
|
+
Compara duas fingerprints e retorna similaridade.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
current: Fingerprint atual
|
|
120
|
+
reference: Fingerprint de referência (baseline)
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
tuple[float, dict]: (similaridade 0-1, detalhes das diferenças)
|
|
124
|
+
"""
|
|
125
|
+
scores: dict[str, float] = {}
|
|
126
|
+
details: dict[str, Any] = {}
|
|
127
|
+
|
|
128
|
+
scores["structure"] = 1.0 if current.structure_hash == reference.structure_hash else 0.0
|
|
129
|
+
if scores["structure"] == 0:
|
|
130
|
+
details["structure_changed"] = {
|
|
131
|
+
"current": current.structure_hash,
|
|
132
|
+
"reference": reference.structure_hash,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if reference.table_classes:
|
|
136
|
+
matches = sum(1 for tc in current.table_classes if tc in reference.table_classes)
|
|
137
|
+
scores["table_classes"] = matches / len(reference.table_classes)
|
|
138
|
+
if scores["table_classes"] < 1.0:
|
|
139
|
+
details["table_classes_diff"] = {
|
|
140
|
+
"missing": [
|
|
141
|
+
tc for tc in reference.table_classes if tc not in current.table_classes
|
|
142
|
+
],
|
|
143
|
+
"new": [tc for tc in current.table_classes if tc not in reference.table_classes],
|
|
144
|
+
}
|
|
145
|
+
else:
|
|
146
|
+
scores["table_classes"] = 1.0
|
|
147
|
+
|
|
148
|
+
if reference.key_ids:
|
|
149
|
+
matches = sum(1 for kid in reference.key_ids if kid in current.key_ids)
|
|
150
|
+
scores["key_ids"] = matches / len(reference.key_ids)
|
|
151
|
+
if scores["key_ids"] < 1.0:
|
|
152
|
+
details["key_ids_diff"] = {
|
|
153
|
+
"missing": [kid for kid in reference.key_ids if kid not in current.key_ids],
|
|
154
|
+
"new": [kid for kid in current.key_ids if kid not in reference.key_ids],
|
|
155
|
+
}
|
|
156
|
+
else:
|
|
157
|
+
scores["key_ids"] = 1.0
|
|
158
|
+
|
|
159
|
+
if reference.table_headers:
|
|
160
|
+
header_score = 0.0
|
|
161
|
+
for ref_headers in reference.table_headers:
|
|
162
|
+
for cur_headers in current.table_headers:
|
|
163
|
+
ref_set = set(ref_headers)
|
|
164
|
+
cur_set = set(cur_headers)
|
|
165
|
+
if ref_set or cur_set:
|
|
166
|
+
jaccard = len(ref_set & cur_set) / len(ref_set | cur_set)
|
|
167
|
+
header_score = max(header_score, jaccard)
|
|
168
|
+
scores["table_headers"] = header_score
|
|
169
|
+
if scores["table_headers"] < 0.9:
|
|
170
|
+
details["table_headers_diff"] = {
|
|
171
|
+
"reference": reference.table_headers,
|
|
172
|
+
"current": current.table_headers,
|
|
173
|
+
}
|
|
174
|
+
else:
|
|
175
|
+
scores["table_headers"] = 1.0
|
|
176
|
+
|
|
177
|
+
count_diffs: dict[str, dict[str, int]] = {}
|
|
178
|
+
for key in reference.element_counts:
|
|
179
|
+
ref_count = reference.element_counts.get(key, 0)
|
|
180
|
+
cur_count = current.element_counts.get(key, 0)
|
|
181
|
+
if ref_count > 0:
|
|
182
|
+
diff_ratio = abs(cur_count - ref_count) / ref_count
|
|
183
|
+
if diff_ratio > 0.5:
|
|
184
|
+
count_diffs[key] = {"reference": ref_count, "current": cur_count}
|
|
185
|
+
|
|
186
|
+
if count_diffs:
|
|
187
|
+
scores["element_counts"] = max(0, 1 - len(count_diffs) * 0.2)
|
|
188
|
+
details["element_counts_diff"] = count_diffs
|
|
189
|
+
else:
|
|
190
|
+
scores["element_counts"] = 1.0
|
|
191
|
+
|
|
192
|
+
weights = {
|
|
193
|
+
"structure": 0.25,
|
|
194
|
+
"table_classes": 0.20,
|
|
195
|
+
"key_ids": 0.15,
|
|
196
|
+
"table_headers": 0.30,
|
|
197
|
+
"element_counts": 0.10,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
final_score = sum(scores[k] * weights[k] for k in weights)
|
|
201
|
+
|
|
202
|
+
logger.debug(
|
|
203
|
+
"fingerprint_comparison",
|
|
204
|
+
scores=scores,
|
|
205
|
+
final_score=final_score,
|
|
206
|
+
has_changes=bool(details),
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return final_score, details
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def save_baseline_fingerprint(fingerprint: Fingerprint, path: str) -> None:
|
|
213
|
+
"""Salva fingerprint como baseline de referência."""
|
|
214
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
216
|
+
json.dump(fingerprint.model_dump(mode="json"), f, indent=2, default=str)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def load_baseline_fingerprint(path: str) -> Fingerprint | None:
|
|
220
|
+
"""Carrega fingerprint de referência."""
|
|
221
|
+
if not Path(path).exists():
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
with open(path, encoding="utf-8") as f:
|
|
225
|
+
data = json.load(f)
|
|
226
|
+
return Fingerprint.model_validate(data)
|