pdf2md-tool 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdf2md/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """pdf2md — conversor PDF→Markdown CPU-first com roteamento por intent.
2
+
3
+ Núcleo CPU (pdftotext/PyMuPDF) offline e determinístico; capacidades pesadas
4
+ (marker/GPU, pix2tex, OCR, VLM) são opcionais e detectadas em runtime via
5
+ `pdf2md doctor`. CLI em `pdf2md.cli:app`.
6
+ """
7
+
8
+ __version__ = "0.8.0"
@@ -0,0 +1,45 @@
1
+ """Runner standalone de pix2tex — executado pelo python de um venv EXTERNO (torch).
2
+
3
+ NÃO importa pdf2md (o venv externo não tem o pacote). Só pix2tex + PIL + stdlib.
4
+ O executor (pdf2md.executor) cropa as fórmulas no venv geral e chama:
5
+
6
+ <pix2tex_python> _pix2tex_runner.py <crop_dir> <out_json>
7
+
8
+ Lê todos os PNGs de <crop_dir>, roda LatexOCR e escreve {filename: latex} em
9
+ <out_json>. Mantém o torch fora do venv geral (ver feedback_venv_efemero_para_labs).
10
+ """
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def main(argv) -> int:
17
+ if len(argv) != 2:
18
+ print("uso: _pix2tex_runner.py <crop_dir> <out_json>", file=sys.stderr)
19
+ return 2
20
+ crop_dir, out_json = Path(argv[0]), Path(argv[1])
21
+ crops = sorted(crop_dir.glob("*.png"))
22
+ if not crops:
23
+ out_json.write_text("{}", encoding="utf-8")
24
+ return 0
25
+ try:
26
+ from PIL import Image
27
+ from pix2tex.cli import LatexOCR
28
+ except ImportError as e:
29
+ print(f"[ERRO] runtime pix2tex ausente: {e}", file=sys.stderr)
30
+ return 3
31
+
32
+ model = LatexOCR()
33
+ out = {}
34
+ for png in crops:
35
+ try:
36
+ out[png.name] = model(Image.open(png))
37
+ except Exception as e: # uma fórmula ruim não derruba o lote
38
+ out[png.name] = ""
39
+ print(f"[warn] {png.name}: {e}", file=sys.stderr)
40
+ out_json.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
41
+ return 0
42
+
43
+
44
+ if __name__ == "__main__":
45
+ raise SystemExit(main(sys.argv[1:]))
pdf2md/_profiles.py ADDED
@@ -0,0 +1,130 @@
1
+ """Índice machine-readable dos perfis ativos — subconjunto *route-relevant*.
2
+
3
+ Espelha o padrão de `corpus/registry.py` (dict Python puro, sem dependência):
4
+ a fonte-de-verdade HUMANA são os YAML em `docs/profiles/ativo/`; este módulo é o
5
+ recorte que o roteador (`pdf2md/routing.py`) consulta. Decisão registrada em
6
+ T090 (decisão #1) — pyyaml não é dependência declarada, então não parseamos YAML
7
+ em runtime.
8
+
9
+ Campos por perfil:
10
+ - role: PRIMARY (extrator full-doc) | REFINER (recorte) | OPTIMIZER
11
+ - hardware: "cpu" | "gpu-required" | "gpu-optional"
12
+ - needs: requisitos externos além de hardware (ex. "marker_bin", "ollama")
13
+ - granularity: "documento" | "recorte"
14
+ - unit_speed_s + unit: custo medido por unidade (pg / formula / img)
15
+ - vram_mb: pico de VRAM medido (0 = CPU)
16
+ - scan: capacidade em scan — "none" | "printed" (Tesseract CPU) | "ocr-gpu" (Surya)
17
+ - wins: dimensões vencidas (do perfil)
18
+ - measured_in: labs de origem
19
+
20
+ Mantido em sincronia manual com docs/profiles/ativo/*.yaml. Se divergir, o YAML
21
+ manda (é a fonte humana); atualizar aqui ao promover/remensurar um perfil.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ PRIMARY = "PRIMARY"
26
+ REFINER = "REFINER"
27
+ OPTIMIZER = "OPTIMIZER"
28
+
29
+ PROFILES: dict[str, dict] = {
30
+ # ---- PRIMARY (extrator full-doc; escolher 1) --------------------------
31
+ "marker": {
32
+ "role": PRIMARY,
33
+ "hardware": "gpu-required",
34
+ "needs": ["marker_bin"],
35
+ "granularity": "documento",
36
+ "unit_speed_s": 12.9, "unit": "pg",
37
+ "vram_mb": 3400, "ram_mb": 1500, "cold_s": 30,
38
+ "scan": "ocr-gpu", # Surya OCR (único OCR full-doc; GPU)
39
+ "math": "nativo", # Texify: math_display/inline forte
40
+ "wins": ["prose", "math", "livro", "paper", "riqueza", "estabilidade"],
41
+ "measured_in": ["e00", "e14"],
42
+ },
43
+ "pdftotext": {
44
+ "role": PRIMARY,
45
+ "hardware": "cpu",
46
+ "needs": [],
47
+ "granularity": "documento",
48
+ "unit_speed_s": 0.0205, "unit": "pg",
49
+ "vram_mb": 0, "ram_mb": 63, "cold_s": 0.1,
50
+ "scan": "none", # text-layer only; scan vai p/ tesseract
51
+ "math": "cru", # Unicode cru, sem LaTeX
52
+ "wins": ["velocidade", "ram", "offline", "alucinacao", "determinismo", "estabilidade"],
53
+ "measured_in": ["e19"],
54
+ },
55
+ "tesseract": {
56
+ "role": PRIMARY,
57
+ "hardware": "cpu",
58
+ "needs": ["tesseract_bin"],
59
+ "granularity": "documento",
60
+ "unit_speed_s": 2.74, "unit": "pg", # @300dpi, e20
61
+ "vram_mb": 0, "ram_mb": 124, "cold_s": 0.5,
62
+ "scan": "printed", # OCR CPU p/ scan impresso (e20 WER 0.052)
63
+ "math": "cru",
64
+ "wins": ["scan_cpu", "alucinacao", "determinismo", "offline"],
65
+ "measured_in": ["e20"],
66
+ "note": "scan impresso forte; manuscrito falha (honesto, sem alucinar).",
67
+ },
68
+
69
+ # ---- REFINER (recorte, componível) ------------------------------------
70
+ "pix2tex": {
71
+ "role": REFINER,
72
+ "hardware": "cpu",
73
+ "needs": ["pix2tex_runtime"], # cropper agora é built-in (formula_cropper, e21);
74
+ # só falta o runtime torch (externo, venv próprio)
75
+ "granularity": "recorte",
76
+ "unit_speed_s": 6.5, "unit": "formula",
77
+ "vram_mb": 0, "ram_mb": 800, "cold_s": 11.9,
78
+ "refines": "math_display",
79
+ "math_quality": {"display_linha_unica": 0.80, "matriz": 0.50}, # e21 (N=6 / N=3)
80
+ "wins": ["math_semantico", "alucinacao", "offline", "cpu"],
81
+ "measured_in": ["e18", "e21"],
82
+ "note": "cropper CPU resolvido (e21); fronteira: linha-única 0.80, matriz 0.50.",
83
+ },
84
+ "gemma3-4b-small-image": {
85
+ "role": REFINER,
86
+ "hardware": "gpu-optional",
87
+ "needs": ["ollama"],
88
+ "granularity": "recorte",
89
+ "unit_speed_s": 45.9, "unit": "img",
90
+ "vram_mb": 3500, "ram_mb": 600, "cold_s": 5,
91
+ "refines": "logo",
92
+ "wins": ["logo"],
93
+ "measured_in": ["e16"],
94
+ },
95
+ "qwen3-vl-8b-small-image": {
96
+ "role": REFINER,
97
+ "hardware": "gpu-optional",
98
+ "needs": ["ollama"],
99
+ "granularity": "recorte",
100
+ "unit_speed_s": 112.0, "unit": "img",
101
+ "vram_mb": 5500, "ram_mb": 700, "cold_s": 5,
102
+ "refines": "logo",
103
+ "wins": ["logo"],
104
+ "measured_in": ["e16"],
105
+ "note": "fallback Apache-2.0 do gemma3-4b p/ logo.",
106
+ },
107
+
108
+ # ---- OPTIMIZER (pós-extração, universal) ------------------------------
109
+ "pdf2md-optimize": {
110
+ "role": OPTIMIZER,
111
+ "hardware": "cpu",
112
+ "needs": [],
113
+ "granularity": "recorte",
114
+ "unit_speed_s": 0.12, "unit": "img",
115
+ "vram_mb": 0, "ram_mb": 100, "cold_s": 0.1,
116
+ "wins": ["bytes", "determinismo", "universal"],
117
+ "measured_in": ["e04"],
118
+ },
119
+ }
120
+
121
+
122
+ def load_active_profiles() -> dict[str, dict]:
123
+ """Devolve os perfis ativos (route-relevant). Função p/ paridade com a API
124
+ do T090 e p/ permitir injeção/override em testes."""
125
+ return PROFILES
126
+
127
+
128
+ def by_role(role: str, profiles: dict | None = None) -> dict[str, dict]:
129
+ p = profiles or PROFILES
130
+ return {k: v for k, v in p.items() if v["role"] == role}
pdf2md/aggregate.py ADDED
@@ -0,0 +1,407 @@
1
+ """Agrega múltiplos `_stats.json` (recursivos) em `_OVERVIEW.{md,json}`.
2
+
3
+ Varre uma raiz, classifica cada doc (livro/paper/material/outro), e produz:
4
+ - `_OVERVIEW.md`: relatório consolidado humano (resumo + tabelas por kind +
5
+ análise round-trip + multi-iteration + outliers + versões).
6
+ - `_OVERVIEW.json`: dump agregado para máquina.
7
+
8
+ Pode ser usado como módulo (`from pdf2md.aggregate import aggregate`) ou
9
+ script standalone (compat com `python src/aggregate_stats.py ROOT --out DIR`).
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import sys
15
+ from collections import Counter
16
+ from datetime import datetime
17
+ from pathlib import Path
18
+
19
+
20
+ def fmt_bytes(n: int) -> str:
21
+ """`1024` → `1.0 KB`. Para bytes humano-legíveis nas tabelas."""
22
+ n = float(n)
23
+ for unit in ("B", "KB", "MB", "GB"):
24
+ if n < 1024:
25
+ return f"{n:.1f} {unit}" if unit != "B" else f"{int(n)} B"
26
+ n /= 1024
27
+ return f"{n:.1f} TB"
28
+
29
+
30
+ def classify_doc(stats: dict, path: Path) -> str:
31
+ """Heurística path-based: livro / paper / material / outro."""
32
+ parts = [p.lower() for p in path.parts]
33
+ if "livros" in parts:
34
+ return "livro"
35
+ if "papers" in parts:
36
+ return "paper"
37
+ if "material_aulas" in parts:
38
+ return "material"
39
+ return "outro"
40
+
41
+
42
+ def load_multi_roundtrip(folder: Path) -> dict | None:
43
+ """Lê `_multi_roundtrip.json` ao lado do `_stats.json`, se existir."""
44
+ f = folder / "_multi_roundtrip.json"
45
+ if not f.exists():
46
+ return None
47
+ try:
48
+ return json.loads(f.read_text(encoding="utf-8"))
49
+ except Exception:
50
+ return None
51
+
52
+
53
+ def collect_docs(root: Path) -> list[dict]:
54
+ """Varre `root` recursivamente, achata cada `_stats.json` num dict comum."""
55
+ docs: list[dict] = []
56
+ for stats_path in sorted(root.rglob("_stats.json")):
57
+ try:
58
+ data = json.loads(stats_path.read_text(encoding="utf-8"))
59
+ except Exception as e:
60
+ print(f"[WARN] {stats_path}: {e}", file=sys.stderr)
61
+ continue
62
+ rel = stats_path.parent.relative_to(root)
63
+ kind = classify_doc(data, stats_path)
64
+ out = data.get("output", {})
65
+ tot = out.get("totals", {}) or {}
66
+ src = data.get("source", {}) or {}
67
+ rt = data.get("roundtrip") or {}
68
+ mrt = load_multi_roundtrip(stats_path.parent)
69
+ docs.append({
70
+ "kind": kind,
71
+ "name": rel.name or str(rel),
72
+ "rel_path": str(rel).replace("\\", "/"),
73
+ "stats_path": str(stats_path),
74
+ "pages": src.get("pages"),
75
+ "pdf_size": src.get("size_bytes", 0),
76
+ "chapter_count": tot.get("chapter_count", 0),
77
+ "tokens": tot.get("tokens", 0),
78
+ "words": tot.get("words", 0),
79
+ "size_bytes": tot.get("size_bytes", 0),
80
+ "math_display": tot.get("math_display", 0),
81
+ "math_inline": tot.get("math_inline", 0),
82
+ "headers_total": tot.get("headers_total", 0),
83
+ "tables_rough": tot.get("tables_rough", 0),
84
+ "images_count": tot.get("images_count", 0),
85
+ "images_total_bytes": tot.get("images_total_bytes", 0),
86
+ "ligature_artifacts": tot.get("ligature_artifacts", 0),
87
+ "extraction_seconds": data.get("extraction_time_seconds"),
88
+ "similarity": rt.get("similarity"),
89
+ "divergences": rt.get("divergence_categories", {}),
90
+ "tools": data.get("tools", {}),
91
+ "generated_at": data.get("generated_at", "?"),
92
+ "multi_roundtrip": mrt,
93
+ })
94
+ return docs
95
+
96
+
97
+ def render_overview(root: Path, docs: list[dict]) -> str:
98
+ """Formata os docs coletados como Markdown legível com tabelas e veredito."""
99
+ if not docs:
100
+ return f"# OVERVIEW — {root}\n\nNenhum `_stats.json` encontrado.\n"
101
+
102
+ by_kind = {"livro": [], "paper": [], "material": [], "outro": []}
103
+ for d in docs:
104
+ by_kind[d["kind"]].append(d)
105
+
106
+ lines = [
107
+ f"# OVERVIEW — `{root}`",
108
+ "",
109
+ f"*Gerado em: {datetime.now().isoformat(timespec='seconds')}*",
110
+ "",
111
+ "Visão consolidada de todas as extrações do conversor neste diretório.",
112
+ "Cada linha vem de um `_stats.json` produzido por `pdf2md stats`.",
113
+ "",
114
+ ]
115
+
116
+ total_pages = sum(d["pages"] or 0 for d in docs)
117
+ total_tokens = sum(d["tokens"] for d in docs)
118
+ total_words = sum(d["words"] for d in docs)
119
+ total_math = sum(d["math_display"] + d["math_inline"] for d in docs)
120
+ total_imgs = sum(d["images_count"] for d in docs)
121
+ total_md_bytes = sum(d["size_bytes"] for d in docs)
122
+ total_pdf_bytes = sum(d["pdf_size"] for d in docs)
123
+ total_img_bytes = sum(d["images_total_bytes"] for d in docs)
124
+
125
+ sims = [d["similarity"] for d in docs if d["similarity"] is not None]
126
+ sim_avg = sum(sims) / len(sims) if sims else None
127
+ sim_min = min(sims) if sims else None
128
+ sim_max = max(sims) if sims else None
129
+
130
+ agg_div: Counter = Counter()
131
+ for d in docs:
132
+ agg_div.update(d["divergences"])
133
+
134
+ lines += [
135
+ "## Resumo executivo",
136
+ "",
137
+ "| Métrica | Total |",
138
+ "|---|---:|",
139
+ f"| Documentos extraídos | **{len(docs)}** |",
140
+ f"| Livros | {len(by_kind['livro'])} |",
141
+ f"| Papers | {len(by_kind['paper'])} |",
142
+ f"| Materiais de aula | {len(by_kind['material'])} |",
143
+ f"| Páginas (PDF originais) | {total_pages:,} |",
144
+ f"| Tokens (MD output) | {total_tokens:,} |",
145
+ f"| Palavras | {total_words:,} |",
146
+ f"| Fórmulas | {total_math:,} |",
147
+ f"| Imagens | {total_imgs:,} |",
148
+ f"| Tamanho dos PDFs | {fmt_bytes(total_pdf_bytes)} |",
149
+ f"| Tamanho dos MDs | {fmt_bytes(total_md_bytes)} |",
150
+ f"| Tamanho das imagens | {fmt_bytes(total_img_bytes)} |",
151
+ ]
152
+ if total_pdf_bytes:
153
+ ratio_md = total_md_bytes / total_pdf_bytes * 100
154
+ ratio_full = (total_md_bytes + total_img_bytes) / total_pdf_bytes * 100
155
+ lines += [
156
+ f"| MD/PDF (apenas texto) | {ratio_md:.1f}% |",
157
+ f"| (MD+img)/PDF (transporte) | {ratio_full:.1f}% |",
158
+ ]
159
+ if sim_avg is not None:
160
+ lines += [
161
+ f"| Round-trip — média | **{sim_avg*100:.2f}%** |",
162
+ f"| Round-trip — min | {sim_min*100:.2f}% |",
163
+ f"| Round-trip — max | {sim_max*100:.2f}% |",
164
+ ]
165
+ lines.append("")
166
+
167
+ # Tabelas por kind
168
+ for kind, label in [("livro", "Livros"), ("paper", "Papers"), ("material", "Materiais de aula")]:
169
+ items = by_kind[kind]
170
+ if not items:
171
+ continue
172
+ lines += [
173
+ f"## {label}",
174
+ "",
175
+ "| Doc | Pgs | Seções | Tokens | Palavras | Fórmulas | Imgs | MD | Round-trip | Tempo |",
176
+ "|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|",
177
+ ]
178
+ for d in sorted(items, key=lambda x: x["rel_path"]):
179
+ sim_str = f"{d['similarity']*100:.1f}%" if d["similarity"] is not None else "—"
180
+ t_str = f"{d['extraction_seconds']:.0f}s" if d["extraction_seconds"] else "—"
181
+ formula_total = d["math_display"] + d["math_inline"]
182
+ lines.append(
183
+ f"| [{d['rel_path']}](./{d['rel_path']}/_stats.md) "
184
+ f"| {d['pages'] or '?'} "
185
+ f"| {d['chapter_count']} "
186
+ f"| {d['tokens']:,} "
187
+ f"| {d['words']:,} "
188
+ f"| {formula_total} "
189
+ f"| {d['images_count']} "
190
+ f"| {fmt_bytes(d['size_bytes'])} "
191
+ f"| {sim_str} "
192
+ f"| {t_str} |"
193
+ )
194
+ lines.append("")
195
+
196
+ if sims:
197
+ lines += [
198
+ "## Análise round-trip",
199
+ "",
200
+ f"Documentos com medição de round-trip: **{len(sims)}/{len(docs)}**.",
201
+ "",
202
+ ]
203
+ buckets = {"≥95%": 0, "90-95%": 0, "80-90%": 0, "<80%": 0}
204
+ for s in sims:
205
+ p = s * 100
206
+ if p >= 95:
207
+ buckets["≥95%"] += 1
208
+ elif p >= 90:
209
+ buckets["90-95%"] += 1
210
+ elif p >= 80:
211
+ buckets["80-90%"] += 1
212
+ else:
213
+ buckets["<80%"] += 1
214
+ lines += ["### Distribuição", "", "| Faixa | Documentos |", "|---|---:|"]
215
+ for k, v in buckets.items():
216
+ lines.append(f"| {k} | {v} |")
217
+ lines.append("")
218
+
219
+ if agg_div:
220
+ total_div = sum(agg_div.values())
221
+ lines += [
222
+ "### Divergências (agregadas em todos os docs com round-trip)",
223
+ "",
224
+ "| Categoria | Tokens divergentes | % |",
225
+ "|---|---:|---:|",
226
+ ]
227
+ for cat, n in agg_div.most_common():
228
+ pct = n / max(total_div, 1) * 100
229
+ lines.append(f"| `{cat}` | {n:,} | {pct:.1f}% |")
230
+ lines.append("")
231
+
232
+ if agg_div:
233
+ math_pct = agg_div.get("math", 0) / max(sum(agg_div.values()), 1) * 100
234
+ lines += [
235
+ "### Interpretação à luz da hierarquia de prioridades",
236
+ "",
237
+ f"`math` representa **{math_pct:.1f}%** das divergências agregadas. Isso é",
238
+ "consistente com **drift de notação LaTeX** (4ª prioridade — formatação),",
239
+ "não perda de conteúdo (1ª prioridade). O round-trip mede similaridade",
240
+ "byte-a-byte; uma fórmula re-renderizada com `\\rm` em vez de `\\mathrm`",
241
+ "conta como divergência mas preserva o conteúdo matemático.",
242
+ "",
243
+ "Para validar **preservação de conteúdo** (1ª prioridade), seria preciso",
244
+ "comparar AST math e contagem de fórmulas em vez de tokens — ver T410.",
245
+ "",
246
+ ]
247
+
248
+ multi_docs = [d for d in docs if d.get("multi_roundtrip")]
249
+ if multi_docs:
250
+ lines += [
251
+ "## Multi-iteration round-trip",
252
+ "",
253
+ "Documentos com teste de iteração MD → PDF → MD' → PDF → ... medindo",
254
+ "convergência ou drift do pipeline. Curva achatada = pipeline idempotente.",
255
+ "",
256
+ ]
257
+ for d in multi_docs:
258
+ mrt = d["multi_roundtrip"]
259
+ iters = mrt.get("iterations", [])
260
+ if not iters:
261
+ continue
262
+ lines += [
263
+ f"### `{d['rel_path']}` — {len(iters)} iterações",
264
+ "",
265
+ "| i | Tokens | Sim(MDᵢ, MD₀) | Sim(MDᵢ, MDᵢ₋₁) | Tempo |",
266
+ "|---:|---:|---:|---:|---:|",
267
+ ]
268
+ for it in iters:
269
+ if it.get("error"):
270
+ lines.append(f"| {it['i']} | — | — | — | falha |")
271
+ continue
272
+ sim_to_0 = f"{it['sim_to_md0']*100:.2f}%" if it.get("sim_to_md0") is not None else "—"
273
+ sim_to_prev = f"{it['sim_to_prev']*100:.2f}%" if it.get("sim_to_prev") is not None else "—"
274
+ lines.append(
275
+ f"| {it['i']} | {it.get('tokens', 0):,} | "
276
+ f"{sim_to_0} | {sim_to_prev} | {it.get('seconds', 0):.0f}s |"
277
+ )
278
+ sims_to_0 = [it["sim_to_md0"] for it in iters if it.get("sim_to_md0") is not None]
279
+ if len(sims_to_0) >= 2:
280
+ first, last = sims_to_0[0], sims_to_0[-1]
281
+ drift = (first - last) * 100
282
+ last_two_diff = abs(sims_to_0[-1] - sims_to_0[-2]) * 100
283
+ lines.append("")
284
+ if abs(drift) < 1.0:
285
+ lines.append(f"**Veredito:** pipeline estável (drift {drift:.2f}% < 1%).")
286
+ elif drift > 0 and last_two_diff < 0.5:
287
+ lines.append(f"**Veredito:** convergência logarítmica — perdeu {drift:.1f}% mas estabilizou (Δ últimas duas iterações: {last_two_diff:.2f}%).")
288
+ else:
289
+ lines.append(f"**Veredito:** drift contínuo — perdeu {drift:.1f}% e ainda variando.")
290
+ lines.append("")
291
+ lines.append(f"Detalhes: [`{d['rel_path']}/_multi_roundtrip.md`](./{d['rel_path']}/_multi_roundtrip.md)")
292
+ lines.append("")
293
+
294
+ # Outliers
295
+ lines += ["## Outliers e atenção", ""]
296
+ critical: list[tuple[str, list[str]]] = []
297
+ notable: list[tuple[str, list[str]]] = []
298
+ for d in docs:
299
+ notes: list[str] = []
300
+ bucket: str | None = None
301
+ if d["similarity"] is not None:
302
+ sim_pct = d["similarity"] * 100
303
+ if sim_pct < 50:
304
+ notes.append(f"round-trip crítico ({sim_pct:.1f}%)")
305
+ bucket = "critical"
306
+ elif sim_pct < 70:
307
+ notes.append(f"round-trip moderado ({sim_pct:.1f}%) — provavelmente drift LaTeX")
308
+ bucket = "notable"
309
+ if d["ligature_artifacts"] > 0:
310
+ notes.append(f"{d['ligature_artifacts']} ligaduras quebradas")
311
+ bucket = bucket or "critical"
312
+ if d["tokens"] == 0:
313
+ notes.append("zero tokens (extração falhou?)")
314
+ bucket = "critical"
315
+ if not notes:
316
+ continue
317
+ if bucket == "critical":
318
+ critical.append((d["rel_path"], notes))
319
+ else:
320
+ notable.append((d["rel_path"], notes))
321
+
322
+ if critical:
323
+ lines += ["### Crítico (investigar)", ""]
324
+ for path, notes in critical:
325
+ lines.append(f"- `{path}` — {'; '.join(notes)}")
326
+ lines.append("")
327
+ if notable:
328
+ lines += ["### Notável (drift esperado, monitorar)", ""]
329
+ for path, notes in notable:
330
+ lines.append(f"- `{path}` — {'; '.join(notes)}")
331
+ lines.append("")
332
+ if not critical and not notable:
333
+ lines.append("Nenhum documento com flags de atenção.")
334
+ lines.append("")
335
+
336
+ # Versões
337
+ tool_versions: Counter = Counter()
338
+ for d in docs:
339
+ t = d["tools"]
340
+ sig = (
341
+ t.get("marker", "?"),
342
+ t.get("torch", "?"),
343
+ t.get("cuda_device", "CPU"),
344
+ )
345
+ tool_versions[sig] += 1
346
+ if tool_versions:
347
+ lines += [
348
+ "## Pipeline / versões",
349
+ "",
350
+ "| marker-pdf | torch | device | docs |",
351
+ "|---|---|---|---:|",
352
+ ]
353
+ for (mk, tc, dev), n in tool_versions.most_common():
354
+ lines.append(f"| `{mk}` | `{tc}` | `{dev}` | {n} |")
355
+ lines.append("")
356
+
357
+ return "\n".join(lines) + "\n"
358
+
359
+
360
+ def aggregate(root: Path, out_dir: Path | None = None) -> tuple[Path, Path]:
361
+ """Pipeline completo: varre root, agrega, salva _OVERVIEW.md + .json.
362
+
363
+ Retorna `(md_path, json_path)`.
364
+ """
365
+ out_dir = out_dir or root
366
+ out_dir.mkdir(parents=True, exist_ok=True)
367
+
368
+ docs = collect_docs(root)
369
+
370
+ overview_md = out_dir / "_OVERVIEW.md"
371
+ overview_json = out_dir / "_OVERVIEW.json"
372
+
373
+ overview_md.write_text(render_overview(root, docs), encoding="utf-8")
374
+ overview_json.write_text(
375
+ json.dumps({
376
+ "generated_at": datetime.now().isoformat(timespec="seconds"),
377
+ "root": str(root),
378
+ "docs": docs,
379
+ }, indent=2, ensure_ascii=False),
380
+ encoding="utf-8",
381
+ )
382
+ return overview_md, overview_json
383
+
384
+
385
+ def _cli() -> int:
386
+ """CLI standalone (compat com `python src/aggregate_stats.py ROOT --out DIR`)."""
387
+ import argparse
388
+ p = argparse.ArgumentParser(description="Agrega _stats.json em _OVERVIEW.md")
389
+ p.add_argument("root", type=Path)
390
+ p.add_argument("--out", type=Path, default=None,
391
+ help="Diretório de saída (default: <root>)")
392
+ args = p.parse_args()
393
+
394
+ if not args.root.is_dir():
395
+ print(f"[ERRO] Diretório não encontrado: {args.root}", file=sys.stderr)
396
+ return 1
397
+
398
+ md_path, json_path = aggregate(args.root, args.out)
399
+ docs = collect_docs(args.root) # só para contagem (já chamado dentro de aggregate)
400
+ print(f"[INFO] {len(docs)} doc(s) com _stats.json em {args.root}")
401
+ print(f"[OK] {md_path}")
402
+ print(f"[OK] {json_path}")
403
+ return 0
404
+
405
+
406
+ if __name__ == "__main__":
407
+ raise SystemExit(_cli())