own-rag-cli 0.0.1-snapshot

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1426 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ indexer_full.py — Script standalone de indexação do RAG local.
4
+
5
+ Uso:
6
+ python indexer_full.py [caminho_do_projeto]
7
+
8
+ Se nenhum caminho for passado, usa o diretório atual.
9
+ O ChromaDB deve estar rodando via Docker em localhost:8000.
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ import hashlib
15
+ import argparse
16
+ import shutil
17
+ import logging
18
+ import gc
19
+ import json
20
+ from time import perf_counter, time
21
+ from collections.abc import Iterator
22
+ from pathlib import Path
23
+ from dataclasses import dataclass
24
+ from datetime import datetime
25
+
26
+ # Evita avisos "advisory" ruidosos do transformers no fluxo interativo.
27
+ os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
28
+
29
+
30
+ class _TorchDtypeWarningFilter(logging.Filter):
31
+ def filter(self, record: logging.LogRecord) -> bool:
32
+ return "`torch_dtype` is deprecated! Use `dtype` instead!" not in record.getMessage()
33
+
34
+
35
+ for _logger_name in ("transformers.configuration_utils", "transformers.modeling_utils"):
36
+ logging.getLogger(_logger_name).addFilter(_TorchDtypeWarningFilter())
37
+
38
+ import chromadb
39
+ from sentence_transformers import SentenceTransformer
40
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
41
+ from tqdm import tqdm
42
+ from download_model_from_hugginface import download_model_with_fallback
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Configurações globais
46
+ # ---------------------------------------------------------------------------
47
+
48
+
49
+ def _env_int(name: str, default: int, *, min_value: int = 1) -> int:
50
+ raw = os.environ.get(name)
51
+ if raw is None:
52
+ return max(min_value, default)
53
+ try:
54
+ return max(min_value, int(raw))
55
+ except ValueError:
56
+ return max(min_value, default)
57
+
58
+ CHROMA_HOST = "localhost"
59
+ CHROMA_PORT = _env_int("MCP_CHROMA_PORT", 8000, min_value=1)
60
+ COLLECTION_CODE_JINA = "code_vectors_jina"
61
+ COLLECTION_DOC_BGE = "doc_vectors_bge"
62
+
63
+ # Pastas e extensões ignoradas durante a varredura
64
+ IGNORED_DIRS = {
65
+ ".git", "node_modules", "__pycache__", ".venv", "venv", "env",
66
+ "dist", "build", "out", ".next", ".nuxt", ".cache", "coverage",
67
+ ".pytest_cache", ".mypy_cache", ".ruff_cache", "target", "bin", "obj",
68
+ ".idea", ".vscode", ".DS_Store", "vendor", "tmp", "temp", "logs",
69
+ ".rag_db",
70
+ }
71
+
72
+ IGNORED_EXTENSIONS = {
73
+ # Binários e imagens
74
+ ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp", ".bmp",
75
+ ".mp4", ".mp3", ".wav", ".ogg", ".avi", ".mov",
76
+ # Pacotes e compilados
77
+ ".zip", ".tar", ".gz", ".rar", ".7z", ".jar", ".war", ".ear",
78
+ ".pyc", ".pyo", ".so", ".dll", ".exe", ".bin",
79
+ # Lockfiles e gerados
80
+ ".lock", ".sum",
81
+ # Banco de dados
82
+ ".sqlite", ".db", ".sqlite3",
83
+ # Fontes
84
+ ".ttf", ".woff", ".woff2", ".eot",
85
+ # PDF/Documentos binários
86
+ ".pdf", ".docx", ".xlsx", ".pptx",
87
+ }
88
+
89
+ CODE_EXTENSIONS = {
90
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".c", ".h", ".cpp", ".hpp",
91
+ ".go", ".rs", ".rb", ".php", ".cs", ".swift", ".kt", ".kts", ".scala", ".sql",
92
+ ".sh", ".bash", ".zsh", ".ps1", ".yaml", ".yml", ".toml", ".ini", ".conf",
93
+ ".json", ".xml", ".html", ".css", ".scss", ".sass", ".vue", ".svelte", ".dart",
94
+ ".lua", ".r", ".m", ".mm",
95
+ }
96
+
97
+ DOC_EXTENSIONS = {
98
+ ".md", ".mdx", ".rst", ".txt", ".adoc", ".org", ".tex", ".csv",
99
+ }
100
+
101
+ # Tamanho máximo de arquivo (evita indexar arquivos enormes gerados)
102
+ MAX_FILE_SIZE_BYTES = 500 * 1024 # 500 KB
103
+
104
+ # Parâmetros do splitter e batch (perfil low-memory por padrão).
105
+ CHUNK_SIZE = _env_int("MCP_CHUNK_SIZE", 3000, min_value=256)
106
+ CHUNK_OVERLAP = min(CHUNK_SIZE - 1, _env_int("MCP_CHUNK_OVERLAP", 400, min_value=0))
107
+ EMBEDDING_BATCH_SIZE = _env_int("MCP_EMBEDDING_BATCH_SIZE", 4, min_value=1)
108
+ DEFAULT_PERF_PROFILE = "autotune"
109
+ INDEXER_CONFIG_PATH = Path(
110
+ os.environ.get("MCP_INDEXER_CONFIG_FILE", str(Path.home() / ".rag_db" / "indexer_tuning.json"))
111
+ ).expanduser()
112
+ INDEXER_CONFIG_FALLBACK_PATH = Path.home() / ".cache" / "my-custom-rag-python" / "indexer_tuning.json"
113
+
114
+ # Modelo de embeddings (roda na CPU)
115
+ JINA_V3_EMBEDDING_MODEL = "jinaai/jina-embeddings-v3"
116
+ JINA_V2_EMBEDDING_MODEL = "jinaai/jina-embeddings-v2-base-code"
117
+ BGE_EMBEDDING_MODEL = "BAAI/bge-m3"
118
+ DEFAULT_EMBEDDING_MODEL_CHOICE = "jina"
119
+ DEFAULT_JINA_QUANTIZATION = "dynamic-int8"
120
+ MODEL_CACHE_BASE_DIR = Path(
121
+ os.environ.get("MCP_MODEL_DIR", str(Path.home() / ".cache" / "my-custom-rag-python" / "models"))
122
+ ).expanduser()
123
+ JINA_RECOMMENDED_RAM_GB_DEFAULT = 64
124
+ JINA_RECOMMENDED_RAM_GB_DYNAMIC_INT8 = 48
125
+ JINA_RECOMMENDED_SWAP_GB = 16
126
+ JINA_MIN_AVAILABLE_RAM_GB_HINT = 12
127
+
128
+
129
+ def _env_bool(name: str, default: bool = False) -> bool:
130
+ raw = os.environ.get(name)
131
+ if raw is None:
132
+ return default
133
+ return raw.strip().lower() in {"1", "true", "yes", "on"}
134
+
135
+
136
+ def _clamp(value: float, low: float, high: float) -> float:
137
+ return max(low, min(high, value))
138
+
139
+
140
+ def _is_memory_related_error(exc: Exception) -> bool:
141
+ if isinstance(exc, MemoryError):
142
+ return True
143
+ msg = str(exc).lower()
144
+ memory_markers = (
145
+ "out of memory",
146
+ "oom",
147
+ "cannot allocate memory",
148
+ "std::bad_alloc",
149
+ "bad alloc",
150
+ "insufficient memory",
151
+ )
152
+ return any(marker in msg for marker in memory_markers)
153
+
154
+
155
+ def _is_dimension_mismatch_error(exc: Exception) -> bool:
156
+ msg = str(exc).lower()
157
+ return (
158
+ "expecting embedding with dimension" in msg
159
+ or ("embedding" in msg and "dimension" in msg and "got" in msg)
160
+ )
161
+
162
+
163
+ def _format_exception(exc: Exception) -> str:
164
+ message = str(exc).strip()
165
+ if message:
166
+ return message
167
+ return repr(exc)
168
+
169
+
170
+ @dataclass(frozen=True)
171
+ class IndexTarget:
172
+ model_choice: str
173
+ collection_name: str
174
+ label: str
175
+
176
+
177
+ def _resolve_model_id(model_choice: str) -> str:
178
+ if model_choice == "jina":
179
+ return JINA_V3_EMBEDDING_MODEL
180
+ if model_choice == "jina-v2":
181
+ return JINA_V2_EMBEDDING_MODEL
182
+ if model_choice == "bge":
183
+ return BGE_EMBEDDING_MODEL
184
+ raise ValueError(f"Modelo não suportado: {model_choice}")
185
+
186
+
187
+ def _resolve_fallback_model_id(model_choice: str) -> str:
188
+ return BGE_EMBEDDING_MODEL
189
+
190
+
191
+ def _describe_embedding_choice(model_choice: str) -> str:
192
+ if model_choice == "jina":
193
+ return f"jina ({JINA_V3_EMBEDDING_MODEL})"
194
+ if model_choice == "bge":
195
+ return f"bge ({BGE_EMBEDDING_MODEL})"
196
+ if model_choice == "hybrid":
197
+ return f"hybrid ({JINA_V2_EMBEDDING_MODEL} + {BGE_EMBEDDING_MODEL})"
198
+ return model_choice
199
+
200
+
201
+ def _resolve_index_targets(model_choice: str) -> list[IndexTarget]:
202
+ if model_choice == "jina":
203
+ return [
204
+ IndexTarget(
205
+ model_choice="jina",
206
+ collection_name=COLLECTION_CODE_JINA,
207
+ label="Code/Jina",
208
+ )
209
+ ]
210
+ if model_choice == "bge":
211
+ return [
212
+ IndexTarget(
213
+ model_choice="bge",
214
+ collection_name=COLLECTION_DOC_BGE,
215
+ label="Doc/BGE",
216
+ )
217
+ ]
218
+ if model_choice == "hybrid":
219
+ return [
220
+ IndexTarget(
221
+ model_choice="jina-v2",
222
+ collection_name=COLLECTION_CODE_JINA,
223
+ label="Code/Jina v2",
224
+ ),
225
+ IndexTarget(
226
+ model_choice="bge",
227
+ collection_name=COLLECTION_DOC_BGE,
228
+ label="Doc/BGE",
229
+ ),
230
+ ]
231
+ raise ValueError(f"Modelo não suportado: {model_choice}")
232
+
233
+
234
+ def _classify_file_targets(filepath: Path, model_choice: str) -> set[str]:
235
+ if model_choice != "hybrid":
236
+ return {model_choice}
237
+
238
+ suffix = filepath.suffix.lower()
239
+ is_code = suffix in CODE_EXTENSIONS
240
+ is_doc = suffix in DOC_EXTENSIONS
241
+
242
+ if is_code and not is_doc:
243
+ return {"jina-v2"}
244
+ if is_doc and not is_code:
245
+ return {"bge"}
246
+
247
+ # Extensão desconhecida/ambígua: indexa nos dois ramos para manter recall.
248
+ return {"jina-v2", "bge"}
249
+
250
+
251
+ def _model_cache_dir(base_dir: Path, model_id: str) -> Path:
252
+ safe_name = model_id.replace("/", "__").replace(":", "_")
253
+ return base_dir / safe_name
254
+
255
+
256
+ def _pick_with_prompt(
257
+ *,
258
+ current_value: str | None,
259
+ default_value: str,
260
+ title: str,
261
+ options: list[tuple[str, str]],
262
+ ) -> str:
263
+ if current_value:
264
+ return current_value
265
+ if not sys.stdin.isatty():
266
+ return default_value
267
+
268
+ print(f"\n[CONFIG] {title}")
269
+ for index, (_, description) in enumerate(options, start=1):
270
+ print(f" {index}) {description}")
271
+ print(f" Enter = padrão ({default_value})")
272
+
273
+ answer = input("> Escolha: ").strip()
274
+ if not answer:
275
+ return default_value
276
+ if answer.isdigit():
277
+ idx = int(answer) - 1
278
+ if 0 <= idx < len(options):
279
+ return options[idx][0]
280
+ lowered = answer.lower()
281
+ valid_keys = {k for k, _ in options}
282
+ if lowered in valid_keys:
283
+ return lowered
284
+ print(f"[AVISO] Opção inválida '{answer}'. Usando padrão: {default_value}")
285
+ return default_value
286
+
287
+
288
+ def resolve_embedding_config(
289
+ model_choice_arg: str | None,
290
+ jina_quantization_arg: str | None,
291
+ persisted_config: dict[str, object] | None = None,
292
+ ) -> tuple[str, str]:
293
+ persisted_config = persisted_config or {}
294
+ model_choice_from_config = persisted_config.get("embedding_model")
295
+ model_choice = model_choice_arg or os.environ.get("MCP_EMBEDDING_MODEL")
296
+ if not model_choice and isinstance(model_choice_from_config, str):
297
+ model_choice = model_choice_from_config
298
+ if model_choice:
299
+ model_choice = model_choice.strip().lower()
300
+ model_choice = _pick_with_prompt(
301
+ current_value=model_choice,
302
+ default_value=DEFAULT_EMBEDDING_MODEL_CHOICE,
303
+ title="Escolha do modelo de embeddings",
304
+ options=[
305
+ (
306
+ "jina",
307
+ f"jina ({JINA_V3_EMBEDDING_MODEL}) - foco em código.",
308
+ ),
309
+ (
310
+ "bge",
311
+ f"bge ({BGE_EMBEDDING_MODEL}) - conteúdo misto.",
312
+ ),
313
+ (
314
+ "hybrid",
315
+ f"hybrid (Jina v2 {JINA_V2_EMBEDDING_MODEL} + BGE) - duas coleções.",
316
+ ),
317
+ ],
318
+ )
319
+ if model_choice not in {"jina", "bge", "hybrid"}:
320
+ print(f"[AVISO] MCP_EMBEDDING_MODEL inválido '{model_choice}'. Usando '{DEFAULT_EMBEDDING_MODEL_CHOICE}'.")
321
+ model_choice = DEFAULT_EMBEDDING_MODEL_CHOICE
322
+
323
+ quantization_from_config = persisted_config.get("jina_quantization")
324
+ jina_quantization = jina_quantization_arg or os.environ.get("MCP_JINA_QUANTIZATION")
325
+ if not jina_quantization and isinstance(quantization_from_config, str):
326
+ jina_quantization = quantization_from_config
327
+ if jina_quantization:
328
+ jina_quantization = jina_quantization.strip().lower().replace("_", "-")
329
+
330
+ if model_choice == "jina":
331
+ jina_quantization = _pick_with_prompt(
332
+ current_value=jina_quantization,
333
+ default_value=DEFAULT_JINA_QUANTIZATION,
334
+ title="Quantizacao do Jina (apenas para CPU)",
335
+ options=[
336
+ ("default", "default (sem quantizacao) - maior qualidade, indexacao mais lenta."),
337
+ ("dynamic-int8", "dynamic-int8 - indexacao mais rapida e menor uso de RAM, com pequena perda de qualidade."),
338
+ ],
339
+ )
340
+ if jina_quantization not in {"default", "dynamic-int8"}:
341
+ print(
342
+ f"[AVISO] MCP_JINA_QUANTIZATION inválido '{jina_quantization}'. "
343
+ f"Usando '{DEFAULT_JINA_QUANTIZATION}'."
344
+ )
345
+ jina_quantization = DEFAULT_JINA_QUANTIZATION
346
+ else:
347
+ jina_quantization = "default"
348
+
349
+ return model_choice, jina_quantization
350
+
351
+
352
+ def _indexer_config_candidates() -> list[Path]:
353
+ candidates = [INDEXER_CONFIG_PATH]
354
+ if INDEXER_CONFIG_FALLBACK_PATH not in candidates:
355
+ candidates.append(INDEXER_CONFIG_FALLBACK_PATH)
356
+ return candidates
357
+
358
+
359
+ def load_indexer_tuning_config(force_reconfigure: bool) -> dict[str, object]:
360
+ if force_reconfigure:
361
+ return {}
362
+ for candidate in _indexer_config_candidates():
363
+ try:
364
+ if not candidate.exists():
365
+ continue
366
+ data = json.loads(candidate.read_text(encoding="utf-8"))
367
+ if isinstance(data, dict):
368
+ return data
369
+ except Exception:
370
+ continue
371
+ return {}
372
+
373
+
374
+ def save_indexer_tuning_config(config: dict[str, object]) -> None:
375
+ payload = {
376
+ **config,
377
+ "updated_at": int(time()),
378
+ }
379
+ write_errors: list[tuple[Path, Exception]] = []
380
+
381
+ for candidate in _indexer_config_candidates():
382
+ try:
383
+ candidate.parent.mkdir(parents=True, exist_ok=True)
384
+ candidate.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
385
+ if candidate == INDEXER_CONFIG_PATH:
386
+ print(f"[CONFIG] Configuração persistida em: {candidate}")
387
+ else:
388
+ print(
389
+ f"[CONFIG] Configuração persistida em fallback: {candidate} "
390
+ f"(destino primário sem permissão: {INDEXER_CONFIG_PATH})"
391
+ )
392
+ return
393
+ except Exception as e:
394
+ write_errors.append((candidate, e))
395
+
396
+ details = " | ".join(f"{path}: {_format_exception(err)}" for path, err in write_errors)
397
+ print(f"[AVISO] Não foi possível persistir configuração: {details}")
398
+
399
+
400
+ def resolve_perf_profile(perf_profile_arg: str | None, persisted_config: dict[str, object]) -> str:
401
+ profile_from_config = persisted_config.get("perf_profile")
402
+ profile = perf_profile_arg or os.environ.get("MCP_PERF_PROFILE")
403
+ if not profile and isinstance(profile_from_config, str):
404
+ profile = profile_from_config
405
+ if profile:
406
+ profile = profile.strip().lower()
407
+
408
+ profile = _pick_with_prompt(
409
+ current_value=profile,
410
+ default_value=DEFAULT_PERF_PROFILE,
411
+ title="Perfil de performance da indexação",
412
+ options=[
413
+ (
414
+ "autotune",
415
+ "autotune - equilíbrio (recomendado).",
416
+ ),
417
+ (
418
+ "max-performance",
419
+ "max-performance - máximo throughput (mais RAM).",
420
+ ),
421
+ ],
422
+ )
423
+ if profile not in {"autotune", "max-performance"}:
424
+ print(f"[AVISO] Perfil inválido '{profile}'. Usando '{DEFAULT_PERF_PROFILE}'.")
425
+ profile = DEFAULT_PERF_PROFILE
426
+ return profile
427
+
428
+
429
+ def _parse_config_int(config: dict[str, object], key: str) -> int | None:
430
+ raw = config.get(key)
431
+ if isinstance(raw, int):
432
+ return raw
433
+ if isinstance(raw, str) and raw.isdigit():
434
+ return int(raw)
435
+ return None
436
+
437
+
438
+ def _read_meminfo_gib() -> tuple[float | None, float | None, float | None]:
439
+ """Retorna (mem_total, mem_available, swap_total) em GiB, quando disponível."""
440
+ mem_total_kib: int | None = None
441
+ mem_available_kib: int | None = None
442
+ swap_total_kib: int | None = None
443
+
444
+ try:
445
+ for line in Path("/proc/meminfo").read_text(encoding="utf-8").splitlines():
446
+ if line.startswith("MemTotal:"):
447
+ mem_total_kib = int(line.split()[1])
448
+ elif line.startswith("MemAvailable:"):
449
+ mem_available_kib = int(line.split()[1])
450
+ elif line.startswith("SwapTotal:"):
451
+ swap_total_kib = int(line.split()[1])
452
+ except (OSError, ValueError, IndexError):
453
+ return None, None, None
454
+
455
+ to_gib = lambda kib: (kib / (1024 * 1024)) if kib is not None else None
456
+ return to_gib(mem_total_kib), to_gib(mem_available_kib), to_gib(swap_total_kib)
457
+
458
+
459
+ def warn_if_jina_memory_risk(model_choice: str, jina_quantization: str) -> None:
460
+ """Mostra aviso de risco de OOM para o modelo Jina em máquinas com pouca memória."""
461
+ if model_choice not in {"jina", "hybrid"}:
462
+ return
463
+
464
+ mem_total_gib, mem_available_gib, swap_total_gib = _read_meminfo_gib()
465
+ if mem_total_gib is None:
466
+ return
467
+
468
+ recommended_ram_gib = (
469
+ JINA_RECOMMENDED_RAM_GB_DEFAULT
470
+ if jina_quantization == "default"
471
+ else JINA_RECOMMENDED_RAM_GB_DYNAMIC_INT8
472
+ )
473
+
474
+ reasons: list[str] = []
475
+ if mem_total_gib < recommended_ram_gib:
476
+ reasons.append(
477
+ f"RAM total detectada: {mem_total_gib:.1f} GiB (recomendado >= {recommended_ram_gib} GiB para Jina/{jina_quantization})."
478
+ )
479
+ if swap_total_gib is not None and swap_total_gib < JINA_RECOMMENDED_SWAP_GB:
480
+ reasons.append(
481
+ f"Swap detectada: {swap_total_gib:.1f} GiB (recomendado >= {JINA_RECOMMENDED_SWAP_GB} GiB)."
482
+ )
483
+ if mem_available_gib is not None and mem_available_gib < JINA_MIN_AVAILABLE_RAM_GB_HINT:
484
+ reasons.append(
485
+ f"RAM livre atual: {mem_available_gib:.1f} GiB (baixo para a carga inicial do Jina)."
486
+ )
487
+
488
+ if not reasons:
489
+ return
490
+
491
+ print("[AVISO] Alto risco de OOM com Jina nesta máquina/carga.")
492
+ for reason in reasons:
493
+ print(f" - {reason}")
494
+ print(" - Se ocorrer 'Killed' (exit 137), use BGE: --embedding-model bge")
495
+ print(" - Ou rode o Jina em máquina com mais RAM/swap e menos processos concorrentes.")
496
+
497
+
498
+ @dataclass(frozen=True)
499
+ class RuntimeIndexingParams:
500
+ chunk_size: int
501
+ chunk_overlap: int
502
+ embedding_batch_size: int
503
+ reasons: list[str]
504
+
505
+
506
+ def _resolve_max_performance_params(
507
+ *,
508
+ chunk_size_locked: bool,
509
+ chunk_overlap_locked: bool,
510
+ batch_size_locked: bool,
511
+ chunk_size: int,
512
+ chunk_overlap: int,
513
+ embedding_batch_size: int,
514
+ ) -> RuntimeIndexingParams:
515
+ mem_total_gib, mem_available_gib, _ = _read_meminfo_gib()
516
+ reasons = [
517
+ "Perfil selecionado: max-performance.",
518
+ "Modo pode elevar consideravelmente o consumo de memória e causar encerramento por OOM (exit 137).",
519
+ ]
520
+
521
+ tuned_chunk_size = chunk_size
522
+ tuned_chunk_overlap = chunk_overlap
523
+ tuned_batch = embedding_batch_size
524
+
525
+ if not chunk_size_locked:
526
+ if mem_total_gib is not None and mem_total_gib >= 64 and (mem_available_gib or 0) >= 16:
527
+ tuned_chunk_size = 7000
528
+ else:
529
+ tuned_chunk_size = 6000
530
+ reasons.append(f"chunk_size ajustado para {tuned_chunk_size} no perfil max-performance.")
531
+
532
+ if not chunk_overlap_locked:
533
+ tuned_chunk_overlap = min(tuned_chunk_size - 1, max(300, int(tuned_chunk_size * 0.15)))
534
+ reasons.append(f"chunk_overlap ajustado para {tuned_chunk_overlap}.")
535
+
536
+ if not batch_size_locked:
537
+ if mem_total_gib is not None and mem_total_gib >= 64 and (mem_available_gib or 0) >= 16:
538
+ tuned_batch = 24
539
+ elif mem_total_gib is not None and mem_total_gib >= 32:
540
+ tuned_batch = 16
541
+ else:
542
+ tuned_batch = 12
543
+ reasons.append(f"embedding_batch_size ajustado para {tuned_batch}.")
544
+
545
+ return RuntimeIndexingParams(
546
+ chunk_size=tuned_chunk_size,
547
+ chunk_overlap=tuned_chunk_overlap,
548
+ embedding_batch_size=max(1, tuned_batch),
549
+ reasons=reasons,
550
+ )
551
+
552
+
553
+ def _resolve_autotuned_params(
554
+ *,
555
+ model: SentenceTransformer,
556
+ chunk_size_locked: bool,
557
+ chunk_overlap_locked: bool,
558
+ batch_size_locked: bool,
559
+ chunk_size: int,
560
+ chunk_overlap: int,
561
+ embedding_batch_size: int,
562
+ ) -> RuntimeIndexingParams:
563
+ reasons: list[str] = ["Perfil selecionado: autotune (custo-benefício)."]
564
+ verbose_autotune = _env_bool("MCP_AUTOTUNE_VERBOSE", default=False)
565
+
566
+ try:
567
+ import psutil # type: ignore
568
+ except Exception:
569
+ reasons.append("psutil indisponível; mantendo parâmetros atuais sem benchmark.")
570
+ return RuntimeIndexingParams(
571
+ chunk_size=chunk_size,
572
+ chunk_overlap=chunk_overlap,
573
+ embedding_batch_size=embedding_batch_size,
574
+ reasons=reasons,
575
+ )
576
+
577
+ vm = psutil.virtual_memory()
578
+ swap = psutil.swap_memory()
579
+ mem_total_gib = vm.total / (1024**3)
580
+ mem_available_gib = vm.available / (1024**3)
581
+ swap_total_gib = swap.total / (1024**3)
582
+
583
+ target_ram_percent = _clamp(
584
+ float(os.environ.get("MCP_AUTOTUNE_TARGET_RAM_PERCENT", "68")),
585
+ 60.0,
586
+ 75.0,
587
+ )
588
+ if mem_available_gib < 6 or swap_total_gib < 4:
589
+ target_ram_percent = min(target_ram_percent, 63.0)
590
+ reasons.append(
591
+ f"Memória detectada: total={mem_total_gib:.1f} GiB, livre={mem_available_gib:.1f} GiB, "
592
+ f"swap={swap_total_gib:.1f} GiB, alvo={target_ram_percent:.1f}%."
593
+ )
594
+
595
+ tuned_chunk_size = chunk_size
596
+ tuned_chunk_overlap = chunk_overlap
597
+ tuned_batch = embedding_batch_size
598
+
599
+ if not chunk_size_locked:
600
+ if mem_total_gib < 8 or mem_available_gib < 3:
601
+ tuned_chunk_size = 1800
602
+ elif mem_total_gib < 16 or mem_available_gib < 6:
603
+ tuned_chunk_size = 2400
604
+ elif mem_total_gib < 32 or mem_available_gib < 12:
605
+ tuned_chunk_size = 3200
606
+ else:
607
+ tuned_chunk_size = 4200
608
+ reasons.append(f"chunk_size autotunado para {tuned_chunk_size}.")
609
+
610
+ if not chunk_overlap_locked:
611
+ tuned_chunk_overlap = min(tuned_chunk_size - 1, max(120, int(tuned_chunk_size * 0.15)))
612
+ reasons.append(f"chunk_overlap autotunado para {tuned_chunk_overlap}.")
613
+
614
+ if not batch_size_locked:
615
+ max_candidate = 16
616
+ if mem_total_gib < 8 or mem_available_gib < 3 or swap_total_gib < 2:
617
+ max_candidate = 2
618
+ elif mem_total_gib < 16 or mem_available_gib < 6:
619
+ max_candidate = 4
620
+ elif mem_total_gib < 32 or mem_available_gib < 10:
621
+ max_candidate = 8
622
+
623
+ candidates = [2, 4, 6, 8, 12, 16]
624
+ candidates = [c for c in candidates if c <= max_candidate]
625
+ if not candidates:
626
+ candidates = [2]
627
+
628
+ process = psutil.Process()
629
+ sample_size = min(max(512, tuned_chunk_size), 3000)
630
+ sample_text = ("# autotune-sample\n" + ("x" * sample_size))
631
+
632
+ best_batch = candidates[0]
633
+ best_score = -1.0
634
+ best_memory_pct = 100.0
635
+ selected_benchmark_line: str | None = None
636
+ benchmark_lines: list[str] = []
637
+
638
+ # Warmup curto para estabilizar cache interno.
639
+ try:
640
+ _ = model.encode([sample_text], show_progress_bar=False, batch_size=1)
641
+ except Exception:
642
+ pass
643
+
644
+ for candidate in candidates:
645
+ docs = [sample_text] * candidate
646
+ gc.collect()
647
+ before_vm = psutil.virtual_memory().percent
648
+ before_rss = process.memory_info().rss / (1024**2)
649
+ started = perf_counter()
650
+ try:
651
+ embeddings = model.encode(
652
+ docs,
653
+ show_progress_bar=False,
654
+ batch_size=candidate,
655
+ )
656
+ except Exception as e:
657
+ benchmark_lines.append(f"batch={candidate}: erro ({e})")
658
+ continue
659
+
660
+ elapsed = max(perf_counter() - started, 1e-6)
661
+ after_vm = psutil.virtual_memory().percent
662
+ after_rss = process.memory_info().rss / (1024**2)
663
+ del embeddings
664
+ gc.collect()
665
+
666
+ throughput = candidate / elapsed
667
+ safe = after_vm <= (target_ram_percent + 3.0)
668
+ benchmark_lines.append(
669
+ f"batch={candidate}: {throughput:.2f} itens/s, vm={after_vm:.1f}%, rss_delta={after_rss - before_rss:+.1f} MiB"
670
+ )
671
+
672
+ if safe and throughput > best_score:
673
+ best_score = throughput
674
+ best_batch = candidate
675
+ best_memory_pct = after_vm
676
+ selected_benchmark_line = benchmark_lines[-1]
677
+ elif best_score < 0 and after_vm < best_memory_pct:
678
+ # Se nenhum candidato ficou "safe", escolhe o menos agressivo em memória.
679
+ best_batch = candidate
680
+ best_memory_pct = after_vm
681
+ selected_benchmark_line = benchmark_lines[-1]
682
+
683
+ # Se já passou muito do limite, evita tentar batches maiores.
684
+ if after_vm > target_ram_percent + 8.0:
685
+ break
686
+
687
+ # Evita escolher candidato que já começou acima do limite.
688
+ if before_vm > target_ram_percent + 5.0:
689
+ break
690
+
691
+ tuned_batch = max(1, best_batch)
692
+ if verbose_autotune:
693
+ reasons.extend(benchmark_lines)
694
+ elif selected_benchmark_line:
695
+ reasons.append(f"Micro-benchmark: {selected_benchmark_line}")
696
+ reasons.append(
697
+ f"embedding_batch_size autotunado para {tuned_batch} (alvo de memória: {target_ram_percent:.1f}%)."
698
+ )
699
+
700
+ return RuntimeIndexingParams(
701
+ chunk_size=tuned_chunk_size,
702
+ chunk_overlap=tuned_chunk_overlap,
703
+ embedding_batch_size=max(1, tuned_batch),
704
+ reasons=reasons,
705
+ )
706
+
707
+
708
+ # ---------------------------------------------------------------------------
709
+ # Funções auxiliares
710
+ # ---------------------------------------------------------------------------
711
+
712
+ def get_text_splitter(chunk_size: int, chunk_overlap: int) -> RecursiveCharacterTextSplitter:
713
+ """Retorna o splitter compartilhado com as configurações padrão do projeto."""
714
+ return RecursiveCharacterTextSplitter(
715
+ chunk_size=chunk_size,
716
+ chunk_overlap=chunk_overlap,
717
+ length_function=len,
718
+ separators=["\n\n", "\n", " ", ""],
719
+ )
720
+
721
+
722
+ def load_embedding_model(model_choice: str, jina_quantization: str) -> SentenceTransformer:
723
+ """Carrega o modelo de embeddings forçando uso de CPU."""
724
+ embedding_model_id = _resolve_model_id(model_choice)
725
+ fallback_model_id = _resolve_fallback_model_id(model_choice)
726
+
727
+ model_base_dir = MODEL_CACHE_BASE_DIR
728
+ model_base_dir.mkdir(parents=True, exist_ok=True)
729
+ preferred_model_cache_dir = _model_cache_dir(model_base_dir, embedding_model_id)
730
+
731
+ print(f"[+] Baixando modelo preferido: {embedding_model_id}")
732
+ print(f"[+] Diretório de download/cache do modelo: {preferred_model_cache_dir}")
733
+ selection = download_model_with_fallback(
734
+ preferred_model_id=embedding_model_id,
735
+ fallback_model_id=fallback_model_id,
736
+ local_dir=model_base_dir,
737
+ )
738
+ selected_model_dir = selection.local_dir
739
+ print(
740
+ f"[+] Modelo selecionado: {selection.model_id} "
741
+ f"(provider={selection.provider}, path={selected_model_dir})"
742
+ )
743
+
744
+ def _clear_hf_dynamic_modules_cache() -> None:
745
+ cache_dir = Path.home() / ".cache" / "huggingface" / "modules" / "transformers_modules"
746
+ if cache_dir.exists():
747
+ print(f"[!] Limpando cache de módulos dinâmicos do Hugging Face: {cache_dir}")
748
+ shutil.rmtree(cache_dir, ignore_errors=True)
749
+
750
+ def _load_from_local_dir(model_id: str) -> SentenceTransformer:
751
+ # O modelo da Jina depende de código remoto; fallback normalmente não.
752
+ trust_remote_code = model_id.startswith("jinaai/")
753
+ tokenizer_kwargs = {"fix_mistral_regex": True}
754
+
755
+ def _instantiate_model() -> SentenceTransformer:
756
+ return SentenceTransformer(
757
+ str(selected_model_dir),
758
+ device="cpu",
759
+ trust_remote_code=trust_remote_code,
760
+ tokenizer_kwargs=tokenizer_kwargs,
761
+ )
762
+
763
+ def _load_with_mistral_regex_patch() -> SentenceTransformer:
764
+ # O código remoto da Jina instancia um tokenizer interno sem repassar tokenizer_kwargs.
765
+ if not trust_remote_code:
766
+ return _instantiate_model()
767
+
768
+ from transformers import AutoModel, AutoTokenizer
769
+ from transformers.modeling_utils import PreTrainedModel
770
+
771
+ original_from_pretrained = AutoTokenizer.from_pretrained
772
+ original_model_from_pretrained = AutoModel.from_pretrained
773
+ original_pretrained_model_from_pretrained = PreTrainedModel.from_pretrained
774
+ original_pretrained_model_from_config = PreTrainedModel._from_config
775
+ model_refs = {str(selected_model_dir), str(selected_model_dir.resolve())}
776
+
777
+ def _patched_from_pretrained(*args, **kwargs):
778
+ model_ref = args[0] if args else kwargs.get("pretrained_model_name_or_path")
779
+ if model_ref is not None and str(model_ref) in model_refs:
780
+ kwargs.setdefault("fix_mistral_regex", True)
781
+ return original_from_pretrained(*args, **kwargs)
782
+
783
+ def _patched_model_from_pretrained(*args, **kwargs):
784
+ model_ref = args[0] if args else kwargs.get("pretrained_model_name_or_path")
785
+ if model_ref is not None and str(model_ref) in model_refs and "torch_dtype" in kwargs:
786
+ kwargs = dict(kwargs)
787
+ if "dtype" not in kwargs:
788
+ kwargs["dtype"] = kwargs["torch_dtype"]
789
+ kwargs.pop("torch_dtype", None)
790
+ return original_model_from_pretrained(*args, **kwargs)
791
+
792
+ original_pretrained_model_from_pretrained_fn = original_pretrained_model_from_pretrained.__func__
793
+
794
+ @classmethod
795
+ def _patched_pretrained_model_from_pretrained(cls, *args, **kwargs):
796
+ if "torch_dtype" in kwargs:
797
+ kwargs = dict(kwargs)
798
+ if "dtype" not in kwargs:
799
+ kwargs["dtype"] = kwargs["torch_dtype"]
800
+ kwargs.pop("torch_dtype", None)
801
+ return original_pretrained_model_from_pretrained_fn(cls, *args, **kwargs)
802
+
803
+ original_pretrained_model_from_config_fn = original_pretrained_model_from_config.__func__
804
+
805
+ @classmethod
806
+ def _patched_pretrained_model_from_config(cls, *args, **kwargs):
807
+ if "torch_dtype" in kwargs:
808
+ kwargs = dict(kwargs)
809
+ if "dtype" not in kwargs:
810
+ kwargs["dtype"] = kwargs["torch_dtype"]
811
+ kwargs.pop("torch_dtype", None)
812
+ return original_pretrained_model_from_config_fn(cls, *args, **kwargs)
813
+
814
+ AutoTokenizer.from_pretrained = _patched_from_pretrained
815
+ AutoModel.from_pretrained = _patched_model_from_pretrained
816
+ PreTrainedModel.from_pretrained = _patched_pretrained_model_from_pretrained
817
+ PreTrainedModel._from_config = _patched_pretrained_model_from_config
818
+ try:
819
+ return _instantiate_model()
820
+ finally:
821
+ AutoTokenizer.from_pretrained = original_from_pretrained
822
+ AutoModel.from_pretrained = original_model_from_pretrained
823
+ PreTrainedModel.from_pretrained = original_pretrained_model_from_pretrained
824
+ PreTrainedModel._from_config = original_pretrained_model_from_config
825
+
826
+ print(f"[+] Carregando modelo de embeddings a partir de: {selected_model_dir} (CPU)...")
827
+ try:
828
+ return _load_with_mistral_regex_patch()
829
+ except FileNotFoundError as e:
830
+ # Corrige corrupção/incompletude no cache dinâmico do transformers.
831
+ if trust_remote_code and "transformers_modules" in str(e):
832
+ print(f"[!] Cache dinâmico inconsistente detectado: {e}")
833
+ _clear_hf_dynamic_modules_cache()
834
+ return _load_with_mistral_regex_patch()
835
+ raise
836
+
837
+ def _apply_jina_quantization_if_needed(model: SentenceTransformer, model_id: str) -> SentenceTransformer:
838
+ if model_id != JINA_V3_EMBEDDING_MODEL or jina_quantization == "default":
839
+ return model
840
+ try:
841
+ import torch
842
+ import warnings
843
+
844
+ quantized_layers = 0
845
+ for module in model.modules():
846
+ if type(module).__name__ != "ParametrizedLinear":
847
+ continue
848
+
849
+ float_linear = torch.nn.Linear(
850
+ module.in_features,
851
+ module.out_features,
852
+ bias=module.bias is not None,
853
+ )
854
+ with torch.no_grad():
855
+ float_linear.weight.copy_(module.weight.detach().to(torch.float32))
856
+ if module.bias is not None:
857
+ float_linear.bias.copy_(module.bias.detach().to(torch.float32))
858
+
859
+ with warnings.catch_warnings():
860
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
861
+ quantized_linear = torch.quantization.quantize_dynamic(
862
+ torch.nn.Sequential(float_linear),
863
+ {torch.nn.Linear},
864
+ dtype=torch.qint8,
865
+ )[0]
866
+
867
+ module._dynamic_int8_linear = quantized_linear
868
+
869
+ def _forward_dynamic_int8(self, input, task_id=None, residual=False):
870
+ out = self._dynamic_int8_linear(input)
871
+ if residual:
872
+ return out, input
873
+ return out
874
+
875
+ module.forward = _forward_dynamic_int8.__get__(module, module.__class__)
876
+ quantized_layers += 1
877
+
878
+ if quantized_layers == 0:
879
+ print(
880
+ "[AVISO] Nenhuma camada ParametrizedLinear encontrada para dynamic-int8; usando modelo padrao."
881
+ )
882
+ return model
883
+
884
+ print(f"[+] Quantizacao Jina aplicada: dynamic-int8 (CPU, {quantized_layers} camadas).")
885
+ return model
886
+ except Exception as quant_error:
887
+ print(f"[AVISO] Falha ao aplicar dynamic-int8 ({quant_error}); usando modelo padrao.")
888
+ return model
889
+
890
+ try:
891
+ model = _load_from_local_dir(selection.model_id)
892
+ model = _apply_jina_quantization_if_needed(model, selection.model_id)
893
+ print("[+] Modelo carregado com sucesso.")
894
+ return model
895
+ except Exception as first_error:
896
+ if selection.model_id == fallback_model_id:
897
+ raise RuntimeError(
898
+ f"Falha ao carregar o modelo fallback '{fallback_model_id}': {first_error}"
899
+ ) from first_error
900
+
901
+ print(
902
+ f"[!] Falha ao carregar '{selection.model_id}': {_format_exception(first_error)}\n"
903
+ f" Tentando fallback de carregamento: {fallback_model_id}"
904
+ )
905
+ fallback_selection = download_model_with_fallback(
906
+ preferred_model_id=fallback_model_id,
907
+ fallback_model_id=fallback_model_id,
908
+ local_dir=model_base_dir,
909
+ )
910
+ selected_model_dir = fallback_selection.local_dir
911
+ print(
912
+ f"[+] Modelo selecionado: {fallback_selection.model_id} "
913
+ f"(provider={fallback_selection.provider}, path={selected_model_dir})"
914
+ )
915
+ model = _load_from_local_dir(fallback_selection.model_id)
916
+ model = _apply_jina_quantization_if_needed(model, fallback_selection.model_id)
917
+ print("[+] Modelo fallback carregado com sucesso.")
918
+ return model
919
+
920
+
921
+ def connect_to_chroma() -> chromadb.HttpClient:
922
+ """Conecta ao ChromaDB via HTTP e valida a conexão."""
923
+ try:
924
+ client = chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT)
925
+ # Faz um heartbeat para confirmar que o servidor está no ar
926
+ client.heartbeat()
927
+ print(f"[+] Conectado ao ChromaDB em {CHROMA_HOST}:{CHROMA_PORT}")
928
+ return client
929
+ except Exception as e:
930
+ print(f"[ERRO] Não foi possível conectar ao ChromaDB: {e}")
931
+ print(" Verifique se o container Docker está rodando:")
932
+ print(" docker compose up -d")
933
+ sys.exit(1)
934
+
935
+
936
+ def scan_files(root_path: Path) -> Iterator[Path]:
937
+ """
938
+ Varre recursivamente o diretório raiz, retornando em streaming
939
+ os arquivos de texto relevantes para indexação.
940
+ """
941
+ for dirpath, dirnames, filenames in os.walk(root_path):
942
+ # Remove dirs ignorados in-place para que os.walk não desça neles
943
+ dirnames[:] = [
944
+ d for d in dirnames
945
+ if d not in IGNORED_DIRS and not d.startswith(".")
946
+ ]
947
+ dirnames.sort()
948
+
949
+ for filename in sorted(filenames):
950
+ filepath = Path(dirpath) / filename
951
+
952
+ # Ignora por extensão
953
+ if filepath.suffix.lower() in IGNORED_EXTENSIONS:
954
+ continue
955
+
956
+ # Ignora arquivos muito grandes
957
+ try:
958
+ if filepath.stat().st_size > MAX_FILE_SIZE_BYTES:
959
+ continue
960
+ except OSError:
961
+ continue
962
+
963
+ yield filepath
964
+
965
+
966
+ def make_chunk_id(file_path: str, chunk_index: int) -> str:
967
+ """Gera um ID determinístico para cada chunk baseado no caminho + índice."""
968
+ raw = f"{file_path}::chunk::{chunk_index}"
969
+ return hashlib.md5(raw.encode()).hexdigest()
970
+
971
+
972
+ def read_file_safe(filepath: Path) -> str | None:
973
+ """Lê um arquivo de texto, tentando múltiplos encodings."""
974
+ for encoding in ("utf-8", "latin-1", "cp1252"):
975
+ try:
976
+ return filepath.read_text(encoding=encoding)
977
+ except UnicodeDecodeError:
978
+ continue
979
+ except OSError as e:
980
+ print(f" [AVISO] Não foi possível ler {filepath}: {e}")
981
+ return None
982
+ # Se nenhum encoding funcionou, é provavelmente binário disfarçado
983
+ return None
984
+
985
+
986
+ def delete_file_chunks(collection: chromadb.Collection, file_path: str) -> None:
987
+ """Remove todos os chunks de um arquivo específico da coleção."""
988
+ try:
989
+ # Pede somente IDs para evitar materializar docs/metadata na memória.
990
+ results = collection.get(where={"file_path": file_path}, include=[])
991
+ if results and results["ids"]:
992
+ collection.delete(ids=results["ids"])
993
+ except Exception as e:
994
+ print(f" [AVISO] Erro ao deletar chunks de {file_path}: {_format_exception(e)}")
995
+
996
+
997
+ # ---------------------------------------------------------------------------
998
+ # Indexação de um único arquivo
999
+ # ---------------------------------------------------------------------------
1000
+
1001
+ def index_file(
1002
+ filepath: Path,
1003
+ collection: chromadb.Collection,
1004
+ model: SentenceTransformer,
1005
+ splitter: RecursiveCharacterTextSplitter,
1006
+ root_path: Path,
1007
+ embedding_batch_size: int,
1008
+ ) -> int:
1009
+ """
1010
+ Indexa um único arquivo: lê, divide em chunks, gera embeddings e faz upsert.
1011
+ Retorna o número de chunks indexados.
1012
+ """
1013
+ content = read_file_safe(filepath)
1014
+ if not content or not content.strip():
1015
+ return 0
1016
+
1017
+ # Usa caminho absoluto como metadado
1018
+ abs_path = str(filepath.resolve())
1019
+
1020
+ # Remove chunks antigos deste arquivo (atualização idempotente)
1021
+ delete_file_chunks(collection, abs_path)
1022
+
1023
+ chunks = splitter.split_text(content)
1024
+ if not chunks:
1025
+ return 0
1026
+
1027
+ relative_path = str(filepath.relative_to(root_path))
1028
+ inserted_chunks = 0
1029
+ batch_ids: list[str] = []
1030
+ batch_docs: list[str] = []
1031
+ batch_metadatas: list[dict[str, object]] = []
1032
+
1033
+ def _flush_batch() -> None:
1034
+ nonlocal inserted_chunks
1035
+ if not batch_ids:
1036
+ return
1037
+
1038
+ embeddings = model.encode(
1039
+ batch_docs,
1040
+ show_progress_bar=False,
1041
+ batch_size=embedding_batch_size,
1042
+ ).tolist()
1043
+ collection.upsert(
1044
+ ids=batch_ids,
1045
+ embeddings=embeddings,
1046
+ documents=batch_docs,
1047
+ metadatas=batch_metadatas,
1048
+ )
1049
+ inserted_chunks += len(batch_ids)
1050
+ del embeddings
1051
+ batch_ids.clear()
1052
+ batch_docs.clear()
1053
+ batch_metadatas.clear()
1054
+ gc.collect()
1055
+
1056
+ for i, chunk in enumerate(chunks):
1057
+ batch_ids.append(make_chunk_id(abs_path, i))
1058
+ batch_docs.append(chunk)
1059
+ batch_metadatas.append(
1060
+ {
1061
+ "file_path": abs_path,
1062
+ "chunk_index": i,
1063
+ "file_name": filepath.name,
1064
+ # Caminho relativo à raiz do projeto para exibição compacta
1065
+ "relative_path": relative_path,
1066
+ }
1067
+ )
1068
+ if len(batch_ids) >= embedding_batch_size:
1069
+ _flush_batch()
1070
+
1071
+ _flush_batch()
1072
+ return inserted_chunks
1073
+
1074
+
1075
+ # ---------------------------------------------------------------------------
1076
+ # Ponto de entrada principal
1077
+ # ---------------------------------------------------------------------------
1078
+
1079
+ def main():
1080
+ parser = argparse.ArgumentParser(
1081
+ description="Indexa um projeto de código no ChromaDB para RAG local."
1082
+ )
1083
+ parser.add_argument(
1084
+ "project_path",
1085
+ nargs="?",
1086
+ default=".",
1087
+ help="Caminho raiz do projeto a indexar (padrão: diretório atual)",
1088
+ )
1089
+ parser.add_argument(
1090
+ "--clear",
1091
+ action="store_true",
1092
+ help="Limpa toda a coleção antes de reindexar",
1093
+ )
1094
+ parser.add_argument(
1095
+ "--embedding-model",
1096
+ choices=["jina", "bge", "hybrid"],
1097
+ help=(
1098
+ "Modelo de embeddings: 'jina' (codigo), "
1099
+ "'bge' (conteudo misto) ou 'hybrid' (duas colecoes: Jina v2 + BGE)."
1100
+ ),
1101
+ )
1102
+ parser.add_argument(
1103
+ "--jina-quantization",
1104
+ choices=["default", "dynamic-int8"],
1105
+ help="Quantizacao para Jina: 'default' (mais qualidade) ou 'dynamic-int8' (mais velocidade).",
1106
+ )
1107
+ parser.add_argument(
1108
+ "--perf-profile",
1109
+ choices=["autotune", "max-performance"],
1110
+ help=(
1111
+ "Perfil de performance da indexação: "
1112
+ "'autotune' (custo-benefício) ou 'max-performance' (mais throughput, maior uso de RAM)."
1113
+ ),
1114
+ )
1115
+ args = parser.parse_args()
1116
+
1117
+ root_path = Path(args.project_path).resolve()
1118
+ if not root_path.is_dir():
1119
+ print(f"[ERRO] Caminho não existe ou não é um diretório: {root_path}")
1120
+ sys.exit(1)
1121
+
1122
+ print(f"\n{'='*60}")
1123
+ print(f" RAG Indexer — Projeto: {root_path}")
1124
+ print(f"{'='*60}\n")
1125
+ index_started_at = datetime.now()
1126
+ print(f"[INFO] Início: {index_started_at.strftime('%Y-%m-%d %H:%M:%S')}")
1127
+
1128
+ force_model_reconfigure = _env_bool("MCP_FORCE_MODEL_RECONFIG", default=False)
1129
+ persisted_config = load_indexer_tuning_config(force_model_reconfigure)
1130
+ model_choice, jina_quantization = resolve_embedding_config(
1131
+ args.embedding_model,
1132
+ args.jina_quantization,
1133
+ persisted_config=persisted_config,
1134
+ )
1135
+ perf_profile = resolve_perf_profile(args.perf_profile, persisted_config)
1136
+
1137
+ chunk_size_locked = "MCP_CHUNK_SIZE" in os.environ
1138
+ chunk_overlap_locked = "MCP_CHUNK_OVERLAP" in os.environ
1139
+ batch_size_locked = "MCP_EMBEDDING_BATCH_SIZE" in os.environ
1140
+
1141
+ persisted_chunk_size = _parse_config_int(persisted_config, "chunk_size")
1142
+ persisted_chunk_overlap = _parse_config_int(persisted_config, "chunk_overlap")
1143
+ persisted_batch_size = _parse_config_int(persisted_config, "embedding_batch_size")
1144
+
1145
+ effective_chunk_size = CHUNK_SIZE
1146
+ if not chunk_size_locked and persisted_chunk_size is not None:
1147
+ effective_chunk_size = max(256, persisted_chunk_size)
1148
+
1149
+ effective_chunk_overlap = CHUNK_OVERLAP
1150
+ if not chunk_overlap_locked and persisted_chunk_overlap is not None:
1151
+ effective_chunk_overlap = max(0, min(effective_chunk_size - 1, persisted_chunk_overlap))
1152
+
1153
+ effective_batch_size = EMBEDDING_BATCH_SIZE
1154
+ if not batch_size_locked and persisted_batch_size is not None:
1155
+ effective_batch_size = max(1, persisted_batch_size)
1156
+
1157
+ print(
1158
+ f"[CONFIG] Modelo escolhido: {model_choice} "
1159
+ f"({_describe_embedding_choice(model_choice)})"
1160
+ )
1161
+ if model_choice == "jina":
1162
+ print(f"[CONFIG] Quantizacao Jina: {jina_quantization}")
1163
+ elif model_choice == "hybrid":
1164
+ print("[CONFIG] Quantizacao Jina: nao aplicavel no hybrid (Jina v2 + BGE)")
1165
+ else:
1166
+ print("[CONFIG] Quantizacao Jina: nao aplicavel (modelo BGE selecionado)")
1167
+ print(f"[CONFIG] Perfil de performance: {perf_profile}")
1168
+ if perf_profile == "max-performance":
1169
+ print(
1170
+ "[AVISO] Este modo pode elevar consideravelmente o consumo de memória "
1171
+ "e causar encerramento por OOM (exit 137)."
1172
+ )
1173
+ warn_if_jina_memory_risk(model_choice, jina_quantization)
1174
+
1175
+ # Inicializa componentes
1176
+ client = connect_to_chroma()
1177
+ targets = _resolve_index_targets(model_choice)
1178
+
1179
+ # Obtém ou recria as coleções envolvidas.
1180
+ collections: dict[str, chromadb.Collection] = {}
1181
+ collection_dimension_reset_done: dict[str, bool] = {}
1182
+ for target in targets:
1183
+ if args.clear:
1184
+ try:
1185
+ client.delete_collection(target.collection_name)
1186
+ print(f"[!] Coleção '{target.collection_name}' removida para reindexação limpa.")
1187
+ except Exception:
1188
+ pass
1189
+ collections[target.collection_name] = client.get_or_create_collection(
1190
+ name=target.collection_name,
1191
+ metadata={"hnsw:space": "cosine"},
1192
+ )
1193
+ collection_dimension_reset_done[target.collection_name] = False
1194
+
1195
+ # Carrega modelos de forma lazy e reaproveita por target.
1196
+ loaded_models: dict[str, SentenceTransformer] = {}
1197
+ total_chunks = 0
1198
+ errors = 0
1199
+ files_scanned = 0
1200
+ files_processed_total = 0
1201
+ chunks_by_collection = {target.collection_name: 0 for target in targets}
1202
+ files_by_collection = {target.collection_name: 0 for target in targets}
1203
+ files_eligible_by_collection = {target.collection_name: 0 for target in targets}
1204
+ errors_by_collection = {target.collection_name: 0 for target in targets}
1205
+ error_samples_by_collection: dict[str, list[str]] = {target.collection_name: [] for target in targets}
1206
+ target_by_model = {target.model_choice: target for target in targets}
1207
+
1208
+ # Carrega o primeiro modelo antes para autotune com micro-benchmark.
1209
+ primary_target = targets[0]
1210
+ primary_quantization = jina_quantization if primary_target.model_choice == "jina" else "default"
1211
+ loaded_models[primary_target.model_choice] = load_embedding_model(primary_target.model_choice, primary_quantization)
1212
+ primary_model = loaded_models[primary_target.model_choice]
1213
+
1214
+ if perf_profile == "autotune":
1215
+ tuned = _resolve_autotuned_params(
1216
+ model=primary_model,
1217
+ chunk_size_locked=chunk_size_locked,
1218
+ chunk_overlap_locked=chunk_overlap_locked,
1219
+ batch_size_locked=batch_size_locked,
1220
+ chunk_size=effective_chunk_size,
1221
+ chunk_overlap=effective_chunk_overlap,
1222
+ embedding_batch_size=effective_batch_size,
1223
+ )
1224
+ else:
1225
+ tuned = _resolve_max_performance_params(
1226
+ chunk_size_locked=chunk_size_locked,
1227
+ chunk_overlap_locked=chunk_overlap_locked,
1228
+ batch_size_locked=batch_size_locked,
1229
+ chunk_size=effective_chunk_size,
1230
+ chunk_overlap=effective_chunk_overlap,
1231
+ embedding_batch_size=effective_batch_size,
1232
+ )
1233
+
1234
+ effective_chunk_size = max(256, tuned.chunk_size)
1235
+ effective_chunk_overlap = max(0, min(effective_chunk_size - 1, tuned.chunk_overlap))
1236
+ effective_batch_size = max(1, tuned.embedding_batch_size)
1237
+
1238
+ for reason in tuned.reasons:
1239
+ print(f"[CONFIG] {reason}")
1240
+
1241
+ print(
1242
+ f"[CONFIG] Parâmetros finais: "
1243
+ f"chunk_size={effective_chunk_size}, chunk_overlap={effective_chunk_overlap}, "
1244
+ f"embedding_batch={effective_batch_size}"
1245
+ )
1246
+
1247
+ save_indexer_tuning_config(
1248
+ {
1249
+ "embedding_model": model_choice,
1250
+ "jina_quantization": jina_quantization,
1251
+ "perf_profile": perf_profile,
1252
+ "chunk_size": effective_chunk_size,
1253
+ "chunk_overlap": effective_chunk_overlap,
1254
+ "embedding_batch_size": effective_batch_size,
1255
+ }
1256
+ )
1257
+
1258
+ splitter = get_text_splitter(effective_chunk_size, effective_chunk_overlap)
1259
+
1260
+ print(f"\n[+] Varrendo e indexando arquivos em: {root_path}")
1261
+ files = list(scan_files(root_path))
1262
+ files_scanned = len(files)
1263
+ if files_scanned == 0:
1264
+ print("[AVISO] Nenhum arquivo encontrado. Verifique o caminho e os filtros.")
1265
+ sys.exit(0)
1266
+
1267
+ print(f"[+] {files_scanned} arquivo(s) elegível(is) para indexação.")
1268
+ with tqdm(
1269
+ total=files_scanned,
1270
+ desc="Indexando",
1271
+ unit="arquivo",
1272
+ bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, {rate_fmt}]",
1273
+ ) as pbar:
1274
+ for filepath in files:
1275
+ target_models = _classify_file_targets(filepath, model_choice)
1276
+
1277
+ for target_model in target_models:
1278
+ target = target_by_model.get(target_model)
1279
+ if target is None:
1280
+ continue
1281
+
1282
+ if target.model_choice not in loaded_models:
1283
+ target_quantization = jina_quantization if target.model_choice == "jina" else "default"
1284
+ try:
1285
+ loaded_models[target.model_choice] = load_embedding_model(
1286
+ target.model_choice,
1287
+ target_quantization,
1288
+ )
1289
+ except Exception as load_error:
1290
+ # Em hybrid, pode faltar RAM ao manter dois modelos grandes simultaneamente.
1291
+ if model_choice == "hybrid" and loaded_models and _is_memory_related_error(load_error):
1292
+ print(
1293
+ "[AVISO] Falha ao carregar modelo adicional no hybrid por memória. "
1294
+ "Liberando modelo anterior e tentando novamente."
1295
+ )
1296
+ loaded_models.clear()
1297
+ gc.collect()
1298
+ loaded_models[target.model_choice] = load_embedding_model(
1299
+ target.model_choice,
1300
+ target_quantization,
1301
+ )
1302
+ else:
1303
+ raise
1304
+
1305
+ model = loaded_models[target.model_choice]
1306
+ collection = collections[target.collection_name]
1307
+ files_eligible_by_collection[target.collection_name] += 1
1308
+
1309
+ while True:
1310
+ try:
1311
+ n_chunks = index_file(
1312
+ filepath,
1313
+ collection,
1314
+ model,
1315
+ splitter,
1316
+ root_path,
1317
+ embedding_batch_size=effective_batch_size,
1318
+ )
1319
+ total_chunks += n_chunks
1320
+ files_processed_total += 1
1321
+ chunks_by_collection[target.collection_name] += n_chunks
1322
+ files_by_collection[target.collection_name] += 1
1323
+ break
1324
+ except Exception as e:
1325
+ # Fallback automático para evitar quebra total em máquinas no limite de RAM.
1326
+ if (
1327
+ not batch_size_locked
1328
+ and effective_batch_size > 1
1329
+ and _is_memory_related_error(e)
1330
+ ):
1331
+ new_batch = max(1, effective_batch_size // 2)
1332
+ if new_batch < effective_batch_size:
1333
+ tqdm.write(
1334
+ f" [AJUSTE] Memória alta em {target.label}. "
1335
+ f"Batch reduzido {effective_batch_size} -> {new_batch}."
1336
+ )
1337
+ effective_batch_size = new_batch
1338
+ gc.collect()
1339
+ continue
1340
+
1341
+ if (
1342
+ _is_dimension_mismatch_error(e)
1343
+ and not collection_dimension_reset_done[target.collection_name]
1344
+ ):
1345
+ tqdm.write(
1346
+ f" [AJUSTE] Dimensão incompatível detectada em '{target.collection_name}'. "
1347
+ "Recriando coleção e tentando novamente."
1348
+ )
1349
+ try:
1350
+ client.delete_collection(target.collection_name)
1351
+ except Exception:
1352
+ pass
1353
+
1354
+ collections[target.collection_name] = client.get_or_create_collection(
1355
+ name=target.collection_name,
1356
+ metadata={"hnsw:space": "cosine"},
1357
+ )
1358
+ collection_dimension_reset_done[target.collection_name] = True
1359
+ gc.collect()
1360
+ continue
1361
+
1362
+ errors += 1
1363
+ errors_by_collection[target.collection_name] += 1
1364
+ if len(error_samples_by_collection[target.collection_name]) < 3:
1365
+ error_samples_by_collection[target.collection_name].append(
1366
+ f"{filepath.name}: {_format_exception(e)}"
1367
+ )
1368
+ tqdm.write(f" [ERRO] {filepath} [{target.label}]: {_format_exception(e)}")
1369
+ break
1370
+
1371
+ pbar.set_postfix({"chunks": total_chunks, "atual": filepath.name[:20]})
1372
+ pbar.update(1)
1373
+
1374
+ for target in targets:
1375
+ collection_name = target.collection_name
1376
+ eligible = files_eligible_by_collection[collection_name]
1377
+ processed = files_by_collection[collection_name]
1378
+ target_errors = errors_by_collection[collection_name]
1379
+
1380
+ if eligible == 0:
1381
+ print(f"[AVISO] Nenhum arquivo elegível para {target.label}; etapa ignorada.")
1382
+ elif processed == 0 and target_errors > 0:
1383
+ print(
1384
+ f"[AVISO] {eligible} arquivo(s) elegível(is) para {target.label}, "
1385
+ "mas todos falharam."
1386
+ )
1387
+
1388
+ if target_errors:
1389
+ print(f"[AVISO] {target_errors} erro(s) durante a indexação do target {target.label}.")
1390
+ for sample in error_samples_by_collection[collection_name]:
1391
+ print(f" - {sample}")
1392
+
1393
+ index_finished_at = datetime.now()
1394
+ elapsed_seconds = int((index_finished_at - index_started_at).total_seconds())
1395
+ elapsed_h = elapsed_seconds // 3600
1396
+ elapsed_m = (elapsed_seconds % 3600) // 60
1397
+ elapsed_s = elapsed_seconds % 60
1398
+ print(f"\n{'='*60}")
1399
+ print(f" Indexação concluída!")
1400
+ print(f" Início : {index_started_at.strftime('%Y-%m-%d %H:%M:%S')}")
1401
+ print(f" Fim : {index_finished_at.strftime('%Y-%m-%d %H:%M:%S')}")
1402
+ print(f" Duração : {elapsed_h:02d}:{elapsed_m:02d}:{elapsed_s:02d}")
1403
+ print(f" Arquivos varridos : {files_scanned}")
1404
+ print(f" Arquivos processados : {files_processed_total}")
1405
+ print(f" Total de chunks : {total_chunks}")
1406
+ print(f" Erros : {errors}")
1407
+ for target in targets:
1408
+ collection_name = target.collection_name
1409
+ print(
1410
+ f" Coleção ChromaDB : '{collection_name}' "
1411
+ f"(elegíveis={files_eligible_by_collection.get(collection_name, 0)}, "
1412
+ f"arquivos={files_by_collection.get(collection_name, 0)}, "
1413
+ f"chunks={chunks_by_collection.get(collection_name, 0)})"
1414
+ )
1415
+ print(f"{'='*60}\n")
1416
+
1417
+
1418
+ if __name__ == "__main__":
1419
+ try:
1420
+ main()
1421
+ except MemoryError:
1422
+ print(
1423
+ "[ERRO] Falha de memória durante a indexação. "
1424
+ "Use --embedding-model bge ou execute o Jina em máquina com mais RAM/swap."
1425
+ )
1426
+ sys.exit(1)