own-rag-cli 0.0.1-snapshot
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/MCP_USAGE.md +315 -0
- package/README.md +133 -0
- package/bin/docker-compose.yml +21 -0
- package/bin/download_model_from_hugginface.py +219 -0
- package/bin/download_model_from_modelscope.py +26 -0
- package/bin/indexer_full.py +1426 -0
- package/bin/mcp_server.py +1433 -0
- package/bin/postinstall.sh +102 -0
- package/bin/rag-remove.sh +198 -0
- package/bin/rag-wrapper.sh +186 -0
- package/bin/requirements.txt +21 -0
- package/chroma_monitor.sh +857 -0
- package/how-its-work.md +285 -0
- package/package.json +49 -0
- package/rag-setup-macos.run +1129 -0
- package/rag-setup.run +1179 -0
|
@@ -0,0 +1,1426 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
indexer_full.py — Script standalone de indexação do RAG local.
|
|
4
|
+
|
|
5
|
+
Uso:
|
|
6
|
+
python indexer_full.py [caminho_do_projeto]
|
|
7
|
+
|
|
8
|
+
Se nenhum caminho for passado, usa o diretório atual.
|
|
9
|
+
O ChromaDB deve estar rodando via Docker em localhost:8000.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
import hashlib
|
|
15
|
+
import argparse
|
|
16
|
+
import shutil
|
|
17
|
+
import logging
|
|
18
|
+
import gc
|
|
19
|
+
import json
|
|
20
|
+
from time import perf_counter, time
|
|
21
|
+
from collections.abc import Iterator
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
|
|
26
|
+
# Evita avisos "advisory" ruidosos do transformers no fluxo interativo.
|
|
27
|
+
os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _TorchDtypeWarningFilter(logging.Filter):
|
|
31
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
32
|
+
return "`torch_dtype` is deprecated! Use `dtype` instead!" not in record.getMessage()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
for _logger_name in ("transformers.configuration_utils", "transformers.modeling_utils"):
|
|
36
|
+
logging.getLogger(_logger_name).addFilter(_TorchDtypeWarningFilter())
|
|
37
|
+
|
|
38
|
+
import chromadb
|
|
39
|
+
from sentence_transformers import SentenceTransformer
|
|
40
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
41
|
+
from tqdm import tqdm
|
|
42
|
+
from download_model_from_hugginface import download_model_with_fallback
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Configurações globais
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _env_int(name: str, default: int, *, min_value: int = 1) -> int:
|
|
50
|
+
raw = os.environ.get(name)
|
|
51
|
+
if raw is None:
|
|
52
|
+
return max(min_value, default)
|
|
53
|
+
try:
|
|
54
|
+
return max(min_value, int(raw))
|
|
55
|
+
except ValueError:
|
|
56
|
+
return max(min_value, default)
|
|
57
|
+
|
|
58
|
+
CHROMA_HOST = "localhost"
|
|
59
|
+
CHROMA_PORT = _env_int("MCP_CHROMA_PORT", 8000, min_value=1)
|
|
60
|
+
COLLECTION_CODE_JINA = "code_vectors_jina"
|
|
61
|
+
COLLECTION_DOC_BGE = "doc_vectors_bge"
|
|
62
|
+
|
|
63
|
+
# Pastas e extensões ignoradas durante a varredura
|
|
64
|
+
IGNORED_DIRS = {
|
|
65
|
+
".git", "node_modules", "__pycache__", ".venv", "venv", "env",
|
|
66
|
+
"dist", "build", "out", ".next", ".nuxt", ".cache", "coverage",
|
|
67
|
+
".pytest_cache", ".mypy_cache", ".ruff_cache", "target", "bin", "obj",
|
|
68
|
+
".idea", ".vscode", ".DS_Store", "vendor", "tmp", "temp", "logs",
|
|
69
|
+
".rag_db",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
IGNORED_EXTENSIONS = {
|
|
73
|
+
# Binários e imagens
|
|
74
|
+
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp", ".bmp",
|
|
75
|
+
".mp4", ".mp3", ".wav", ".ogg", ".avi", ".mov",
|
|
76
|
+
# Pacotes e compilados
|
|
77
|
+
".zip", ".tar", ".gz", ".rar", ".7z", ".jar", ".war", ".ear",
|
|
78
|
+
".pyc", ".pyo", ".so", ".dll", ".exe", ".bin",
|
|
79
|
+
# Lockfiles e gerados
|
|
80
|
+
".lock", ".sum",
|
|
81
|
+
# Banco de dados
|
|
82
|
+
".sqlite", ".db", ".sqlite3",
|
|
83
|
+
# Fontes
|
|
84
|
+
".ttf", ".woff", ".woff2", ".eot",
|
|
85
|
+
# PDF/Documentos binários
|
|
86
|
+
".pdf", ".docx", ".xlsx", ".pptx",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
CODE_EXTENSIONS = {
|
|
90
|
+
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".c", ".h", ".cpp", ".hpp",
|
|
91
|
+
".go", ".rs", ".rb", ".php", ".cs", ".swift", ".kt", ".kts", ".scala", ".sql",
|
|
92
|
+
".sh", ".bash", ".zsh", ".ps1", ".yaml", ".yml", ".toml", ".ini", ".conf",
|
|
93
|
+
".json", ".xml", ".html", ".css", ".scss", ".sass", ".vue", ".svelte", ".dart",
|
|
94
|
+
".lua", ".r", ".m", ".mm",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
DOC_EXTENSIONS = {
|
|
98
|
+
".md", ".mdx", ".rst", ".txt", ".adoc", ".org", ".tex", ".csv",
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Tamanho máximo de arquivo (evita indexar arquivos enormes gerados)
|
|
102
|
+
MAX_FILE_SIZE_BYTES = 500 * 1024 # 500 KB
|
|
103
|
+
|
|
104
|
+
# Parâmetros do splitter e batch (perfil low-memory por padrão).
|
|
105
|
+
CHUNK_SIZE = _env_int("MCP_CHUNK_SIZE", 3000, min_value=256)
|
|
106
|
+
CHUNK_OVERLAP = min(CHUNK_SIZE - 1, _env_int("MCP_CHUNK_OVERLAP", 400, min_value=0))
|
|
107
|
+
EMBEDDING_BATCH_SIZE = _env_int("MCP_EMBEDDING_BATCH_SIZE", 4, min_value=1)
|
|
108
|
+
DEFAULT_PERF_PROFILE = "autotune"
|
|
109
|
+
INDEXER_CONFIG_PATH = Path(
|
|
110
|
+
os.environ.get("MCP_INDEXER_CONFIG_FILE", str(Path.home() / ".rag_db" / "indexer_tuning.json"))
|
|
111
|
+
).expanduser()
|
|
112
|
+
INDEXER_CONFIG_FALLBACK_PATH = Path.home() / ".cache" / "my-custom-rag-python" / "indexer_tuning.json"
|
|
113
|
+
|
|
114
|
+
# Modelo de embeddings (roda na CPU)
|
|
115
|
+
JINA_V3_EMBEDDING_MODEL = "jinaai/jina-embeddings-v3"
|
|
116
|
+
JINA_V2_EMBEDDING_MODEL = "jinaai/jina-embeddings-v2-base-code"
|
|
117
|
+
BGE_EMBEDDING_MODEL = "BAAI/bge-m3"
|
|
118
|
+
DEFAULT_EMBEDDING_MODEL_CHOICE = "jina"
|
|
119
|
+
DEFAULT_JINA_QUANTIZATION = "dynamic-int8"
|
|
120
|
+
MODEL_CACHE_BASE_DIR = Path(
|
|
121
|
+
os.environ.get("MCP_MODEL_DIR", str(Path.home() / ".cache" / "my-custom-rag-python" / "models"))
|
|
122
|
+
).expanduser()
|
|
123
|
+
JINA_RECOMMENDED_RAM_GB_DEFAULT = 64
|
|
124
|
+
JINA_RECOMMENDED_RAM_GB_DYNAMIC_INT8 = 48
|
|
125
|
+
JINA_RECOMMENDED_SWAP_GB = 16
|
|
126
|
+
JINA_MIN_AVAILABLE_RAM_GB_HINT = 12
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _env_bool(name: str, default: bool = False) -> bool:
|
|
130
|
+
raw = os.environ.get(name)
|
|
131
|
+
if raw is None:
|
|
132
|
+
return default
|
|
133
|
+
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _clamp(value: float, low: float, high: float) -> float:
|
|
137
|
+
return max(low, min(high, value))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _is_memory_related_error(exc: Exception) -> bool:
|
|
141
|
+
if isinstance(exc, MemoryError):
|
|
142
|
+
return True
|
|
143
|
+
msg = str(exc).lower()
|
|
144
|
+
memory_markers = (
|
|
145
|
+
"out of memory",
|
|
146
|
+
"oom",
|
|
147
|
+
"cannot allocate memory",
|
|
148
|
+
"std::bad_alloc",
|
|
149
|
+
"bad alloc",
|
|
150
|
+
"insufficient memory",
|
|
151
|
+
)
|
|
152
|
+
return any(marker in msg for marker in memory_markers)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _is_dimension_mismatch_error(exc: Exception) -> bool:
|
|
156
|
+
msg = str(exc).lower()
|
|
157
|
+
return (
|
|
158
|
+
"expecting embedding with dimension" in msg
|
|
159
|
+
or ("embedding" in msg and "dimension" in msg and "got" in msg)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _format_exception(exc: Exception) -> str:
|
|
164
|
+
message = str(exc).strip()
|
|
165
|
+
if message:
|
|
166
|
+
return message
|
|
167
|
+
return repr(exc)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@dataclass(frozen=True)
|
|
171
|
+
class IndexTarget:
|
|
172
|
+
model_choice: str
|
|
173
|
+
collection_name: str
|
|
174
|
+
label: str
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _resolve_model_id(model_choice: str) -> str:
|
|
178
|
+
if model_choice == "jina":
|
|
179
|
+
return JINA_V3_EMBEDDING_MODEL
|
|
180
|
+
if model_choice == "jina-v2":
|
|
181
|
+
return JINA_V2_EMBEDDING_MODEL
|
|
182
|
+
if model_choice == "bge":
|
|
183
|
+
return BGE_EMBEDDING_MODEL
|
|
184
|
+
raise ValueError(f"Modelo não suportado: {model_choice}")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _resolve_fallback_model_id(model_choice: str) -> str:
|
|
188
|
+
return BGE_EMBEDDING_MODEL
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _describe_embedding_choice(model_choice: str) -> str:
|
|
192
|
+
if model_choice == "jina":
|
|
193
|
+
return f"jina ({JINA_V3_EMBEDDING_MODEL})"
|
|
194
|
+
if model_choice == "bge":
|
|
195
|
+
return f"bge ({BGE_EMBEDDING_MODEL})"
|
|
196
|
+
if model_choice == "hybrid":
|
|
197
|
+
return f"hybrid ({JINA_V2_EMBEDDING_MODEL} + {BGE_EMBEDDING_MODEL})"
|
|
198
|
+
return model_choice
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _resolve_index_targets(model_choice: str) -> list[IndexTarget]:
|
|
202
|
+
if model_choice == "jina":
|
|
203
|
+
return [
|
|
204
|
+
IndexTarget(
|
|
205
|
+
model_choice="jina",
|
|
206
|
+
collection_name=COLLECTION_CODE_JINA,
|
|
207
|
+
label="Code/Jina",
|
|
208
|
+
)
|
|
209
|
+
]
|
|
210
|
+
if model_choice == "bge":
|
|
211
|
+
return [
|
|
212
|
+
IndexTarget(
|
|
213
|
+
model_choice="bge",
|
|
214
|
+
collection_name=COLLECTION_DOC_BGE,
|
|
215
|
+
label="Doc/BGE",
|
|
216
|
+
)
|
|
217
|
+
]
|
|
218
|
+
if model_choice == "hybrid":
|
|
219
|
+
return [
|
|
220
|
+
IndexTarget(
|
|
221
|
+
model_choice="jina-v2",
|
|
222
|
+
collection_name=COLLECTION_CODE_JINA,
|
|
223
|
+
label="Code/Jina v2",
|
|
224
|
+
),
|
|
225
|
+
IndexTarget(
|
|
226
|
+
model_choice="bge",
|
|
227
|
+
collection_name=COLLECTION_DOC_BGE,
|
|
228
|
+
label="Doc/BGE",
|
|
229
|
+
),
|
|
230
|
+
]
|
|
231
|
+
raise ValueError(f"Modelo não suportado: {model_choice}")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _classify_file_targets(filepath: Path, model_choice: str) -> set[str]:
|
|
235
|
+
if model_choice != "hybrid":
|
|
236
|
+
return {model_choice}
|
|
237
|
+
|
|
238
|
+
suffix = filepath.suffix.lower()
|
|
239
|
+
is_code = suffix in CODE_EXTENSIONS
|
|
240
|
+
is_doc = suffix in DOC_EXTENSIONS
|
|
241
|
+
|
|
242
|
+
if is_code and not is_doc:
|
|
243
|
+
return {"jina-v2"}
|
|
244
|
+
if is_doc and not is_code:
|
|
245
|
+
return {"bge"}
|
|
246
|
+
|
|
247
|
+
# Extensão desconhecida/ambígua: indexa nos dois ramos para manter recall.
|
|
248
|
+
return {"jina-v2", "bge"}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _model_cache_dir(base_dir: Path, model_id: str) -> Path:
|
|
252
|
+
safe_name = model_id.replace("/", "__").replace(":", "_")
|
|
253
|
+
return base_dir / safe_name
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _pick_with_prompt(
|
|
257
|
+
*,
|
|
258
|
+
current_value: str | None,
|
|
259
|
+
default_value: str,
|
|
260
|
+
title: str,
|
|
261
|
+
options: list[tuple[str, str]],
|
|
262
|
+
) -> str:
|
|
263
|
+
if current_value:
|
|
264
|
+
return current_value
|
|
265
|
+
if not sys.stdin.isatty():
|
|
266
|
+
return default_value
|
|
267
|
+
|
|
268
|
+
print(f"\n[CONFIG] {title}")
|
|
269
|
+
for index, (_, description) in enumerate(options, start=1):
|
|
270
|
+
print(f" {index}) {description}")
|
|
271
|
+
print(f" Enter = padrão ({default_value})")
|
|
272
|
+
|
|
273
|
+
answer = input("> Escolha: ").strip()
|
|
274
|
+
if not answer:
|
|
275
|
+
return default_value
|
|
276
|
+
if answer.isdigit():
|
|
277
|
+
idx = int(answer) - 1
|
|
278
|
+
if 0 <= idx < len(options):
|
|
279
|
+
return options[idx][0]
|
|
280
|
+
lowered = answer.lower()
|
|
281
|
+
valid_keys = {k for k, _ in options}
|
|
282
|
+
if lowered in valid_keys:
|
|
283
|
+
return lowered
|
|
284
|
+
print(f"[AVISO] Opção inválida '{answer}'. Usando padrão: {default_value}")
|
|
285
|
+
return default_value
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def resolve_embedding_config(
|
|
289
|
+
model_choice_arg: str | None,
|
|
290
|
+
jina_quantization_arg: str | None,
|
|
291
|
+
persisted_config: dict[str, object] | None = None,
|
|
292
|
+
) -> tuple[str, str]:
|
|
293
|
+
persisted_config = persisted_config or {}
|
|
294
|
+
model_choice_from_config = persisted_config.get("embedding_model")
|
|
295
|
+
model_choice = model_choice_arg or os.environ.get("MCP_EMBEDDING_MODEL")
|
|
296
|
+
if not model_choice and isinstance(model_choice_from_config, str):
|
|
297
|
+
model_choice = model_choice_from_config
|
|
298
|
+
if model_choice:
|
|
299
|
+
model_choice = model_choice.strip().lower()
|
|
300
|
+
model_choice = _pick_with_prompt(
|
|
301
|
+
current_value=model_choice,
|
|
302
|
+
default_value=DEFAULT_EMBEDDING_MODEL_CHOICE,
|
|
303
|
+
title="Escolha do modelo de embeddings",
|
|
304
|
+
options=[
|
|
305
|
+
(
|
|
306
|
+
"jina",
|
|
307
|
+
f"jina ({JINA_V3_EMBEDDING_MODEL}) - foco em código.",
|
|
308
|
+
),
|
|
309
|
+
(
|
|
310
|
+
"bge",
|
|
311
|
+
f"bge ({BGE_EMBEDDING_MODEL}) - conteúdo misto.",
|
|
312
|
+
),
|
|
313
|
+
(
|
|
314
|
+
"hybrid",
|
|
315
|
+
f"hybrid (Jina v2 {JINA_V2_EMBEDDING_MODEL} + BGE) - duas coleções.",
|
|
316
|
+
),
|
|
317
|
+
],
|
|
318
|
+
)
|
|
319
|
+
if model_choice not in {"jina", "bge", "hybrid"}:
|
|
320
|
+
print(f"[AVISO] MCP_EMBEDDING_MODEL inválido '{model_choice}'. Usando '{DEFAULT_EMBEDDING_MODEL_CHOICE}'.")
|
|
321
|
+
model_choice = DEFAULT_EMBEDDING_MODEL_CHOICE
|
|
322
|
+
|
|
323
|
+
quantization_from_config = persisted_config.get("jina_quantization")
|
|
324
|
+
jina_quantization = jina_quantization_arg or os.environ.get("MCP_JINA_QUANTIZATION")
|
|
325
|
+
if not jina_quantization and isinstance(quantization_from_config, str):
|
|
326
|
+
jina_quantization = quantization_from_config
|
|
327
|
+
if jina_quantization:
|
|
328
|
+
jina_quantization = jina_quantization.strip().lower().replace("_", "-")
|
|
329
|
+
|
|
330
|
+
if model_choice == "jina":
|
|
331
|
+
jina_quantization = _pick_with_prompt(
|
|
332
|
+
current_value=jina_quantization,
|
|
333
|
+
default_value=DEFAULT_JINA_QUANTIZATION,
|
|
334
|
+
title="Quantizacao do Jina (apenas para CPU)",
|
|
335
|
+
options=[
|
|
336
|
+
("default", "default (sem quantizacao) - maior qualidade, indexacao mais lenta."),
|
|
337
|
+
("dynamic-int8", "dynamic-int8 - indexacao mais rapida e menor uso de RAM, com pequena perda de qualidade."),
|
|
338
|
+
],
|
|
339
|
+
)
|
|
340
|
+
if jina_quantization not in {"default", "dynamic-int8"}:
|
|
341
|
+
print(
|
|
342
|
+
f"[AVISO] MCP_JINA_QUANTIZATION inválido '{jina_quantization}'. "
|
|
343
|
+
f"Usando '{DEFAULT_JINA_QUANTIZATION}'."
|
|
344
|
+
)
|
|
345
|
+
jina_quantization = DEFAULT_JINA_QUANTIZATION
|
|
346
|
+
else:
|
|
347
|
+
jina_quantization = "default"
|
|
348
|
+
|
|
349
|
+
return model_choice, jina_quantization
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _indexer_config_candidates() -> list[Path]:
|
|
353
|
+
candidates = [INDEXER_CONFIG_PATH]
|
|
354
|
+
if INDEXER_CONFIG_FALLBACK_PATH not in candidates:
|
|
355
|
+
candidates.append(INDEXER_CONFIG_FALLBACK_PATH)
|
|
356
|
+
return candidates
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def load_indexer_tuning_config(force_reconfigure: bool) -> dict[str, object]:
|
|
360
|
+
if force_reconfigure:
|
|
361
|
+
return {}
|
|
362
|
+
for candidate in _indexer_config_candidates():
|
|
363
|
+
try:
|
|
364
|
+
if not candidate.exists():
|
|
365
|
+
continue
|
|
366
|
+
data = json.loads(candidate.read_text(encoding="utf-8"))
|
|
367
|
+
if isinstance(data, dict):
|
|
368
|
+
return data
|
|
369
|
+
except Exception:
|
|
370
|
+
continue
|
|
371
|
+
return {}
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def save_indexer_tuning_config(config: dict[str, object]) -> None:
|
|
375
|
+
payload = {
|
|
376
|
+
**config,
|
|
377
|
+
"updated_at": int(time()),
|
|
378
|
+
}
|
|
379
|
+
write_errors: list[tuple[Path, Exception]] = []
|
|
380
|
+
|
|
381
|
+
for candidate in _indexer_config_candidates():
|
|
382
|
+
try:
|
|
383
|
+
candidate.parent.mkdir(parents=True, exist_ok=True)
|
|
384
|
+
candidate.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
385
|
+
if candidate == INDEXER_CONFIG_PATH:
|
|
386
|
+
print(f"[CONFIG] Configuração persistida em: {candidate}")
|
|
387
|
+
else:
|
|
388
|
+
print(
|
|
389
|
+
f"[CONFIG] Configuração persistida em fallback: {candidate} "
|
|
390
|
+
f"(destino primário sem permissão: {INDEXER_CONFIG_PATH})"
|
|
391
|
+
)
|
|
392
|
+
return
|
|
393
|
+
except Exception as e:
|
|
394
|
+
write_errors.append((candidate, e))
|
|
395
|
+
|
|
396
|
+
details = " | ".join(f"{path}: {_format_exception(err)}" for path, err in write_errors)
|
|
397
|
+
print(f"[AVISO] Não foi possível persistir configuração: {details}")
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def resolve_perf_profile(perf_profile_arg: str | None, persisted_config: dict[str, object]) -> str:
|
|
401
|
+
profile_from_config = persisted_config.get("perf_profile")
|
|
402
|
+
profile = perf_profile_arg or os.environ.get("MCP_PERF_PROFILE")
|
|
403
|
+
if not profile and isinstance(profile_from_config, str):
|
|
404
|
+
profile = profile_from_config
|
|
405
|
+
if profile:
|
|
406
|
+
profile = profile.strip().lower()
|
|
407
|
+
|
|
408
|
+
profile = _pick_with_prompt(
|
|
409
|
+
current_value=profile,
|
|
410
|
+
default_value=DEFAULT_PERF_PROFILE,
|
|
411
|
+
title="Perfil de performance da indexação",
|
|
412
|
+
options=[
|
|
413
|
+
(
|
|
414
|
+
"autotune",
|
|
415
|
+
"autotune - equilíbrio (recomendado).",
|
|
416
|
+
),
|
|
417
|
+
(
|
|
418
|
+
"max-performance",
|
|
419
|
+
"max-performance - máximo throughput (mais RAM).",
|
|
420
|
+
),
|
|
421
|
+
],
|
|
422
|
+
)
|
|
423
|
+
if profile not in {"autotune", "max-performance"}:
|
|
424
|
+
print(f"[AVISO] Perfil inválido '{profile}'. Usando '{DEFAULT_PERF_PROFILE}'.")
|
|
425
|
+
profile = DEFAULT_PERF_PROFILE
|
|
426
|
+
return profile
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _parse_config_int(config: dict[str, object], key: str) -> int | None:
|
|
430
|
+
raw = config.get(key)
|
|
431
|
+
if isinstance(raw, int):
|
|
432
|
+
return raw
|
|
433
|
+
if isinstance(raw, str) and raw.isdigit():
|
|
434
|
+
return int(raw)
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _read_meminfo_gib() -> tuple[float | None, float | None, float | None]:
|
|
439
|
+
"""Retorna (mem_total, mem_available, swap_total) em GiB, quando disponível."""
|
|
440
|
+
mem_total_kib: int | None = None
|
|
441
|
+
mem_available_kib: int | None = None
|
|
442
|
+
swap_total_kib: int | None = None
|
|
443
|
+
|
|
444
|
+
try:
|
|
445
|
+
for line in Path("/proc/meminfo").read_text(encoding="utf-8").splitlines():
|
|
446
|
+
if line.startswith("MemTotal:"):
|
|
447
|
+
mem_total_kib = int(line.split()[1])
|
|
448
|
+
elif line.startswith("MemAvailable:"):
|
|
449
|
+
mem_available_kib = int(line.split()[1])
|
|
450
|
+
elif line.startswith("SwapTotal:"):
|
|
451
|
+
swap_total_kib = int(line.split()[1])
|
|
452
|
+
except (OSError, ValueError, IndexError):
|
|
453
|
+
return None, None, None
|
|
454
|
+
|
|
455
|
+
to_gib = lambda kib: (kib / (1024 * 1024)) if kib is not None else None
|
|
456
|
+
return to_gib(mem_total_kib), to_gib(mem_available_kib), to_gib(swap_total_kib)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def warn_if_jina_memory_risk(model_choice: str, jina_quantization: str) -> None:
|
|
460
|
+
"""Mostra aviso de risco de OOM para o modelo Jina em máquinas com pouca memória."""
|
|
461
|
+
if model_choice not in {"jina", "hybrid"}:
|
|
462
|
+
return
|
|
463
|
+
|
|
464
|
+
mem_total_gib, mem_available_gib, swap_total_gib = _read_meminfo_gib()
|
|
465
|
+
if mem_total_gib is None:
|
|
466
|
+
return
|
|
467
|
+
|
|
468
|
+
recommended_ram_gib = (
|
|
469
|
+
JINA_RECOMMENDED_RAM_GB_DEFAULT
|
|
470
|
+
if jina_quantization == "default"
|
|
471
|
+
else JINA_RECOMMENDED_RAM_GB_DYNAMIC_INT8
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
reasons: list[str] = []
|
|
475
|
+
if mem_total_gib < recommended_ram_gib:
|
|
476
|
+
reasons.append(
|
|
477
|
+
f"RAM total detectada: {mem_total_gib:.1f} GiB (recomendado >= {recommended_ram_gib} GiB para Jina/{jina_quantization})."
|
|
478
|
+
)
|
|
479
|
+
if swap_total_gib is not None and swap_total_gib < JINA_RECOMMENDED_SWAP_GB:
|
|
480
|
+
reasons.append(
|
|
481
|
+
f"Swap detectada: {swap_total_gib:.1f} GiB (recomendado >= {JINA_RECOMMENDED_SWAP_GB} GiB)."
|
|
482
|
+
)
|
|
483
|
+
if mem_available_gib is not None and mem_available_gib < JINA_MIN_AVAILABLE_RAM_GB_HINT:
|
|
484
|
+
reasons.append(
|
|
485
|
+
f"RAM livre atual: {mem_available_gib:.1f} GiB (baixo para a carga inicial do Jina)."
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
if not reasons:
|
|
489
|
+
return
|
|
490
|
+
|
|
491
|
+
print("[AVISO] Alto risco de OOM com Jina nesta máquina/carga.")
|
|
492
|
+
for reason in reasons:
|
|
493
|
+
print(f" - {reason}")
|
|
494
|
+
print(" - Se ocorrer 'Killed' (exit 137), use BGE: --embedding-model bge")
|
|
495
|
+
print(" - Ou rode o Jina em máquina com mais RAM/swap e menos processos concorrentes.")
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
@dataclass(frozen=True)
|
|
499
|
+
class RuntimeIndexingParams:
|
|
500
|
+
chunk_size: int
|
|
501
|
+
chunk_overlap: int
|
|
502
|
+
embedding_batch_size: int
|
|
503
|
+
reasons: list[str]
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _resolve_max_performance_params(
|
|
507
|
+
*,
|
|
508
|
+
chunk_size_locked: bool,
|
|
509
|
+
chunk_overlap_locked: bool,
|
|
510
|
+
batch_size_locked: bool,
|
|
511
|
+
chunk_size: int,
|
|
512
|
+
chunk_overlap: int,
|
|
513
|
+
embedding_batch_size: int,
|
|
514
|
+
) -> RuntimeIndexingParams:
|
|
515
|
+
mem_total_gib, mem_available_gib, _ = _read_meminfo_gib()
|
|
516
|
+
reasons = [
|
|
517
|
+
"Perfil selecionado: max-performance.",
|
|
518
|
+
"Modo pode elevar consideravelmente o consumo de memória e causar encerramento por OOM (exit 137).",
|
|
519
|
+
]
|
|
520
|
+
|
|
521
|
+
tuned_chunk_size = chunk_size
|
|
522
|
+
tuned_chunk_overlap = chunk_overlap
|
|
523
|
+
tuned_batch = embedding_batch_size
|
|
524
|
+
|
|
525
|
+
if not chunk_size_locked:
|
|
526
|
+
if mem_total_gib is not None and mem_total_gib >= 64 and (mem_available_gib or 0) >= 16:
|
|
527
|
+
tuned_chunk_size = 7000
|
|
528
|
+
else:
|
|
529
|
+
tuned_chunk_size = 6000
|
|
530
|
+
reasons.append(f"chunk_size ajustado para {tuned_chunk_size} no perfil max-performance.")
|
|
531
|
+
|
|
532
|
+
if not chunk_overlap_locked:
|
|
533
|
+
tuned_chunk_overlap = min(tuned_chunk_size - 1, max(300, int(tuned_chunk_size * 0.15)))
|
|
534
|
+
reasons.append(f"chunk_overlap ajustado para {tuned_chunk_overlap}.")
|
|
535
|
+
|
|
536
|
+
if not batch_size_locked:
|
|
537
|
+
if mem_total_gib is not None and mem_total_gib >= 64 and (mem_available_gib or 0) >= 16:
|
|
538
|
+
tuned_batch = 24
|
|
539
|
+
elif mem_total_gib is not None and mem_total_gib >= 32:
|
|
540
|
+
tuned_batch = 16
|
|
541
|
+
else:
|
|
542
|
+
tuned_batch = 12
|
|
543
|
+
reasons.append(f"embedding_batch_size ajustado para {tuned_batch}.")
|
|
544
|
+
|
|
545
|
+
return RuntimeIndexingParams(
|
|
546
|
+
chunk_size=tuned_chunk_size,
|
|
547
|
+
chunk_overlap=tuned_chunk_overlap,
|
|
548
|
+
embedding_batch_size=max(1, tuned_batch),
|
|
549
|
+
reasons=reasons,
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _resolve_autotuned_params(
|
|
554
|
+
*,
|
|
555
|
+
model: SentenceTransformer,
|
|
556
|
+
chunk_size_locked: bool,
|
|
557
|
+
chunk_overlap_locked: bool,
|
|
558
|
+
batch_size_locked: bool,
|
|
559
|
+
chunk_size: int,
|
|
560
|
+
chunk_overlap: int,
|
|
561
|
+
embedding_batch_size: int,
|
|
562
|
+
) -> RuntimeIndexingParams:
|
|
563
|
+
reasons: list[str] = ["Perfil selecionado: autotune (custo-benefício)."]
|
|
564
|
+
verbose_autotune = _env_bool("MCP_AUTOTUNE_VERBOSE", default=False)
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
import psutil # type: ignore
|
|
568
|
+
except Exception:
|
|
569
|
+
reasons.append("psutil indisponível; mantendo parâmetros atuais sem benchmark.")
|
|
570
|
+
return RuntimeIndexingParams(
|
|
571
|
+
chunk_size=chunk_size,
|
|
572
|
+
chunk_overlap=chunk_overlap,
|
|
573
|
+
embedding_batch_size=embedding_batch_size,
|
|
574
|
+
reasons=reasons,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
vm = psutil.virtual_memory()
|
|
578
|
+
swap = psutil.swap_memory()
|
|
579
|
+
mem_total_gib = vm.total / (1024**3)
|
|
580
|
+
mem_available_gib = vm.available / (1024**3)
|
|
581
|
+
swap_total_gib = swap.total / (1024**3)
|
|
582
|
+
|
|
583
|
+
target_ram_percent = _clamp(
|
|
584
|
+
float(os.environ.get("MCP_AUTOTUNE_TARGET_RAM_PERCENT", "68")),
|
|
585
|
+
60.0,
|
|
586
|
+
75.0,
|
|
587
|
+
)
|
|
588
|
+
if mem_available_gib < 6 or swap_total_gib < 4:
|
|
589
|
+
target_ram_percent = min(target_ram_percent, 63.0)
|
|
590
|
+
reasons.append(
|
|
591
|
+
f"Memória detectada: total={mem_total_gib:.1f} GiB, livre={mem_available_gib:.1f} GiB, "
|
|
592
|
+
f"swap={swap_total_gib:.1f} GiB, alvo={target_ram_percent:.1f}%."
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
tuned_chunk_size = chunk_size
|
|
596
|
+
tuned_chunk_overlap = chunk_overlap
|
|
597
|
+
tuned_batch = embedding_batch_size
|
|
598
|
+
|
|
599
|
+
if not chunk_size_locked:
|
|
600
|
+
if mem_total_gib < 8 or mem_available_gib < 3:
|
|
601
|
+
tuned_chunk_size = 1800
|
|
602
|
+
elif mem_total_gib < 16 or mem_available_gib < 6:
|
|
603
|
+
tuned_chunk_size = 2400
|
|
604
|
+
elif mem_total_gib < 32 or mem_available_gib < 12:
|
|
605
|
+
tuned_chunk_size = 3200
|
|
606
|
+
else:
|
|
607
|
+
tuned_chunk_size = 4200
|
|
608
|
+
reasons.append(f"chunk_size autotunado para {tuned_chunk_size}.")
|
|
609
|
+
|
|
610
|
+
if not chunk_overlap_locked:
|
|
611
|
+
tuned_chunk_overlap = min(tuned_chunk_size - 1, max(120, int(tuned_chunk_size * 0.15)))
|
|
612
|
+
reasons.append(f"chunk_overlap autotunado para {tuned_chunk_overlap}.")
|
|
613
|
+
|
|
614
|
+
if not batch_size_locked:
|
|
615
|
+
max_candidate = 16
|
|
616
|
+
if mem_total_gib < 8 or mem_available_gib < 3 or swap_total_gib < 2:
|
|
617
|
+
max_candidate = 2
|
|
618
|
+
elif mem_total_gib < 16 or mem_available_gib < 6:
|
|
619
|
+
max_candidate = 4
|
|
620
|
+
elif mem_total_gib < 32 or mem_available_gib < 10:
|
|
621
|
+
max_candidate = 8
|
|
622
|
+
|
|
623
|
+
candidates = [2, 4, 6, 8, 12, 16]
|
|
624
|
+
candidates = [c for c in candidates if c <= max_candidate]
|
|
625
|
+
if not candidates:
|
|
626
|
+
candidates = [2]
|
|
627
|
+
|
|
628
|
+
process = psutil.Process()
|
|
629
|
+
sample_size = min(max(512, tuned_chunk_size), 3000)
|
|
630
|
+
sample_text = ("# autotune-sample\n" + ("x" * sample_size))
|
|
631
|
+
|
|
632
|
+
best_batch = candidates[0]
|
|
633
|
+
best_score = -1.0
|
|
634
|
+
best_memory_pct = 100.0
|
|
635
|
+
selected_benchmark_line: str | None = None
|
|
636
|
+
benchmark_lines: list[str] = []
|
|
637
|
+
|
|
638
|
+
# Warmup curto para estabilizar cache interno.
|
|
639
|
+
try:
|
|
640
|
+
_ = model.encode([sample_text], show_progress_bar=False, batch_size=1)
|
|
641
|
+
except Exception:
|
|
642
|
+
pass
|
|
643
|
+
|
|
644
|
+
for candidate in candidates:
|
|
645
|
+
docs = [sample_text] * candidate
|
|
646
|
+
gc.collect()
|
|
647
|
+
before_vm = psutil.virtual_memory().percent
|
|
648
|
+
before_rss = process.memory_info().rss / (1024**2)
|
|
649
|
+
started = perf_counter()
|
|
650
|
+
try:
|
|
651
|
+
embeddings = model.encode(
|
|
652
|
+
docs,
|
|
653
|
+
show_progress_bar=False,
|
|
654
|
+
batch_size=candidate,
|
|
655
|
+
)
|
|
656
|
+
except Exception as e:
|
|
657
|
+
benchmark_lines.append(f"batch={candidate}: erro ({e})")
|
|
658
|
+
continue
|
|
659
|
+
|
|
660
|
+
elapsed = max(perf_counter() - started, 1e-6)
|
|
661
|
+
after_vm = psutil.virtual_memory().percent
|
|
662
|
+
after_rss = process.memory_info().rss / (1024**2)
|
|
663
|
+
del embeddings
|
|
664
|
+
gc.collect()
|
|
665
|
+
|
|
666
|
+
throughput = candidate / elapsed
|
|
667
|
+
safe = after_vm <= (target_ram_percent + 3.0)
|
|
668
|
+
benchmark_lines.append(
|
|
669
|
+
f"batch={candidate}: {throughput:.2f} itens/s, vm={after_vm:.1f}%, rss_delta={after_rss - before_rss:+.1f} MiB"
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
if safe and throughput > best_score:
|
|
673
|
+
best_score = throughput
|
|
674
|
+
best_batch = candidate
|
|
675
|
+
best_memory_pct = after_vm
|
|
676
|
+
selected_benchmark_line = benchmark_lines[-1]
|
|
677
|
+
elif best_score < 0 and after_vm < best_memory_pct:
|
|
678
|
+
# Se nenhum candidato ficou "safe", escolhe o menos agressivo em memória.
|
|
679
|
+
best_batch = candidate
|
|
680
|
+
best_memory_pct = after_vm
|
|
681
|
+
selected_benchmark_line = benchmark_lines[-1]
|
|
682
|
+
|
|
683
|
+
# Se já passou muito do limite, evita tentar batches maiores.
|
|
684
|
+
if after_vm > target_ram_percent + 8.0:
|
|
685
|
+
break
|
|
686
|
+
|
|
687
|
+
# Evita escolher candidato que já começou acima do limite.
|
|
688
|
+
if before_vm > target_ram_percent + 5.0:
|
|
689
|
+
break
|
|
690
|
+
|
|
691
|
+
tuned_batch = max(1, best_batch)
|
|
692
|
+
if verbose_autotune:
|
|
693
|
+
reasons.extend(benchmark_lines)
|
|
694
|
+
elif selected_benchmark_line:
|
|
695
|
+
reasons.append(f"Micro-benchmark: {selected_benchmark_line}")
|
|
696
|
+
reasons.append(
|
|
697
|
+
f"embedding_batch_size autotunado para {tuned_batch} (alvo de memória: {target_ram_percent:.1f}%)."
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
return RuntimeIndexingParams(
|
|
701
|
+
chunk_size=tuned_chunk_size,
|
|
702
|
+
chunk_overlap=tuned_chunk_overlap,
|
|
703
|
+
embedding_batch_size=max(1, tuned_batch),
|
|
704
|
+
reasons=reasons,
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
# ---------------------------------------------------------------------------
|
|
709
|
+
# Funções auxiliares
|
|
710
|
+
# ---------------------------------------------------------------------------
|
|
711
|
+
|
|
712
|
+
def get_text_splitter(chunk_size: int, chunk_overlap: int) -> RecursiveCharacterTextSplitter:
|
|
713
|
+
"""Retorna o splitter compartilhado com as configurações padrão do projeto."""
|
|
714
|
+
return RecursiveCharacterTextSplitter(
|
|
715
|
+
chunk_size=chunk_size,
|
|
716
|
+
chunk_overlap=chunk_overlap,
|
|
717
|
+
length_function=len,
|
|
718
|
+
separators=["\n\n", "\n", " ", ""],
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def load_embedding_model(model_choice: str, jina_quantization: str) -> SentenceTransformer:
|
|
723
|
+
"""Carrega o modelo de embeddings forçando uso de CPU."""
|
|
724
|
+
embedding_model_id = _resolve_model_id(model_choice)
|
|
725
|
+
fallback_model_id = _resolve_fallback_model_id(model_choice)
|
|
726
|
+
|
|
727
|
+
model_base_dir = MODEL_CACHE_BASE_DIR
|
|
728
|
+
model_base_dir.mkdir(parents=True, exist_ok=True)
|
|
729
|
+
preferred_model_cache_dir = _model_cache_dir(model_base_dir, embedding_model_id)
|
|
730
|
+
|
|
731
|
+
print(f"[+] Baixando modelo preferido: {embedding_model_id}")
|
|
732
|
+
print(f"[+] Diretório de download/cache do modelo: {preferred_model_cache_dir}")
|
|
733
|
+
selection = download_model_with_fallback(
|
|
734
|
+
preferred_model_id=embedding_model_id,
|
|
735
|
+
fallback_model_id=fallback_model_id,
|
|
736
|
+
local_dir=model_base_dir,
|
|
737
|
+
)
|
|
738
|
+
selected_model_dir = selection.local_dir
|
|
739
|
+
print(
|
|
740
|
+
f"[+] Modelo selecionado: {selection.model_id} "
|
|
741
|
+
f"(provider={selection.provider}, path={selected_model_dir})"
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
def _clear_hf_dynamic_modules_cache() -> None:
|
|
745
|
+
cache_dir = Path.home() / ".cache" / "huggingface" / "modules" / "transformers_modules"
|
|
746
|
+
if cache_dir.exists():
|
|
747
|
+
print(f"[!] Limpando cache de módulos dinâmicos do Hugging Face: {cache_dir}")
|
|
748
|
+
shutil.rmtree(cache_dir, ignore_errors=True)
|
|
749
|
+
|
|
750
|
+
def _load_from_local_dir(model_id: str) -> SentenceTransformer:
|
|
751
|
+
# O modelo da Jina depende de código remoto; fallback normalmente não.
|
|
752
|
+
trust_remote_code = model_id.startswith("jinaai/")
|
|
753
|
+
tokenizer_kwargs = {"fix_mistral_regex": True}
|
|
754
|
+
|
|
755
|
+
def _instantiate_model() -> SentenceTransformer:
|
|
756
|
+
return SentenceTransformer(
|
|
757
|
+
str(selected_model_dir),
|
|
758
|
+
device="cpu",
|
|
759
|
+
trust_remote_code=trust_remote_code,
|
|
760
|
+
tokenizer_kwargs=tokenizer_kwargs,
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
def _load_with_mistral_regex_patch() -> SentenceTransformer:
|
|
764
|
+
# O código remoto da Jina instancia um tokenizer interno sem repassar tokenizer_kwargs.
|
|
765
|
+
if not trust_remote_code:
|
|
766
|
+
return _instantiate_model()
|
|
767
|
+
|
|
768
|
+
from transformers import AutoModel, AutoTokenizer
|
|
769
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
770
|
+
|
|
771
|
+
original_from_pretrained = AutoTokenizer.from_pretrained
|
|
772
|
+
original_model_from_pretrained = AutoModel.from_pretrained
|
|
773
|
+
original_pretrained_model_from_pretrained = PreTrainedModel.from_pretrained
|
|
774
|
+
original_pretrained_model_from_config = PreTrainedModel._from_config
|
|
775
|
+
model_refs = {str(selected_model_dir), str(selected_model_dir.resolve())}
|
|
776
|
+
|
|
777
|
+
def _patched_from_pretrained(*args, **kwargs):
|
|
778
|
+
model_ref = args[0] if args else kwargs.get("pretrained_model_name_or_path")
|
|
779
|
+
if model_ref is not None and str(model_ref) in model_refs:
|
|
780
|
+
kwargs.setdefault("fix_mistral_regex", True)
|
|
781
|
+
return original_from_pretrained(*args, **kwargs)
|
|
782
|
+
|
|
783
|
+
def _patched_model_from_pretrained(*args, **kwargs):
|
|
784
|
+
model_ref = args[0] if args else kwargs.get("pretrained_model_name_or_path")
|
|
785
|
+
if model_ref is not None and str(model_ref) in model_refs and "torch_dtype" in kwargs:
|
|
786
|
+
kwargs = dict(kwargs)
|
|
787
|
+
if "dtype" not in kwargs:
|
|
788
|
+
kwargs["dtype"] = kwargs["torch_dtype"]
|
|
789
|
+
kwargs.pop("torch_dtype", None)
|
|
790
|
+
return original_model_from_pretrained(*args, **kwargs)
|
|
791
|
+
|
|
792
|
+
original_pretrained_model_from_pretrained_fn = original_pretrained_model_from_pretrained.__func__
|
|
793
|
+
|
|
794
|
+
@classmethod
|
|
795
|
+
def _patched_pretrained_model_from_pretrained(cls, *args, **kwargs):
|
|
796
|
+
if "torch_dtype" in kwargs:
|
|
797
|
+
kwargs = dict(kwargs)
|
|
798
|
+
if "dtype" not in kwargs:
|
|
799
|
+
kwargs["dtype"] = kwargs["torch_dtype"]
|
|
800
|
+
kwargs.pop("torch_dtype", None)
|
|
801
|
+
return original_pretrained_model_from_pretrained_fn(cls, *args, **kwargs)
|
|
802
|
+
|
|
803
|
+
original_pretrained_model_from_config_fn = original_pretrained_model_from_config.__func__
|
|
804
|
+
|
|
805
|
+
@classmethod
|
|
806
|
+
def _patched_pretrained_model_from_config(cls, *args, **kwargs):
|
|
807
|
+
if "torch_dtype" in kwargs:
|
|
808
|
+
kwargs = dict(kwargs)
|
|
809
|
+
if "dtype" not in kwargs:
|
|
810
|
+
kwargs["dtype"] = kwargs["torch_dtype"]
|
|
811
|
+
kwargs.pop("torch_dtype", None)
|
|
812
|
+
return original_pretrained_model_from_config_fn(cls, *args, **kwargs)
|
|
813
|
+
|
|
814
|
+
AutoTokenizer.from_pretrained = _patched_from_pretrained
|
|
815
|
+
AutoModel.from_pretrained = _patched_model_from_pretrained
|
|
816
|
+
PreTrainedModel.from_pretrained = _patched_pretrained_model_from_pretrained
|
|
817
|
+
PreTrainedModel._from_config = _patched_pretrained_model_from_config
|
|
818
|
+
try:
|
|
819
|
+
return _instantiate_model()
|
|
820
|
+
finally:
|
|
821
|
+
AutoTokenizer.from_pretrained = original_from_pretrained
|
|
822
|
+
AutoModel.from_pretrained = original_model_from_pretrained
|
|
823
|
+
PreTrainedModel.from_pretrained = original_pretrained_model_from_pretrained
|
|
824
|
+
PreTrainedModel._from_config = original_pretrained_model_from_config
|
|
825
|
+
|
|
826
|
+
print(f"[+] Carregando modelo de embeddings a partir de: {selected_model_dir} (CPU)...")
|
|
827
|
+
try:
|
|
828
|
+
return _load_with_mistral_regex_patch()
|
|
829
|
+
except FileNotFoundError as e:
|
|
830
|
+
# Corrige corrupção/incompletude no cache dinâmico do transformers.
|
|
831
|
+
if trust_remote_code and "transformers_modules" in str(e):
|
|
832
|
+
print(f"[!] Cache dinâmico inconsistente detectado: {e}")
|
|
833
|
+
_clear_hf_dynamic_modules_cache()
|
|
834
|
+
return _load_with_mistral_regex_patch()
|
|
835
|
+
raise
|
|
836
|
+
|
|
837
|
+
def _apply_jina_quantization_if_needed(model: SentenceTransformer, model_id: str) -> SentenceTransformer:
|
|
838
|
+
if model_id != JINA_V3_EMBEDDING_MODEL or jina_quantization == "default":
|
|
839
|
+
return model
|
|
840
|
+
try:
|
|
841
|
+
import torch
|
|
842
|
+
import warnings
|
|
843
|
+
|
|
844
|
+
quantized_layers = 0
|
|
845
|
+
for module in model.modules():
|
|
846
|
+
if type(module).__name__ != "ParametrizedLinear":
|
|
847
|
+
continue
|
|
848
|
+
|
|
849
|
+
float_linear = torch.nn.Linear(
|
|
850
|
+
module.in_features,
|
|
851
|
+
module.out_features,
|
|
852
|
+
bias=module.bias is not None,
|
|
853
|
+
)
|
|
854
|
+
with torch.no_grad():
|
|
855
|
+
float_linear.weight.copy_(module.weight.detach().to(torch.float32))
|
|
856
|
+
if module.bias is not None:
|
|
857
|
+
float_linear.bias.copy_(module.bias.detach().to(torch.float32))
|
|
858
|
+
|
|
859
|
+
with warnings.catch_warnings():
|
|
860
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
861
|
+
quantized_linear = torch.quantization.quantize_dynamic(
|
|
862
|
+
torch.nn.Sequential(float_linear),
|
|
863
|
+
{torch.nn.Linear},
|
|
864
|
+
dtype=torch.qint8,
|
|
865
|
+
)[0]
|
|
866
|
+
|
|
867
|
+
module._dynamic_int8_linear = quantized_linear
|
|
868
|
+
|
|
869
|
+
def _forward_dynamic_int8(self, input, task_id=None, residual=False):
|
|
870
|
+
out = self._dynamic_int8_linear(input)
|
|
871
|
+
if residual:
|
|
872
|
+
return out, input
|
|
873
|
+
return out
|
|
874
|
+
|
|
875
|
+
module.forward = _forward_dynamic_int8.__get__(module, module.__class__)
|
|
876
|
+
quantized_layers += 1
|
|
877
|
+
|
|
878
|
+
if quantized_layers == 0:
|
|
879
|
+
print(
|
|
880
|
+
"[AVISO] Nenhuma camada ParametrizedLinear encontrada para dynamic-int8; usando modelo padrao."
|
|
881
|
+
)
|
|
882
|
+
return model
|
|
883
|
+
|
|
884
|
+
print(f"[+] Quantizacao Jina aplicada: dynamic-int8 (CPU, {quantized_layers} camadas).")
|
|
885
|
+
return model
|
|
886
|
+
except Exception as quant_error:
|
|
887
|
+
print(f"[AVISO] Falha ao aplicar dynamic-int8 ({quant_error}); usando modelo padrao.")
|
|
888
|
+
return model
|
|
889
|
+
|
|
890
|
+
try:
|
|
891
|
+
model = _load_from_local_dir(selection.model_id)
|
|
892
|
+
model = _apply_jina_quantization_if_needed(model, selection.model_id)
|
|
893
|
+
print("[+] Modelo carregado com sucesso.")
|
|
894
|
+
return model
|
|
895
|
+
except Exception as first_error:
|
|
896
|
+
if selection.model_id == fallback_model_id:
|
|
897
|
+
raise RuntimeError(
|
|
898
|
+
f"Falha ao carregar o modelo fallback '{fallback_model_id}': {first_error}"
|
|
899
|
+
) from first_error
|
|
900
|
+
|
|
901
|
+
print(
|
|
902
|
+
f"[!] Falha ao carregar '{selection.model_id}': {_format_exception(first_error)}\n"
|
|
903
|
+
f" Tentando fallback de carregamento: {fallback_model_id}"
|
|
904
|
+
)
|
|
905
|
+
fallback_selection = download_model_with_fallback(
|
|
906
|
+
preferred_model_id=fallback_model_id,
|
|
907
|
+
fallback_model_id=fallback_model_id,
|
|
908
|
+
local_dir=model_base_dir,
|
|
909
|
+
)
|
|
910
|
+
selected_model_dir = fallback_selection.local_dir
|
|
911
|
+
print(
|
|
912
|
+
f"[+] Modelo selecionado: {fallback_selection.model_id} "
|
|
913
|
+
f"(provider={fallback_selection.provider}, path={selected_model_dir})"
|
|
914
|
+
)
|
|
915
|
+
model = _load_from_local_dir(fallback_selection.model_id)
|
|
916
|
+
model = _apply_jina_quantization_if_needed(model, fallback_selection.model_id)
|
|
917
|
+
print("[+] Modelo fallback carregado com sucesso.")
|
|
918
|
+
return model
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def connect_to_chroma() -> chromadb.HttpClient:
|
|
922
|
+
"""Conecta ao ChromaDB via HTTP e valida a conexão."""
|
|
923
|
+
try:
|
|
924
|
+
client = chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT)
|
|
925
|
+
# Faz um heartbeat para confirmar que o servidor está no ar
|
|
926
|
+
client.heartbeat()
|
|
927
|
+
print(f"[+] Conectado ao ChromaDB em {CHROMA_HOST}:{CHROMA_PORT}")
|
|
928
|
+
return client
|
|
929
|
+
except Exception as e:
|
|
930
|
+
print(f"[ERRO] Não foi possível conectar ao ChromaDB: {e}")
|
|
931
|
+
print(" Verifique se o container Docker está rodando:")
|
|
932
|
+
print(" docker compose up -d")
|
|
933
|
+
sys.exit(1)
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
def scan_files(root_path: Path) -> Iterator[Path]:
|
|
937
|
+
"""
|
|
938
|
+
Varre recursivamente o diretório raiz, retornando em streaming
|
|
939
|
+
os arquivos de texto relevantes para indexação.
|
|
940
|
+
"""
|
|
941
|
+
for dirpath, dirnames, filenames in os.walk(root_path):
|
|
942
|
+
# Remove dirs ignorados in-place para que os.walk não desça neles
|
|
943
|
+
dirnames[:] = [
|
|
944
|
+
d for d in dirnames
|
|
945
|
+
if d not in IGNORED_DIRS and not d.startswith(".")
|
|
946
|
+
]
|
|
947
|
+
dirnames.sort()
|
|
948
|
+
|
|
949
|
+
for filename in sorted(filenames):
|
|
950
|
+
filepath = Path(dirpath) / filename
|
|
951
|
+
|
|
952
|
+
# Ignora por extensão
|
|
953
|
+
if filepath.suffix.lower() in IGNORED_EXTENSIONS:
|
|
954
|
+
continue
|
|
955
|
+
|
|
956
|
+
# Ignora arquivos muito grandes
|
|
957
|
+
try:
|
|
958
|
+
if filepath.stat().st_size > MAX_FILE_SIZE_BYTES:
|
|
959
|
+
continue
|
|
960
|
+
except OSError:
|
|
961
|
+
continue
|
|
962
|
+
|
|
963
|
+
yield filepath
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
def make_chunk_id(file_path: str, chunk_index: int) -> str:
|
|
967
|
+
"""Gera um ID determinístico para cada chunk baseado no caminho + índice."""
|
|
968
|
+
raw = f"{file_path}::chunk::{chunk_index}"
|
|
969
|
+
return hashlib.md5(raw.encode()).hexdigest()
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
def read_file_safe(filepath: Path) -> str | None:
|
|
973
|
+
"""Lê um arquivo de texto, tentando múltiplos encodings."""
|
|
974
|
+
for encoding in ("utf-8", "latin-1", "cp1252"):
|
|
975
|
+
try:
|
|
976
|
+
return filepath.read_text(encoding=encoding)
|
|
977
|
+
except UnicodeDecodeError:
|
|
978
|
+
continue
|
|
979
|
+
except OSError as e:
|
|
980
|
+
print(f" [AVISO] Não foi possível ler {filepath}: {e}")
|
|
981
|
+
return None
|
|
982
|
+
# Se nenhum encoding funcionou, é provavelmente binário disfarçado
|
|
983
|
+
return None
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def delete_file_chunks(collection: chromadb.Collection, file_path: str) -> None:
|
|
987
|
+
"""Remove todos os chunks de um arquivo específico da coleção."""
|
|
988
|
+
try:
|
|
989
|
+
# Pede somente IDs para evitar materializar docs/metadata na memória.
|
|
990
|
+
results = collection.get(where={"file_path": file_path}, include=[])
|
|
991
|
+
if results and results["ids"]:
|
|
992
|
+
collection.delete(ids=results["ids"])
|
|
993
|
+
except Exception as e:
|
|
994
|
+
print(f" [AVISO] Erro ao deletar chunks de {file_path}: {_format_exception(e)}")
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
# ---------------------------------------------------------------------------
|
|
998
|
+
# Indexação de um único arquivo
|
|
999
|
+
# ---------------------------------------------------------------------------
|
|
1000
|
+
|
|
1001
|
+
def index_file(
|
|
1002
|
+
filepath: Path,
|
|
1003
|
+
collection: chromadb.Collection,
|
|
1004
|
+
model: SentenceTransformer,
|
|
1005
|
+
splitter: RecursiveCharacterTextSplitter,
|
|
1006
|
+
root_path: Path,
|
|
1007
|
+
embedding_batch_size: int,
|
|
1008
|
+
) -> int:
|
|
1009
|
+
"""
|
|
1010
|
+
Indexa um único arquivo: lê, divide em chunks, gera embeddings e faz upsert.
|
|
1011
|
+
Retorna o número de chunks indexados.
|
|
1012
|
+
"""
|
|
1013
|
+
content = read_file_safe(filepath)
|
|
1014
|
+
if not content or not content.strip():
|
|
1015
|
+
return 0
|
|
1016
|
+
|
|
1017
|
+
# Usa caminho absoluto como metadado
|
|
1018
|
+
abs_path = str(filepath.resolve())
|
|
1019
|
+
|
|
1020
|
+
# Remove chunks antigos deste arquivo (atualização idempotente)
|
|
1021
|
+
delete_file_chunks(collection, abs_path)
|
|
1022
|
+
|
|
1023
|
+
chunks = splitter.split_text(content)
|
|
1024
|
+
if not chunks:
|
|
1025
|
+
return 0
|
|
1026
|
+
|
|
1027
|
+
relative_path = str(filepath.relative_to(root_path))
|
|
1028
|
+
inserted_chunks = 0
|
|
1029
|
+
batch_ids: list[str] = []
|
|
1030
|
+
batch_docs: list[str] = []
|
|
1031
|
+
batch_metadatas: list[dict[str, object]] = []
|
|
1032
|
+
|
|
1033
|
+
def _flush_batch() -> None:
|
|
1034
|
+
nonlocal inserted_chunks
|
|
1035
|
+
if not batch_ids:
|
|
1036
|
+
return
|
|
1037
|
+
|
|
1038
|
+
embeddings = model.encode(
|
|
1039
|
+
batch_docs,
|
|
1040
|
+
show_progress_bar=False,
|
|
1041
|
+
batch_size=embedding_batch_size,
|
|
1042
|
+
).tolist()
|
|
1043
|
+
collection.upsert(
|
|
1044
|
+
ids=batch_ids,
|
|
1045
|
+
embeddings=embeddings,
|
|
1046
|
+
documents=batch_docs,
|
|
1047
|
+
metadatas=batch_metadatas,
|
|
1048
|
+
)
|
|
1049
|
+
inserted_chunks += len(batch_ids)
|
|
1050
|
+
del embeddings
|
|
1051
|
+
batch_ids.clear()
|
|
1052
|
+
batch_docs.clear()
|
|
1053
|
+
batch_metadatas.clear()
|
|
1054
|
+
gc.collect()
|
|
1055
|
+
|
|
1056
|
+
for i, chunk in enumerate(chunks):
|
|
1057
|
+
batch_ids.append(make_chunk_id(abs_path, i))
|
|
1058
|
+
batch_docs.append(chunk)
|
|
1059
|
+
batch_metadatas.append(
|
|
1060
|
+
{
|
|
1061
|
+
"file_path": abs_path,
|
|
1062
|
+
"chunk_index": i,
|
|
1063
|
+
"file_name": filepath.name,
|
|
1064
|
+
# Caminho relativo à raiz do projeto para exibição compacta
|
|
1065
|
+
"relative_path": relative_path,
|
|
1066
|
+
}
|
|
1067
|
+
)
|
|
1068
|
+
if len(batch_ids) >= embedding_batch_size:
|
|
1069
|
+
_flush_batch()
|
|
1070
|
+
|
|
1071
|
+
_flush_batch()
|
|
1072
|
+
return inserted_chunks
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
# ---------------------------------------------------------------------------
|
|
1076
|
+
# Ponto de entrada principal
|
|
1077
|
+
# ---------------------------------------------------------------------------
|
|
1078
|
+
|
|
1079
|
+
def main():
|
|
1080
|
+
parser = argparse.ArgumentParser(
|
|
1081
|
+
description="Indexa um projeto de código no ChromaDB para RAG local."
|
|
1082
|
+
)
|
|
1083
|
+
parser.add_argument(
|
|
1084
|
+
"project_path",
|
|
1085
|
+
nargs="?",
|
|
1086
|
+
default=".",
|
|
1087
|
+
help="Caminho raiz do projeto a indexar (padrão: diretório atual)",
|
|
1088
|
+
)
|
|
1089
|
+
parser.add_argument(
|
|
1090
|
+
"--clear",
|
|
1091
|
+
action="store_true",
|
|
1092
|
+
help="Limpa toda a coleção antes de reindexar",
|
|
1093
|
+
)
|
|
1094
|
+
parser.add_argument(
|
|
1095
|
+
"--embedding-model",
|
|
1096
|
+
choices=["jina", "bge", "hybrid"],
|
|
1097
|
+
help=(
|
|
1098
|
+
"Modelo de embeddings: 'jina' (codigo), "
|
|
1099
|
+
"'bge' (conteudo misto) ou 'hybrid' (duas colecoes: Jina v2 + BGE)."
|
|
1100
|
+
),
|
|
1101
|
+
)
|
|
1102
|
+
parser.add_argument(
|
|
1103
|
+
"--jina-quantization",
|
|
1104
|
+
choices=["default", "dynamic-int8"],
|
|
1105
|
+
help="Quantizacao para Jina: 'default' (mais qualidade) ou 'dynamic-int8' (mais velocidade).",
|
|
1106
|
+
)
|
|
1107
|
+
parser.add_argument(
|
|
1108
|
+
"--perf-profile",
|
|
1109
|
+
choices=["autotune", "max-performance"],
|
|
1110
|
+
help=(
|
|
1111
|
+
"Perfil de performance da indexação: "
|
|
1112
|
+
"'autotune' (custo-benefício) ou 'max-performance' (mais throughput, maior uso de RAM)."
|
|
1113
|
+
),
|
|
1114
|
+
)
|
|
1115
|
+
args = parser.parse_args()
|
|
1116
|
+
|
|
1117
|
+
root_path = Path(args.project_path).resolve()
|
|
1118
|
+
if not root_path.is_dir():
|
|
1119
|
+
print(f"[ERRO] Caminho não existe ou não é um diretório: {root_path}")
|
|
1120
|
+
sys.exit(1)
|
|
1121
|
+
|
|
1122
|
+
print(f"\n{'='*60}")
|
|
1123
|
+
print(f" RAG Indexer — Projeto: {root_path}")
|
|
1124
|
+
print(f"{'='*60}\n")
|
|
1125
|
+
index_started_at = datetime.now()
|
|
1126
|
+
print(f"[INFO] Início: {index_started_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
1127
|
+
|
|
1128
|
+
force_model_reconfigure = _env_bool("MCP_FORCE_MODEL_RECONFIG", default=False)
|
|
1129
|
+
persisted_config = load_indexer_tuning_config(force_model_reconfigure)
|
|
1130
|
+
model_choice, jina_quantization = resolve_embedding_config(
|
|
1131
|
+
args.embedding_model,
|
|
1132
|
+
args.jina_quantization,
|
|
1133
|
+
persisted_config=persisted_config,
|
|
1134
|
+
)
|
|
1135
|
+
perf_profile = resolve_perf_profile(args.perf_profile, persisted_config)
|
|
1136
|
+
|
|
1137
|
+
chunk_size_locked = "MCP_CHUNK_SIZE" in os.environ
|
|
1138
|
+
chunk_overlap_locked = "MCP_CHUNK_OVERLAP" in os.environ
|
|
1139
|
+
batch_size_locked = "MCP_EMBEDDING_BATCH_SIZE" in os.environ
|
|
1140
|
+
|
|
1141
|
+
persisted_chunk_size = _parse_config_int(persisted_config, "chunk_size")
|
|
1142
|
+
persisted_chunk_overlap = _parse_config_int(persisted_config, "chunk_overlap")
|
|
1143
|
+
persisted_batch_size = _parse_config_int(persisted_config, "embedding_batch_size")
|
|
1144
|
+
|
|
1145
|
+
effective_chunk_size = CHUNK_SIZE
|
|
1146
|
+
if not chunk_size_locked and persisted_chunk_size is not None:
|
|
1147
|
+
effective_chunk_size = max(256, persisted_chunk_size)
|
|
1148
|
+
|
|
1149
|
+
effective_chunk_overlap = CHUNK_OVERLAP
|
|
1150
|
+
if not chunk_overlap_locked and persisted_chunk_overlap is not None:
|
|
1151
|
+
effective_chunk_overlap = max(0, min(effective_chunk_size - 1, persisted_chunk_overlap))
|
|
1152
|
+
|
|
1153
|
+
effective_batch_size = EMBEDDING_BATCH_SIZE
|
|
1154
|
+
if not batch_size_locked and persisted_batch_size is not None:
|
|
1155
|
+
effective_batch_size = max(1, persisted_batch_size)
|
|
1156
|
+
|
|
1157
|
+
print(
|
|
1158
|
+
f"[CONFIG] Modelo escolhido: {model_choice} "
|
|
1159
|
+
f"({_describe_embedding_choice(model_choice)})"
|
|
1160
|
+
)
|
|
1161
|
+
if model_choice == "jina":
|
|
1162
|
+
print(f"[CONFIG] Quantizacao Jina: {jina_quantization}")
|
|
1163
|
+
elif model_choice == "hybrid":
|
|
1164
|
+
print("[CONFIG] Quantizacao Jina: nao aplicavel no hybrid (Jina v2 + BGE)")
|
|
1165
|
+
else:
|
|
1166
|
+
print("[CONFIG] Quantizacao Jina: nao aplicavel (modelo BGE selecionado)")
|
|
1167
|
+
print(f"[CONFIG] Perfil de performance: {perf_profile}")
|
|
1168
|
+
if perf_profile == "max-performance":
|
|
1169
|
+
print(
|
|
1170
|
+
"[AVISO] Este modo pode elevar consideravelmente o consumo de memória "
|
|
1171
|
+
"e causar encerramento por OOM (exit 137)."
|
|
1172
|
+
)
|
|
1173
|
+
warn_if_jina_memory_risk(model_choice, jina_quantization)
|
|
1174
|
+
|
|
1175
|
+
# Inicializa componentes
|
|
1176
|
+
client = connect_to_chroma()
|
|
1177
|
+
targets = _resolve_index_targets(model_choice)
|
|
1178
|
+
|
|
1179
|
+
# Obtém ou recria as coleções envolvidas.
|
|
1180
|
+
collections: dict[str, chromadb.Collection] = {}
|
|
1181
|
+
collection_dimension_reset_done: dict[str, bool] = {}
|
|
1182
|
+
for target in targets:
|
|
1183
|
+
if args.clear:
|
|
1184
|
+
try:
|
|
1185
|
+
client.delete_collection(target.collection_name)
|
|
1186
|
+
print(f"[!] Coleção '{target.collection_name}' removida para reindexação limpa.")
|
|
1187
|
+
except Exception:
|
|
1188
|
+
pass
|
|
1189
|
+
collections[target.collection_name] = client.get_or_create_collection(
|
|
1190
|
+
name=target.collection_name,
|
|
1191
|
+
metadata={"hnsw:space": "cosine"},
|
|
1192
|
+
)
|
|
1193
|
+
collection_dimension_reset_done[target.collection_name] = False
|
|
1194
|
+
|
|
1195
|
+
# Carrega modelos de forma lazy e reaproveita por target.
|
|
1196
|
+
loaded_models: dict[str, SentenceTransformer] = {}
|
|
1197
|
+
total_chunks = 0
|
|
1198
|
+
errors = 0
|
|
1199
|
+
files_scanned = 0
|
|
1200
|
+
files_processed_total = 0
|
|
1201
|
+
chunks_by_collection = {target.collection_name: 0 for target in targets}
|
|
1202
|
+
files_by_collection = {target.collection_name: 0 for target in targets}
|
|
1203
|
+
files_eligible_by_collection = {target.collection_name: 0 for target in targets}
|
|
1204
|
+
errors_by_collection = {target.collection_name: 0 for target in targets}
|
|
1205
|
+
error_samples_by_collection: dict[str, list[str]] = {target.collection_name: [] for target in targets}
|
|
1206
|
+
target_by_model = {target.model_choice: target for target in targets}
|
|
1207
|
+
|
|
1208
|
+
# Carrega o primeiro modelo antes para autotune com micro-benchmark.
|
|
1209
|
+
primary_target = targets[0]
|
|
1210
|
+
primary_quantization = jina_quantization if primary_target.model_choice == "jina" else "default"
|
|
1211
|
+
loaded_models[primary_target.model_choice] = load_embedding_model(primary_target.model_choice, primary_quantization)
|
|
1212
|
+
primary_model = loaded_models[primary_target.model_choice]
|
|
1213
|
+
|
|
1214
|
+
if perf_profile == "autotune":
|
|
1215
|
+
tuned = _resolve_autotuned_params(
|
|
1216
|
+
model=primary_model,
|
|
1217
|
+
chunk_size_locked=chunk_size_locked,
|
|
1218
|
+
chunk_overlap_locked=chunk_overlap_locked,
|
|
1219
|
+
batch_size_locked=batch_size_locked,
|
|
1220
|
+
chunk_size=effective_chunk_size,
|
|
1221
|
+
chunk_overlap=effective_chunk_overlap,
|
|
1222
|
+
embedding_batch_size=effective_batch_size,
|
|
1223
|
+
)
|
|
1224
|
+
else:
|
|
1225
|
+
tuned = _resolve_max_performance_params(
|
|
1226
|
+
chunk_size_locked=chunk_size_locked,
|
|
1227
|
+
chunk_overlap_locked=chunk_overlap_locked,
|
|
1228
|
+
batch_size_locked=batch_size_locked,
|
|
1229
|
+
chunk_size=effective_chunk_size,
|
|
1230
|
+
chunk_overlap=effective_chunk_overlap,
|
|
1231
|
+
embedding_batch_size=effective_batch_size,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
effective_chunk_size = max(256, tuned.chunk_size)
|
|
1235
|
+
effective_chunk_overlap = max(0, min(effective_chunk_size - 1, tuned.chunk_overlap))
|
|
1236
|
+
effective_batch_size = max(1, tuned.embedding_batch_size)
|
|
1237
|
+
|
|
1238
|
+
for reason in tuned.reasons:
|
|
1239
|
+
print(f"[CONFIG] {reason}")
|
|
1240
|
+
|
|
1241
|
+
print(
|
|
1242
|
+
f"[CONFIG] Parâmetros finais: "
|
|
1243
|
+
f"chunk_size={effective_chunk_size}, chunk_overlap={effective_chunk_overlap}, "
|
|
1244
|
+
f"embedding_batch={effective_batch_size}"
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
save_indexer_tuning_config(
|
|
1248
|
+
{
|
|
1249
|
+
"embedding_model": model_choice,
|
|
1250
|
+
"jina_quantization": jina_quantization,
|
|
1251
|
+
"perf_profile": perf_profile,
|
|
1252
|
+
"chunk_size": effective_chunk_size,
|
|
1253
|
+
"chunk_overlap": effective_chunk_overlap,
|
|
1254
|
+
"embedding_batch_size": effective_batch_size,
|
|
1255
|
+
}
|
|
1256
|
+
)
|
|
1257
|
+
|
|
1258
|
+
splitter = get_text_splitter(effective_chunk_size, effective_chunk_overlap)
|
|
1259
|
+
|
|
1260
|
+
print(f"\n[+] Varrendo e indexando arquivos em: {root_path}")
|
|
1261
|
+
files = list(scan_files(root_path))
|
|
1262
|
+
files_scanned = len(files)
|
|
1263
|
+
if files_scanned == 0:
|
|
1264
|
+
print("[AVISO] Nenhum arquivo encontrado. Verifique o caminho e os filtros.")
|
|
1265
|
+
sys.exit(0)
|
|
1266
|
+
|
|
1267
|
+
print(f"[+] {files_scanned} arquivo(s) elegível(is) para indexação.")
|
|
1268
|
+
with tqdm(
|
|
1269
|
+
total=files_scanned,
|
|
1270
|
+
desc="Indexando",
|
|
1271
|
+
unit="arquivo",
|
|
1272
|
+
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, {rate_fmt}]",
|
|
1273
|
+
) as pbar:
|
|
1274
|
+
for filepath in files:
|
|
1275
|
+
target_models = _classify_file_targets(filepath, model_choice)
|
|
1276
|
+
|
|
1277
|
+
for target_model in target_models:
|
|
1278
|
+
target = target_by_model.get(target_model)
|
|
1279
|
+
if target is None:
|
|
1280
|
+
continue
|
|
1281
|
+
|
|
1282
|
+
if target.model_choice not in loaded_models:
|
|
1283
|
+
target_quantization = jina_quantization if target.model_choice == "jina" else "default"
|
|
1284
|
+
try:
|
|
1285
|
+
loaded_models[target.model_choice] = load_embedding_model(
|
|
1286
|
+
target.model_choice,
|
|
1287
|
+
target_quantization,
|
|
1288
|
+
)
|
|
1289
|
+
except Exception as load_error:
|
|
1290
|
+
# Em hybrid, pode faltar RAM ao manter dois modelos grandes simultaneamente.
|
|
1291
|
+
if model_choice == "hybrid" and loaded_models and _is_memory_related_error(load_error):
|
|
1292
|
+
print(
|
|
1293
|
+
"[AVISO] Falha ao carregar modelo adicional no hybrid por memória. "
|
|
1294
|
+
"Liberando modelo anterior e tentando novamente."
|
|
1295
|
+
)
|
|
1296
|
+
loaded_models.clear()
|
|
1297
|
+
gc.collect()
|
|
1298
|
+
loaded_models[target.model_choice] = load_embedding_model(
|
|
1299
|
+
target.model_choice,
|
|
1300
|
+
target_quantization,
|
|
1301
|
+
)
|
|
1302
|
+
else:
|
|
1303
|
+
raise
|
|
1304
|
+
|
|
1305
|
+
model = loaded_models[target.model_choice]
|
|
1306
|
+
collection = collections[target.collection_name]
|
|
1307
|
+
files_eligible_by_collection[target.collection_name] += 1
|
|
1308
|
+
|
|
1309
|
+
while True:
|
|
1310
|
+
try:
|
|
1311
|
+
n_chunks = index_file(
|
|
1312
|
+
filepath,
|
|
1313
|
+
collection,
|
|
1314
|
+
model,
|
|
1315
|
+
splitter,
|
|
1316
|
+
root_path,
|
|
1317
|
+
embedding_batch_size=effective_batch_size,
|
|
1318
|
+
)
|
|
1319
|
+
total_chunks += n_chunks
|
|
1320
|
+
files_processed_total += 1
|
|
1321
|
+
chunks_by_collection[target.collection_name] += n_chunks
|
|
1322
|
+
files_by_collection[target.collection_name] += 1
|
|
1323
|
+
break
|
|
1324
|
+
except Exception as e:
|
|
1325
|
+
# Fallback automático para evitar quebra total em máquinas no limite de RAM.
|
|
1326
|
+
if (
|
|
1327
|
+
not batch_size_locked
|
|
1328
|
+
and effective_batch_size > 1
|
|
1329
|
+
and _is_memory_related_error(e)
|
|
1330
|
+
):
|
|
1331
|
+
new_batch = max(1, effective_batch_size // 2)
|
|
1332
|
+
if new_batch < effective_batch_size:
|
|
1333
|
+
tqdm.write(
|
|
1334
|
+
f" [AJUSTE] Memória alta em {target.label}. "
|
|
1335
|
+
f"Batch reduzido {effective_batch_size} -> {new_batch}."
|
|
1336
|
+
)
|
|
1337
|
+
effective_batch_size = new_batch
|
|
1338
|
+
gc.collect()
|
|
1339
|
+
continue
|
|
1340
|
+
|
|
1341
|
+
if (
|
|
1342
|
+
_is_dimension_mismatch_error(e)
|
|
1343
|
+
and not collection_dimension_reset_done[target.collection_name]
|
|
1344
|
+
):
|
|
1345
|
+
tqdm.write(
|
|
1346
|
+
f" [AJUSTE] Dimensão incompatível detectada em '{target.collection_name}'. "
|
|
1347
|
+
"Recriando coleção e tentando novamente."
|
|
1348
|
+
)
|
|
1349
|
+
try:
|
|
1350
|
+
client.delete_collection(target.collection_name)
|
|
1351
|
+
except Exception:
|
|
1352
|
+
pass
|
|
1353
|
+
|
|
1354
|
+
collections[target.collection_name] = client.get_or_create_collection(
|
|
1355
|
+
name=target.collection_name,
|
|
1356
|
+
metadata={"hnsw:space": "cosine"},
|
|
1357
|
+
)
|
|
1358
|
+
collection_dimension_reset_done[target.collection_name] = True
|
|
1359
|
+
gc.collect()
|
|
1360
|
+
continue
|
|
1361
|
+
|
|
1362
|
+
errors += 1
|
|
1363
|
+
errors_by_collection[target.collection_name] += 1
|
|
1364
|
+
if len(error_samples_by_collection[target.collection_name]) < 3:
|
|
1365
|
+
error_samples_by_collection[target.collection_name].append(
|
|
1366
|
+
f"{filepath.name}: {_format_exception(e)}"
|
|
1367
|
+
)
|
|
1368
|
+
tqdm.write(f" [ERRO] {filepath} [{target.label}]: {_format_exception(e)}")
|
|
1369
|
+
break
|
|
1370
|
+
|
|
1371
|
+
pbar.set_postfix({"chunks": total_chunks, "atual": filepath.name[:20]})
|
|
1372
|
+
pbar.update(1)
|
|
1373
|
+
|
|
1374
|
+
for target in targets:
|
|
1375
|
+
collection_name = target.collection_name
|
|
1376
|
+
eligible = files_eligible_by_collection[collection_name]
|
|
1377
|
+
processed = files_by_collection[collection_name]
|
|
1378
|
+
target_errors = errors_by_collection[collection_name]
|
|
1379
|
+
|
|
1380
|
+
if eligible == 0:
|
|
1381
|
+
print(f"[AVISO] Nenhum arquivo elegível para {target.label}; etapa ignorada.")
|
|
1382
|
+
elif processed == 0 and target_errors > 0:
|
|
1383
|
+
print(
|
|
1384
|
+
f"[AVISO] {eligible} arquivo(s) elegível(is) para {target.label}, "
|
|
1385
|
+
"mas todos falharam."
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
if target_errors:
|
|
1389
|
+
print(f"[AVISO] {target_errors} erro(s) durante a indexação do target {target.label}.")
|
|
1390
|
+
for sample in error_samples_by_collection[collection_name]:
|
|
1391
|
+
print(f" - {sample}")
|
|
1392
|
+
|
|
1393
|
+
index_finished_at = datetime.now()
|
|
1394
|
+
elapsed_seconds = int((index_finished_at - index_started_at).total_seconds())
|
|
1395
|
+
elapsed_h = elapsed_seconds // 3600
|
|
1396
|
+
elapsed_m = (elapsed_seconds % 3600) // 60
|
|
1397
|
+
elapsed_s = elapsed_seconds % 60
|
|
1398
|
+
print(f"\n{'='*60}")
|
|
1399
|
+
print(f" Indexação concluída!")
|
|
1400
|
+
print(f" Início : {index_started_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
1401
|
+
print(f" Fim : {index_finished_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
1402
|
+
print(f" Duração : {elapsed_h:02d}:{elapsed_m:02d}:{elapsed_s:02d}")
|
|
1403
|
+
print(f" Arquivos varridos : {files_scanned}")
|
|
1404
|
+
print(f" Arquivos processados : {files_processed_total}")
|
|
1405
|
+
print(f" Total de chunks : {total_chunks}")
|
|
1406
|
+
print(f" Erros : {errors}")
|
|
1407
|
+
for target in targets:
|
|
1408
|
+
collection_name = target.collection_name
|
|
1409
|
+
print(
|
|
1410
|
+
f" Coleção ChromaDB : '{collection_name}' "
|
|
1411
|
+
f"(elegíveis={files_eligible_by_collection.get(collection_name, 0)}, "
|
|
1412
|
+
f"arquivos={files_by_collection.get(collection_name, 0)}, "
|
|
1413
|
+
f"chunks={chunks_by_collection.get(collection_name, 0)})"
|
|
1414
|
+
)
|
|
1415
|
+
print(f"{'='*60}\n")
|
|
1416
|
+
|
|
1417
|
+
|
|
1418
|
+
if __name__ == "__main__":
|
|
1419
|
+
try:
|
|
1420
|
+
main()
|
|
1421
|
+
except MemoryError:
|
|
1422
|
+
print(
|
|
1423
|
+
"[ERRO] Falha de memória durante a indexação. "
|
|
1424
|
+
"Use --embedding-model bge ou execute o Jina em máquina com mais RAM/swap."
|
|
1425
|
+
)
|
|
1426
|
+
sys.exit(1)
|