own-rag-cli 0.0.4-snapshot → 0.0.6-snapshot
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -1
- package/README.pt-br.md +24 -0
- package/bin/indexer_full.py +128 -21
- package/bin/mcp_server.py +146 -22
- package/bin/postinstall.sh +234 -0
- package/bin/rag-remove.sh +24 -0
- package/bin/rag-wrapper.sh +151 -10
- package/package.json +1 -1
- package/rag-setup-macos.run +2 -2
- package/rag-setup.run +2 -2
package/README.md
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
# MCP binary checksum (SHA-256, payload without shebang): `1413af4d4c7d01d57ec5195ea0c5f704f9fefabeb641d2f216a042ec638c2b59`
|
|
2
|
+
|
|
1
3
|
# own-rag
|
|
2
4
|
|
|
3
5
|
Local RAG for codebases with ChromaDB + MCP, focused on practical setup and lower LLM token waste.
|
|
4
6
|
|
|
5
|
-
Language: English (default) | Portuguese: `README.pt-br.md`
|
|
7
|
+
Language: English (default) | Portuguese: `README.pt-br.md` - https://github.com/JocsaPB/own-rag/blob/main/README.pt-br.md
|
|
6
8
|
|
|
7
9
|
## Why use own-rag
|
|
8
10
|
|
|
@@ -59,6 +61,30 @@ rag remove # full local uninstall (double confirmation)
|
|
|
59
61
|
rag remove --force # uninstall without confirmation prompts
|
|
60
62
|
```
|
|
61
63
|
|
|
64
|
+
## Indexing from URL (HTTP/HTTPS)
|
|
65
|
+
|
|
66
|
+
`rag run` now accepts remote URLs in addition to local folders.
|
|
67
|
+
|
|
68
|
+
How it works:
|
|
69
|
+
- If you pass `http://` or `https://`, the wrapper downloads content to a temporary folder.
|
|
70
|
+
- If the downloaded file is a ZIP, it is extracted and the extracted folder is indexed.
|
|
71
|
+
- Text files (`.txt`, `.md`, and other non-binary text files) are indexable.
|
|
72
|
+
- Binary files are skipped by the indexer.
|
|
73
|
+
- After indexing finishes, temporary downloaded/extracted files are removed automatically.
|
|
74
|
+
|
|
75
|
+
Download tool behavior:
|
|
76
|
+
- Uses `curl` when available.
|
|
77
|
+
- If `curl` is missing, the wrapper attempts automatic installation:
|
|
78
|
+
- Linux: package manager (`apt`, `dnf`, `yum`, `pacman`, `zypper`, `apk`)
|
|
79
|
+
- macOS: Homebrew (`brew`)
|
|
80
|
+
|
|
81
|
+
Examples:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
rag run https://example.com/docs/guide.md
|
|
85
|
+
rag run https://example.com/snapshots/project-docs.zip
|
|
86
|
+
```
|
|
87
|
+
|
|
62
88
|
## Configuration files and paths
|
|
63
89
|
|
|
64
90
|
### 1) Runtime config (CLI-level)
|
package/README.pt-br.md
CHANGED
|
@@ -59,6 +59,30 @@ rag remove # desinstalação completa local (dupla confirma
|
|
|
59
59
|
rag remove --force # desinstala sem prompts de confirmação
|
|
60
60
|
```
|
|
61
61
|
|
|
62
|
+
## Indexacao a partir de URL (HTTP/HTTPS)
|
|
63
|
+
|
|
64
|
+
Agora o `rag run` aceita URL remota alem de pasta local.
|
|
65
|
+
|
|
66
|
+
Como funciona:
|
|
67
|
+
- Se voce passar `http://` ou `https://`, o wrapper baixa o conteudo para uma pasta temporaria.
|
|
68
|
+
- Se o arquivo baixado for ZIP, ele e descompactado e a pasta extraida e indexada.
|
|
69
|
+
- Arquivos de texto (`.txt`, `.md` e outros textos nao-binarios) podem ser indexados.
|
|
70
|
+
- Arquivos binarios sao ignorados pelo indexador.
|
|
71
|
+
- Ao terminar a indexacao, os arquivos temporarios baixados/descompactados sao removidos automaticamente.
|
|
72
|
+
|
|
73
|
+
Comportamento do downloader:
|
|
74
|
+
- Usa `curl` quando disponivel.
|
|
75
|
+
- Se `curl` nao existir, o wrapper tenta instalar automaticamente:
|
|
76
|
+
- Linux: gerenciador de pacotes (`apt`, `dnf`, `yum`, `pacman`, `zypper`, `apk`)
|
|
77
|
+
- macOS: Homebrew (`brew`)
|
|
78
|
+
|
|
79
|
+
Exemplos:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
rag run https://exemplo.com/docs/guia.md
|
|
83
|
+
rag run https://exemplo.com/snapshots/docs-projeto.zip
|
|
84
|
+
```
|
|
85
|
+
|
|
62
86
|
## Arquivos e caminhos de configuração
|
|
63
87
|
|
|
64
88
|
### 1) Configuração de runtime (nível CLI)
|
package/bin/indexer_full.py
CHANGED
|
@@ -194,7 +194,7 @@ IGNORED_EXTENSIONS = {
|
|
|
194
194
|
".mp4", ".mp3", ".wav", ".ogg", ".avi", ".mov",
|
|
195
195
|
# Pacotes e compilados
|
|
196
196
|
".zip", ".tar", ".gz", ".rar", ".7z", ".jar", ".war", ".ear",
|
|
197
|
-
".pyc", ".pyo", ".so", ".dll", ".exe", ".bin",
|
|
197
|
+
".pyc", ".pyo", ".so", ".dll", ".exe", ".bin", ".run",
|
|
198
198
|
# Lockfiles e gerados
|
|
199
199
|
".lock", ".sum",
|
|
200
200
|
# Banco de dados
|
|
@@ -1113,17 +1113,44 @@ def make_chunk_id(file_path: str, chunk_index: int) -> str:
|
|
|
1113
1113
|
return hashlib.md5(raw.encode()).hexdigest()
|
|
1114
1114
|
|
|
1115
1115
|
|
|
1116
|
+
def _looks_binary_content(raw: bytes) -> bool:
|
|
1117
|
+
"""Detecta conteúdo binário por heurística em amostra de bytes."""
|
|
1118
|
+
if not raw:
|
|
1119
|
+
return False
|
|
1120
|
+
|
|
1121
|
+
sample = raw[:4096]
|
|
1122
|
+
if b"\x00" in sample:
|
|
1123
|
+
return True
|
|
1124
|
+
|
|
1125
|
+
non_text_bytes = 0
|
|
1126
|
+
for byte in sample:
|
|
1127
|
+
if byte in (9, 10, 13): # \t \n \r
|
|
1128
|
+
continue
|
|
1129
|
+
if 32 <= byte <= 126: # ASCII imprimível
|
|
1130
|
+
continue
|
|
1131
|
+
if 160 <= byte <= 255: # Latin-1 estendido comum em texto
|
|
1132
|
+
continue
|
|
1133
|
+
non_text_bytes += 1
|
|
1134
|
+
|
|
1135
|
+
return (non_text_bytes / len(sample)) > 0.30
|
|
1136
|
+
|
|
1137
|
+
|
|
1116
1138
|
def read_file_safe(filepath: Path) -> str | None:
|
|
1117
|
-
"""Lê um arquivo de texto, tentando múltiplos encodings."""
|
|
1139
|
+
"""Lê um arquivo de texto, evitando binários e tentando múltiplos encodings."""
|
|
1140
|
+
try:
|
|
1141
|
+
raw = filepath.read_bytes()
|
|
1142
|
+
except OSError as e:
|
|
1143
|
+
print(f" [AVISO] Não foi possível ler {filepath}: {e}")
|
|
1144
|
+
return None
|
|
1145
|
+
|
|
1146
|
+
if _looks_binary_content(raw):
|
|
1147
|
+
return None
|
|
1148
|
+
|
|
1118
1149
|
for encoding in ("utf-8", "latin-1", "cp1252"):
|
|
1119
1150
|
try:
|
|
1120
|
-
return
|
|
1151
|
+
return raw.decode(encoding)
|
|
1121
1152
|
except UnicodeDecodeError:
|
|
1122
1153
|
continue
|
|
1123
|
-
except OSError as e:
|
|
1124
|
-
print(f" [AVISO] Não foi possível ler {filepath}: {e}")
|
|
1125
|
-
return None
|
|
1126
|
-
# Se nenhum encoding funcionou, é provavelmente binário disfarçado
|
|
1127
1154
|
return None
|
|
1128
1155
|
|
|
1129
1156
|
|
|
@@ -1170,34 +1197,110 @@ def index_file(
|
|
|
1170
1197
|
|
|
1171
1198
|
relative_path = str(filepath.relative_to(root_path))
|
|
1172
1199
|
inserted_chunks = 0
|
|
1200
|
+
skipped_chunks = 0
|
|
1201
|
+
stop_iteration_warnings = 0
|
|
1173
1202
|
batch_ids: list[str] = []
|
|
1174
1203
|
batch_docs: list[str] = []
|
|
1175
1204
|
batch_metadatas: list[dict[str, object]] = []
|
|
1176
1205
|
|
|
1206
|
+
def _warn_stop_iteration(message: str) -> None:
|
|
1207
|
+
nonlocal stop_iteration_warnings
|
|
1208
|
+
if stop_iteration_warnings < 3:
|
|
1209
|
+
tqdm.write(message)
|
|
1210
|
+
stop_iteration_warnings += 1
|
|
1211
|
+
|
|
1212
|
+
def _to_embedding_rows(encoded_embeddings: object) -> list[list[float]]:
|
|
1213
|
+
if hasattr(encoded_embeddings, "tolist"):
|
|
1214
|
+
rows = encoded_embeddings.tolist()
|
|
1215
|
+
if isinstance(rows, list):
|
|
1216
|
+
if rows and isinstance(rows[0], (int, float)):
|
|
1217
|
+
return [list(rows)]
|
|
1218
|
+
return rows
|
|
1219
|
+
return [list(row) for row in encoded_embeddings] # type: ignore[arg-type]
|
|
1220
|
+
|
|
1177
1221
|
def _flush_batch() -> None:
|
|
1178
|
-
nonlocal inserted_chunks
|
|
1222
|
+
nonlocal inserted_chunks, skipped_chunks
|
|
1179
1223
|
if not batch_ids:
|
|
1180
1224
|
return
|
|
1181
1225
|
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1226
|
+
pending_ids = list(batch_ids)
|
|
1227
|
+
pending_docs = list(batch_docs)
|
|
1228
|
+
pending_metadatas = list(batch_metadatas)
|
|
1229
|
+
|
|
1230
|
+
try:
|
|
1231
|
+
encoded = model.encode(
|
|
1232
|
+
pending_docs,
|
|
1233
|
+
show_progress_bar=False,
|
|
1234
|
+
batch_size=embedding_batch_size,
|
|
1235
|
+
)
|
|
1236
|
+
embeddings = _to_embedding_rows(encoded)
|
|
1237
|
+
collection.upsert(
|
|
1238
|
+
ids=pending_ids,
|
|
1239
|
+
embeddings=embeddings,
|
|
1240
|
+
documents=pending_docs,
|
|
1241
|
+
metadatas=pending_metadatas,
|
|
1242
|
+
)
|
|
1243
|
+
inserted_chunks += len(pending_ids)
|
|
1244
|
+
del embeddings
|
|
1245
|
+
except StopIteration:
|
|
1246
|
+
_warn_stop_iteration(
|
|
1247
|
+
f" [AVISO] {filepath.name}: StopIteration no batch de embeddings; tentando fallback por chunk."
|
|
1248
|
+
)
|
|
1249
|
+
for chunk_id, chunk_doc, chunk_metadata in zip(pending_ids, pending_docs, pending_metadatas):
|
|
1250
|
+
candidate_doc = chunk_doc.strip()
|
|
1251
|
+
if not candidate_doc:
|
|
1252
|
+
skipped_chunks += 1
|
|
1253
|
+
continue
|
|
1254
|
+
|
|
1255
|
+
try:
|
|
1256
|
+
encoded_single = model.encode(
|
|
1257
|
+
[candidate_doc],
|
|
1258
|
+
show_progress_bar=False,
|
|
1259
|
+
batch_size=1,
|
|
1260
|
+
)
|
|
1261
|
+
single_embeddings = _to_embedding_rows(encoded_single)
|
|
1262
|
+
collection.upsert(
|
|
1263
|
+
ids=[chunk_id],
|
|
1264
|
+
embeddings=single_embeddings,
|
|
1265
|
+
documents=[candidate_doc],
|
|
1266
|
+
metadatas=[chunk_metadata],
|
|
1267
|
+
)
|
|
1268
|
+
inserted_chunks += 1
|
|
1269
|
+
del single_embeddings
|
|
1270
|
+
except StopIteration:
|
|
1271
|
+
compact_doc = " ".join(candidate_doc.split())
|
|
1272
|
+
if not compact_doc:
|
|
1273
|
+
skipped_chunks += 1
|
|
1274
|
+
continue
|
|
1275
|
+
try:
|
|
1276
|
+
encoded_single = model.encode(
|
|
1277
|
+
[compact_doc],
|
|
1278
|
+
show_progress_bar=False,
|
|
1279
|
+
batch_size=1,
|
|
1280
|
+
)
|
|
1281
|
+
single_embeddings = _to_embedding_rows(encoded_single)
|
|
1282
|
+
collection.upsert(
|
|
1283
|
+
ids=[chunk_id],
|
|
1284
|
+
embeddings=single_embeddings,
|
|
1285
|
+
documents=[compact_doc],
|
|
1286
|
+
metadatas=[chunk_metadata],
|
|
1287
|
+
)
|
|
1288
|
+
inserted_chunks += 1
|
|
1289
|
+
del single_embeddings
|
|
1290
|
+
except StopIteration:
|
|
1291
|
+
skipped_chunks += 1
|
|
1292
|
+
_warn_stop_iteration(
|
|
1293
|
+
f" [AVISO] {filepath.name}: chunk ignorado após StopIteration repetido."
|
|
1294
|
+
)
|
|
1195
1295
|
batch_ids.clear()
|
|
1196
1296
|
batch_docs.clear()
|
|
1197
1297
|
batch_metadatas.clear()
|
|
1198
1298
|
gc.collect()
|
|
1199
1299
|
|
|
1200
1300
|
for i, chunk in enumerate(chunks):
|
|
1301
|
+
if not chunk or not chunk.strip():
|
|
1302
|
+
skipped_chunks += 1
|
|
1303
|
+
continue
|
|
1201
1304
|
batch_ids.append(make_chunk_id(abs_path, i))
|
|
1202
1305
|
batch_docs.append(chunk)
|
|
1203
1306
|
batch_metadatas.append(
|
|
@@ -1213,6 +1316,10 @@ def index_file(
|
|
|
1213
1316
|
_flush_batch()
|
|
1214
1317
|
|
|
1215
1318
|
_flush_batch()
|
|
1319
|
+
if skipped_chunks:
|
|
1320
|
+
_warn_stop_iteration(
|
|
1321
|
+
f" [AVISO] {filepath.name}: {skipped_chunks} chunk(s) vazio(s)/inválido(s) foram ignorados."
|
|
1322
|
+
)
|
|
1216
1323
|
return inserted_chunks
|
|
1217
1324
|
|
|
1218
1325
|
|
package/bin/mcp_server.py
CHANGED
|
@@ -399,7 +399,7 @@ IGNORED_EXTENSIONS = {
|
|
|
399
399
|
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp", ".bmp",
|
|
400
400
|
".mp4", ".mp3", ".wav", ".ogg", ".avi", ".mov",
|
|
401
401
|
".zip", ".tar", ".gz", ".rar", ".7z", ".jar", ".war",
|
|
402
|
-
".pyc", ".pyo", ".so", ".dll", ".exe", ".bin",
|
|
402
|
+
".pyc", ".pyo", ".so", ".dll", ".exe", ".bin", ".run",
|
|
403
403
|
".lock", ".sum", ".sqlite", ".db", ".sqlite3",
|
|
404
404
|
".ttf", ".woff", ".woff2", ".eot",
|
|
405
405
|
".pdf", ".docx", ".xlsx", ".pptx",
|
|
@@ -804,16 +804,43 @@ def _delete_file_chunks(collection: chromadb.Collection, file_path: str) -> int:
|
|
|
804
804
|
|
|
805
805
|
|
|
806
806
|
def _read_file_safe(filepath: Path) -> str | None:
|
|
807
|
+
try:
|
|
808
|
+
raw = filepath.read_bytes()
|
|
809
|
+
except OSError:
|
|
810
|
+
return None
|
|
811
|
+
|
|
812
|
+
if _looks_binary_content(raw):
|
|
813
|
+
return None
|
|
814
|
+
|
|
807
815
|
for encoding in ("utf-8", "latin-1", "cp1252"):
|
|
808
816
|
try:
|
|
809
|
-
return
|
|
817
|
+
return raw.decode(encoding)
|
|
810
818
|
except UnicodeDecodeError:
|
|
811
819
|
continue
|
|
812
|
-
except OSError:
|
|
813
|
-
return None
|
|
814
820
|
return None
|
|
815
821
|
|
|
816
822
|
|
|
823
|
+
def _looks_binary_content(raw: bytes) -> bool:
|
|
824
|
+
if not raw:
|
|
825
|
+
return False
|
|
826
|
+
|
|
827
|
+
sample = raw[:4096]
|
|
828
|
+
if b"\x00" in sample:
|
|
829
|
+
return True
|
|
830
|
+
|
|
831
|
+
non_text_bytes = 0
|
|
832
|
+
for byte in sample:
|
|
833
|
+
if byte in (9, 10, 13): # \t \n \r
|
|
834
|
+
continue
|
|
835
|
+
if 32 <= byte <= 126: # ASCII imprimivel
|
|
836
|
+
continue
|
|
837
|
+
if 160 <= byte <= 255: # Latin-1 estendido
|
|
838
|
+
continue
|
|
839
|
+
non_text_bytes += 1
|
|
840
|
+
|
|
841
|
+
return (non_text_bytes / len(sample)) > 0.30
|
|
842
|
+
|
|
843
|
+
|
|
817
844
|
def _scan_folder(folder_path: Path) -> Iterator[Path]:
|
|
818
845
|
for dirpath, dirnames, filenames in os.walk(folder_path):
|
|
819
846
|
dirnames[:] = [
|
|
@@ -871,32 +898,108 @@ def _index_single_file_for_branch(
|
|
|
871
898
|
_delete_file_chunks(collection, abs_path)
|
|
872
899
|
|
|
873
900
|
inserted_chunks = 0
|
|
901
|
+
skipped_chunks = 0
|
|
902
|
+
stop_iteration_warnings = 0
|
|
874
903
|
batch_ids: list[str] = []
|
|
875
904
|
batch_docs: list[str] = []
|
|
876
905
|
batch_metadatas: list[dict[str, object]] = []
|
|
877
906
|
|
|
907
|
+
def _warn_stop_iteration(message: str) -> None:
|
|
908
|
+
nonlocal stop_iteration_warnings
|
|
909
|
+
if stop_iteration_warnings < 3:
|
|
910
|
+
log.warning(message)
|
|
911
|
+
stop_iteration_warnings += 1
|
|
912
|
+
|
|
913
|
+
def _to_embedding_rows(encoded_embeddings: object) -> list[list[float]]:
|
|
914
|
+
if hasattr(encoded_embeddings, "tolist"):
|
|
915
|
+
rows = encoded_embeddings.tolist()
|
|
916
|
+
if isinstance(rows, list):
|
|
917
|
+
if rows and isinstance(rows[0], (int, float)):
|
|
918
|
+
return [list(rows)]
|
|
919
|
+
return rows
|
|
920
|
+
return [list(row) for row in encoded_embeddings] # type: ignore[arg-type]
|
|
921
|
+
|
|
878
922
|
def _flush_batch() -> None:
|
|
879
|
-
nonlocal inserted_chunks
|
|
923
|
+
nonlocal inserted_chunks, skipped_chunks
|
|
880
924
|
if not batch_ids:
|
|
881
925
|
return
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
926
|
+
|
|
927
|
+
pending_ids = list(batch_ids)
|
|
928
|
+
pending_docs = list(batch_docs)
|
|
929
|
+
pending_metadatas = list(batch_metadatas)
|
|
930
|
+
|
|
931
|
+
try:
|
|
932
|
+
encoded = model.encode(
|
|
933
|
+
pending_docs,
|
|
934
|
+
show_progress_bar=False,
|
|
935
|
+
batch_size=EMBEDDING_BATCH_SIZE,
|
|
936
|
+
)
|
|
937
|
+
embeddings = _to_embedding_rows(encoded)
|
|
938
|
+
collection.upsert(
|
|
939
|
+
ids=pending_ids,
|
|
940
|
+
embeddings=embeddings,
|
|
941
|
+
documents=pending_docs,
|
|
942
|
+
metadatas=pending_metadatas,
|
|
943
|
+
)
|
|
944
|
+
inserted_chunks += len(pending_ids)
|
|
945
|
+
del embeddings
|
|
946
|
+
except StopIteration:
|
|
947
|
+
_warn_stop_iteration(
|
|
948
|
+
f"{filepath.name} [{branch.key}] StopIteration no batch; aplicando fallback por chunk."
|
|
949
|
+
)
|
|
950
|
+
for chunk_id, chunk_doc, chunk_metadata in zip(pending_ids, pending_docs, pending_metadatas):
|
|
951
|
+
candidate_doc = chunk_doc.strip()
|
|
952
|
+
if not candidate_doc:
|
|
953
|
+
skipped_chunks += 1
|
|
954
|
+
continue
|
|
955
|
+
try:
|
|
956
|
+
encoded_single = model.encode(
|
|
957
|
+
[candidate_doc],
|
|
958
|
+
show_progress_bar=False,
|
|
959
|
+
batch_size=1,
|
|
960
|
+
)
|
|
961
|
+
single_embeddings = _to_embedding_rows(encoded_single)
|
|
962
|
+
collection.upsert(
|
|
963
|
+
ids=[chunk_id],
|
|
964
|
+
embeddings=single_embeddings,
|
|
965
|
+
documents=[candidate_doc],
|
|
966
|
+
metadatas=[chunk_metadata],
|
|
967
|
+
)
|
|
968
|
+
inserted_chunks += 1
|
|
969
|
+
del single_embeddings
|
|
970
|
+
except StopIteration:
|
|
971
|
+
compact_doc = " ".join(candidate_doc.split())
|
|
972
|
+
if not compact_doc:
|
|
973
|
+
skipped_chunks += 1
|
|
974
|
+
continue
|
|
975
|
+
try:
|
|
976
|
+
encoded_single = model.encode(
|
|
977
|
+
[compact_doc],
|
|
978
|
+
show_progress_bar=False,
|
|
979
|
+
batch_size=1,
|
|
980
|
+
)
|
|
981
|
+
single_embeddings = _to_embedding_rows(encoded_single)
|
|
982
|
+
collection.upsert(
|
|
983
|
+
ids=[chunk_id],
|
|
984
|
+
embeddings=single_embeddings,
|
|
985
|
+
documents=[compact_doc],
|
|
986
|
+
metadatas=[chunk_metadata],
|
|
987
|
+
)
|
|
988
|
+
inserted_chunks += 1
|
|
989
|
+
del single_embeddings
|
|
990
|
+
except StopIteration:
|
|
991
|
+
skipped_chunks += 1
|
|
992
|
+
_warn_stop_iteration(
|
|
993
|
+
f"{filepath.name} [{branch.key}] chunk ignorado após StopIteration repetido."
|
|
994
|
+
)
|
|
895
995
|
batch_ids.clear()
|
|
896
996
|
batch_docs.clear()
|
|
897
997
|
batch_metadatas.clear()
|
|
898
998
|
|
|
899
999
|
for i, chunk in enumerate(chunks):
|
|
1000
|
+
if not chunk or not chunk.strip():
|
|
1001
|
+
skipped_chunks += 1
|
|
1002
|
+
continue
|
|
900
1003
|
batch_ids.append(_make_chunk_id(abs_path, i))
|
|
901
1004
|
batch_docs.append(chunk)
|
|
902
1005
|
batch_metadatas.append(
|
|
@@ -914,6 +1017,10 @@ def _index_single_file_for_branch(
|
|
|
914
1017
|
_flush_batch()
|
|
915
1018
|
|
|
916
1019
|
_flush_batch()
|
|
1020
|
+
if skipped_chunks:
|
|
1021
|
+
_warn_stop_iteration(
|
|
1022
|
+
f"{filepath.name} [{branch.key}] ignorou {skipped_chunks} chunk(s) vazio(s)/inválido(s)."
|
|
1023
|
+
)
|
|
917
1024
|
return inserted_chunks
|
|
918
1025
|
|
|
919
1026
|
|
|
@@ -1196,15 +1303,32 @@ def _run_ensemble_mode(query: str, top_k: int) -> tuple[list[FusedHit], list[str
|
|
|
1196
1303
|
# Servidor MCP via FastMCP
|
|
1197
1304
|
# ---------------------------------------------------------------------------
|
|
1198
1305
|
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1306
|
+
FORCE_SEMANTIC_FALLBACK = (
|
|
1307
|
+
os.environ.get("MCP_FORCE_SEMANTIC_FALLBACK", "true").strip().lower()
|
|
1308
|
+
in {"1", "true", "yes", "on"}
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
def _build_server_instructions() -> str:
|
|
1313
|
+
instructions = (
|
|
1202
1314
|
"Servidor RAG para busca semântica em código-fonte local com suporte a ensemble híbrido. "
|
|
1203
1315
|
"No modo hybrid, a branch de código usa Jina v2 e a de documentação usa BGE. "
|
|
1204
1316
|
"Use semantic_search_code(query, top_k, mode='ensemble') para combinar Jina+BGE com RRF e reranking. "
|
|
1205
1317
|
"Use update_file_index após editar um arquivo para manter as duas coleções sincronizadas. "
|
|
1206
1318
|
"Use index_specific_folder para indexação recursiva sob demanda."
|
|
1207
|
-
)
|
|
1319
|
+
)
|
|
1320
|
+
if FORCE_SEMANTIC_FALLBACK:
|
|
1321
|
+
instructions += (
|
|
1322
|
+
" Regra obrigatória: quando a pergunta mencionar função/classe/componente/arquivo "
|
|
1323
|
+
"e isso não estiver no contexto imediato, chame semantic_search_code(mode='ensemble') "
|
|
1324
|
+
"antes de responder que não encontrou. Só conclua ausência após ao menos uma busca semântica."
|
|
1325
|
+
)
|
|
1326
|
+
return instructions
|
|
1327
|
+
|
|
1328
|
+
|
|
1329
|
+
mcp = FastMCP(
|
|
1330
|
+
name="rag-codebase",
|
|
1331
|
+
instructions=_build_server_instructions(),
|
|
1208
1332
|
)
|
|
1209
1333
|
|
|
1210
1334
|
|