own-rag-cli 0.0.4-snapshot → 0.0.6-snapshot

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,8 +1,10 @@
1
+ # MCP binary checksum (SHA-256, payload without shebang): `1413af4d4c7d01d57ec5195ea0c5f704f9fefabeb641d2f216a042ec638c2b59`
2
+
1
3
  # own-rag
2
4
 
3
5
  Local RAG for codebases with ChromaDB + MCP, focused on practical setup and lower LLM token waste.
4
6
 
5
- Language: English (default) | Portuguese: `README.pt-br.md`
7
+ Language: English (default) | Portuguese: `README.pt-br.md` - https://github.com/JocsaPB/own-rag/blob/main/README.pt-br.md
6
8
 
7
9
  ## Why use own-rag
8
10
 
@@ -59,6 +61,30 @@ rag remove # full local uninstall (double confirmation)
59
61
  rag remove --force # uninstall without confirmation prompts
60
62
  ```
61
63
 
64
+ ## Indexing from URL (HTTP/HTTPS)
65
+
66
+ `rag run` now accepts remote URLs in addition to local folders.
67
+
68
+ How it works:
69
+ - If you pass `http://` or `https://`, the wrapper downloads content to a temporary folder.
70
+ - If the downloaded file is a ZIP, it is extracted and the extracted folder is indexed.
71
+ - Text files (`.txt`, `.md`, and other non-binary text files) are indexable.
72
+ - Binary files are skipped by the indexer.
73
+ - After indexing finishes, temporary downloaded/extracted files are removed automatically.
74
+
75
+ Download tool behavior:
76
+ - Uses `curl` when available.
77
+ - If `curl` is missing, the wrapper attempts automatic installation:
78
+ - Linux: package manager (`apt`, `dnf`, `yum`, `pacman`, `zypper`, `apk`)
79
+ - macOS: Homebrew (`brew`)
80
+
81
+ Examples:
82
+
83
+ ```bash
84
+ rag run https://example.com/docs/guide.md
85
+ rag run https://example.com/snapshots/project-docs.zip
86
+ ```
87
+
62
88
  ## Configuration files and paths
63
89
 
64
90
  ### 1) Runtime config (CLI-level)
package/README.pt-br.md CHANGED
@@ -59,6 +59,30 @@ rag remove # desinstalação completa local (dupla confirma
59
59
  rag remove --force # desinstala sem prompts de confirmação
60
60
  ```
61
61
 
62
+ ## Indexacao a partir de URL (HTTP/HTTPS)
63
+
64
+ Agora o `rag run` aceita URL remota alem de pasta local.
65
+
66
+ Como funciona:
67
+ - Se voce passar `http://` ou `https://`, o wrapper baixa o conteudo para uma pasta temporaria.
68
+ - Se o arquivo baixado for ZIP, ele e descompactado e a pasta extraida e indexada.
69
+ - Arquivos de texto (`.txt`, `.md` e outros textos nao-binarios) podem ser indexados.
70
+ - Arquivos binarios sao ignorados pelo indexador.
71
+ - Ao terminar a indexacao, os arquivos temporarios baixados/descompactados sao removidos automaticamente.
72
+
73
+ Comportamento do downloader:
74
+ - Usa `curl` quando disponivel.
75
+ - Se `curl` nao existir, o wrapper tenta instalar automaticamente:
76
+ - Linux: gerenciador de pacotes (`apt`, `dnf`, `yum`, `pacman`, `zypper`, `apk`)
77
+ - macOS: Homebrew (`brew`)
78
+
79
+ Exemplos:
80
+
81
+ ```bash
82
+ rag run https://exemplo.com/docs/guia.md
83
+ rag run https://exemplo.com/snapshots/docs-projeto.zip
84
+ ```
85
+
62
86
  ## Arquivos e caminhos de configuração
63
87
 
64
88
  ### 1) Configuração de runtime (nível CLI)
@@ -194,7 +194,7 @@ IGNORED_EXTENSIONS = {
194
194
  ".mp4", ".mp3", ".wav", ".ogg", ".avi", ".mov",
195
195
  # Pacotes e compilados
196
196
  ".zip", ".tar", ".gz", ".rar", ".7z", ".jar", ".war", ".ear",
197
- ".pyc", ".pyo", ".so", ".dll", ".exe", ".bin",
197
+ ".pyc", ".pyo", ".so", ".dll", ".exe", ".bin", ".run",
198
198
  # Lockfiles e gerados
199
199
  ".lock", ".sum",
200
200
  # Banco de dados
@@ -1113,17 +1113,44 @@ def make_chunk_id(file_path: str, chunk_index: int) -> str:
1113
1113
  return hashlib.md5(raw.encode()).hexdigest()
1114
1114
 
1115
1115
 
1116
+ def _looks_binary_content(raw: bytes) -> bool:
1117
+ """Detecta conteúdo binário por heurística em amostra de bytes."""
1118
+ if not raw:
1119
+ return False
1120
+
1121
+ sample = raw[:4096]
1122
+ if b"\x00" in sample:
1123
+ return True
1124
+
1125
+ non_text_bytes = 0
1126
+ for byte in sample:
1127
+ if byte in (9, 10, 13): # \t \n \r
1128
+ continue
1129
+ if 32 <= byte <= 126: # ASCII imprimível
1130
+ continue
1131
+ if 160 <= byte <= 255: # Latin-1 estendido comum em texto
1132
+ continue
1133
+ non_text_bytes += 1
1134
+
1135
+ return (non_text_bytes / len(sample)) > 0.30
1136
+
1137
+
1116
1138
  def read_file_safe(filepath: Path) -> str | None:
1117
- """Lê um arquivo de texto, tentando múltiplos encodings."""
1139
+ """Lê um arquivo de texto, evitando binários e tentando múltiplos encodings."""
1140
+ try:
1141
+ raw = filepath.read_bytes()
1142
+ except OSError as e:
1143
+ print(f" [AVISO] Não foi possível ler {filepath}: {e}")
1144
+ return None
1145
+
1146
+ if _looks_binary_content(raw):
1147
+ return None
1148
+
1118
1149
  for encoding in ("utf-8", "latin-1", "cp1252"):
1119
1150
  try:
1120
- return filepath.read_text(encoding=encoding)
1151
+ return raw.decode(encoding)
1121
1152
  except UnicodeDecodeError:
1122
1153
  continue
1123
- except OSError as e:
1124
- print(f" [AVISO] Não foi possível ler {filepath}: {e}")
1125
- return None
1126
- # Se nenhum encoding funcionou, é provavelmente binário disfarçado
1127
1154
  return None
1128
1155
 
1129
1156
 
@@ -1170,34 +1197,110 @@ def index_file(
1170
1197
 
1171
1198
  relative_path = str(filepath.relative_to(root_path))
1172
1199
  inserted_chunks = 0
1200
+ skipped_chunks = 0
1201
+ stop_iteration_warnings = 0
1173
1202
  batch_ids: list[str] = []
1174
1203
  batch_docs: list[str] = []
1175
1204
  batch_metadatas: list[dict[str, object]] = []
1176
1205
 
1206
+ def _warn_stop_iteration(message: str) -> None:
1207
+ nonlocal stop_iteration_warnings
1208
+ if stop_iteration_warnings < 3:
1209
+ tqdm.write(message)
1210
+ stop_iteration_warnings += 1
1211
+
1212
+ def _to_embedding_rows(encoded_embeddings: object) -> list[list[float]]:
1213
+ if hasattr(encoded_embeddings, "tolist"):
1214
+ rows = encoded_embeddings.tolist()
1215
+ if isinstance(rows, list):
1216
+ if rows and isinstance(rows[0], (int, float)):
1217
+ return [list(rows)]
1218
+ return rows
1219
+ return [list(row) for row in encoded_embeddings] # type: ignore[arg-type]
1220
+
1177
1221
  def _flush_batch() -> None:
1178
- nonlocal inserted_chunks
1222
+ nonlocal inserted_chunks, skipped_chunks
1179
1223
  if not batch_ids:
1180
1224
  return
1181
1225
 
1182
- embeddings = model.encode(
1183
- batch_docs,
1184
- show_progress_bar=False,
1185
- batch_size=embedding_batch_size,
1186
- ).tolist()
1187
- collection.upsert(
1188
- ids=batch_ids,
1189
- embeddings=embeddings,
1190
- documents=batch_docs,
1191
- metadatas=batch_metadatas,
1192
- )
1193
- inserted_chunks += len(batch_ids)
1194
- del embeddings
1226
+ pending_ids = list(batch_ids)
1227
+ pending_docs = list(batch_docs)
1228
+ pending_metadatas = list(batch_metadatas)
1229
+
1230
+ try:
1231
+ encoded = model.encode(
1232
+ pending_docs,
1233
+ show_progress_bar=False,
1234
+ batch_size=embedding_batch_size,
1235
+ )
1236
+ embeddings = _to_embedding_rows(encoded)
1237
+ collection.upsert(
1238
+ ids=pending_ids,
1239
+ embeddings=embeddings,
1240
+ documents=pending_docs,
1241
+ metadatas=pending_metadatas,
1242
+ )
1243
+ inserted_chunks += len(pending_ids)
1244
+ del embeddings
1245
+ except StopIteration:
1246
+ _warn_stop_iteration(
1247
+ f" [AVISO] {filepath.name}: StopIteration no batch de embeddings; tentando fallback por chunk."
1248
+ )
1249
+ for chunk_id, chunk_doc, chunk_metadata in zip(pending_ids, pending_docs, pending_metadatas):
1250
+ candidate_doc = chunk_doc.strip()
1251
+ if not candidate_doc:
1252
+ skipped_chunks += 1
1253
+ continue
1254
+
1255
+ try:
1256
+ encoded_single = model.encode(
1257
+ [candidate_doc],
1258
+ show_progress_bar=False,
1259
+ batch_size=1,
1260
+ )
1261
+ single_embeddings = _to_embedding_rows(encoded_single)
1262
+ collection.upsert(
1263
+ ids=[chunk_id],
1264
+ embeddings=single_embeddings,
1265
+ documents=[candidate_doc],
1266
+ metadatas=[chunk_metadata],
1267
+ )
1268
+ inserted_chunks += 1
1269
+ del single_embeddings
1270
+ except StopIteration:
1271
+ compact_doc = " ".join(candidate_doc.split())
1272
+ if not compact_doc:
1273
+ skipped_chunks += 1
1274
+ continue
1275
+ try:
1276
+ encoded_single = model.encode(
1277
+ [compact_doc],
1278
+ show_progress_bar=False,
1279
+ batch_size=1,
1280
+ )
1281
+ single_embeddings = _to_embedding_rows(encoded_single)
1282
+ collection.upsert(
1283
+ ids=[chunk_id],
1284
+ embeddings=single_embeddings,
1285
+ documents=[compact_doc],
1286
+ metadatas=[chunk_metadata],
1287
+ )
1288
+ inserted_chunks += 1
1289
+ del single_embeddings
1290
+ except StopIteration:
1291
+ skipped_chunks += 1
1292
+ _warn_stop_iteration(
1293
+ f" [AVISO] {filepath.name}: chunk ignorado após StopIteration repetido."
1294
+ )
1195
1295
  batch_ids.clear()
1196
1296
  batch_docs.clear()
1197
1297
  batch_metadatas.clear()
1198
1298
  gc.collect()
1199
1299
 
1200
1300
  for i, chunk in enumerate(chunks):
1301
+ if not chunk or not chunk.strip():
1302
+ skipped_chunks += 1
1303
+ continue
1201
1304
  batch_ids.append(make_chunk_id(abs_path, i))
1202
1305
  batch_docs.append(chunk)
1203
1306
  batch_metadatas.append(
@@ -1213,6 +1316,10 @@ def index_file(
1213
1316
  _flush_batch()
1214
1317
 
1215
1318
  _flush_batch()
1319
+ if skipped_chunks:
1320
+ _warn_stop_iteration(
1321
+ f" [AVISO] {filepath.name}: {skipped_chunks} chunk(s) vazio(s)/inválido(s) foram ignorados."
1322
+ )
1216
1323
  return inserted_chunks
1217
1324
 
1218
1325
 
package/bin/mcp_server.py CHANGED
@@ -399,7 +399,7 @@ IGNORED_EXTENSIONS = {
399
399
  ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp", ".bmp",
400
400
  ".mp4", ".mp3", ".wav", ".ogg", ".avi", ".mov",
401
401
  ".zip", ".tar", ".gz", ".rar", ".7z", ".jar", ".war",
402
- ".pyc", ".pyo", ".so", ".dll", ".exe", ".bin",
402
+ ".pyc", ".pyo", ".so", ".dll", ".exe", ".bin", ".run",
403
403
  ".lock", ".sum", ".sqlite", ".db", ".sqlite3",
404
404
  ".ttf", ".woff", ".woff2", ".eot",
405
405
  ".pdf", ".docx", ".xlsx", ".pptx",
@@ -804,16 +804,43 @@ def _delete_file_chunks(collection: chromadb.Collection, file_path: str) -> int:
804
804
 
805
805
 
806
806
  def _read_file_safe(filepath: Path) -> str | None:
807
+ try:
808
+ raw = filepath.read_bytes()
809
+ except OSError:
810
+ return None
811
+
812
+ if _looks_binary_content(raw):
813
+ return None
814
+
807
815
  for encoding in ("utf-8", "latin-1", "cp1252"):
808
816
  try:
809
- return filepath.read_text(encoding=encoding)
817
+ return raw.decode(encoding)
810
818
  except UnicodeDecodeError:
811
819
  continue
812
- except OSError:
813
- return None
814
820
  return None
815
821
 
816
822
 
823
+ def _looks_binary_content(raw: bytes) -> bool:
824
+ if not raw:
825
+ return False
826
+
827
+ sample = raw[:4096]
828
+ if b"\x00" in sample:
829
+ return True
830
+
831
+ non_text_bytes = 0
832
+ for byte in sample:
833
+ if byte in (9, 10, 13): # \t \n \r
834
+ continue
835
+ if 32 <= byte <= 126: # ASCII imprimivel
836
+ continue
837
+ if 160 <= byte <= 255: # Latin-1 estendido
838
+ continue
839
+ non_text_bytes += 1
840
+
841
+ return (non_text_bytes / len(sample)) > 0.30
842
+
843
+
817
844
  def _scan_folder(folder_path: Path) -> Iterator[Path]:
818
845
  for dirpath, dirnames, filenames in os.walk(folder_path):
819
846
  dirnames[:] = [
@@ -871,32 +898,108 @@ def _index_single_file_for_branch(
871
898
  _delete_file_chunks(collection, abs_path)
872
899
 
873
900
  inserted_chunks = 0
901
+ skipped_chunks = 0
902
+ stop_iteration_warnings = 0
874
903
  batch_ids: list[str] = []
875
904
  batch_docs: list[str] = []
876
905
  batch_metadatas: list[dict[str, object]] = []
877
906
 
907
+ def _warn_stop_iteration(message: str) -> None:
908
+ nonlocal stop_iteration_warnings
909
+ if stop_iteration_warnings < 3:
910
+ log.warning(message)
911
+ stop_iteration_warnings += 1
912
+
913
+ def _to_embedding_rows(encoded_embeddings: object) -> list[list[float]]:
914
+ if hasattr(encoded_embeddings, "tolist"):
915
+ rows = encoded_embeddings.tolist()
916
+ if isinstance(rows, list):
917
+ if rows and isinstance(rows[0], (int, float)):
918
+ return [list(rows)]
919
+ return rows
920
+ return [list(row) for row in encoded_embeddings] # type: ignore[arg-type]
921
+
878
922
  def _flush_batch() -> None:
879
- nonlocal inserted_chunks
923
+ nonlocal inserted_chunks, skipped_chunks
880
924
  if not batch_ids:
881
925
  return
882
- embeddings = model.encode(
883
- batch_docs,
884
- show_progress_bar=False,
885
- batch_size=EMBEDDING_BATCH_SIZE,
886
- ).tolist()
887
- collection.upsert(
888
- ids=batch_ids,
889
- embeddings=embeddings,
890
- documents=batch_docs,
891
- metadatas=batch_metadatas,
892
- )
893
- inserted_chunks += len(batch_ids)
894
- del embeddings
926
+
927
+ pending_ids = list(batch_ids)
928
+ pending_docs = list(batch_docs)
929
+ pending_metadatas = list(batch_metadatas)
930
+
931
+ try:
932
+ encoded = model.encode(
933
+ pending_docs,
934
+ show_progress_bar=False,
935
+ batch_size=EMBEDDING_BATCH_SIZE,
936
+ )
937
+ embeddings = _to_embedding_rows(encoded)
938
+ collection.upsert(
939
+ ids=pending_ids,
940
+ embeddings=embeddings,
941
+ documents=pending_docs,
942
+ metadatas=pending_metadatas,
943
+ )
944
+ inserted_chunks += len(pending_ids)
945
+ del embeddings
946
+ except StopIteration:
947
+ _warn_stop_iteration(
948
+ f"{filepath.name} [{branch.key}] StopIteration no batch; aplicando fallback por chunk."
949
+ )
950
+ for chunk_id, chunk_doc, chunk_metadata in zip(pending_ids, pending_docs, pending_metadatas):
951
+ candidate_doc = chunk_doc.strip()
952
+ if not candidate_doc:
953
+ skipped_chunks += 1
954
+ continue
955
+ try:
956
+ encoded_single = model.encode(
957
+ [candidate_doc],
958
+ show_progress_bar=False,
959
+ batch_size=1,
960
+ )
961
+ single_embeddings = _to_embedding_rows(encoded_single)
962
+ collection.upsert(
963
+ ids=[chunk_id],
964
+ embeddings=single_embeddings,
965
+ documents=[candidate_doc],
966
+ metadatas=[chunk_metadata],
967
+ )
968
+ inserted_chunks += 1
969
+ del single_embeddings
970
+ except StopIteration:
971
+ compact_doc = " ".join(candidate_doc.split())
972
+ if not compact_doc:
973
+ skipped_chunks += 1
974
+ continue
975
+ try:
976
+ encoded_single = model.encode(
977
+ [compact_doc],
978
+ show_progress_bar=False,
979
+ batch_size=1,
980
+ )
981
+ single_embeddings = _to_embedding_rows(encoded_single)
982
+ collection.upsert(
983
+ ids=[chunk_id],
984
+ embeddings=single_embeddings,
985
+ documents=[compact_doc],
986
+ metadatas=[chunk_metadata],
987
+ )
988
+ inserted_chunks += 1
989
+ del single_embeddings
990
+ except StopIteration:
991
+ skipped_chunks += 1
992
+ _warn_stop_iteration(
993
+ f"{filepath.name} [{branch.key}] chunk ignorado após StopIteration repetido."
994
+ )
895
995
  batch_ids.clear()
896
996
  batch_docs.clear()
897
997
  batch_metadatas.clear()
898
998
 
899
999
  for i, chunk in enumerate(chunks):
1000
+ if not chunk or not chunk.strip():
1001
+ skipped_chunks += 1
1002
+ continue
900
1003
  batch_ids.append(_make_chunk_id(abs_path, i))
901
1004
  batch_docs.append(chunk)
902
1005
  batch_metadatas.append(
@@ -914,6 +1017,10 @@ def _index_single_file_for_branch(
914
1017
  _flush_batch()
915
1018
 
916
1019
  _flush_batch()
1020
+ if skipped_chunks:
1021
+ _warn_stop_iteration(
1022
+ f"{filepath.name} [{branch.key}] ignorou {skipped_chunks} chunk(s) vazio(s)/inválido(s)."
1023
+ )
917
1024
  return inserted_chunks
918
1025
 
919
1026
 
@@ -1196,15 +1303,32 @@ def _run_ensemble_mode(query: str, top_k: int) -> tuple[list[FusedHit], list[str
1196
1303
  # Servidor MCP via FastMCP
1197
1304
  # ---------------------------------------------------------------------------
1198
1305
 
1199
- mcp = FastMCP(
1200
- name="rag-codebase",
1201
- instructions=(
1306
+ FORCE_SEMANTIC_FALLBACK = (
1307
+ os.environ.get("MCP_FORCE_SEMANTIC_FALLBACK", "true").strip().lower()
1308
+ in {"1", "true", "yes", "on"}
1309
+ )
1310
+
1311
+
1312
+ def _build_server_instructions() -> str:
1313
+ instructions = (
1202
1314
  "Servidor RAG para busca semântica em código-fonte local com suporte a ensemble híbrido. "
1203
1315
  "No modo hybrid, a branch de código usa Jina v2 e a de documentação usa BGE. "
1204
1316
  "Use semantic_search_code(query, top_k, mode='ensemble') para combinar Jina+BGE com RRF e reranking. "
1205
1317
  "Use update_file_index após editar um arquivo para manter as duas coleções sincronizadas. "
1206
1318
  "Use index_specific_folder para indexação recursiva sob demanda."
1207
- ),
1319
+ )
1320
+ if FORCE_SEMANTIC_FALLBACK:
1321
+ instructions += (
1322
+ " Regra obrigatória: quando a pergunta mencionar função/classe/componente/arquivo "
1323
+ "e isso não estiver no contexto imediato, chame semantic_search_code(mode='ensemble') "
1324
+ "antes de responder que não encontrou. Só conclua ausência após ao menos uma busca semântica."
1325
+ )
1326
+ return instructions
1327
+
1328
+
1329
+ mcp = FastMCP(
1330
+ name="rag-codebase",
1331
+ instructions=_build_server_instructions(),
1208
1332
  )
1209
1333
 
1210
1334