own-rag-cli 0.0.8-snapshot → 0.0.9-snapshot

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # MCP binary checksum (SHA-256, payload without shebang): `1413af4d4c7d01d57ec5195ea0c5f704f9fefabeb641d2f216a042ec638c2b59`
1
+ # MCP binary checksum (SHA-256, payload without shebang): `80e56de30d6296582df5b6b09ab182df9879dc16603fe48604206d94264e8673`
2
2
 
3
3
  # own-rag
4
4
 
@@ -1176,6 +1176,7 @@ def index_file(
1176
1176
  splitter: RecursiveCharacterTextSplitter,
1177
1177
  root_path: Path,
1178
1178
  embedding_batch_size: int,
1179
+ raise_on_stop_iteration: bool = False,
1179
1180
  ) -> int:
1180
1181
  """
1181
1182
  Indexa um único arquivo: lê, divide em chunks, gera embeddings e faz upsert.
@@ -1320,6 +1321,10 @@ def index_file(
1320
1321
  _warn_stop_iteration(
1321
1322
  f" [AVISO] {filepath.name}: {skipped_chunks} chunk(s) vazio(s)/inválido(s) foram ignorados."
1322
1323
  )
1324
+ if raise_on_stop_iteration and inserted_chunks == 0 and stop_iteration_warnings > 0:
1325
+ raise RuntimeError(
1326
+ f"stop_iteration_all_chunks:{filepath.name}:{stop_iteration_warnings}"
1327
+ )
1323
1328
  return inserted_chunks
1324
1329
 
1325
1330
 
@@ -1532,8 +1537,8 @@ def main():
1532
1537
  if target is None:
1533
1538
  continue
1534
1539
 
1540
+ target_quantization = jina_quantization if target.model_choice == "jina" else "default"
1535
1541
  if target.model_choice not in loaded_models:
1536
- target_quantization = jina_quantization if target.model_choice == "jina" else "default"
1537
1542
  try:
1538
1543
  loaded_models[target.model_choice] = load_embedding_model(
1539
1544
  target.model_choice,
@@ -1568,6 +1573,7 @@ def main():
1568
1573
  splitter,
1569
1574
  root_path,
1570
1575
  embedding_batch_size=effective_batch_size,
1576
+ raise_on_stop_iteration=(target.model_choice == "bge"),
1571
1577
  )
1572
1578
  total_chunks += n_chunks
1573
1579
  files_processed_total += 1
@@ -1591,6 +1597,45 @@ def main():
1591
1597
  gc.collect()
1592
1598
  continue
1593
1599
 
1600
+ if target.model_choice == "bge":
1601
+ fallback_target = target_by_model.get("jina-v2") or target_by_model.get("jina")
1602
+ if fallback_target is not None:
1603
+ fallback_quantization = (
1604
+ jina_quantization if fallback_target.model_choice == "jina" else "default"
1605
+ )
1606
+ try:
1607
+ if fallback_target.model_choice not in loaded_models:
1608
+ loaded_models[fallback_target.model_choice] = load_embedding_model(
1609
+ fallback_target.model_choice,
1610
+ fallback_quantization,
1611
+ )
1612
+ fallback_model = loaded_models[fallback_target.model_choice]
1613
+ fallback_collection = collections[fallback_target.collection_name]
1614
+ tqdm.write(
1615
+ f" [FALLBACK] {filepath.name}: BGE falhou; "
1616
+ f"reindexando com {fallback_target.label}."
1617
+ )
1618
+ fallback_chunks = index_file(
1619
+ filepath,
1620
+ fallback_collection,
1621
+ fallback_model,
1622
+ splitter,
1623
+ root_path,
1624
+ embedding_batch_size=effective_batch_size,
1625
+ raise_on_stop_iteration=False,
1626
+ )
1627
+ total_chunks += fallback_chunks
1628
+ files_processed_total += 1
1629
+ files_eligible_by_collection[fallback_target.collection_name] += 1
1630
+ chunks_by_collection[fallback_target.collection_name] += fallback_chunks
1631
+ files_by_collection[fallback_target.collection_name] += 1
1632
+ break
1633
+ except Exception as fallback_error:
1634
+ tqdm.write(
1635
+ f" [ERRO] {filepath} [fallback {fallback_target.label}]: "
1636
+ f"{_format_exception(fallback_error)}"
1637
+ )
1638
+
1594
1639
  if (
1595
1640
  _is_dimension_mismatch_error(e)
1596
1641
  and not collection_dimension_reset_done[target.collection_name]
package/bin/mcp_server.py CHANGED
@@ -1021,6 +1021,10 @@ def _index_single_file_for_branch(
1021
1021
  _warn_stop_iteration(
1022
1022
  f"{filepath.name} [{branch.key}] ignorou {skipped_chunks} chunk(s) vazio(s)/inválido(s)."
1023
1023
  )
1024
+ if branch.key == "bge_doc" and inserted_chunks == 0 and stop_iteration_warnings > 0:
1025
+ raise RuntimeError(
1026
+ f"stop_iteration_all_chunks:{filepath.name}:{stop_iteration_warnings}"
1027
+ )
1024
1028
  return inserted_chunks
1025
1029
 
1026
1030
 
@@ -1139,6 +1143,7 @@ def _rrf_fuse(hits_by_branch: dict[str, list[RetrievedHit]], top_limit: int) ->
1139
1143
 
1140
1144
 
1141
1145
  def _apply_rerank(query: str, fused_hits: list[FusedHit], top_k: int) -> tuple[list[FusedHit], bool, str | None]:
1146
+ global _reranker, _reranker_error
1142
1147
  if not fused_hits:
1143
1148
  return [], False, None
1144
1149
 
@@ -1163,7 +1168,9 @@ def _apply_rerank(query: str, fused_hits: list[FusedHit], top_k: int) -> tuple[l
1163
1168
  )
1164
1169
  return fused_hits[:top_k], True, None
1165
1170
  except Exception as e:
1166
- return fused_hits[:top_k], False, str(e)
1171
+ _reranker = None
1172
+ _reranker_error = f"runtime_error:{e}"
1173
+ return fused_hits[:top_k], False, f"fallback_sem_rerank:{e}"
1167
1174
 
1168
1175
 
1169
1176
  def _format_similarity(similarity: float | None) -> str:
@@ -1483,6 +1490,23 @@ def update_file_index(file_path: str) -> str:
1483
1490
  )
1484
1491
  inserted_per_branch[branch.key] = inserted
1485
1492
  except Exception as e:
1493
+ if branch.key == "bge_doc" and "jina_code" not in inserted_per_branch:
1494
+ fallback_branch = BRANCH_SPECS["jina_code"]
1495
+ try:
1496
+ inserted = _index_single_file_for_branch(
1497
+ filepath,
1498
+ fallback_branch,
1499
+ splitter,
1500
+ delete_existing=False,
1501
+ )
1502
+ inserted_per_branch[fallback_branch.key] = inserted
1503
+ index_errors.append(f"{branch.key}: {e} (fallback para {fallback_branch.key} aplicado)")
1504
+ continue
1505
+ except Exception as fallback_error:
1506
+ index_errors.append(
1507
+ f"{branch.key}: {e} | fallback {fallback_branch.key}: {fallback_error}"
1508
+ )
1509
+ continue
1486
1510
  index_errors.append(f"{branch.key}: {e}")
1487
1511
 
1488
1512
  success_branches = [k for k, v in inserted_per_branch.items() if v > 0]
@@ -1611,13 +1635,31 @@ def index_specific_folder(folder_path: str) -> str:
1611
1635
  for filepath in _scan_folder(folder):
1612
1636
  processed_files += 1
1613
1637
  targets = _classify_file_targets(filepath)
1638
+ indexed_branches_for_file: set[str] = set()
1614
1639
 
1615
1640
  for branch in targets:
1616
1641
  try:
1617
1642
  n_chunks = _index_single_file_for_branch(filepath, branch, splitter)
1618
1643
  branch_file_counts[branch.key] += 1
1619
1644
  branch_chunk_counts[branch.key] += n_chunks
1645
+ indexed_branches_for_file.add(branch.key)
1620
1646
  except Exception as e:
1647
+ if branch.key == "bge_doc" and "jina_code" not in indexed_branches_for_file:
1648
+ fallback_branch = BRANCH_SPECS["jina_code"]
1649
+ try:
1650
+ n_chunks = _index_single_file_for_branch(filepath, fallback_branch, splitter)
1651
+ branch_file_counts[fallback_branch.key] += 1
1652
+ branch_chunk_counts[fallback_branch.key] += n_chunks
1653
+ indexed_branches_for_file.add(fallback_branch.key)
1654
+ error_count += 1
1655
+ if len(error_samples) < 10:
1656
+ error_samples.append(
1657
+ f"{filepath.name} [{branch.key}]: {e} "
1658
+ f"(fallback para {fallback_branch.key} aplicado)"
1659
+ )
1660
+ continue
1661
+ except Exception as fallback_error:
1662
+ e = RuntimeError(f"{e} | fallback {fallback_branch.key}: {fallback_error}")
1621
1663
  error_count += 1
1622
1664
  if len(error_samples) < 10:
1623
1665
  error_samples.append(f"{filepath.name} [{branch.key}]: {e}")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "own-rag-cli",
3
- "version": "0.0.8-snapshot",
3
+ "version": "0.0.9-snapshot",
4
4
  "description": "Local RAG setup with ChromaDB + MCP server (Jina/BGE hybrid support).",
5
5
  "license": "MIT",
6
6
  "private": false,