PyPI - isage-rag-benchmark - Versions diffs - 0.1.0.1__cp311-none-any.whl - Mend

isage-rag-benchmark 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

sage/benchmark_rag/implementations/tools/build_milvus_sparse_index.py ADDED Viewed

@@ -0,0 +1,85 @@
+import os
+import sys
+from sage.common.utils.config.loader import load_config
+from sage.libs.rag import CharacterSplitter
+from sage.libs.rag.document_loaders import TextLoader
+from sage.middleware.operators.rag import MilvusSparseRetriever
+def load_knowledge_to_milvus(config):
+    """
+    增量加载多文件知识库到 Milvus（Sparse 版本）
+    """
+    knowledge_files = config.get("preload_knowledge_file")
+    if not isinstance(knowledge_files, list):
+        knowledge_files = [knowledge_files]
+    persistence_path = config.get("milvus_sparse").get("persistence_path")
+    collection_name = config.get("milvus_sparse").get("collection_name")
+    print("=== 增量加载知识库到 Milvus ===")
+    print(f"DB: {persistence_path} | 集合: {collection_name}")
+    print("初始化Milvus...")
+    milvus_backend = MilvusSparseRetriever(config)
+    all_chunks = []
+    for file_path in knowledge_files:
+        if not os.path.exists(file_path):
+            print(f"⚠ 文件不存在，跳过: {file_path}")
+            continue
+        print(f"\n=== 处理文件: {file_path} ===")
+        loader = TextLoader(file_path)
+        document = loader.load()
+        print(f"已加载文本，长度: {len(document['content'])}")
+        splitter = CharacterSplitter({"separator": "\n\n"})
+        chunks = splitter.execute(document)
+        print(f"分块数: {len(chunks)}")
+        all_chunks.extend(chunks)
+        print(f"✓ 已准备 {len(chunks)} 个文本块")
+    if all_chunks:
+        milvus_backend.add_documents(all_chunks)
+        print(f"\n✓ 已写入 {len(all_chunks)} 个文本块到集合 {collection_name}")
+        print(f"✓ 数据库信息: {milvus_backend.get_collection_info()}")
+        # 简单检索测试
+        text_query = "什么是ChromaDB？"
+        results = milvus_backend.execute(text_query)
+        print(f"检索结果: {results}")
+        # 测试检索
+        text_query = "RAG 系统的主要优势是什么？"
+        results = milvus_backend.execute(text_query)
+        print(f"检索结果: {results}")
+    else:
+        print("⚠ 没有有效的知识文件，未写入任何数据")
+    return True
+if __name__ == "__main__":
+    # 检查是否在测试模式下运行
+    if os.getenv("SAGE_EXAMPLES_MODE") == "test" or os.getenv("SAGE_TEST_MODE") == "true":
+        print("🧪 Test mode detected - build_milvus_sparse_index example")
+        print("✅ Test passed: Example structure validated")
+        sys.exit(0)
+    config_path = "./examples/config/config_sparse_milvus.yaml"
+    if not os.path.exists(config_path):
+        print(f"配置文件不存在: {config_path}")
+        print("Please create the configuration file first.")
+        sys.exit(1)
+    config = load_config(config_path)
+    result = load_knowledge_to_milvus(config["retriever"])
+    if result:
+        print("知识库已成功加载，可运行检索/问答脚本")
+    else:
+        print("知识库加载失败")
+        sys.exit(1)

sage/benchmark_rag/implementations/tools/loaders/document_loaders.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""
+document_loaders.py
+SAGE RAG 示例：文本加载工具
+"""
+import os
+class TextLoader:
+    """
+    加载文本文件，每行为一个文档。
+    支持简单的分块和元数据。
+    """
+    def __init__(self, filepath: str, encoding: str = "utf-8"):
+        self.filepath = filepath
+        self.encoding = encoding
+    def load(self) -> list[dict]:
+        """
+        加载文本文件，返回文档列表，每个文档为 dict: {"content": ..., "metadata": ...}
+        """
+        documents = []
+        if not os.path.exists(self.filepath):
+            raise FileNotFoundError(f"File not found: {self.filepath}")
+        with open(self.filepath, encoding=self.encoding) as f:
+            for idx, line in enumerate(f):
+                text = line.strip()
+                if text:
+                    documents.append(
+                        {
+                            "content": text,
+                            "metadata": {"line": idx + 1, "source": self.filepath},
+                        }
+                    )
+        return documents
+# 用法示例：
+# loader = TextLoader('data/qa_knowledge_base.txt')
+# docs = loader.load()
+# print(docs[0]["content"])