PyPI - realtimex-deeptutor - Versions diffs - 0.5.0.post1__py3-none-any.whl - Mend

realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

realtimex_deeptutor/__init__.py +67 -0
realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
src/__init__.py +40 -0
src/agents/__init__.py +24 -0
src/agents/base_agent.py +657 -0
src/agents/chat/__init__.py +24 -0
src/agents/chat/chat_agent.py +435 -0
src/agents/chat/prompts/en/chat_agent.yaml +35 -0
src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
src/agents/chat/session_manager.py +311 -0
src/agents/co_writer/__init__.py +0 -0
src/agents/co_writer/edit_agent.py +260 -0
src/agents/co_writer/narrator_agent.py +423 -0
src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
src/agents/guide/__init__.py +16 -0
src/agents/guide/agents/__init__.py +11 -0
src/agents/guide/agents/chat_agent.py +104 -0
src/agents/guide/agents/interactive_agent.py +223 -0
src/agents/guide/agents/locate_agent.py +149 -0
src/agents/guide/agents/summary_agent.py +150 -0
src/agents/guide/guide_manager.py +500 -0
src/agents/guide/prompts/en/chat_agent.yaml +41 -0
src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
src/agents/guide/prompts/en/locate_agent.yaml +68 -0
src/agents/guide/prompts/en/summary_agent.yaml +157 -0
src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
src/agents/ideagen/__init__.py +12 -0
src/agents/ideagen/idea_generation_workflow.py +426 -0
src/agents/ideagen/material_organizer_agent.py +173 -0
src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
src/agents/question/__init__.py +24 -0
src/agents/question/agents/__init__.py +18 -0
src/agents/question/agents/generate_agent.py +381 -0
src/agents/question/agents/relevance_analyzer.py +207 -0
src/agents/question/agents/retrieve_agent.py +239 -0
src/agents/question/coordinator.py +718 -0
src/agents/question/example.py +109 -0
src/agents/question/prompts/en/coordinator.yaml +75 -0
src/agents/question/prompts/en/generate_agent.yaml +77 -0
src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
src/agents/question/prompts/zh/coordinator.yaml +75 -0
src/agents/question/prompts/zh/generate_agent.yaml +77 -0
src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
src/agents/research/agents/__init__.py +23 -0
src/agents/research/agents/decompose_agent.py +507 -0
src/agents/research/agents/manager_agent.py +228 -0
src/agents/research/agents/note_agent.py +180 -0
src/agents/research/agents/rephrase_agent.py +263 -0
src/agents/research/agents/reporting_agent.py +1333 -0
src/agents/research/agents/research_agent.py +714 -0
src/agents/research/data_structures.py +451 -0
src/agents/research/main.py +188 -0
src/agents/research/prompts/en/decompose_agent.yaml +89 -0
src/agents/research/prompts/en/manager_agent.yaml +24 -0
src/agents/research/prompts/en/note_agent.yaml +121 -0
src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
src/agents/research/prompts/en/reporting_agent.yaml +380 -0
src/agents/research/prompts/en/research_agent.yaml +173 -0
src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
src/agents/research/prompts/zh/manager_agent.yaml +24 -0
src/agents/research/prompts/zh/note_agent.yaml +121 -0
src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
src/agents/research/prompts/zh/research_agent.yaml +173 -0
src/agents/research/research_pipeline.py +1309 -0
src/agents/research/utils/__init__.py +60 -0
src/agents/research/utils/citation_manager.py +799 -0
src/agents/research/utils/json_utils.py +98 -0
src/agents/research/utils/token_tracker.py +297 -0
src/agents/solve/__init__.py +80 -0
src/agents/solve/analysis_loop/__init__.py +14 -0
src/agents/solve/analysis_loop/investigate_agent.py +414 -0
src/agents/solve/analysis_loop/note_agent.py +190 -0
src/agents/solve/main_solver.py +862 -0
src/agents/solve/memory/__init__.py +34 -0
src/agents/solve/memory/citation_memory.py +353 -0
src/agents/solve/memory/investigate_memory.py +226 -0
src/agents/solve/memory/solve_memory.py +340 -0
src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
src/agents/solve/solve_loop/__init__.py +22 -0
src/agents/solve/solve_loop/citation_manager.py +74 -0
src/agents/solve/solve_loop/manager_agent.py +274 -0
src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
src/agents/solve/solve_loop/response_agent.py +301 -0
src/agents/solve/solve_loop/solve_agent.py +325 -0
src/agents/solve/solve_loop/tool_agent.py +470 -0
src/agents/solve/utils/__init__.py +64 -0
src/agents/solve/utils/config_validator.py +313 -0
src/agents/solve/utils/display_manager.py +223 -0
src/agents/solve/utils/error_handler.py +363 -0
src/agents/solve/utils/json_utils.py +98 -0
src/agents/solve/utils/performance_monitor.py +407 -0
src/agents/solve/utils/token_tracker.py +541 -0
src/api/__init__.py +0 -0
src/api/main.py +240 -0
src/api/routers/__init__.py +1 -0
src/api/routers/agent_config.py +69 -0
src/api/routers/chat.py +296 -0
src/api/routers/co_writer.py +337 -0
src/api/routers/config.py +627 -0
src/api/routers/dashboard.py +18 -0
src/api/routers/guide.py +337 -0
src/api/routers/ideagen.py +436 -0
src/api/routers/knowledge.py +821 -0
src/api/routers/notebook.py +247 -0
src/api/routers/question.py +537 -0
src/api/routers/research.py +394 -0
src/api/routers/settings.py +164 -0
src/api/routers/solve.py +305 -0
src/api/routers/system.py +252 -0
src/api/run_server.py +61 -0
src/api/utils/history.py +172 -0
src/api/utils/log_interceptor.py +21 -0
src/api/utils/notebook_manager.py +415 -0
src/api/utils/progress_broadcaster.py +72 -0
src/api/utils/task_id_manager.py +100 -0
src/config/__init__.py +0 -0
src/config/accessors.py +18 -0
src/config/constants.py +34 -0
src/config/defaults.py +18 -0
src/config/schema.py +38 -0
src/config/settings.py +50 -0
src/core/errors.py +62 -0
src/knowledge/__init__.py +23 -0
src/knowledge/add_documents.py +606 -0
src/knowledge/config.py +65 -0
src/knowledge/example_add_documents.py +236 -0
src/knowledge/extract_numbered_items.py +1039 -0
src/knowledge/initializer.py +621 -0
src/knowledge/kb.py +22 -0
src/knowledge/manager.py +782 -0
src/knowledge/progress_tracker.py +182 -0
src/knowledge/start_kb.py +535 -0
src/logging/__init__.py +103 -0
src/logging/adapters/__init__.py +17 -0
src/logging/adapters/lightrag.py +184 -0
src/logging/adapters/llamaindex.py +141 -0
src/logging/config.py +80 -0
src/logging/handlers/__init__.py +20 -0
src/logging/handlers/console.py +75 -0
src/logging/handlers/file.py +201 -0
src/logging/handlers/websocket.py +127 -0
src/logging/logger.py +709 -0
src/logging/stats/__init__.py +16 -0
src/logging/stats/llm_stats.py +179 -0
src/services/__init__.py +56 -0
src/services/config/__init__.py +61 -0
src/services/config/knowledge_base_config.py +210 -0
src/services/config/loader.py +260 -0
src/services/config/unified_config.py +603 -0
src/services/embedding/__init__.py +45 -0
src/services/embedding/adapters/__init__.py +22 -0
src/services/embedding/adapters/base.py +106 -0
src/services/embedding/adapters/cohere.py +127 -0
src/services/embedding/adapters/jina.py +99 -0
src/services/embedding/adapters/ollama.py +116 -0
src/services/embedding/adapters/openai_compatible.py +96 -0
src/services/embedding/client.py +159 -0
src/services/embedding/config.py +156 -0
src/services/embedding/provider.py +119 -0
src/services/llm/__init__.py +152 -0
src/services/llm/capabilities.py +313 -0
src/services/llm/client.py +302 -0
src/services/llm/cloud_provider.py +530 -0
src/services/llm/config.py +200 -0
src/services/llm/error_mapping.py +103 -0
src/services/llm/exceptions.py +152 -0
src/services/llm/factory.py +450 -0
src/services/llm/local_provider.py +347 -0
src/services/llm/providers/anthropic.py +95 -0
src/services/llm/providers/base_provider.py +93 -0
src/services/llm/providers/open_ai.py +83 -0
src/services/llm/registry.py +71 -0
src/services/llm/telemetry.py +40 -0
src/services/llm/types.py +27 -0
src/services/llm/utils.py +333 -0
src/services/prompt/__init__.py +25 -0
src/services/prompt/manager.py +206 -0
src/services/rag/__init__.py +64 -0
src/services/rag/components/__init__.py +29 -0
src/services/rag/components/base.py +59 -0
src/services/rag/components/chunkers/__init__.py +18 -0
src/services/rag/components/chunkers/base.py +34 -0
src/services/rag/components/chunkers/fixed.py +71 -0
src/services/rag/components/chunkers/numbered_item.py +94 -0
src/services/rag/components/chunkers/semantic.py +97 -0
src/services/rag/components/embedders/__init__.py +14 -0
src/services/rag/components/embedders/base.py +32 -0
src/services/rag/components/embedders/openai.py +63 -0
src/services/rag/components/indexers/__init__.py +18 -0
src/services/rag/components/indexers/base.py +35 -0
src/services/rag/components/indexers/graph.py +172 -0
src/services/rag/components/indexers/lightrag.py +156 -0
src/services/rag/components/indexers/vector.py +146 -0
src/services/rag/components/parsers/__init__.py +18 -0
src/services/rag/components/parsers/base.py +35 -0
src/services/rag/components/parsers/markdown.py +52 -0
src/services/rag/components/parsers/pdf.py +115 -0
src/services/rag/components/parsers/text.py +86 -0
src/services/rag/components/retrievers/__init__.py +18 -0
src/services/rag/components/retrievers/base.py +34 -0
src/services/rag/components/retrievers/dense.py +200 -0
src/services/rag/components/retrievers/hybrid.py +164 -0
src/services/rag/components/retrievers/lightrag.py +169 -0
src/services/rag/components/routing.py +286 -0
src/services/rag/factory.py +234 -0
src/services/rag/pipeline.py +215 -0
src/services/rag/pipelines/__init__.py +32 -0
src/services/rag/pipelines/academic.py +44 -0
src/services/rag/pipelines/lightrag.py +43 -0
src/services/rag/pipelines/llamaindex.py +313 -0
src/services/rag/pipelines/raganything.py +384 -0
src/services/rag/service.py +244 -0
src/services/rag/types.py +73 -0
src/services/search/__init__.py +284 -0
src/services/search/base.py +87 -0
src/services/search/consolidation.py +398 -0
src/services/search/providers/__init__.py +128 -0
src/services/search/providers/baidu.py +188 -0
src/services/search/providers/exa.py +194 -0
src/services/search/providers/jina.py +161 -0
src/services/search/providers/perplexity.py +153 -0
src/services/search/providers/serper.py +209 -0
src/services/search/providers/tavily.py +161 -0
src/services/search/types.py +114 -0
src/services/setup/__init__.py +34 -0
src/services/setup/init.py +285 -0
src/services/tts/__init__.py +16 -0
src/services/tts/config.py +99 -0
src/tools/__init__.py +91 -0
src/tools/code_executor.py +536 -0
src/tools/paper_search_tool.py +171 -0
src/tools/query_item_tool.py +310 -0
src/tools/question/__init__.py +15 -0
src/tools/question/exam_mimic.py +616 -0
src/tools/question/pdf_parser.py +211 -0
src/tools/question/question_extractor.py +397 -0
src/tools/rag_tool.py +173 -0
src/tools/tex_chunker.py +339 -0
src/tools/tex_downloader.py +253 -0
src/tools/web_search.py +71 -0
src/utils/config_manager.py +206 -0
src/utils/document_validator.py +168 -0
src/utils/error_rate_tracker.py +111 -0
src/utils/error_utils.py +82 -0
src/utils/json_parser.py +110 -0
src/utils/network/circuit_breaker.py +79 -0

src/knowledge/add_documents.py ADDED Viewed

@@ -0,0 +1,606 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Incrementally add documents to existing knowledge base.
+Improved version with Hash-based duplicate checking, robust error handling,
+and architectural improvements for data integrity and vision support.
+"""
+import argparse
+import asyncio
+from datetime import datetime
+from functools import partial
+import hashlib
+import json
+import os
+from pathlib import Path
+import shutil
+import sys
+import tempfile
+from typing import TYPE_CHECKING, Any, Dict, List
+from dotenv import load_dotenv
+# Attempt imports for dynamic dependencies
+try:
+    from lightrag.llm.openai import openai_complete_if_cache
+    from lightrag.utils import EmbeddingFunc
+except ImportError:
+    # These will be caught during runtime if needed
+    openai_complete_if_cache = None
+    EmbeddingFunc = None
+# Type hinting support for dynamic imports
+if TYPE_CHECKING:
+    try:
+        from raganything import RAGAnything
+        from raganything import RAGAnythingConfig as RAGAnythingConfigType
+    except ImportError:
+        RAGAnything = Any
+        RAGAnythingConfigType = Any
+else:
+    RAGAnything = None
+    RAGAnythingConfigType = None
+# Placeholder for runtime classes
+raganything_cls = None
+RAGAnythingConfig = None
+def load_dynamic_imports(project_root: Path):
+    """Handle the path injections and dynamic imports safely."""
+    global raganything_cls, RAGAnythingConfig
+    sys.path.insert(0, str(project_root))
+    raganything_path = project_root.parent / "raganything" / "RAG-Anything"
+    if raganything_path.exists():
+        sys.path.insert(0, str(raganything_path))
+    try:
+        from raganything import RAGAnything as RA
+        from raganything import RAGAnythingConfig as RAC
+        raganything_cls = RA
+        RAGAnythingConfig = RAC
+    except ImportError:
+        pass
+from src.knowledge.extract_numbered_items import process_content_list
+from src.logging import LightRAGLogContext, get_logger
+from src.services.embedding import (
+    get_embedding_client,
+    get_embedding_config,
+    reset_embedding_client,
+)
+from src.services.llm import get_llm_config
+logger = get_logger("KnowledgeInit")
+# Default base directory for knowledge bases
+DEFAULT_BASE_DIR = "./data/knowledge_bases"
+class DocumentAdder:
+    """Add documents to existing knowledge base with Hash-validation"""
+    def __init__(
+        self,
+        kb_name: str,
+        base_dir=DEFAULT_BASE_DIR,
+        api_key: str | None = None,
+        base_url: str | None = None,
+        progress_tracker=None,
+        rag_provider: str | None = None,
+    ):
+        self.kb_name = kb_name
+        self.base_dir = Path(base_dir)
+        self.kb_dir = self.base_dir / kb_name
+        if not self.kb_dir.exists():
+            raise ValueError(f"Knowledge base does not exist: {kb_name}")
+        self.raw_dir = self.kb_dir / "raw"
+        self.images_dir = self.kb_dir / "images"
+        self.rag_storage_dir = self.kb_dir / "rag_storage"
+        self.content_list_dir = self.kb_dir / "content_list"
+        self.metadata_file = self.kb_dir / "metadata.json"
+        if not self.rag_storage_dir.exists():
+            raise ValueError(f"Knowledge base not initialized: {kb_name}")
+        self.api_key = api_key
+        self.base_url = base_url
+        self.progress_tracker = progress_tracker
+        self.rag_provider = rag_provider
+        self._ensure_working_directories()
+    def _ensure_working_directories(self):
+        for directory in [self.raw_dir, self.images_dir, self.content_list_dir]:
+            directory.mkdir(parents=True, exist_ok=True)
+    def _get_file_hash(self, file_path: Path) -> str:
+        """
+        Calculate SHA-256 hash of a file.
+        Uses 64KB chunks for better throughput on SSDs.
+        """
+        sha256_hash = hashlib.sha256()
+        chunk_size = 65536  # 64KB
+        with open(file_path, "rb") as f:
+            for byte_block in iter(lambda: f.read(chunk_size), b""):
+                sha256_hash.update(byte_block)
+        return sha256_hash.hexdigest()
+    def get_ingested_hashes(self) -> Dict[str, str]:
+        """Get map of filename -> hash from metadata."""
+        if self.metadata_file.exists():
+            try:
+                with open(self.metadata_file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                    return data.get("file_hashes", {})
+            except Exception:
+                return {}
+        return {}
+    async def _run_in_executor(self, func, *args, **kwargs):
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, partial(func, *args, **kwargs))
+    def add_documents(self, source_files: List[str], allow_duplicates: bool = False) -> List[Path]:
+        """
+        Synchronous phase: Validates hashes and prepares files.
+        Treats 'raw/' as a Write-Ahead Log: files exist there before being canonized in metadata.
+        """
+        logger.info(f"Validating documents for '{self.kb_name}'...")
+        ingested_hashes = self.get_ingested_hashes()
+        files_to_process = []
+        for source in source_files:
+            source_path = Path(source)
+            if not source_path.exists():
+                logger.warning(f"  ⚠ Missing: {source}")
+                continue
+            current_hash = self._get_file_hash(source_path)
+            # 1. Check if content is already fully ingested (Canon Check)
+            # We look for value matches in the metadata hash map
+            if current_hash in ingested_hashes.values() and not allow_duplicates:
+                logger.info(f"  → Skipped (content already indexed): {source_path.name}")
+                continue
+            # 2. Prepare file in raw/ (Write-Ahead Log)
+            dest_path = self.raw_dir / source_path.name
+            should_copy = True
+            if dest_path.exists():
+                # If file exists in raw, check if it's the same content
+                dest_hash = self._get_file_hash(dest_path)
+                if dest_hash == current_hash:
+                    should_copy = False
+                    logger.info(f"  ⚠ Recovering staged file (interrupted run): {source_path.name}")
+                else:
+                    if not allow_duplicates:
+                        # Name collision with different content
+                        logger.info(
+                            f"  → Skipped (filename collision with different content): {source_path.name}"
+                        )
+                        continue
+                    else:
+                        logger.info(f"  → Overwriting existing raw file: {source_path.name}")
+            if should_copy:
+                shutil.copy2(source_path, dest_path)
+                logger.info(f"  ✓ Staged to raw: {source_path.name}")
+            files_to_process.append(dest_path)
+        return files_to_process
+    async def process_new_documents(self, new_files: List[Path]):
+        """
+        Async phase: Ingests files into the RAG system.
+        Uses FileTypeRouter to classify files and route them appropriately:
+        - PDF/DOCX/images -> MinerU parser (full document analysis)
+        - Text/Markdown -> Direct read + LightRAG insert (fast)
+        """
+        if not new_files:
+            return None
+        if raganything_cls is None:
+            raise ImportError("RAGAnything module not found.")
+        from src.services.rag.components.routing import FileTypeRouter
+        # Pre-import progress stage if needed to avoid overhead in loop
+        ProgressStage: Any = None
+        if self.progress_tracker:
+            from src.knowledge.progress_tracker import ProgressStage
+        self.llm_cfg = get_llm_config()
+        model = self.llm_cfg.model
+        api_key = self.api_key or self.llm_cfg.api_key
+        base_url = self.base_url or self.llm_cfg.base_url
+        # LLM Function Wrapper
+        def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
+            if history_messages is None:
+                history_messages = []
+            return openai_complete_if_cache(
+                model,
+                prompt,
+                system_prompt=system_prompt,
+                history_messages=history_messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # Vision Function Wrapper - Robust history handling
+        def vision_model_func(
+            prompt,
+            system_prompt=None,
+            history_messages=None,
+            image_data=None,
+            messages=None,
+            **kwargs,
+        ):
+            if history_messages is None:
+                history_messages = []
+            # If pre-formatted messages are provided, sanitize them
+            if messages:
+                safe_messages = self._filter_valid_messages(messages)
+                return openai_complete_if_cache(
+                    model,
+                    prompt="",
+                    messages=safe_messages,
+                    api_key=api_key,
+                    base_url=base_url,
+                    **kwargs,
+                )
+            # --- Construct Message History ---
+            current_messages = []
+            # 1. Add System Prompt (if provided)
+            if system_prompt:
+                current_messages.append({"role": "system", "content": system_prompt})
+            # 2. Add History (Filtering out conflicting system prompts)
+            if history_messages:
+                # Filter out system messages from history to avoid duplicates/conflicts with the new system_prompt
+                filtered_history = [
+                    msg
+                    for msg in history_messages
+                    if isinstance(msg, dict) and msg.get("role") != "system"
+                ]
+                current_messages.extend(filtered_history)
+            # 3. Construct New User Message
+            user_content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
+            if image_data:
+                user_content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
+                    }
+                )
+            # 4. Merge Logic: Avoid back-to-back user messages
+            if current_messages and current_messages[-1].get("role") == "user":
+                last_msg = current_messages[-1]
+                # If last content is string, convert to list format first
+                if isinstance(last_msg["content"], str):
+                    last_msg["content"] = [{"type": "text", "text": last_msg["content"]}]
+                # Append new content blocks
+                if isinstance(last_msg["content"], list):
+                    last_msg["content"].extend(user_content)
+                else:
+                    # Fallback if structure is unexpected, just append new message
+                    current_messages.append({"role": "user", "content": user_content})
+            else:
+                current_messages.append({"role": "user", "content": user_content})
+            return openai_complete_if_cache(
+                model,
+                prompt="",
+                messages=current_messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # Embedding Setup
+        reset_embedding_client()
+        embedding_cfg = get_embedding_config()
+        embedding_client = get_embedding_client()
+        async def unified_embed_func(texts):
+            return await embedding_client.embed(texts)
+        embedding_func = EmbeddingFunc(
+            embedding_dim=embedding_cfg.dim,
+            max_token_size=embedding_cfg.max_tokens,
+            func=unified_embed_func,
+        )
+        config = RAGAnythingConfig(
+            working_dir=str(self.rag_storage_dir),
+            parser="mineru",
+            enable_image_processing=True,
+            enable_table_processing=True,
+            enable_equation_processing=True,
+        )
+        with LightRAGLogContext(scene="knowledge_init"):
+            rag = raganything_cls(
+                config=config,
+                llm_model_func=llm_model_func,
+                vision_model_func=vision_model_func,
+                embedding_func=embedding_func,
+            )
+            if hasattr(rag, "_ensure_lightrag_initialized"):
+                await rag._ensure_lightrag_initialized()
+        # Classify files by type
+        file_paths_str = [str(f) for f in new_files]
+        classification = FileTypeRouter.classify_files(file_paths_str)
+        logger.info(
+            f"File classification: {len(classification.needs_mineru)} need MinerU, "
+            f"{len(classification.text_files)} text files, "
+            f"{len(classification.unsupported)} unsupported"
+        )
+        processed_files = []
+        total_files = len(classification.needs_mineru) + len(classification.text_files)
+        idx = 0
+        # Process files requiring MinerU (PDF, DOCX, images)
+        for doc_file_str in classification.needs_mineru:
+            doc_file = Path(doc_file_str)
+            idx += 1
+            try:
+                if self.progress_tracker and ProgressStage:
+                    self.progress_tracker.update(
+                        ProgressStage.PROCESSING_FILE,
+                        f"Ingesting (MinerU) {doc_file.name}",
+                        current=idx,
+                        total=total_files,
+                    )
+                # Verify file still exists in raw/ (it should, as we staged it)
+                if not doc_file.exists():
+                    logger.error(f"  ✗ Failed: Staged file missing {doc_file.name}")
+                    continue
+                await asyncio.wait_for(
+                    rag.process_document_complete(
+                        file_path=str(doc_file),
+                        output_dir=str(self.content_list_dir),
+                        parse_method="auto",
+                    ),
+                    timeout=600.0,
+                )
+                processed_files.append(doc_file)
+                # Store hash on success - "Canonizing" the file
+                self._record_successful_hash(doc_file)
+                logger.info(f"  ✓ Processed (MinerU): {doc_file.name}")
+            except Exception as e:
+                logger.exception(f"  ✗ Failed {doc_file.name}: {e}")
+        # Process text files directly (fast path - no MinerU)
+        for doc_file_str in classification.text_files:
+            doc_file = Path(doc_file_str)
+            idx += 1
+            try:
+                if self.progress_tracker and ProgressStage:
+                    self.progress_tracker.update(
+                        ProgressStage.PROCESSING_FILE,
+                        f"Ingesting (text) {doc_file.name}",
+                        current=idx,
+                        total=total_files,
+                    )
+                # Verify file still exists
+                if not doc_file.exists():
+                    logger.error(f"  ✗ Failed: Staged file missing {doc_file.name}")
+                    continue
+                # Read text file directly
+                content = await FileTypeRouter.read_text_file(str(doc_file))
+                if content.strip():
+                    # Insert directly into LightRAG, bypassing MinerU
+                    await rag.lightrag.ainsert(content)
+                    processed_files.append(doc_file)
+                    self._record_successful_hash(doc_file)
+                    logger.info(f"  ✓ Processed (text): {doc_file.name}")
+                else:
+                    logger.warning(f"  ⚠ Skipped empty file: {doc_file.name}")
+            except Exception as e:
+                logger.exception(f"  ✗ Failed {doc_file.name}: {e}")
+        # Log unsupported files
+        for doc_file_str in classification.unsupported:
+            logger.warning(f"  ⚠ Skipped unsupported file: {Path(doc_file_str).name}")
+        await self.fix_structure()
+        return processed_files
+    def _record_successful_hash(self, file_path: Path):
+        """Update metadata with the hash of a successfully processed file."""
+        file_hash = self._get_file_hash(file_path)
+        try:
+            metadata = {}
+            if self.metadata_file.exists():
+                with open(self.metadata_file, "r", encoding="utf-8") as f:
+                    metadata = json.load(f)
+            if "file_hashes" not in metadata:
+                metadata["file_hashes"] = {}
+            metadata["file_hashes"][file_path.name] = file_hash
+            # Atomic write: write to temp file, then rename
+            fd, tmp_path = tempfile.mkstemp(dir=self.kb_dir, suffix=".json")
+            try:
+                with os.fdopen(fd, "w", encoding="utf-8") as f:
+                    json.dump(metadata, f, indent=2, ensure_ascii=False)
+                os.replace(tmp_path, self.metadata_file)
+            except Exception:
+                os.unlink(tmp_path)
+                raise
+        except Exception as e:
+            logger.warning(f"Could not update hash metadata: {e}")
+    @staticmethod
+    def _filter_valid_messages(messages):
+        return [
+            m
+            for m in messages
+            if isinstance(m, dict) and m.get("role") is not None and m.get("content") is not None
+        ]
+    async def fix_structure(self):
+        """Robustly moves nested outputs and cleans up."""
+        logger.info("Organizing storage structure...")
+        # 1. Identify moves
+        moves = []
+        for doc_dir in self.content_list_dir.glob("*"):
+            if not doc_dir.is_dir():
+                continue
+            # Content List
+            json_src = next(doc_dir.glob("auto/*_content_list.json"), None)
+            if json_src:
+                moves.append((json_src, self.content_list_dir / f"{doc_dir.name}.json"))
+            # Images
+            for img in doc_dir.glob("auto/images/*"):
+                moves.append((img, self.images_dir / img.name))
+        # 2. Execute moves
+        for src, dest in moves:
+            if src.exists():
+                await self._run_in_executor(shutil.copy2, src, dest)
+        # 3. Safe Cleanup: Only delete directories we actually processed
+        for doc_dir in self.content_list_dir.glob("*"):
+            if doc_dir.is_dir():
+                # Safety check: only delete if it looks like a parser output (has 'auto' subdir)
+                # This prevents wiping manual user folders in content_list_dir
+                if (doc_dir / "auto").exists():
+                    await self._run_in_executor(shutil.rmtree, doc_dir, ignore_errors=True)
+    def extract_numbered_items_for_new_docs(self, processed_files, batch_size=20):
+        if not processed_files:
+            return
+        llm_cfg = getattr(self, "llm_cfg", None)
+        if llm_cfg is None:
+            llm_cfg = get_llm_config()
+        api_key = self.api_key or llm_cfg.api_key
+        base_url = self.base_url or llm_cfg.base_url
+        output_file = self.kb_dir / "numbered_items.json"
+        for doc_file in processed_files:
+            content_list_file = self.content_list_dir / f"{doc_file.stem}.json"
+            if content_list_file.exists():
+                process_content_list(
+                    content_list_file=content_list_file,
+                    output_file=output_file,
+                    api_key=api_key,
+                    base_url=base_url,
+                    batch_size=batch_size,
+                    merge=output_file.exists(),
+                )
+    def update_metadata(self, added_count: int):
+        if not self.metadata_file.exists():
+            return
+        try:
+            with open(self.metadata_file, "r", encoding="utf-8") as f:
+                metadata = json.load(f)
+            metadata["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            # Update RAG provider if specified
+            if self.rag_provider:
+                metadata["rag_provider"] = self.rag_provider
+                # Also save to centralized config file
+                try:
+                    from src.services.config import get_kb_config_service
+                    kb_config_service = get_kb_config_service()
+                    kb_config_service.set_rag_provider(self.kb_name, self.rag_provider)
+                except Exception as config_err:
+                    logger.warning(f"Failed to save to centralized config: {config_err}")
+            history = metadata.get("update_history", [])
+            history.append(
+                {
+                    "timestamp": metadata["last_updated"],
+                    "action": "incremental_add",
+                    "count": added_count,
+                }
+            )
+            metadata["update_history"] = history
+            with open(self.metadata_file, "w", encoding="utf-8") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+        except Exception as e:
+            logger.warning(f"Metadata update failed: {e}")
+async def main():
+    parser = argparse.ArgumentParser(description="Incrementally add documents to RAG KB")
+    parser.add_argument("kb_name", help="KB Name")
+    parser.add_argument("--docs", nargs="+", help="Files")
+    parser.add_argument("--docs-dir", help="Directory")
+    parser.add_argument("--base-dir", default=DEFAULT_BASE_DIR)
+    parser.add_argument("--api-key", default=os.getenv("LLM_API_KEY"))
+    parser.add_argument("--base-url", default=os.getenv("LLM_HOST"))
+    parser.add_argument("--allow-duplicates", action="store_true")
+    args = parser.parse_args()
+    # Initialize dynamic paths
+    project_root = Path(__file__).parent.parent.parent
+    load_dynamic_imports(project_root)
+    load_dotenv()
+    doc_files = []
+    if args.docs:
+        doc_files.extend(args.docs)
+    if args.docs_dir:
+        p = Path(args.docs_dir)
+        for ext in ["*.pdf", "*.docx", "*.txt", "*.md"]:
+            doc_files.extend([str(f) for f in p.glob(ext)])
+    if not doc_files:
+        logger.error("No documents provided.")
+        return
+    adder = DocumentAdder(args.kb_name, args.base_dir, args.api_key, args.base_url)
+    # 1. Sync Phase (Validate and Stage)
+    new_files = adder.add_documents(doc_files, allow_duplicates=args.allow_duplicates)
+    # 2. Async Ingestion (Process and Canonize)
+    if new_files:
+        processed = await adder.process_new_documents(new_files)
+        if processed:
+            adder.extract_numbered_items_for_new_docs(processed)
+            adder.update_metadata(len(processed))
+            logger.info(f"Done! Successfully added {len(processed)} documents.")
+    else:
+        logger.info("No new unique documents to add.")
+if __name__ == "__main__":
+    asyncio.run(main())

src/knowledge/config.py ADDED Viewed

@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+"""
+Knowledge Base Path Configuration Module - Unified management of all paths
+"""
+import os
+from pathlib import Path
+# Project root directory (DeepTutor/)
+PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve()
+# Knowledge base base directory
+KNOWLEDGE_BASES_DIR = PROJECT_ROOT / "data" / "knowledge_bases"
+# raganything module path
+RAGANYTHING_PATH = PROJECT_ROOT.parent / "raganything" / "RAG-Anything"
+# Ensure raganything path existence check
+def check_raganything():
+    """Check if raganything module exists"""
+    return RAGANYTHING_PATH.exists()
+# Environment variable configuration
+def get_env_config():
+    """Get environment variable configuration (unified read from env_config)"""
+    try:
+        from src.services.llm import get_llm_config
+        cfg = get_llm_config()
+        return {
+            "api_key": cfg.api_key,
+            "base_url": cfg.base_url,
+        }
+    except Exception:
+        # Compatibility fallback: directly read environment variables
+        return {
+            "api_key": os.getenv("LLM_API_KEY"),
+            "base_url": os.getenv("LLM_HOST"),
+        }
+# Add necessary paths to sys.path
+def setup_paths():
+    """Set Python module search paths"""
+    import sys
+    # Add project root directory
+    if str(PROJECT_ROOT) not in sys.path:
+        sys.path.insert(0, str(PROJECT_ROOT))
+    # Add raganything path (if exists)
+    if check_raganything() and str(RAGANYTHING_PATH) not in sys.path:
+        sys.path.insert(0, str(RAGANYTHING_PATH))
+__all__ = [
+    "KNOWLEDGE_BASES_DIR",
+    "PROJECT_ROOT",
+    "RAGANYTHING_PATH",
+    "check_raganything",
+    "get_env_config",
+    "setup_paths",
+]