PyPI - realtimex-deeptutor - Versions diffs - 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl - Mend

realtimex-deeptutor 0.5.0.post1py3-none-any.whl → 0.5.0.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

{realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
{realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
{realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
{realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
scripts/__init__.py +1 -0
scripts/audit_prompts.py +179 -0
scripts/check_install.py +460 -0
scripts/generate_roster.py +327 -0
scripts/install_all.py +653 -0
scripts/migrate_kb.py +655 -0
scripts/start.py +807 -0
scripts/start_web.py +632 -0
scripts/sync_prompts_from_en.py +147 -0
src/__init__.py +2 -2
src/agents/ideagen/material_organizer_agent.py +2 -0
src/agents/solve/__init__.py +6 -0
src/agents/solve/main_solver.py +9 -0
src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
src/agents/solve/session_manager.py +345 -0
src/api/main.py +14 -0
src/api/routers/chat.py +3 -3
src/api/routers/co_writer.py +12 -7
src/api/routers/config.py +1 -0
src/api/routers/guide.py +3 -1
src/api/routers/ideagen.py +7 -0
src/api/routers/knowledge.py +64 -12
src/api/routers/question.py +2 -0
src/api/routers/realtimex.py +137 -0
src/api/routers/research.py +9 -0
src/api/routers/solve.py +120 -2
src/cli/__init__.py +13 -0
src/cli/start.py +209 -0
src/config/constants.py +11 -9
src/knowledge/add_documents.py +453 -213
src/knowledge/extract_numbered_items.py +9 -10
src/knowledge/initializer.py +102 -101
src/knowledge/manager.py +251 -74
src/knowledge/progress_tracker.py +43 -2
src/knowledge/start_kb.py +11 -2
src/logging/__init__.py +5 -0
src/logging/adapters/__init__.py +1 -0
src/logging/adapters/lightrag.py +25 -18
src/logging/adapters/llamaindex.py +1 -0
src/logging/config.py +30 -27
src/logging/handlers/__init__.py +1 -0
src/logging/handlers/console.py +7 -50
src/logging/handlers/file.py +5 -20
src/logging/handlers/websocket.py +23 -19
src/logging/logger.py +161 -126
src/logging/stats/__init__.py +1 -0
src/logging/stats/llm_stats.py +37 -17
src/services/__init__.py +17 -1
src/services/config/__init__.py +1 -0
src/services/config/knowledge_base_config.py +1 -0
src/services/config/loader.py +1 -1
src/services/config/unified_config.py +211 -4
src/services/embedding/__init__.py +1 -0
src/services/embedding/adapters/__init__.py +3 -0
src/services/embedding/adapters/base.py +1 -0
src/services/embedding/adapters/cohere.py +1 -0
src/services/embedding/adapters/jina.py +1 -0
src/services/embedding/adapters/ollama.py +1 -0
src/services/embedding/adapters/openai_compatible.py +1 -0
src/services/embedding/adapters/realtimex.py +125 -0
src/services/embedding/client.py +27 -0
src/services/embedding/config.py +3 -0
src/services/embedding/provider.py +1 -0
src/services/llm/__init__.py +17 -3
src/services/llm/capabilities.py +47 -0
src/services/llm/client.py +32 -0
src/services/llm/cloud_provider.py +21 -4
src/services/llm/config.py +36 -2
src/services/llm/error_mapping.py +1 -0
src/services/llm/exceptions.py +30 -0
src/services/llm/factory.py +55 -16
src/services/llm/local_provider.py +1 -0
src/services/llm/providers/anthropic.py +1 -0
src/services/llm/providers/base_provider.py +1 -0
src/services/llm/providers/open_ai.py +1 -0
src/services/llm/realtimex_provider.py +240 -0
src/services/llm/registry.py +1 -0
src/services/llm/telemetry.py +1 -0
src/services/llm/types.py +1 -0
src/services/llm/utils.py +1 -0
src/services/prompt/__init__.py +1 -0
src/services/prompt/manager.py +3 -2
src/services/rag/__init__.py +27 -5
src/services/rag/components/__init__.py +1 -0
src/services/rag/components/base.py +1 -0
src/services/rag/components/chunkers/__init__.py +1 -0
src/services/rag/components/chunkers/base.py +1 -0
src/services/rag/components/chunkers/fixed.py +1 -0
src/services/rag/components/chunkers/numbered_item.py +1 -0
src/services/rag/components/chunkers/semantic.py +1 -0
src/services/rag/components/embedders/__init__.py +1 -0
src/services/rag/components/embedders/base.py +1 -0
src/services/rag/components/embedders/openai.py +1 -0
src/services/rag/components/indexers/__init__.py +1 -0
src/services/rag/components/indexers/base.py +1 -0
src/services/rag/components/indexers/graph.py +5 -44
src/services/rag/components/indexers/lightrag.py +5 -44
src/services/rag/components/indexers/vector.py +1 -0
src/services/rag/components/parsers/__init__.py +1 -0
src/services/rag/components/parsers/base.py +1 -0
src/services/rag/components/parsers/markdown.py +1 -0
src/services/rag/components/parsers/pdf.py +1 -0
src/services/rag/components/parsers/text.py +1 -0
src/services/rag/components/retrievers/__init__.py +1 -0
src/services/rag/components/retrievers/base.py +1 -0
src/services/rag/components/retrievers/dense.py +1 -0
src/services/rag/components/retrievers/hybrid.py +5 -44
src/services/rag/components/retrievers/lightrag.py +5 -44
src/services/rag/components/routing.py +48 -0
src/services/rag/factory.py +112 -46
src/services/rag/pipeline.py +1 -0
src/services/rag/pipelines/__init__.py +27 -18
src/services/rag/pipelines/lightrag.py +1 -0
src/services/rag/pipelines/llamaindex.py +99 -0
src/services/rag/pipelines/raganything.py +67 -100
src/services/rag/pipelines/raganything_docling.py +368 -0
src/services/rag/service.py +5 -12
src/services/rag/types.py +1 -0
src/services/rag/utils/__init__.py +17 -0
src/services/rag/utils/image_migration.py +279 -0
src/services/search/__init__.py +1 -0
src/services/search/base.py +1 -0
src/services/search/consolidation.py +1 -0
src/services/search/providers/__init__.py +1 -0
src/services/search/providers/baidu.py +1 -0
src/services/search/providers/exa.py +1 -0
src/services/search/providers/jina.py +1 -0
src/services/search/providers/perplexity.py +1 -0
src/services/search/providers/serper.py +1 -0
src/services/search/providers/tavily.py +1 -0
src/services/search/types.py +1 -0
src/services/settings/__init__.py +1 -0
src/services/settings/interface_settings.py +78 -0
src/services/setup/__init__.py +1 -0
src/services/tts/__init__.py +1 -0
src/services/tts/config.py +1 -0
src/utils/realtimex.py +284 -0
realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
src/services/rag/pipelines/academic.py +0 -44
{realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0

src/services/rag/pipelines/raganything.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 """
 RAGAnything Pipeline
 ====================
@@ -9,11 +10,13 @@ from pathlib import Path
 import sys
 from typing import Any, Dict, List, Optional
-from lightrag.llm.openai import openai_complete_if_cache
 from src.logging import get_logger
 from src.logging.adapters import LightRAGLogContext
+# Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
+# This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
+from src.services.llm.config import get_llm_config as _early_config_load  # noqa: F401
 class RAGAnythingPipeline:
     """
@@ -71,106 +74,19 @@ class RAGAnythingPipeline:
         self._setup_raganything_path()
-        from openai import AsyncOpenAI
         from raganything import RAGAnything, RAGAnythingConfig
         from src.services.embedding import get_embedding_client
         from src.services.llm import get_llm_client
+        # Use unified LLM client from src/services/llm
         llm_client = get_llm_client()
         embed_client = get_embedding_client()
-        # Create AsyncOpenAI client directly - bypasses LightRAG's response_format handling
-        openai_client = AsyncOpenAI(
-            api_key=llm_client.config.api_key,
-            base_url=llm_client.config.base_url,
-        )
-        async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
-            """Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
-            if history_messages is None:
-                history_messages = []
-            # Build messages array
-            messages = []
-            if system_prompt:
-                messages.append({"role": "system", "content": system_prompt})
-            # Add history
-            messages.extend(history_messages)
-            # Add current prompt
-            messages.append({"role": "user", "content": prompt})
-            # Whitelist only valid OpenAI parameters, filter out LightRAG-specific ones
-            valid_params = {
-                "temperature",
-                "top_p",
-                "n",
-                "stream",
-                "stop",
-                "max_tokens",
-                "presence_penalty",
-                "frequency_penalty",
-                "logit_bias",
-                "user",
-                "seed",
-            }
-            clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
-            # Call OpenAI API directly (async)
-            response = await openai_client.chat.completions.create(
-                model=llm_client.config.model,
-                messages=messages,
-                **clean_kwargs,
-            )
-            return response.choices[0].message.content
-        def vision_model_func(
-            prompt,
-            system_prompt=None,
-            history_messages=[],
-            image_data=None,
-            messages=None,
-            **kwargs,
-        ):
-            # Handle multimodal messages
-            if messages:
-                clean_kwargs = {
-                    k: v
-                    for k, v in kwargs.items()
-                    if k not in ["messages", "prompt", "system_prompt", "history_messages"]
-                }
-                return openai_complete_if_cache(
-                    llm_client.config.model,
-                    prompt="",
-                    messages=messages,
-                    api_key=llm_client.config.api_key,
-                    base_url=llm_client.config.base_url,
-                    **clean_kwargs,
-                )
-            if image_data:
-                # Build image message
-                image_message = {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt},
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
-                        },
-                    ],
-                }
-                return openai_complete_if_cache(
-                    llm_client.config.model,
-                    prompt="",
-                    messages=[image_message],
-                    api_key=llm_client.config.api_key,
-                    base_url=llm_client.config.base_url,
-                    **kwargs,
-                )
-            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
+        # Get model functions from unified LLM client
+        # These handle all provider differences (OpenAI, Anthropic, Azure, local, etc.)
+        llm_model_func = llm_client.get_model_func()
+        vision_model_func = llm_client.get_vision_model_func()
         config = RAGAnythingConfig(
             working_dir=working_dir,
@@ -197,7 +113,15 @@ class RAGAnythingPipeline:
         **kwargs,
     ) -> bool:
         """
-        Initialize KB using RAG-Anything's process_document_complete().
+        Initialize KB using RAG-Anything with MinerU parser.
+        Processing flow:
+        1. Parse documents using MinerU (generates content_list with nested image paths)
+        2. Migrate images to canonical location (kb/images/) and update paths in content_list
+        3. Insert updated content_list into RAG (now with correct image paths)
+        4. Clean up temporary parser output directories
+        This ensures RAG stores the final image paths, avoiding path mismatches during retrieval.
         Uses FileTypeRouter to classify files and route them appropriately:
         - PDF files -> MinerU parser (full document analysis)
@@ -212,13 +136,21 @@ class RAGAnythingPipeline:
         Returns:
             True if successful
         """
+        import json
         from ..components.routing import FileTypeRouter
+        from ..utils.image_migration import (
+            cleanup_parser_output_dirs,
+            migrate_images_and_update_paths,
+        )
         self.logger.info(f"Initializing KB '{kb_name}' with {len(file_paths)} files")
         kb_dir = Path(self.kb_base_dir) / kb_name
         content_list_dir = kb_dir / "content_list"
+        images_dir = kb_dir / "images"
         content_list_dir.mkdir(parents=True, exist_ok=True)
+        images_dir.mkdir(parents=True, exist_ok=True)
         # Classify files by type
         classification = FileTypeRouter.classify_files(file_paths)
@@ -235,19 +167,47 @@ class RAGAnythingPipeline:
             total_files = len(classification.needs_mineru) + len(classification.text_files)
             idx = 0
+            total_images_migrated = 0
             # Process files requiring MinerU (PDF, DOCX, images)
             for file_path in classification.needs_mineru:
                 idx += 1
-                self.logger.info(
-                    f"Processing [{idx}/{total_files}] (MinerU): {Path(file_path).name}"
-                )
-                await rag.process_document_complete(
+                file_name = Path(file_path).name
+                self.logger.info(f"Processing [{idx}/{total_files}] (MinerU): {file_name}")
+                # Step 1: Parse document (without RAG insertion)
+                self.logger.info("  Step 1/3: Parsing document...")
+                content_list, doc_id = await rag.parse_document(
                     file_path=file_path,
                     output_dir=str(content_list_dir),
                     parse_method="auto",
                 )
+                # Step 2: Migrate images and update paths
+                self.logger.info("  Step 2/3: Migrating images to canonical location...")
+                updated_content_list, num_migrated = await migrate_images_and_update_paths(
+                    content_list=content_list,
+                    source_base_dir=content_list_dir,
+                    target_images_dir=images_dir,
+                    batch_size=50,
+                )
+                total_images_migrated += num_migrated
+                # Save updated content_list for future reference
+                content_list_file = content_list_dir / f"{Path(file_path).stem}.json"
+                with open(content_list_file, "w", encoding="utf-8") as f:
+                    json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
+                # Step 3: Insert into RAG with corrected paths
+                self.logger.info("  Step 3/3: Inserting into RAG knowledge graph...")
+                await rag.insert_content_list(
+                    content_list=updated_content_list,
+                    file_path=file_path,
+                    doc_id=doc_id,
+                )
+                self.logger.info(f"  ✓ Completed: {file_name}")
             # Process text files directly (fast path)
             for file_path in classification.text_files:
                 idx += 1
@@ -263,10 +223,17 @@ class RAGAnythingPipeline:
             for file_path in classification.unsupported:
                 self.logger.warning(f"Skipped unsupported file: {Path(file_path).name}")
+            # Clean up temporary parser output directories
+            if total_images_migrated > 0:
+                self.logger.info("Cleaning up temporary parser output directories...")
+                await cleanup_parser_output_dirs(content_list_dir)
         if extract_numbered_items:
             await self._extract_numbered_items(kb_name)
-        self.logger.info(f"KB '{kb_name}' initialized successfully")
+        self.logger.info(
+            f"KB '{kb_name}' initialized successfully ({total_images_migrated} images migrated)"
+        )
         return True
     async def _extract_numbered_items(self, kb_name: str):

src/services/rag/pipelines/raganything_docling.py ADDED Viewed

@@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+"""
+RAGAnything Docling Pipeline
+============================
+End-to-end pipeline wrapping RAG-Anything with Docling parser for document processing.
+Uses Docling instead of MinerU for better Office document and HTML support.
+"""
+from pathlib import Path
+import sys
+from typing import Any, Dict, List, Optional
+from src.logging import get_logger
+from src.logging.adapters import LightRAGLogContext
+# Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
+# This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
+from src.services.llm.config import get_llm_config as _early_config_load  # noqa: F401
+class RAGAnythingDoclingPipeline:
+    """
+    RAG-Anything Pipeline with Docling Parser.
+    Uses RAG-Anything's complete processing with Docling as the document parser:
+    - Docling document parsing (supports PDF, Office documents, HTML)
+    - LightRAG knowledge graph construction
+    - Hybrid retrieval (hybrid/local/global/naive modes)
+    Advantages over MinerU:
+    - Better support for Office documents (.doc, .docx, .ppt, .pptx, .xls, .xlsx)
+    - Native HTML parsing support
+    - Easier installation (no CUDA dependencies)
+    Note: For academic PDFs with complex equations and formulas,
+    use RAGAnythingPipeline (MinerU) instead for better accuracy.
+    """
+    name = "raganything_docling"
+    def __init__(
+        self,
+        kb_base_dir: Optional[str] = None,
+        enable_image_processing: bool = True,
+        enable_table_processing: bool = True,
+        enable_equation_processing: bool = True,
+    ):
+        """
+        Initialize RAGAnything Docling pipeline.
+        Args:
+            kb_base_dir: Base directory for knowledge bases
+            enable_image_processing: Enable image extraction and processing
+            enable_table_processing: Enable table extraction and processing
+            enable_equation_processing: Enable equation extraction and processing
+        """
+        self.logger = get_logger("RAGAnythingDoclingPipeline")
+        self.kb_base_dir = kb_base_dir or str(
+            Path(__file__).resolve().parent.parent.parent.parent.parent / "data" / "knowledge_bases"
+        )
+        self.enable_image = enable_image_processing
+        self.enable_table = enable_table_processing
+        self.enable_equation = enable_equation_processing
+        self._instances: Dict[str, Any] = {}
+    def _setup_raganything_path(self):
+        """Add RAG-Anything to sys.path if available."""
+        project_root = Path(__file__).resolve().parent.parent.parent.parent.parent
+        raganything_path = project_root.parent / "raganything" / "RAG-Anything"
+        if raganything_path.exists() and str(raganything_path) not in sys.path:
+            sys.path.insert(0, str(raganything_path))
+    def _get_rag_instance(self, kb_name: str):
+        """Get or create RAGAnything instance with Docling parser."""
+        kb_dir = Path(self.kb_base_dir) / kb_name
+        working_dir = str(kb_dir / "rag_storage")
+        if working_dir in self._instances:
+            return self._instances[working_dir]
+        self._setup_raganything_path()
+        from raganything import RAGAnything, RAGAnythingConfig
+        from src.services.embedding import get_embedding_client
+        from src.services.llm import get_llm_client
+        # Use unified LLM client from src/services/llm
+        llm_client = get_llm_client()
+        embed_client = get_embedding_client()
+        # Get model functions from unified LLM client
+        # These handle all provider differences (OpenAI, Anthropic, Azure, local, etc.)
+        llm_model_func = llm_client.get_model_func()
+        vision_model_func = llm_client.get_vision_model_func()
+        # Configure RAGAnything with Docling parser
+        # Note: content_format should be "auto" or "minerU" because DoclingParser
+        # converts its output to MinerU-compatible format internally
+        config = RAGAnythingConfig(
+            working_dir=working_dir,
+            parser="docling",  # Use Docling instead of MinerU
+            content_format="auto",  # Auto-detect format (Docling outputs MinerU-compatible format)
+            enable_image_processing=self.enable_image,
+            enable_table_processing=self.enable_table,
+            enable_equation_processing=self.enable_equation,
+        )
+        rag = RAGAnything(
+            config=config,
+            llm_model_func=llm_model_func,
+            vision_model_func=vision_model_func,
+            embedding_func=embed_client.get_embedding_func(),
+        )
+        self._instances[working_dir] = rag
+        return rag
+    async def initialize(
+        self,
+        kb_name: str,
+        file_paths: List[str],
+        extract_numbered_items: bool = True,
+        **kwargs,
+    ) -> bool:
+        """
+        Initialize KB using RAG-Anything with Docling parser.
+        Processing flow:
+        1. Parse documents using Docling (generates content_list with nested image paths)
+        2. Migrate images to canonical location (kb/images/) and update paths in content_list
+        3. Insert updated content_list into RAG (now with correct image paths)
+        4. Clean up temporary parser output directories
+        This ensures RAG stores the final image paths, avoiding path mismatches during retrieval.
+        Uses FileTypeRouter to classify files and route them appropriately:
+        - PDF files -> Docling parser
+        - Office files (.doc, .docx, .ppt, .pptx) -> Docling parser (direct support)
+        - HTML files -> Docling parser
+        - Text files -> Direct read + LightRAG insert (fast)
+        Args:
+            kb_name: Knowledge base name
+            file_paths: List of file paths to process
+            extract_numbered_items: Whether to extract numbered items after processing
+            **kwargs: Additional arguments
+        Returns:
+            True if successful
+        """
+        import json
+        from ..components.routing import FileTypeRouter
+        from ..utils.image_migration import (
+            cleanup_parser_output_dirs,
+            migrate_images_and_update_paths,
+        )
+        self.logger.info(
+            f"Initializing KB '{kb_name}' with {len(file_paths)} files (Docling parser)"
+        )
+        kb_dir = Path(self.kb_base_dir) / kb_name
+        content_list_dir = kb_dir / "content_list"
+        images_dir = kb_dir / "images"
+        content_list_dir.mkdir(parents=True, exist_ok=True)
+        images_dir.mkdir(parents=True, exist_ok=True)
+        # Classify files by type
+        classification = FileTypeRouter.classify_files(file_paths)
+        self.logger.info(
+            f"File classification: {len(classification.needs_mineru)} need Docling, "
+            f"{len(classification.text_files)} text files, "
+            f"{len(classification.unsupported)} unsupported"
+        )
+        with LightRAGLogContext(scene="knowledge_init"):
+            rag = self._get_rag_instance(kb_name)
+            await rag._ensure_lightrag_initialized()
+            total_files = len(classification.needs_mineru) + len(classification.text_files)
+            idx = 0
+            total_images_migrated = 0
+            # Process files requiring Docling (PDF, DOCX, images, HTML)
+            for file_path in classification.needs_mineru:
+                idx += 1
+                file_name = Path(file_path).name
+                self.logger.info(f"Processing [{idx}/{total_files}] (Docling): {file_name}")
+                # Step 1: Parse document (without RAG insertion)
+                self.logger.info("  Step 1/3: Parsing document...")
+                content_list, doc_id = await rag.parse_document(
+                    file_path=file_path,
+                    output_dir=str(content_list_dir),
+                    parse_method="auto",
+                )
+                # Step 2: Migrate images and update paths
+                self.logger.info("  Step 2/3: Migrating images to canonical location...")
+                updated_content_list, num_migrated = await migrate_images_and_update_paths(
+                    content_list=content_list,
+                    source_base_dir=content_list_dir,
+                    target_images_dir=images_dir,
+                    batch_size=50,
+                )
+                total_images_migrated += num_migrated
+                # Save updated content_list for future reference
+                content_list_file = content_list_dir / f"{Path(file_path).stem}.json"
+                with open(content_list_file, "w", encoding="utf-8") as f:
+                    json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
+                # Step 3: Insert into RAG with corrected paths
+                self.logger.info("  Step 3/3: Inserting into RAG knowledge graph...")
+                await rag.insert_content_list(
+                    content_list=updated_content_list,
+                    file_path=file_path,
+                    doc_id=doc_id,
+                )
+                self.logger.info(f"  ✓ Completed: {file_name}")
+            # Process text files directly (fast path)
+            for file_path in classification.text_files:
+                idx += 1
+                self.logger.info(
+                    f"Processing [{idx}/{total_files}] (direct text): {Path(file_path).name}"
+                )
+                content = await FileTypeRouter.read_text_file(file_path)
+                if content.strip():
+                    # Insert directly into LightRAG, bypassing parser
+                    await rag.lightrag.ainsert(content)
+            # Log unsupported files
+            for file_path in classification.unsupported:
+                self.logger.warning(f"Skipped unsupported file: {Path(file_path).name}")
+            # Clean up temporary parser output directories
+            if total_images_migrated > 0:
+                self.logger.info("Cleaning up temporary parser output directories...")
+                await cleanup_parser_output_dirs(content_list_dir)
+        if extract_numbered_items:
+            await self._extract_numbered_items(kb_name)
+        self.logger.info(
+            f"KB '{kb_name}' initialized successfully with Docling parser "
+            f"({total_images_migrated} images migrated)"
+        )
+        return True
+    async def _extract_numbered_items(self, kb_name: str):
+        """Extract numbered items using existing extraction logic."""
+        try:
+            import json
+            from src.knowledge.extract_numbered_items import (
+                extract_numbered_items_with_llm_async,
+            )
+            from src.services.llm import get_llm_client
+            kb_dir = Path(self.kb_base_dir) / kb_name
+            content_list_dir = kb_dir / "content_list"
+            if not content_list_dir.exists():
+                self.logger.warning("No content_list directory found, skipping extraction")
+                return
+            # Load all content list files
+            all_content_items = []
+            for json_file in content_list_dir.glob("*.json"):
+                with open(json_file, "r", encoding="utf-8") as f:
+                    content_items = json.load(f)
+                    all_content_items.extend(content_items)
+            if not all_content_items:
+                self.logger.warning("No content items found for extraction")
+                return
+            self.logger.info(
+                f"Extracting numbered items from {len(all_content_items)} content items"
+            )
+            llm_client = get_llm_client()
+            items = await extract_numbered_items_with_llm_async(
+                all_content_items,
+                api_key=llm_client.config.api_key,
+                base_url=llm_client.config.base_url,
+            )
+            # Save numbered items
+            if items:
+                output_file = kb_dir / "numbered_items.json"
+                with open(output_file, "w", encoding="utf-8") as f:
+                    json.dump(items, f, ensure_ascii=False, indent=2)
+                self.logger.info(f"Extracted {len(items)} numbered items")
+        except ImportError as e:
+            self.logger.warning(f"Could not import extraction module: {e}")
+        except Exception as e:
+            self.logger.error(f"Failed to extract numbered items: {e}")
+    async def search(
+        self,
+        query: str,
+        kb_name: str,
+        mode: str = "hybrid",
+        only_need_context: bool = False,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Search using RAG-Anything's aquery().
+        Args:
+            query: Search query
+            kb_name: Knowledge base name
+            mode: Search mode (hybrid, local, global, naive)
+            only_need_context: Whether to only return context without answer
+            **kwargs: Additional arguments
+        Returns:
+            Search results dictionary
+        """
+        with LightRAGLogContext(scene="rag_search"):
+            rag = self._get_rag_instance(kb_name)
+            await rag._ensure_lightrag_initialized()
+            answer = await rag.aquery(query, mode=mode, only_need_context=only_need_context)
+            answer_str = answer if isinstance(answer, str) else str(answer)
+            return {
+                "query": query,
+                "answer": answer_str,
+                "content": answer_str,
+                "mode": mode,
+                "provider": "raganything_docling",
+            }
+    async def delete(self, kb_name: str) -> bool:
+        """
+        Delete knowledge base.
+        Args:
+            kb_name: Knowledge base name
+        Returns:
+            True if successful
+        """
+        import shutil
+        kb_dir = Path(self.kb_base_dir) / kb_name
+        working_dir = str(kb_dir / "rag_storage")
+        # Remove from cache
+        if working_dir in self._instances:
+            del self._instances[working_dir]
+        # Delete directory
+        if kb_dir.exists():
+            shutil.rmtree(kb_dir)
+            self.logger.info(f"Deleted KB '{kb_name}'")
+            return True
+        return False

src/services/rag/service.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 """
 RAG Service
 ===========
@@ -5,12 +6,16 @@ RAG Service
 Unified RAG service providing a single entry point for all RAG operations.
 """
+import json
 import os
 from pathlib import Path
+import shutil
 from typing import Any, Dict, List, Optional
 from src.logging import get_logger
+from .factory import get_pipeline, has_pipeline, list_pipelines
 # Default knowledge base directory
 DEFAULT_KB_BASE_DIR = str(
     Path(__file__).resolve().parent.parent.parent.parent / "data" / "knowledge_bases"
@@ -59,8 +64,6 @@ class RAGService:
     def _get_pipeline(self):
         """Get or create pipeline instance."""
         if self._pipeline is None:
-            from .factory import get_pipeline
             self._pipeline = get_pipeline(self.provider, kb_base_dir=self.kb_base_dir)
         return self._pipeline
@@ -117,8 +120,6 @@ class RAGService:
         )
         # Get pipeline for the specific provider
-        from .factory import get_pipeline
         pipeline = get_pipeline(provider, kb_base_dir=self.kb_base_dir)
         result = await pipeline.search(query=query, kb_name=kb_name, mode=mode, **kwargs)
@@ -149,8 +150,6 @@ class RAGService:
             Provider name (e.g., 'llamaindex', 'lightrag', 'raganything')
         """
         try:
-            import json
             metadata_file = Path(self.kb_base_dir) / kb_name / "metadata.json"
             if metadata_file.exists():
@@ -192,8 +191,6 @@ class RAGService:
             return await pipeline.delete(kb_name=kb_name)
         # Fallback: delete directory manually
-        import shutil
         kb_dir = Path(self.kb_base_dir) / kb_name
         if kb_dir.exists():
             shutil.rmtree(kb_dir)
@@ -214,8 +211,6 @@ class RAGService:
             for p in providers:
                 print(f"{p['id']}: {p['description']}")
         """
-        from .factory import list_pipelines
         return list_pipelines()
     @staticmethod
@@ -239,6 +234,4 @@ class RAGService:
         Returns:
             True if provider exists
         """
-        from .factory import has_pipeline
         return has_pipeline(name)

src/services/rag/types.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 """
 RAG Types
 =========

realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

realtimex-deeptutor 0.5.0.post1py3-none-any.whl → 0.5.0.post3py3-none-any.whl