realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
  2. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
  3. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
  4. realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
  5. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
  6. scripts/__init__.py +1 -0
  7. scripts/audit_prompts.py +179 -0
  8. scripts/check_install.py +460 -0
  9. scripts/generate_roster.py +327 -0
  10. scripts/install_all.py +653 -0
  11. scripts/migrate_kb.py +655 -0
  12. scripts/start.py +807 -0
  13. scripts/start_web.py +632 -0
  14. scripts/sync_prompts_from_en.py +147 -0
  15. src/__init__.py +2 -2
  16. src/agents/ideagen/material_organizer_agent.py +2 -0
  17. src/agents/solve/__init__.py +6 -0
  18. src/agents/solve/main_solver.py +9 -0
  19. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
  20. src/agents/solve/session_manager.py +345 -0
  21. src/api/main.py +14 -0
  22. src/api/routers/chat.py +3 -3
  23. src/api/routers/co_writer.py +12 -7
  24. src/api/routers/config.py +1 -0
  25. src/api/routers/guide.py +3 -1
  26. src/api/routers/ideagen.py +7 -0
  27. src/api/routers/knowledge.py +64 -12
  28. src/api/routers/question.py +2 -0
  29. src/api/routers/realtimex.py +137 -0
  30. src/api/routers/research.py +9 -0
  31. src/api/routers/solve.py +120 -2
  32. src/cli/__init__.py +13 -0
  33. src/cli/start.py +209 -0
  34. src/config/constants.py +11 -9
  35. src/knowledge/add_documents.py +453 -213
  36. src/knowledge/extract_numbered_items.py +9 -10
  37. src/knowledge/initializer.py +102 -101
  38. src/knowledge/manager.py +251 -74
  39. src/knowledge/progress_tracker.py +43 -2
  40. src/knowledge/start_kb.py +11 -2
  41. src/logging/__init__.py +5 -0
  42. src/logging/adapters/__init__.py +1 -0
  43. src/logging/adapters/lightrag.py +25 -18
  44. src/logging/adapters/llamaindex.py +1 -0
  45. src/logging/config.py +30 -27
  46. src/logging/handlers/__init__.py +1 -0
  47. src/logging/handlers/console.py +7 -50
  48. src/logging/handlers/file.py +5 -20
  49. src/logging/handlers/websocket.py +23 -19
  50. src/logging/logger.py +161 -126
  51. src/logging/stats/__init__.py +1 -0
  52. src/logging/stats/llm_stats.py +37 -17
  53. src/services/__init__.py +17 -1
  54. src/services/config/__init__.py +1 -0
  55. src/services/config/knowledge_base_config.py +1 -0
  56. src/services/config/loader.py +1 -1
  57. src/services/config/unified_config.py +211 -4
  58. src/services/embedding/__init__.py +1 -0
  59. src/services/embedding/adapters/__init__.py +3 -0
  60. src/services/embedding/adapters/base.py +1 -0
  61. src/services/embedding/adapters/cohere.py +1 -0
  62. src/services/embedding/adapters/jina.py +1 -0
  63. src/services/embedding/adapters/ollama.py +1 -0
  64. src/services/embedding/adapters/openai_compatible.py +1 -0
  65. src/services/embedding/adapters/realtimex.py +125 -0
  66. src/services/embedding/client.py +27 -0
  67. src/services/embedding/config.py +3 -0
  68. src/services/embedding/provider.py +1 -0
  69. src/services/llm/__init__.py +17 -3
  70. src/services/llm/capabilities.py +47 -0
  71. src/services/llm/client.py +32 -0
  72. src/services/llm/cloud_provider.py +21 -4
  73. src/services/llm/config.py +36 -2
  74. src/services/llm/error_mapping.py +1 -0
  75. src/services/llm/exceptions.py +30 -0
  76. src/services/llm/factory.py +55 -16
  77. src/services/llm/local_provider.py +1 -0
  78. src/services/llm/providers/anthropic.py +1 -0
  79. src/services/llm/providers/base_provider.py +1 -0
  80. src/services/llm/providers/open_ai.py +1 -0
  81. src/services/llm/realtimex_provider.py +240 -0
  82. src/services/llm/registry.py +1 -0
  83. src/services/llm/telemetry.py +1 -0
  84. src/services/llm/types.py +1 -0
  85. src/services/llm/utils.py +1 -0
  86. src/services/prompt/__init__.py +1 -0
  87. src/services/prompt/manager.py +3 -2
  88. src/services/rag/__init__.py +27 -5
  89. src/services/rag/components/__init__.py +1 -0
  90. src/services/rag/components/base.py +1 -0
  91. src/services/rag/components/chunkers/__init__.py +1 -0
  92. src/services/rag/components/chunkers/base.py +1 -0
  93. src/services/rag/components/chunkers/fixed.py +1 -0
  94. src/services/rag/components/chunkers/numbered_item.py +1 -0
  95. src/services/rag/components/chunkers/semantic.py +1 -0
  96. src/services/rag/components/embedders/__init__.py +1 -0
  97. src/services/rag/components/embedders/base.py +1 -0
  98. src/services/rag/components/embedders/openai.py +1 -0
  99. src/services/rag/components/indexers/__init__.py +1 -0
  100. src/services/rag/components/indexers/base.py +1 -0
  101. src/services/rag/components/indexers/graph.py +5 -44
  102. src/services/rag/components/indexers/lightrag.py +5 -44
  103. src/services/rag/components/indexers/vector.py +1 -0
  104. src/services/rag/components/parsers/__init__.py +1 -0
  105. src/services/rag/components/parsers/base.py +1 -0
  106. src/services/rag/components/parsers/markdown.py +1 -0
  107. src/services/rag/components/parsers/pdf.py +1 -0
  108. src/services/rag/components/parsers/text.py +1 -0
  109. src/services/rag/components/retrievers/__init__.py +1 -0
  110. src/services/rag/components/retrievers/base.py +1 -0
  111. src/services/rag/components/retrievers/dense.py +1 -0
  112. src/services/rag/components/retrievers/hybrid.py +5 -44
  113. src/services/rag/components/retrievers/lightrag.py +5 -44
  114. src/services/rag/components/routing.py +48 -0
  115. src/services/rag/factory.py +112 -46
  116. src/services/rag/pipeline.py +1 -0
  117. src/services/rag/pipelines/__init__.py +27 -18
  118. src/services/rag/pipelines/lightrag.py +1 -0
  119. src/services/rag/pipelines/llamaindex.py +99 -0
  120. src/services/rag/pipelines/raganything.py +67 -100
  121. src/services/rag/pipelines/raganything_docling.py +368 -0
  122. src/services/rag/service.py +5 -12
  123. src/services/rag/types.py +1 -0
  124. src/services/rag/utils/__init__.py +17 -0
  125. src/services/rag/utils/image_migration.py +279 -0
  126. src/services/search/__init__.py +1 -0
  127. src/services/search/base.py +1 -0
  128. src/services/search/consolidation.py +1 -0
  129. src/services/search/providers/__init__.py +1 -0
  130. src/services/search/providers/baidu.py +1 -0
  131. src/services/search/providers/exa.py +1 -0
  132. src/services/search/providers/jina.py +1 -0
  133. src/services/search/providers/perplexity.py +1 -0
  134. src/services/search/providers/serper.py +1 -0
  135. src/services/search/providers/tavily.py +1 -0
  136. src/services/search/types.py +1 -0
  137. src/services/settings/__init__.py +1 -0
  138. src/services/settings/interface_settings.py +78 -0
  139. src/services/setup/__init__.py +1 -0
  140. src/services/tts/__init__.py +1 -0
  141. src/services/tts/config.py +1 -0
  142. src/utils/realtimex.py +284 -0
  143. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
  144. src/services/rag/pipelines/academic.py +0 -44
  145. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  RAGAnything Pipeline
3
4
  ====================
@@ -9,11 +10,13 @@ from pathlib import Path
9
10
  import sys
10
11
  from typing import Any, Dict, List, Optional
11
12
 
12
- from lightrag.llm.openai import openai_complete_if_cache
13
-
14
13
  from src.logging import get_logger
15
14
  from src.logging.adapters import LightRAGLogContext
16
15
 
16
+ # Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
17
+ # This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
18
+ from src.services.llm.config import get_llm_config as _early_config_load # noqa: F401
19
+
17
20
 
18
21
  class RAGAnythingPipeline:
19
22
  """
@@ -71,106 +74,19 @@ class RAGAnythingPipeline:
71
74
 
72
75
  self._setup_raganything_path()
73
76
 
74
- from openai import AsyncOpenAI
75
77
  from raganything import RAGAnything, RAGAnythingConfig
76
78
 
77
79
  from src.services.embedding import get_embedding_client
78
80
  from src.services.llm import get_llm_client
79
81
 
82
+ # Use unified LLM client from src/services/llm
80
83
  llm_client = get_llm_client()
81
84
  embed_client = get_embedding_client()
82
85
 
83
- # Create AsyncOpenAI client directly - bypasses LightRAG's response_format handling
84
- openai_client = AsyncOpenAI(
85
- api_key=llm_client.config.api_key,
86
- base_url=llm_client.config.base_url,
87
- )
88
-
89
- async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
90
- """Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
91
- if history_messages is None:
92
- history_messages = []
93
-
94
- # Build messages array
95
- messages = []
96
- if system_prompt:
97
- messages.append({"role": "system", "content": system_prompt})
98
-
99
- # Add history
100
- messages.extend(history_messages)
101
-
102
- # Add current prompt
103
- messages.append({"role": "user", "content": prompt})
104
-
105
- # Whitelist only valid OpenAI parameters, filter out LightRAG-specific ones
106
- valid_params = {
107
- "temperature",
108
- "top_p",
109
- "n",
110
- "stream",
111
- "stop",
112
- "max_tokens",
113
- "presence_penalty",
114
- "frequency_penalty",
115
- "logit_bias",
116
- "user",
117
- "seed",
118
- }
119
- clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
120
-
121
- # Call OpenAI API directly (async)
122
- response = await openai_client.chat.completions.create(
123
- model=llm_client.config.model,
124
- messages=messages,
125
- **clean_kwargs,
126
- )
127
-
128
- return response.choices[0].message.content
129
-
130
- def vision_model_func(
131
- prompt,
132
- system_prompt=None,
133
- history_messages=[],
134
- image_data=None,
135
- messages=None,
136
- **kwargs,
137
- ):
138
- # Handle multimodal messages
139
- if messages:
140
- clean_kwargs = {
141
- k: v
142
- for k, v in kwargs.items()
143
- if k not in ["messages", "prompt", "system_prompt", "history_messages"]
144
- }
145
- return openai_complete_if_cache(
146
- llm_client.config.model,
147
- prompt="",
148
- messages=messages,
149
- api_key=llm_client.config.api_key,
150
- base_url=llm_client.config.base_url,
151
- **clean_kwargs,
152
- )
153
- if image_data:
154
- # Build image message
155
- image_message = {
156
- "role": "user",
157
- "content": [
158
- {"type": "text", "text": prompt},
159
- {
160
- "type": "image_url",
161
- "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
162
- },
163
- ],
164
- }
165
- return openai_complete_if_cache(
166
- llm_client.config.model,
167
- prompt="",
168
- messages=[image_message],
169
- api_key=llm_client.config.api_key,
170
- base_url=llm_client.config.base_url,
171
- **kwargs,
172
- )
173
- return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
86
+ # Get model functions from unified LLM client
87
+ # These handle all provider differences (OpenAI, Anthropic, Azure, local, etc.)
88
+ llm_model_func = llm_client.get_model_func()
89
+ vision_model_func = llm_client.get_vision_model_func()
174
90
 
175
91
  config = RAGAnythingConfig(
176
92
  working_dir=working_dir,
@@ -197,7 +113,15 @@ class RAGAnythingPipeline:
197
113
  **kwargs,
198
114
  ) -> bool:
199
115
  """
200
- Initialize KB using RAG-Anything's process_document_complete().
116
+ Initialize KB using RAG-Anything with MinerU parser.
117
+
118
+ Processing flow:
119
+ 1. Parse documents using MinerU (generates content_list with nested image paths)
120
+ 2. Migrate images to canonical location (kb/images/) and update paths in content_list
121
+ 3. Insert updated content_list into RAG (now with correct image paths)
122
+ 4. Clean up temporary parser output directories
123
+
124
+ This ensures RAG stores the final image paths, avoiding path mismatches during retrieval.
201
125
 
202
126
  Uses FileTypeRouter to classify files and route them appropriately:
203
127
  - PDF files -> MinerU parser (full document analysis)
@@ -212,13 +136,21 @@ class RAGAnythingPipeline:
212
136
  Returns:
213
137
  True if successful
214
138
  """
139
+ import json
140
+
215
141
  from ..components.routing import FileTypeRouter
142
+ from ..utils.image_migration import (
143
+ cleanup_parser_output_dirs,
144
+ migrate_images_and_update_paths,
145
+ )
216
146
 
217
147
  self.logger.info(f"Initializing KB '{kb_name}' with {len(file_paths)} files")
218
148
 
219
149
  kb_dir = Path(self.kb_base_dir) / kb_name
220
150
  content_list_dir = kb_dir / "content_list"
151
+ images_dir = kb_dir / "images"
221
152
  content_list_dir.mkdir(parents=True, exist_ok=True)
153
+ images_dir.mkdir(parents=True, exist_ok=True)
222
154
 
223
155
  # Classify files by type
224
156
  classification = FileTypeRouter.classify_files(file_paths)
@@ -235,19 +167,47 @@ class RAGAnythingPipeline:
235
167
 
236
168
  total_files = len(classification.needs_mineru) + len(classification.text_files)
237
169
  idx = 0
170
+ total_images_migrated = 0
238
171
 
239
172
  # Process files requiring MinerU (PDF, DOCX, images)
240
173
  for file_path in classification.needs_mineru:
241
174
  idx += 1
242
- self.logger.info(
243
- f"Processing [{idx}/{total_files}] (MinerU): {Path(file_path).name}"
244
- )
245
- await rag.process_document_complete(
175
+ file_name = Path(file_path).name
176
+ self.logger.info(f"Processing [{idx}/{total_files}] (MinerU): {file_name}")
177
+
178
+ # Step 1: Parse document (without RAG insertion)
179
+ self.logger.info(" Step 1/3: Parsing document...")
180
+ content_list, doc_id = await rag.parse_document(
246
181
  file_path=file_path,
247
182
  output_dir=str(content_list_dir),
248
183
  parse_method="auto",
249
184
  )
250
185
 
186
+ # Step 2: Migrate images and update paths
187
+ self.logger.info(" Step 2/3: Migrating images to canonical location...")
188
+ updated_content_list, num_migrated = await migrate_images_and_update_paths(
189
+ content_list=content_list,
190
+ source_base_dir=content_list_dir,
191
+ target_images_dir=images_dir,
192
+ batch_size=50,
193
+ )
194
+ total_images_migrated += num_migrated
195
+
196
+ # Save updated content_list for future reference
197
+ content_list_file = content_list_dir / f"{Path(file_path).stem}.json"
198
+ with open(content_list_file, "w", encoding="utf-8") as f:
199
+ json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
200
+
201
+ # Step 3: Insert into RAG with corrected paths
202
+ self.logger.info(" Step 3/3: Inserting into RAG knowledge graph...")
203
+ await rag.insert_content_list(
204
+ content_list=updated_content_list,
205
+ file_path=file_path,
206
+ doc_id=doc_id,
207
+ )
208
+
209
+ self.logger.info(f" ✓ Completed: {file_name}")
210
+
251
211
  # Process text files directly (fast path)
252
212
  for file_path in classification.text_files:
253
213
  idx += 1
@@ -263,10 +223,17 @@ class RAGAnythingPipeline:
263
223
  for file_path in classification.unsupported:
264
224
  self.logger.warning(f"Skipped unsupported file: {Path(file_path).name}")
265
225
 
226
+ # Clean up temporary parser output directories
227
+ if total_images_migrated > 0:
228
+ self.logger.info("Cleaning up temporary parser output directories...")
229
+ await cleanup_parser_output_dirs(content_list_dir)
230
+
266
231
  if extract_numbered_items:
267
232
  await self._extract_numbered_items(kb_name)
268
233
 
269
- self.logger.info(f"KB '{kb_name}' initialized successfully")
234
+ self.logger.info(
235
+ f"KB '{kb_name}' initialized successfully ({total_images_migrated} images migrated)"
236
+ )
270
237
  return True
271
238
 
272
239
  async def _extract_numbered_items(self, kb_name: str):
@@ -0,0 +1,368 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ RAGAnything Docling Pipeline
4
+ ============================
5
+
6
+ End-to-end pipeline wrapping RAG-Anything with Docling parser for document processing.
7
+ Uses Docling instead of MinerU for better Office document and HTML support.
8
+ """
9
+
10
+ from pathlib import Path
11
+ import sys
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from src.logging import get_logger
15
+ from src.logging.adapters import LightRAGLogContext
16
+
17
+ # Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
18
+ # This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
19
+ from src.services.llm.config import get_llm_config as _early_config_load # noqa: F401
20
+
21
+
22
+ class RAGAnythingDoclingPipeline:
23
+ """
24
+ RAG-Anything Pipeline with Docling Parser.
25
+
26
+ Uses RAG-Anything's complete processing with Docling as the document parser:
27
+ - Docling document parsing (supports PDF, Office documents, HTML)
28
+ - LightRAG knowledge graph construction
29
+ - Hybrid retrieval (hybrid/local/global/naive modes)
30
+
31
+ Advantages over MinerU:
32
+ - Better support for Office documents (.doc, .docx, .ppt, .pptx, .xls, .xlsx)
33
+ - Native HTML parsing support
34
+ - Easier installation (no CUDA dependencies)
35
+
36
+ Note: For academic PDFs with complex equations and formulas,
37
+ use RAGAnythingPipeline (MinerU) instead for better accuracy.
38
+ """
39
+
40
+ name = "raganything_docling"
41
+
42
+ def __init__(
43
+ self,
44
+ kb_base_dir: Optional[str] = None,
45
+ enable_image_processing: bool = True,
46
+ enable_table_processing: bool = True,
47
+ enable_equation_processing: bool = True,
48
+ ):
49
+ """
50
+ Initialize RAGAnything Docling pipeline.
51
+
52
+ Args:
53
+ kb_base_dir: Base directory for knowledge bases
54
+ enable_image_processing: Enable image extraction and processing
55
+ enable_table_processing: Enable table extraction and processing
56
+ enable_equation_processing: Enable equation extraction and processing
57
+ """
58
+ self.logger = get_logger("RAGAnythingDoclingPipeline")
59
+ self.kb_base_dir = kb_base_dir or str(
60
+ Path(__file__).resolve().parent.parent.parent.parent.parent / "data" / "knowledge_bases"
61
+ )
62
+ self.enable_image = enable_image_processing
63
+ self.enable_table = enable_table_processing
64
+ self.enable_equation = enable_equation_processing
65
+ self._instances: Dict[str, Any] = {}
66
+
67
+ def _setup_raganything_path(self):
68
+ """Add RAG-Anything to sys.path if available."""
69
+ project_root = Path(__file__).resolve().parent.parent.parent.parent.parent
70
+ raganything_path = project_root.parent / "raganything" / "RAG-Anything"
71
+ if raganything_path.exists() and str(raganything_path) not in sys.path:
72
+ sys.path.insert(0, str(raganything_path))
73
+
74
+ def _get_rag_instance(self, kb_name: str):
75
+ """Get or create RAGAnything instance with Docling parser."""
76
+ kb_dir = Path(self.kb_base_dir) / kb_name
77
+ working_dir = str(kb_dir / "rag_storage")
78
+
79
+ if working_dir in self._instances:
80
+ return self._instances[working_dir]
81
+
82
+ self._setup_raganything_path()
83
+
84
+ from raganything import RAGAnything, RAGAnythingConfig
85
+
86
+ from src.services.embedding import get_embedding_client
87
+ from src.services.llm import get_llm_client
88
+
89
+ # Use unified LLM client from src/services/llm
90
+ llm_client = get_llm_client()
91
+ embed_client = get_embedding_client()
92
+
93
+ # Get model functions from unified LLM client
94
+ # These handle all provider differences (OpenAI, Anthropic, Azure, local, etc.)
95
+ llm_model_func = llm_client.get_model_func()
96
+ vision_model_func = llm_client.get_vision_model_func()
97
+
98
+ # Configure RAGAnything with Docling parser
99
+ # Note: content_format should be "auto" or "minerU" because DoclingParser
100
+ # converts its output to MinerU-compatible format internally
101
+ config = RAGAnythingConfig(
102
+ working_dir=working_dir,
103
+ parser="docling", # Use Docling instead of MinerU
104
+ content_format="auto", # Auto-detect format (Docling outputs MinerU-compatible format)
105
+ enable_image_processing=self.enable_image,
106
+ enable_table_processing=self.enable_table,
107
+ enable_equation_processing=self.enable_equation,
108
+ )
109
+
110
+ rag = RAGAnything(
111
+ config=config,
112
+ llm_model_func=llm_model_func,
113
+ vision_model_func=vision_model_func,
114
+ embedding_func=embed_client.get_embedding_func(),
115
+ )
116
+
117
+ self._instances[working_dir] = rag
118
+ return rag
119
+
120
+ async def initialize(
121
+ self,
122
+ kb_name: str,
123
+ file_paths: List[str],
124
+ extract_numbered_items: bool = True,
125
+ **kwargs,
126
+ ) -> bool:
127
+ """
128
+ Initialize KB using RAG-Anything with Docling parser.
129
+
130
+ Processing flow:
131
+ 1. Parse documents using Docling (generates content_list with nested image paths)
132
+ 2. Migrate images to canonical location (kb/images/) and update paths in content_list
133
+ 3. Insert updated content_list into RAG (now with correct image paths)
134
+ 4. Clean up temporary parser output directories
135
+
136
+ This ensures RAG stores the final image paths, avoiding path mismatches during retrieval.
137
+
138
+ Uses FileTypeRouter to classify files and route them appropriately:
139
+ - PDF files -> Docling parser
140
+ - Office files (.doc, .docx, .ppt, .pptx) -> Docling parser (direct support)
141
+ - HTML files -> Docling parser
142
+ - Text files -> Direct read + LightRAG insert (fast)
143
+
144
+ Args:
145
+ kb_name: Knowledge base name
146
+ file_paths: List of file paths to process
147
+ extract_numbered_items: Whether to extract numbered items after processing
148
+ **kwargs: Additional arguments
149
+
150
+ Returns:
151
+ True if successful
152
+ """
153
+ import json
154
+
155
+ from ..components.routing import FileTypeRouter
156
+ from ..utils.image_migration import (
157
+ cleanup_parser_output_dirs,
158
+ migrate_images_and_update_paths,
159
+ )
160
+
161
+ self.logger.info(
162
+ f"Initializing KB '{kb_name}' with {len(file_paths)} files (Docling parser)"
163
+ )
164
+
165
+ kb_dir = Path(self.kb_base_dir) / kb_name
166
+ content_list_dir = kb_dir / "content_list"
167
+ images_dir = kb_dir / "images"
168
+ content_list_dir.mkdir(parents=True, exist_ok=True)
169
+ images_dir.mkdir(parents=True, exist_ok=True)
170
+
171
+ # Classify files by type
172
+ classification = FileTypeRouter.classify_files(file_paths)
173
+
174
+ self.logger.info(
175
+ f"File classification: {len(classification.needs_mineru)} need Docling, "
176
+ f"{len(classification.text_files)} text files, "
177
+ f"{len(classification.unsupported)} unsupported"
178
+ )
179
+
180
+ with LightRAGLogContext(scene="knowledge_init"):
181
+ rag = self._get_rag_instance(kb_name)
182
+ await rag._ensure_lightrag_initialized()
183
+
184
+ total_files = len(classification.needs_mineru) + len(classification.text_files)
185
+ idx = 0
186
+ total_images_migrated = 0
187
+
188
+ # Process files requiring Docling (PDF, DOCX, images, HTML)
189
+ for file_path in classification.needs_mineru:
190
+ idx += 1
191
+ file_name = Path(file_path).name
192
+ self.logger.info(f"Processing [{idx}/{total_files}] (Docling): {file_name}")
193
+
194
+ # Step 1: Parse document (without RAG insertion)
195
+ self.logger.info(" Step 1/3: Parsing document...")
196
+ content_list, doc_id = await rag.parse_document(
197
+ file_path=file_path,
198
+ output_dir=str(content_list_dir),
199
+ parse_method="auto",
200
+ )
201
+
202
+ # Step 2: Migrate images and update paths
203
+ self.logger.info(" Step 2/3: Migrating images to canonical location...")
204
+ updated_content_list, num_migrated = await migrate_images_and_update_paths(
205
+ content_list=content_list,
206
+ source_base_dir=content_list_dir,
207
+ target_images_dir=images_dir,
208
+ batch_size=50,
209
+ )
210
+ total_images_migrated += num_migrated
211
+
212
+ # Save updated content_list for future reference
213
+ content_list_file = content_list_dir / f"{Path(file_path).stem}.json"
214
+ with open(content_list_file, "w", encoding="utf-8") as f:
215
+ json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
216
+
217
+ # Step 3: Insert into RAG with corrected paths
218
+ self.logger.info(" Step 3/3: Inserting into RAG knowledge graph...")
219
+ await rag.insert_content_list(
220
+ content_list=updated_content_list,
221
+ file_path=file_path,
222
+ doc_id=doc_id,
223
+ )
224
+
225
+ self.logger.info(f" ✓ Completed: {file_name}")
226
+
227
+ # Process text files directly (fast path)
228
+ for file_path in classification.text_files:
229
+ idx += 1
230
+ self.logger.info(
231
+ f"Processing [{idx}/{total_files}] (direct text): {Path(file_path).name}"
232
+ )
233
+ content = await FileTypeRouter.read_text_file(file_path)
234
+ if content.strip():
235
+ # Insert directly into LightRAG, bypassing parser
236
+ await rag.lightrag.ainsert(content)
237
+
238
+ # Log unsupported files
239
+ for file_path in classification.unsupported:
240
+ self.logger.warning(f"Skipped unsupported file: {Path(file_path).name}")
241
+
242
+ # Clean up temporary parser output directories
243
+ if total_images_migrated > 0:
244
+ self.logger.info("Cleaning up temporary parser output directories...")
245
+ await cleanup_parser_output_dirs(content_list_dir)
246
+
247
+ if extract_numbered_items:
248
+ await self._extract_numbered_items(kb_name)
249
+
250
+ self.logger.info(
251
+ f"KB '{kb_name}' initialized successfully with Docling parser "
252
+ f"({total_images_migrated} images migrated)"
253
+ )
254
+ return True
255
+
256
+ async def _extract_numbered_items(self, kb_name: str):
257
+ """Extract numbered items using existing extraction logic."""
258
+ try:
259
+ import json
260
+
261
+ from src.knowledge.extract_numbered_items import (
262
+ extract_numbered_items_with_llm_async,
263
+ )
264
+ from src.services.llm import get_llm_client
265
+
266
+ kb_dir = Path(self.kb_base_dir) / kb_name
267
+ content_list_dir = kb_dir / "content_list"
268
+
269
+ if not content_list_dir.exists():
270
+ self.logger.warning("No content_list directory found, skipping extraction")
271
+ return
272
+
273
+ # Load all content list files
274
+ all_content_items = []
275
+ for json_file in content_list_dir.glob("*.json"):
276
+ with open(json_file, "r", encoding="utf-8") as f:
277
+ content_items = json.load(f)
278
+ all_content_items.extend(content_items)
279
+
280
+ if not all_content_items:
281
+ self.logger.warning("No content items found for extraction")
282
+ return
283
+
284
+ self.logger.info(
285
+ f"Extracting numbered items from {len(all_content_items)} content items"
286
+ )
287
+
288
+ llm_client = get_llm_client()
289
+ items = await extract_numbered_items_with_llm_async(
290
+ all_content_items,
291
+ api_key=llm_client.config.api_key,
292
+ base_url=llm_client.config.base_url,
293
+ )
294
+
295
+ # Save numbered items
296
+ if items:
297
+ output_file = kb_dir / "numbered_items.json"
298
+ with open(output_file, "w", encoding="utf-8") as f:
299
+ json.dump(items, f, ensure_ascii=False, indent=2)
300
+ self.logger.info(f"Extracted {len(items)} numbered items")
301
+
302
+ except ImportError as e:
303
+ self.logger.warning(f"Could not import extraction module: {e}")
304
+ except Exception as e:
305
+ self.logger.error(f"Failed to extract numbered items: {e}")
306
+
307
+ async def search(
308
+ self,
309
+ query: str,
310
+ kb_name: str,
311
+ mode: str = "hybrid",
312
+ only_need_context: bool = False,
313
+ **kwargs,
314
+ ) -> Dict[str, Any]:
315
+ """
316
+ Search using RAG-Anything's aquery().
317
+
318
+ Args:
319
+ query: Search query
320
+ kb_name: Knowledge base name
321
+ mode: Search mode (hybrid, local, global, naive)
322
+ only_need_context: Whether to only return context without answer
323
+ **kwargs: Additional arguments
324
+
325
+ Returns:
326
+ Search results dictionary
327
+ """
328
+ with LightRAGLogContext(scene="rag_search"):
329
+ rag = self._get_rag_instance(kb_name)
330
+ await rag._ensure_lightrag_initialized()
331
+
332
+ answer = await rag.aquery(query, mode=mode, only_need_context=only_need_context)
333
+ answer_str = answer if isinstance(answer, str) else str(answer)
334
+
335
+ return {
336
+ "query": query,
337
+ "answer": answer_str,
338
+ "content": answer_str,
339
+ "mode": mode,
340
+ "provider": "raganything_docling",
341
+ }
342
+
343
+ async def delete(self, kb_name: str) -> bool:
344
+ """
345
+ Delete knowledge base.
346
+
347
+ Args:
348
+ kb_name: Knowledge base name
349
+
350
+ Returns:
351
+ True if successful
352
+ """
353
+ import shutil
354
+
355
+ kb_dir = Path(self.kb_base_dir) / kb_name
356
+ working_dir = str(kb_dir / "rag_storage")
357
+
358
+ # Remove from cache
359
+ if working_dir in self._instances:
360
+ del self._instances[working_dir]
361
+
362
+ # Delete directory
363
+ if kb_dir.exists():
364
+ shutil.rmtree(kb_dir)
365
+ self.logger.info(f"Deleted KB '{kb_name}'")
366
+ return True
367
+
368
+ return False
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  RAG Service
3
4
  ===========
@@ -5,12 +6,16 @@ RAG Service
5
6
  Unified RAG service providing a single entry point for all RAG operations.
6
7
  """
7
8
 
9
+ import json
8
10
  import os
9
11
  from pathlib import Path
12
+ import shutil
10
13
  from typing import Any, Dict, List, Optional
11
14
 
12
15
  from src.logging import get_logger
13
16
 
17
+ from .factory import get_pipeline, has_pipeline, list_pipelines
18
+
14
19
  # Default knowledge base directory
15
20
  DEFAULT_KB_BASE_DIR = str(
16
21
  Path(__file__).resolve().parent.parent.parent.parent / "data" / "knowledge_bases"
@@ -59,8 +64,6 @@ class RAGService:
59
64
  def _get_pipeline(self):
60
65
  """Get or create pipeline instance."""
61
66
  if self._pipeline is None:
62
- from .factory import get_pipeline
63
-
64
67
  self._pipeline = get_pipeline(self.provider, kb_base_dir=self.kb_base_dir)
65
68
  return self._pipeline
66
69
 
@@ -117,8 +120,6 @@ class RAGService:
117
120
  )
118
121
 
119
122
  # Get pipeline for the specific provider
120
- from .factory import get_pipeline
121
-
122
123
  pipeline = get_pipeline(provider, kb_base_dir=self.kb_base_dir)
123
124
 
124
125
  result = await pipeline.search(query=query, kb_name=kb_name, mode=mode, **kwargs)
@@ -149,8 +150,6 @@ class RAGService:
149
150
  Provider name (e.g., 'llamaindex', 'lightrag', 'raganything')
150
151
  """
151
152
  try:
152
- import json
153
-
154
153
  metadata_file = Path(self.kb_base_dir) / kb_name / "metadata.json"
155
154
 
156
155
  if metadata_file.exists():
@@ -192,8 +191,6 @@ class RAGService:
192
191
  return await pipeline.delete(kb_name=kb_name)
193
192
 
194
193
  # Fallback: delete directory manually
195
- import shutil
196
-
197
194
  kb_dir = Path(self.kb_base_dir) / kb_name
198
195
  if kb_dir.exists():
199
196
  shutil.rmtree(kb_dir)
@@ -214,8 +211,6 @@ class RAGService:
214
211
  for p in providers:
215
212
  print(f"{p['id']}: {p['description']}")
216
213
  """
217
- from .factory import list_pipelines
218
-
219
214
  return list_pipelines()
220
215
 
221
216
  @staticmethod
@@ -239,6 +234,4 @@ class RAGService:
239
234
  Returns:
240
235
  True if provider exists
241
236
  """
242
- from .factory import has_pipeline
243
-
244
237
  return has_pipeline(name)
src/services/rag/types.py CHANGED
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  RAG Types
3
4
  =========