realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
- realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
- scripts/__init__.py +1 -0
- scripts/audit_prompts.py +179 -0
- scripts/check_install.py +460 -0
- scripts/generate_roster.py +327 -0
- scripts/install_all.py +653 -0
- scripts/migrate_kb.py +655 -0
- scripts/start.py +807 -0
- scripts/start_web.py +632 -0
- scripts/sync_prompts_from_en.py +147 -0
- src/__init__.py +2 -2
- src/agents/ideagen/material_organizer_agent.py +2 -0
- src/agents/solve/__init__.py +6 -0
- src/agents/solve/main_solver.py +9 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
- src/agents/solve/session_manager.py +345 -0
- src/api/main.py +14 -0
- src/api/routers/chat.py +3 -3
- src/api/routers/co_writer.py +12 -7
- src/api/routers/config.py +1 -0
- src/api/routers/guide.py +3 -1
- src/api/routers/ideagen.py +7 -0
- src/api/routers/knowledge.py +64 -12
- src/api/routers/question.py +2 -0
- src/api/routers/realtimex.py +137 -0
- src/api/routers/research.py +9 -0
- src/api/routers/solve.py +120 -2
- src/cli/__init__.py +13 -0
- src/cli/start.py +209 -0
- src/config/constants.py +11 -9
- src/knowledge/add_documents.py +453 -213
- src/knowledge/extract_numbered_items.py +9 -10
- src/knowledge/initializer.py +102 -101
- src/knowledge/manager.py +251 -74
- src/knowledge/progress_tracker.py +43 -2
- src/knowledge/start_kb.py +11 -2
- src/logging/__init__.py +5 -0
- src/logging/adapters/__init__.py +1 -0
- src/logging/adapters/lightrag.py +25 -18
- src/logging/adapters/llamaindex.py +1 -0
- src/logging/config.py +30 -27
- src/logging/handlers/__init__.py +1 -0
- src/logging/handlers/console.py +7 -50
- src/logging/handlers/file.py +5 -20
- src/logging/handlers/websocket.py +23 -19
- src/logging/logger.py +161 -126
- src/logging/stats/__init__.py +1 -0
- src/logging/stats/llm_stats.py +37 -17
- src/services/__init__.py +17 -1
- src/services/config/__init__.py +1 -0
- src/services/config/knowledge_base_config.py +1 -0
- src/services/config/loader.py +1 -1
- src/services/config/unified_config.py +211 -4
- src/services/embedding/__init__.py +1 -0
- src/services/embedding/adapters/__init__.py +3 -0
- src/services/embedding/adapters/base.py +1 -0
- src/services/embedding/adapters/cohere.py +1 -0
- src/services/embedding/adapters/jina.py +1 -0
- src/services/embedding/adapters/ollama.py +1 -0
- src/services/embedding/adapters/openai_compatible.py +1 -0
- src/services/embedding/adapters/realtimex.py +125 -0
- src/services/embedding/client.py +27 -0
- src/services/embedding/config.py +3 -0
- src/services/embedding/provider.py +1 -0
- src/services/llm/__init__.py +17 -3
- src/services/llm/capabilities.py +47 -0
- src/services/llm/client.py +32 -0
- src/services/llm/cloud_provider.py +21 -4
- src/services/llm/config.py +36 -2
- src/services/llm/error_mapping.py +1 -0
- src/services/llm/exceptions.py +30 -0
- src/services/llm/factory.py +55 -16
- src/services/llm/local_provider.py +1 -0
- src/services/llm/providers/anthropic.py +1 -0
- src/services/llm/providers/base_provider.py +1 -0
- src/services/llm/providers/open_ai.py +1 -0
- src/services/llm/realtimex_provider.py +240 -0
- src/services/llm/registry.py +1 -0
- src/services/llm/telemetry.py +1 -0
- src/services/llm/types.py +1 -0
- src/services/llm/utils.py +1 -0
- src/services/prompt/__init__.py +1 -0
- src/services/prompt/manager.py +3 -2
- src/services/rag/__init__.py +27 -5
- src/services/rag/components/__init__.py +1 -0
- src/services/rag/components/base.py +1 -0
- src/services/rag/components/chunkers/__init__.py +1 -0
- src/services/rag/components/chunkers/base.py +1 -0
- src/services/rag/components/chunkers/fixed.py +1 -0
- src/services/rag/components/chunkers/numbered_item.py +1 -0
- src/services/rag/components/chunkers/semantic.py +1 -0
- src/services/rag/components/embedders/__init__.py +1 -0
- src/services/rag/components/embedders/base.py +1 -0
- src/services/rag/components/embedders/openai.py +1 -0
- src/services/rag/components/indexers/__init__.py +1 -0
- src/services/rag/components/indexers/base.py +1 -0
- src/services/rag/components/indexers/graph.py +5 -44
- src/services/rag/components/indexers/lightrag.py +5 -44
- src/services/rag/components/indexers/vector.py +1 -0
- src/services/rag/components/parsers/__init__.py +1 -0
- src/services/rag/components/parsers/base.py +1 -0
- src/services/rag/components/parsers/markdown.py +1 -0
- src/services/rag/components/parsers/pdf.py +1 -0
- src/services/rag/components/parsers/text.py +1 -0
- src/services/rag/components/retrievers/__init__.py +1 -0
- src/services/rag/components/retrievers/base.py +1 -0
- src/services/rag/components/retrievers/dense.py +1 -0
- src/services/rag/components/retrievers/hybrid.py +5 -44
- src/services/rag/components/retrievers/lightrag.py +5 -44
- src/services/rag/components/routing.py +48 -0
- src/services/rag/factory.py +112 -46
- src/services/rag/pipeline.py +1 -0
- src/services/rag/pipelines/__init__.py +27 -18
- src/services/rag/pipelines/lightrag.py +1 -0
- src/services/rag/pipelines/llamaindex.py +99 -0
- src/services/rag/pipelines/raganything.py +67 -100
- src/services/rag/pipelines/raganything_docling.py +368 -0
- src/services/rag/service.py +5 -12
- src/services/rag/types.py +1 -0
- src/services/rag/utils/__init__.py +17 -0
- src/services/rag/utils/image_migration.py +279 -0
- src/services/search/__init__.py +1 -0
- src/services/search/base.py +1 -0
- src/services/search/consolidation.py +1 -0
- src/services/search/providers/__init__.py +1 -0
- src/services/search/providers/baidu.py +1 -0
- src/services/search/providers/exa.py +1 -0
- src/services/search/providers/jina.py +1 -0
- src/services/search/providers/perplexity.py +1 -0
- src/services/search/providers/serper.py +1 -0
- src/services/search/providers/tavily.py +1 -0
- src/services/search/types.py +1 -0
- src/services/settings/__init__.py +1 -0
- src/services/settings/interface_settings.py +78 -0
- src/services/setup/__init__.py +1 -0
- src/services/tts/__init__.py +1 -0
- src/services/tts/config.py +1 -0
- src/utils/realtimex.py +284 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
- src/services/rag/pipelines/academic.py +0 -44
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
RAGAnything Pipeline
|
|
3
4
|
====================
|
|
@@ -9,11 +10,13 @@ from pathlib import Path
|
|
|
9
10
|
import sys
|
|
10
11
|
from typing import Any, Dict, List, Optional
|
|
11
12
|
|
|
12
|
-
from lightrag.llm.openai import openai_complete_if_cache
|
|
13
|
-
|
|
14
13
|
from src.logging import get_logger
|
|
15
14
|
from src.logging.adapters import LightRAGLogContext
|
|
16
15
|
|
|
16
|
+
# Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
|
|
17
|
+
# This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
|
|
18
|
+
from src.services.llm.config import get_llm_config as _early_config_load # noqa: F401
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
class RAGAnythingPipeline:
|
|
19
22
|
"""
|
|
@@ -71,106 +74,19 @@ class RAGAnythingPipeline:
|
|
|
71
74
|
|
|
72
75
|
self._setup_raganything_path()
|
|
73
76
|
|
|
74
|
-
from openai import AsyncOpenAI
|
|
75
77
|
from raganything import RAGAnything, RAGAnythingConfig
|
|
76
78
|
|
|
77
79
|
from src.services.embedding import get_embedding_client
|
|
78
80
|
from src.services.llm import get_llm_client
|
|
79
81
|
|
|
82
|
+
# Use unified LLM client from src/services/llm
|
|
80
83
|
llm_client = get_llm_client()
|
|
81
84
|
embed_client = get_embedding_client()
|
|
82
85
|
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
|
|
90
|
-
"""Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
|
|
91
|
-
if history_messages is None:
|
|
92
|
-
history_messages = []
|
|
93
|
-
|
|
94
|
-
# Build messages array
|
|
95
|
-
messages = []
|
|
96
|
-
if system_prompt:
|
|
97
|
-
messages.append({"role": "system", "content": system_prompt})
|
|
98
|
-
|
|
99
|
-
# Add history
|
|
100
|
-
messages.extend(history_messages)
|
|
101
|
-
|
|
102
|
-
# Add current prompt
|
|
103
|
-
messages.append({"role": "user", "content": prompt})
|
|
104
|
-
|
|
105
|
-
# Whitelist only valid OpenAI parameters, filter out LightRAG-specific ones
|
|
106
|
-
valid_params = {
|
|
107
|
-
"temperature",
|
|
108
|
-
"top_p",
|
|
109
|
-
"n",
|
|
110
|
-
"stream",
|
|
111
|
-
"stop",
|
|
112
|
-
"max_tokens",
|
|
113
|
-
"presence_penalty",
|
|
114
|
-
"frequency_penalty",
|
|
115
|
-
"logit_bias",
|
|
116
|
-
"user",
|
|
117
|
-
"seed",
|
|
118
|
-
}
|
|
119
|
-
clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
|
|
120
|
-
|
|
121
|
-
# Call OpenAI API directly (async)
|
|
122
|
-
response = await openai_client.chat.completions.create(
|
|
123
|
-
model=llm_client.config.model,
|
|
124
|
-
messages=messages,
|
|
125
|
-
**clean_kwargs,
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
return response.choices[0].message.content
|
|
129
|
-
|
|
130
|
-
def vision_model_func(
|
|
131
|
-
prompt,
|
|
132
|
-
system_prompt=None,
|
|
133
|
-
history_messages=[],
|
|
134
|
-
image_data=None,
|
|
135
|
-
messages=None,
|
|
136
|
-
**kwargs,
|
|
137
|
-
):
|
|
138
|
-
# Handle multimodal messages
|
|
139
|
-
if messages:
|
|
140
|
-
clean_kwargs = {
|
|
141
|
-
k: v
|
|
142
|
-
for k, v in kwargs.items()
|
|
143
|
-
if k not in ["messages", "prompt", "system_prompt", "history_messages"]
|
|
144
|
-
}
|
|
145
|
-
return openai_complete_if_cache(
|
|
146
|
-
llm_client.config.model,
|
|
147
|
-
prompt="",
|
|
148
|
-
messages=messages,
|
|
149
|
-
api_key=llm_client.config.api_key,
|
|
150
|
-
base_url=llm_client.config.base_url,
|
|
151
|
-
**clean_kwargs,
|
|
152
|
-
)
|
|
153
|
-
if image_data:
|
|
154
|
-
# Build image message
|
|
155
|
-
image_message = {
|
|
156
|
-
"role": "user",
|
|
157
|
-
"content": [
|
|
158
|
-
{"type": "text", "text": prompt},
|
|
159
|
-
{
|
|
160
|
-
"type": "image_url",
|
|
161
|
-
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
|
|
162
|
-
},
|
|
163
|
-
],
|
|
164
|
-
}
|
|
165
|
-
return openai_complete_if_cache(
|
|
166
|
-
llm_client.config.model,
|
|
167
|
-
prompt="",
|
|
168
|
-
messages=[image_message],
|
|
169
|
-
api_key=llm_client.config.api_key,
|
|
170
|
-
base_url=llm_client.config.base_url,
|
|
171
|
-
**kwargs,
|
|
172
|
-
)
|
|
173
|
-
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
|
86
|
+
# Get model functions from unified LLM client
|
|
87
|
+
# These handle all provider differences (OpenAI, Anthropic, Azure, local, etc.)
|
|
88
|
+
llm_model_func = llm_client.get_model_func()
|
|
89
|
+
vision_model_func = llm_client.get_vision_model_func()
|
|
174
90
|
|
|
175
91
|
config = RAGAnythingConfig(
|
|
176
92
|
working_dir=working_dir,
|
|
@@ -197,7 +113,15 @@ class RAGAnythingPipeline:
|
|
|
197
113
|
**kwargs,
|
|
198
114
|
) -> bool:
|
|
199
115
|
"""
|
|
200
|
-
Initialize KB using RAG-Anything
|
|
116
|
+
Initialize KB using RAG-Anything with MinerU parser.
|
|
117
|
+
|
|
118
|
+
Processing flow:
|
|
119
|
+
1. Parse documents using MinerU (generates content_list with nested image paths)
|
|
120
|
+
2. Migrate images to canonical location (kb/images/) and update paths in content_list
|
|
121
|
+
3. Insert updated content_list into RAG (now with correct image paths)
|
|
122
|
+
4. Clean up temporary parser output directories
|
|
123
|
+
|
|
124
|
+
This ensures RAG stores the final image paths, avoiding path mismatches during retrieval.
|
|
201
125
|
|
|
202
126
|
Uses FileTypeRouter to classify files and route them appropriately:
|
|
203
127
|
- PDF files -> MinerU parser (full document analysis)
|
|
@@ -212,13 +136,21 @@ class RAGAnythingPipeline:
|
|
|
212
136
|
Returns:
|
|
213
137
|
True if successful
|
|
214
138
|
"""
|
|
139
|
+
import json
|
|
140
|
+
|
|
215
141
|
from ..components.routing import FileTypeRouter
|
|
142
|
+
from ..utils.image_migration import (
|
|
143
|
+
cleanup_parser_output_dirs,
|
|
144
|
+
migrate_images_and_update_paths,
|
|
145
|
+
)
|
|
216
146
|
|
|
217
147
|
self.logger.info(f"Initializing KB '{kb_name}' with {len(file_paths)} files")
|
|
218
148
|
|
|
219
149
|
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
220
150
|
content_list_dir = kb_dir / "content_list"
|
|
151
|
+
images_dir = kb_dir / "images"
|
|
221
152
|
content_list_dir.mkdir(parents=True, exist_ok=True)
|
|
153
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
222
154
|
|
|
223
155
|
# Classify files by type
|
|
224
156
|
classification = FileTypeRouter.classify_files(file_paths)
|
|
@@ -235,19 +167,47 @@ class RAGAnythingPipeline:
|
|
|
235
167
|
|
|
236
168
|
total_files = len(classification.needs_mineru) + len(classification.text_files)
|
|
237
169
|
idx = 0
|
|
170
|
+
total_images_migrated = 0
|
|
238
171
|
|
|
239
172
|
# Process files requiring MinerU (PDF, DOCX, images)
|
|
240
173
|
for file_path in classification.needs_mineru:
|
|
241
174
|
idx += 1
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
175
|
+
file_name = Path(file_path).name
|
|
176
|
+
self.logger.info(f"Processing [{idx}/{total_files}] (MinerU): {file_name}")
|
|
177
|
+
|
|
178
|
+
# Step 1: Parse document (without RAG insertion)
|
|
179
|
+
self.logger.info(" Step 1/3: Parsing document...")
|
|
180
|
+
content_list, doc_id = await rag.parse_document(
|
|
246
181
|
file_path=file_path,
|
|
247
182
|
output_dir=str(content_list_dir),
|
|
248
183
|
parse_method="auto",
|
|
249
184
|
)
|
|
250
185
|
|
|
186
|
+
# Step 2: Migrate images and update paths
|
|
187
|
+
self.logger.info(" Step 2/3: Migrating images to canonical location...")
|
|
188
|
+
updated_content_list, num_migrated = await migrate_images_and_update_paths(
|
|
189
|
+
content_list=content_list,
|
|
190
|
+
source_base_dir=content_list_dir,
|
|
191
|
+
target_images_dir=images_dir,
|
|
192
|
+
batch_size=50,
|
|
193
|
+
)
|
|
194
|
+
total_images_migrated += num_migrated
|
|
195
|
+
|
|
196
|
+
# Save updated content_list for future reference
|
|
197
|
+
content_list_file = content_list_dir / f"{Path(file_path).stem}.json"
|
|
198
|
+
with open(content_list_file, "w", encoding="utf-8") as f:
|
|
199
|
+
json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
|
|
200
|
+
|
|
201
|
+
# Step 3: Insert into RAG with corrected paths
|
|
202
|
+
self.logger.info(" Step 3/3: Inserting into RAG knowledge graph...")
|
|
203
|
+
await rag.insert_content_list(
|
|
204
|
+
content_list=updated_content_list,
|
|
205
|
+
file_path=file_path,
|
|
206
|
+
doc_id=doc_id,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
self.logger.info(f" ✓ Completed: {file_name}")
|
|
210
|
+
|
|
251
211
|
# Process text files directly (fast path)
|
|
252
212
|
for file_path in classification.text_files:
|
|
253
213
|
idx += 1
|
|
@@ -263,10 +223,17 @@ class RAGAnythingPipeline:
|
|
|
263
223
|
for file_path in classification.unsupported:
|
|
264
224
|
self.logger.warning(f"Skipped unsupported file: {Path(file_path).name}")
|
|
265
225
|
|
|
226
|
+
# Clean up temporary parser output directories
|
|
227
|
+
if total_images_migrated > 0:
|
|
228
|
+
self.logger.info("Cleaning up temporary parser output directories...")
|
|
229
|
+
await cleanup_parser_output_dirs(content_list_dir)
|
|
230
|
+
|
|
266
231
|
if extract_numbered_items:
|
|
267
232
|
await self._extract_numbered_items(kb_name)
|
|
268
233
|
|
|
269
|
-
self.logger.info(
|
|
234
|
+
self.logger.info(
|
|
235
|
+
f"KB '{kb_name}' initialized successfully ({total_images_migrated} images migrated)"
|
|
236
|
+
)
|
|
270
237
|
return True
|
|
271
238
|
|
|
272
239
|
async def _extract_numbered_items(self, kb_name: str):
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
RAGAnything Docling Pipeline
|
|
4
|
+
============================
|
|
5
|
+
|
|
6
|
+
End-to-end pipeline wrapping RAG-Anything with Docling parser for document processing.
|
|
7
|
+
Uses Docling instead of MinerU for better Office document and HTML support.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import sys
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from src.logging import get_logger
|
|
15
|
+
from src.logging.adapters import LightRAGLogContext
|
|
16
|
+
|
|
17
|
+
# Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
|
|
18
|
+
# This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
|
|
19
|
+
from src.services.llm.config import get_llm_config as _early_config_load # noqa: F401
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RAGAnythingDoclingPipeline:
|
|
23
|
+
"""
|
|
24
|
+
RAG-Anything Pipeline with Docling Parser.
|
|
25
|
+
|
|
26
|
+
Uses RAG-Anything's complete processing with Docling as the document parser:
|
|
27
|
+
- Docling document parsing (supports PDF, Office documents, HTML)
|
|
28
|
+
- LightRAG knowledge graph construction
|
|
29
|
+
- Hybrid retrieval (hybrid/local/global/naive modes)
|
|
30
|
+
|
|
31
|
+
Advantages over MinerU:
|
|
32
|
+
- Better support for Office documents (.doc, .docx, .ppt, .pptx, .xls, .xlsx)
|
|
33
|
+
- Native HTML parsing support
|
|
34
|
+
- Easier installation (no CUDA dependencies)
|
|
35
|
+
|
|
36
|
+
Note: For academic PDFs with complex equations and formulas,
|
|
37
|
+
use RAGAnythingPipeline (MinerU) instead for better accuracy.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name = "raganything_docling"
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
kb_base_dir: Optional[str] = None,
|
|
45
|
+
enable_image_processing: bool = True,
|
|
46
|
+
enable_table_processing: bool = True,
|
|
47
|
+
enable_equation_processing: bool = True,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
Initialize RAGAnything Docling pipeline.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
kb_base_dir: Base directory for knowledge bases
|
|
54
|
+
enable_image_processing: Enable image extraction and processing
|
|
55
|
+
enable_table_processing: Enable table extraction and processing
|
|
56
|
+
enable_equation_processing: Enable equation extraction and processing
|
|
57
|
+
"""
|
|
58
|
+
self.logger = get_logger("RAGAnythingDoclingPipeline")
|
|
59
|
+
self.kb_base_dir = kb_base_dir or str(
|
|
60
|
+
Path(__file__).resolve().parent.parent.parent.parent.parent / "data" / "knowledge_bases"
|
|
61
|
+
)
|
|
62
|
+
self.enable_image = enable_image_processing
|
|
63
|
+
self.enable_table = enable_table_processing
|
|
64
|
+
self.enable_equation = enable_equation_processing
|
|
65
|
+
self._instances: Dict[str, Any] = {}
|
|
66
|
+
|
|
67
|
+
def _setup_raganything_path(self):
|
|
68
|
+
"""Add RAG-Anything to sys.path if available."""
|
|
69
|
+
project_root = Path(__file__).resolve().parent.parent.parent.parent.parent
|
|
70
|
+
raganything_path = project_root.parent / "raganything" / "RAG-Anything"
|
|
71
|
+
if raganything_path.exists() and str(raganything_path) not in sys.path:
|
|
72
|
+
sys.path.insert(0, str(raganything_path))
|
|
73
|
+
|
|
74
|
+
def _get_rag_instance(self, kb_name: str):
|
|
75
|
+
"""Get or create RAGAnything instance with Docling parser."""
|
|
76
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
77
|
+
working_dir = str(kb_dir / "rag_storage")
|
|
78
|
+
|
|
79
|
+
if working_dir in self._instances:
|
|
80
|
+
return self._instances[working_dir]
|
|
81
|
+
|
|
82
|
+
self._setup_raganything_path()
|
|
83
|
+
|
|
84
|
+
from raganything import RAGAnything, RAGAnythingConfig
|
|
85
|
+
|
|
86
|
+
from src.services.embedding import get_embedding_client
|
|
87
|
+
from src.services.llm import get_llm_client
|
|
88
|
+
|
|
89
|
+
# Use unified LLM client from src/services/llm
|
|
90
|
+
llm_client = get_llm_client()
|
|
91
|
+
embed_client = get_embedding_client()
|
|
92
|
+
|
|
93
|
+
# Get model functions from unified LLM client
|
|
94
|
+
# These handle all provider differences (OpenAI, Anthropic, Azure, local, etc.)
|
|
95
|
+
llm_model_func = llm_client.get_model_func()
|
|
96
|
+
vision_model_func = llm_client.get_vision_model_func()
|
|
97
|
+
|
|
98
|
+
# Configure RAGAnything with Docling parser
|
|
99
|
+
# Note: content_format should be "auto" or "minerU" because DoclingParser
|
|
100
|
+
# converts its output to MinerU-compatible format internally
|
|
101
|
+
config = RAGAnythingConfig(
|
|
102
|
+
working_dir=working_dir,
|
|
103
|
+
parser="docling", # Use Docling instead of MinerU
|
|
104
|
+
content_format="auto", # Auto-detect format (Docling outputs MinerU-compatible format)
|
|
105
|
+
enable_image_processing=self.enable_image,
|
|
106
|
+
enable_table_processing=self.enable_table,
|
|
107
|
+
enable_equation_processing=self.enable_equation,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
rag = RAGAnything(
|
|
111
|
+
config=config,
|
|
112
|
+
llm_model_func=llm_model_func,
|
|
113
|
+
vision_model_func=vision_model_func,
|
|
114
|
+
embedding_func=embed_client.get_embedding_func(),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self._instances[working_dir] = rag
|
|
118
|
+
return rag
|
|
119
|
+
|
|
120
|
+
async def initialize(
|
|
121
|
+
self,
|
|
122
|
+
kb_name: str,
|
|
123
|
+
file_paths: List[str],
|
|
124
|
+
extract_numbered_items: bool = True,
|
|
125
|
+
**kwargs,
|
|
126
|
+
) -> bool:
|
|
127
|
+
"""
|
|
128
|
+
Initialize KB using RAG-Anything with Docling parser.
|
|
129
|
+
|
|
130
|
+
Processing flow:
|
|
131
|
+
1. Parse documents using Docling (generates content_list with nested image paths)
|
|
132
|
+
2. Migrate images to canonical location (kb/images/) and update paths in content_list
|
|
133
|
+
3. Insert updated content_list into RAG (now with correct image paths)
|
|
134
|
+
4. Clean up temporary parser output directories
|
|
135
|
+
|
|
136
|
+
This ensures RAG stores the final image paths, avoiding path mismatches during retrieval.
|
|
137
|
+
|
|
138
|
+
Uses FileTypeRouter to classify files and route them appropriately:
|
|
139
|
+
- PDF files -> Docling parser
|
|
140
|
+
- Office files (.doc, .docx, .ppt, .pptx) -> Docling parser (direct support)
|
|
141
|
+
- HTML files -> Docling parser
|
|
142
|
+
- Text files -> Direct read + LightRAG insert (fast)
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
kb_name: Knowledge base name
|
|
146
|
+
file_paths: List of file paths to process
|
|
147
|
+
extract_numbered_items: Whether to extract numbered items after processing
|
|
148
|
+
**kwargs: Additional arguments
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
True if successful
|
|
152
|
+
"""
|
|
153
|
+
import json
|
|
154
|
+
|
|
155
|
+
from ..components.routing import FileTypeRouter
|
|
156
|
+
from ..utils.image_migration import (
|
|
157
|
+
cleanup_parser_output_dirs,
|
|
158
|
+
migrate_images_and_update_paths,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
self.logger.info(
|
|
162
|
+
f"Initializing KB '{kb_name}' with {len(file_paths)} files (Docling parser)"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
166
|
+
content_list_dir = kb_dir / "content_list"
|
|
167
|
+
images_dir = kb_dir / "images"
|
|
168
|
+
content_list_dir.mkdir(parents=True, exist_ok=True)
|
|
169
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
170
|
+
|
|
171
|
+
# Classify files by type
|
|
172
|
+
classification = FileTypeRouter.classify_files(file_paths)
|
|
173
|
+
|
|
174
|
+
self.logger.info(
|
|
175
|
+
f"File classification: {len(classification.needs_mineru)} need Docling, "
|
|
176
|
+
f"{len(classification.text_files)} text files, "
|
|
177
|
+
f"{len(classification.unsupported)} unsupported"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
with LightRAGLogContext(scene="knowledge_init"):
|
|
181
|
+
rag = self._get_rag_instance(kb_name)
|
|
182
|
+
await rag._ensure_lightrag_initialized()
|
|
183
|
+
|
|
184
|
+
total_files = len(classification.needs_mineru) + len(classification.text_files)
|
|
185
|
+
idx = 0
|
|
186
|
+
total_images_migrated = 0
|
|
187
|
+
|
|
188
|
+
# Process files requiring Docling (PDF, DOCX, images, HTML)
|
|
189
|
+
for file_path in classification.needs_mineru:
|
|
190
|
+
idx += 1
|
|
191
|
+
file_name = Path(file_path).name
|
|
192
|
+
self.logger.info(f"Processing [{idx}/{total_files}] (Docling): {file_name}")
|
|
193
|
+
|
|
194
|
+
# Step 1: Parse document (without RAG insertion)
|
|
195
|
+
self.logger.info(" Step 1/3: Parsing document...")
|
|
196
|
+
content_list, doc_id = await rag.parse_document(
|
|
197
|
+
file_path=file_path,
|
|
198
|
+
output_dir=str(content_list_dir),
|
|
199
|
+
parse_method="auto",
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Step 2: Migrate images and update paths
|
|
203
|
+
self.logger.info(" Step 2/3: Migrating images to canonical location...")
|
|
204
|
+
updated_content_list, num_migrated = await migrate_images_and_update_paths(
|
|
205
|
+
content_list=content_list,
|
|
206
|
+
source_base_dir=content_list_dir,
|
|
207
|
+
target_images_dir=images_dir,
|
|
208
|
+
batch_size=50,
|
|
209
|
+
)
|
|
210
|
+
total_images_migrated += num_migrated
|
|
211
|
+
|
|
212
|
+
# Save updated content_list for future reference
|
|
213
|
+
content_list_file = content_list_dir / f"{Path(file_path).stem}.json"
|
|
214
|
+
with open(content_list_file, "w", encoding="utf-8") as f:
|
|
215
|
+
json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
|
|
216
|
+
|
|
217
|
+
# Step 3: Insert into RAG with corrected paths
|
|
218
|
+
self.logger.info(" Step 3/3: Inserting into RAG knowledge graph...")
|
|
219
|
+
await rag.insert_content_list(
|
|
220
|
+
content_list=updated_content_list,
|
|
221
|
+
file_path=file_path,
|
|
222
|
+
doc_id=doc_id,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
self.logger.info(f" ✓ Completed: {file_name}")
|
|
226
|
+
|
|
227
|
+
# Process text files directly (fast path)
|
|
228
|
+
for file_path in classification.text_files:
|
|
229
|
+
idx += 1
|
|
230
|
+
self.logger.info(
|
|
231
|
+
f"Processing [{idx}/{total_files}] (direct text): {Path(file_path).name}"
|
|
232
|
+
)
|
|
233
|
+
content = await FileTypeRouter.read_text_file(file_path)
|
|
234
|
+
if content.strip():
|
|
235
|
+
# Insert directly into LightRAG, bypassing parser
|
|
236
|
+
await rag.lightrag.ainsert(content)
|
|
237
|
+
|
|
238
|
+
# Log unsupported files
|
|
239
|
+
for file_path in classification.unsupported:
|
|
240
|
+
self.logger.warning(f"Skipped unsupported file: {Path(file_path).name}")
|
|
241
|
+
|
|
242
|
+
# Clean up temporary parser output directories
|
|
243
|
+
if total_images_migrated > 0:
|
|
244
|
+
self.logger.info("Cleaning up temporary parser output directories...")
|
|
245
|
+
await cleanup_parser_output_dirs(content_list_dir)
|
|
246
|
+
|
|
247
|
+
if extract_numbered_items:
|
|
248
|
+
await self._extract_numbered_items(kb_name)
|
|
249
|
+
|
|
250
|
+
self.logger.info(
|
|
251
|
+
f"KB '{kb_name}' initialized successfully with Docling parser "
|
|
252
|
+
f"({total_images_migrated} images migrated)"
|
|
253
|
+
)
|
|
254
|
+
return True
|
|
255
|
+
|
|
256
|
+
async def _extract_numbered_items(self, kb_name: str):
|
|
257
|
+
"""Extract numbered items using existing extraction logic."""
|
|
258
|
+
try:
|
|
259
|
+
import json
|
|
260
|
+
|
|
261
|
+
from src.knowledge.extract_numbered_items import (
|
|
262
|
+
extract_numbered_items_with_llm_async,
|
|
263
|
+
)
|
|
264
|
+
from src.services.llm import get_llm_client
|
|
265
|
+
|
|
266
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
267
|
+
content_list_dir = kb_dir / "content_list"
|
|
268
|
+
|
|
269
|
+
if not content_list_dir.exists():
|
|
270
|
+
self.logger.warning("No content_list directory found, skipping extraction")
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
# Load all content list files
|
|
274
|
+
all_content_items = []
|
|
275
|
+
for json_file in content_list_dir.glob("*.json"):
|
|
276
|
+
with open(json_file, "r", encoding="utf-8") as f:
|
|
277
|
+
content_items = json.load(f)
|
|
278
|
+
all_content_items.extend(content_items)
|
|
279
|
+
|
|
280
|
+
if not all_content_items:
|
|
281
|
+
self.logger.warning("No content items found for extraction")
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
self.logger.info(
|
|
285
|
+
f"Extracting numbered items from {len(all_content_items)} content items"
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
llm_client = get_llm_client()
|
|
289
|
+
items = await extract_numbered_items_with_llm_async(
|
|
290
|
+
all_content_items,
|
|
291
|
+
api_key=llm_client.config.api_key,
|
|
292
|
+
base_url=llm_client.config.base_url,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Save numbered items
|
|
296
|
+
if items:
|
|
297
|
+
output_file = kb_dir / "numbered_items.json"
|
|
298
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
299
|
+
json.dump(items, f, ensure_ascii=False, indent=2)
|
|
300
|
+
self.logger.info(f"Extracted {len(items)} numbered items")
|
|
301
|
+
|
|
302
|
+
except ImportError as e:
|
|
303
|
+
self.logger.warning(f"Could not import extraction module: {e}")
|
|
304
|
+
except Exception as e:
|
|
305
|
+
self.logger.error(f"Failed to extract numbered items: {e}")
|
|
306
|
+
|
|
307
|
+
async def search(
|
|
308
|
+
self,
|
|
309
|
+
query: str,
|
|
310
|
+
kb_name: str,
|
|
311
|
+
mode: str = "hybrid",
|
|
312
|
+
only_need_context: bool = False,
|
|
313
|
+
**kwargs,
|
|
314
|
+
) -> Dict[str, Any]:
|
|
315
|
+
"""
|
|
316
|
+
Search using RAG-Anything's aquery().
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
query: Search query
|
|
320
|
+
kb_name: Knowledge base name
|
|
321
|
+
mode: Search mode (hybrid, local, global, naive)
|
|
322
|
+
only_need_context: Whether to only return context without answer
|
|
323
|
+
**kwargs: Additional arguments
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Search results dictionary
|
|
327
|
+
"""
|
|
328
|
+
with LightRAGLogContext(scene="rag_search"):
|
|
329
|
+
rag = self._get_rag_instance(kb_name)
|
|
330
|
+
await rag._ensure_lightrag_initialized()
|
|
331
|
+
|
|
332
|
+
answer = await rag.aquery(query, mode=mode, only_need_context=only_need_context)
|
|
333
|
+
answer_str = answer if isinstance(answer, str) else str(answer)
|
|
334
|
+
|
|
335
|
+
return {
|
|
336
|
+
"query": query,
|
|
337
|
+
"answer": answer_str,
|
|
338
|
+
"content": answer_str,
|
|
339
|
+
"mode": mode,
|
|
340
|
+
"provider": "raganything_docling",
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
async def delete(self, kb_name: str) -> bool:
|
|
344
|
+
"""
|
|
345
|
+
Delete knowledge base.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
kb_name: Knowledge base name
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
True if successful
|
|
352
|
+
"""
|
|
353
|
+
import shutil
|
|
354
|
+
|
|
355
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
356
|
+
working_dir = str(kb_dir / "rag_storage")
|
|
357
|
+
|
|
358
|
+
# Remove from cache
|
|
359
|
+
if working_dir in self._instances:
|
|
360
|
+
del self._instances[working_dir]
|
|
361
|
+
|
|
362
|
+
# Delete directory
|
|
363
|
+
if kb_dir.exists():
|
|
364
|
+
shutil.rmtree(kb_dir)
|
|
365
|
+
self.logger.info(f"Deleted KB '{kb_name}'")
|
|
366
|
+
return True
|
|
367
|
+
|
|
368
|
+
return False
|
src/services/rag/service.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
RAG Service
|
|
3
4
|
===========
|
|
@@ -5,12 +6,16 @@ RAG Service
|
|
|
5
6
|
Unified RAG service providing a single entry point for all RAG operations.
|
|
6
7
|
"""
|
|
7
8
|
|
|
9
|
+
import json
|
|
8
10
|
import os
|
|
9
11
|
from pathlib import Path
|
|
12
|
+
import shutil
|
|
10
13
|
from typing import Any, Dict, List, Optional
|
|
11
14
|
|
|
12
15
|
from src.logging import get_logger
|
|
13
16
|
|
|
17
|
+
from .factory import get_pipeline, has_pipeline, list_pipelines
|
|
18
|
+
|
|
14
19
|
# Default knowledge base directory
|
|
15
20
|
DEFAULT_KB_BASE_DIR = str(
|
|
16
21
|
Path(__file__).resolve().parent.parent.parent.parent / "data" / "knowledge_bases"
|
|
@@ -59,8 +64,6 @@ class RAGService:
|
|
|
59
64
|
def _get_pipeline(self):
|
|
60
65
|
"""Get or create pipeline instance."""
|
|
61
66
|
if self._pipeline is None:
|
|
62
|
-
from .factory import get_pipeline
|
|
63
|
-
|
|
64
67
|
self._pipeline = get_pipeline(self.provider, kb_base_dir=self.kb_base_dir)
|
|
65
68
|
return self._pipeline
|
|
66
69
|
|
|
@@ -117,8 +120,6 @@ class RAGService:
|
|
|
117
120
|
)
|
|
118
121
|
|
|
119
122
|
# Get pipeline for the specific provider
|
|
120
|
-
from .factory import get_pipeline
|
|
121
|
-
|
|
122
123
|
pipeline = get_pipeline(provider, kb_base_dir=self.kb_base_dir)
|
|
123
124
|
|
|
124
125
|
result = await pipeline.search(query=query, kb_name=kb_name, mode=mode, **kwargs)
|
|
@@ -149,8 +150,6 @@ class RAGService:
|
|
|
149
150
|
Provider name (e.g., 'llamaindex', 'lightrag', 'raganything')
|
|
150
151
|
"""
|
|
151
152
|
try:
|
|
152
|
-
import json
|
|
153
|
-
|
|
154
153
|
metadata_file = Path(self.kb_base_dir) / kb_name / "metadata.json"
|
|
155
154
|
|
|
156
155
|
if metadata_file.exists():
|
|
@@ -192,8 +191,6 @@ class RAGService:
|
|
|
192
191
|
return await pipeline.delete(kb_name=kb_name)
|
|
193
192
|
|
|
194
193
|
# Fallback: delete directory manually
|
|
195
|
-
import shutil
|
|
196
|
-
|
|
197
194
|
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
198
195
|
if kb_dir.exists():
|
|
199
196
|
shutil.rmtree(kb_dir)
|
|
@@ -214,8 +211,6 @@ class RAGService:
|
|
|
214
211
|
for p in providers:
|
|
215
212
|
print(f"{p['id']}: {p['description']}")
|
|
216
213
|
"""
|
|
217
|
-
from .factory import list_pipelines
|
|
218
|
-
|
|
219
214
|
return list_pipelines()
|
|
220
215
|
|
|
221
216
|
@staticmethod
|
|
@@ -239,6 +234,4 @@ class RAGService:
|
|
|
239
234
|
Returns:
|
|
240
235
|
True if provider exists
|
|
241
236
|
"""
|
|
242
|
-
from .factory import has_pipeline
|
|
243
|
-
|
|
244
237
|
return has_pipeline(name)
|
src/services/rag/types.py
CHANGED