realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
- realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
- scripts/__init__.py +1 -0
- scripts/audit_prompts.py +179 -0
- scripts/check_install.py +460 -0
- scripts/generate_roster.py +327 -0
- scripts/install_all.py +653 -0
- scripts/migrate_kb.py +655 -0
- scripts/start.py +807 -0
- scripts/start_web.py +632 -0
- scripts/sync_prompts_from_en.py +147 -0
- src/__init__.py +2 -2
- src/agents/ideagen/material_organizer_agent.py +2 -0
- src/agents/solve/__init__.py +6 -0
- src/agents/solve/main_solver.py +9 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
- src/agents/solve/session_manager.py +345 -0
- src/api/main.py +14 -0
- src/api/routers/chat.py +3 -3
- src/api/routers/co_writer.py +12 -7
- src/api/routers/config.py +1 -0
- src/api/routers/guide.py +3 -1
- src/api/routers/ideagen.py +7 -0
- src/api/routers/knowledge.py +64 -12
- src/api/routers/question.py +2 -0
- src/api/routers/realtimex.py +137 -0
- src/api/routers/research.py +9 -0
- src/api/routers/solve.py +120 -2
- src/cli/__init__.py +13 -0
- src/cli/start.py +209 -0
- src/config/constants.py +11 -9
- src/knowledge/add_documents.py +453 -213
- src/knowledge/extract_numbered_items.py +9 -10
- src/knowledge/initializer.py +102 -101
- src/knowledge/manager.py +251 -74
- src/knowledge/progress_tracker.py +43 -2
- src/knowledge/start_kb.py +11 -2
- src/logging/__init__.py +5 -0
- src/logging/adapters/__init__.py +1 -0
- src/logging/adapters/lightrag.py +25 -18
- src/logging/adapters/llamaindex.py +1 -0
- src/logging/config.py +30 -27
- src/logging/handlers/__init__.py +1 -0
- src/logging/handlers/console.py +7 -50
- src/logging/handlers/file.py +5 -20
- src/logging/handlers/websocket.py +23 -19
- src/logging/logger.py +161 -126
- src/logging/stats/__init__.py +1 -0
- src/logging/stats/llm_stats.py +37 -17
- src/services/__init__.py +17 -1
- src/services/config/__init__.py +1 -0
- src/services/config/knowledge_base_config.py +1 -0
- src/services/config/loader.py +1 -1
- src/services/config/unified_config.py +211 -4
- src/services/embedding/__init__.py +1 -0
- src/services/embedding/adapters/__init__.py +3 -0
- src/services/embedding/adapters/base.py +1 -0
- src/services/embedding/adapters/cohere.py +1 -0
- src/services/embedding/adapters/jina.py +1 -0
- src/services/embedding/adapters/ollama.py +1 -0
- src/services/embedding/adapters/openai_compatible.py +1 -0
- src/services/embedding/adapters/realtimex.py +125 -0
- src/services/embedding/client.py +27 -0
- src/services/embedding/config.py +3 -0
- src/services/embedding/provider.py +1 -0
- src/services/llm/__init__.py +17 -3
- src/services/llm/capabilities.py +47 -0
- src/services/llm/client.py +32 -0
- src/services/llm/cloud_provider.py +21 -4
- src/services/llm/config.py +36 -2
- src/services/llm/error_mapping.py +1 -0
- src/services/llm/exceptions.py +30 -0
- src/services/llm/factory.py +55 -16
- src/services/llm/local_provider.py +1 -0
- src/services/llm/providers/anthropic.py +1 -0
- src/services/llm/providers/base_provider.py +1 -0
- src/services/llm/providers/open_ai.py +1 -0
- src/services/llm/realtimex_provider.py +240 -0
- src/services/llm/registry.py +1 -0
- src/services/llm/telemetry.py +1 -0
- src/services/llm/types.py +1 -0
- src/services/llm/utils.py +1 -0
- src/services/prompt/__init__.py +1 -0
- src/services/prompt/manager.py +3 -2
- src/services/rag/__init__.py +27 -5
- src/services/rag/components/__init__.py +1 -0
- src/services/rag/components/base.py +1 -0
- src/services/rag/components/chunkers/__init__.py +1 -0
- src/services/rag/components/chunkers/base.py +1 -0
- src/services/rag/components/chunkers/fixed.py +1 -0
- src/services/rag/components/chunkers/numbered_item.py +1 -0
- src/services/rag/components/chunkers/semantic.py +1 -0
- src/services/rag/components/embedders/__init__.py +1 -0
- src/services/rag/components/embedders/base.py +1 -0
- src/services/rag/components/embedders/openai.py +1 -0
- src/services/rag/components/indexers/__init__.py +1 -0
- src/services/rag/components/indexers/base.py +1 -0
- src/services/rag/components/indexers/graph.py +5 -44
- src/services/rag/components/indexers/lightrag.py +5 -44
- src/services/rag/components/indexers/vector.py +1 -0
- src/services/rag/components/parsers/__init__.py +1 -0
- src/services/rag/components/parsers/base.py +1 -0
- src/services/rag/components/parsers/markdown.py +1 -0
- src/services/rag/components/parsers/pdf.py +1 -0
- src/services/rag/components/parsers/text.py +1 -0
- src/services/rag/components/retrievers/__init__.py +1 -0
- src/services/rag/components/retrievers/base.py +1 -0
- src/services/rag/components/retrievers/dense.py +1 -0
- src/services/rag/components/retrievers/hybrid.py +5 -44
- src/services/rag/components/retrievers/lightrag.py +5 -44
- src/services/rag/components/routing.py +48 -0
- src/services/rag/factory.py +112 -46
- src/services/rag/pipeline.py +1 -0
- src/services/rag/pipelines/__init__.py +27 -18
- src/services/rag/pipelines/lightrag.py +1 -0
- src/services/rag/pipelines/llamaindex.py +99 -0
- src/services/rag/pipelines/raganything.py +67 -100
- src/services/rag/pipelines/raganything_docling.py +368 -0
- src/services/rag/service.py +5 -12
- src/services/rag/types.py +1 -0
- src/services/rag/utils/__init__.py +17 -0
- src/services/rag/utils/image_migration.py +279 -0
- src/services/search/__init__.py +1 -0
- src/services/search/base.py +1 -0
- src/services/search/consolidation.py +1 -0
- src/services/search/providers/__init__.py +1 -0
- src/services/search/providers/baidu.py +1 -0
- src/services/search/providers/exa.py +1 -0
- src/services/search/providers/jina.py +1 -0
- src/services/search/providers/perplexity.py +1 -0
- src/services/search/providers/serper.py +1 -0
- src/services/search/providers/tavily.py +1 -0
- src/services/search/types.py +1 -0
- src/services/settings/__init__.py +1 -0
- src/services/settings/interface_settings.py +78 -0
- src/services/setup/__init__.py +1 -0
- src/services/tts/__init__.py +1 -0
- src/services/tts/config.py +1 -0
- src/utils/realtimex.py +284 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
- src/services/rag/pipelines/academic.py +0 -44
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
src/knowledge/manager.py
CHANGED
|
@@ -6,11 +6,61 @@ Knowledge Base Manager
|
|
|
6
6
|
Manages multiple knowledge bases and provides utilities for accessing them.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from contextlib import contextmanager
|
|
9
10
|
from datetime import datetime
|
|
10
11
|
import hashlib
|
|
11
12
|
import json
|
|
13
|
+
import os
|
|
12
14
|
from pathlib import Path
|
|
13
15
|
import shutil
|
|
16
|
+
import sys
|
|
17
|
+
|
|
18
|
+
from src.services.rag.components.routing import FileTypeRouter
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Cross-platform file locking
|
|
22
|
+
@contextmanager
|
|
23
|
+
def file_lock_shared(file_handle):
|
|
24
|
+
"""Acquire a shared (read) lock on a file - cross-platform."""
|
|
25
|
+
if sys.platform == "win32":
|
|
26
|
+
import msvcrt
|
|
27
|
+
|
|
28
|
+
msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1)
|
|
29
|
+
try:
|
|
30
|
+
yield
|
|
31
|
+
finally:
|
|
32
|
+
file_handle.seek(0)
|
|
33
|
+
msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
|
|
34
|
+
else:
|
|
35
|
+
import fcntl
|
|
36
|
+
|
|
37
|
+
fcntl.flock(file_handle.fileno(), fcntl.LOCK_SH)
|
|
38
|
+
try:
|
|
39
|
+
yield
|
|
40
|
+
finally:
|
|
41
|
+
fcntl.flock(file_handle.fileno(), fcntl.LOCK_UN)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@contextmanager
|
|
45
|
+
def file_lock_exclusive(file_handle):
|
|
46
|
+
"""Acquire an exclusive (write) lock on a file - cross-platform."""
|
|
47
|
+
if sys.platform == "win32":
|
|
48
|
+
import msvcrt
|
|
49
|
+
|
|
50
|
+
msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1)
|
|
51
|
+
try:
|
|
52
|
+
yield
|
|
53
|
+
finally:
|
|
54
|
+
file_handle.seek(0)
|
|
55
|
+
msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
|
|
56
|
+
else:
|
|
57
|
+
import fcntl
|
|
58
|
+
|
|
59
|
+
fcntl.flock(file_handle.fileno(), fcntl.LOCK_EX)
|
|
60
|
+
try:
|
|
61
|
+
yield
|
|
62
|
+
finally:
|
|
63
|
+
fcntl.flock(file_handle.fileno(), fcntl.LOCK_UN)
|
|
14
64
|
|
|
15
65
|
|
|
16
66
|
class KnowledgeBaseManager:
|
|
@@ -27,42 +77,113 @@ class KnowledgeBaseManager:
|
|
|
27
77
|
def _load_config(self) -> dict:
|
|
28
78
|
"""Load knowledge base configuration (kb_config.json only stores KB list)"""
|
|
29
79
|
if self.config_file.exists():
|
|
30
|
-
|
|
31
|
-
|
|
80
|
+
try:
|
|
81
|
+
with open(self.config_file, encoding="utf-8") as f:
|
|
82
|
+
with file_lock_shared(f):
|
|
83
|
+
content = f.read()
|
|
84
|
+
if not content.strip():
|
|
85
|
+
# Empty file, return default
|
|
86
|
+
return {"knowledge_bases": {}}
|
|
87
|
+
config = json.loads(content)
|
|
88
|
+
|
|
89
|
+
# Ensure knowledge_bases key exists
|
|
90
|
+
if "knowledge_bases" not in config:
|
|
91
|
+
config["knowledge_bases"] = {}
|
|
92
|
+
|
|
32
93
|
# Migration: remove old "default" field if present
|
|
33
94
|
if "default" in config:
|
|
34
95
|
del config["default"]
|
|
35
|
-
#
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
json.dump(config, wf, indent=2, ensure_ascii=False)
|
|
39
|
-
except Exception:
|
|
40
|
-
pass
|
|
96
|
+
# Note: Don't save during load to avoid recursion issues
|
|
97
|
+
# The next _save_config() call will persist this change
|
|
98
|
+
|
|
41
99
|
return config
|
|
100
|
+
except (json.JSONDecodeError, Exception) as e:
|
|
101
|
+
print(f"[KnowledgeBaseManager] Error loading config: {e}")
|
|
102
|
+
return {"knowledge_bases": {}}
|
|
42
103
|
return {"knowledge_bases": {}}
|
|
43
104
|
|
|
44
105
|
def _save_config(self):
|
|
45
|
-
"""Save knowledge base configuration"""
|
|
106
|
+
"""Save knowledge base configuration (thread-safe with file locking)"""
|
|
107
|
+
# Use exclusive lock for writing
|
|
46
108
|
with open(self.config_file, "w", encoding="utf-8") as f:
|
|
47
|
-
|
|
109
|
+
with file_lock_exclusive(f):
|
|
110
|
+
json.dump(self.config, f, indent=2, ensure_ascii=False)
|
|
111
|
+
f.flush()
|
|
112
|
+
os.fsync(f.fileno()) # Ensure data is written to disk
|
|
113
|
+
|
|
114
|
+
def update_kb_status(
|
|
115
|
+
self,
|
|
116
|
+
name: str,
|
|
117
|
+
status: str,
|
|
118
|
+
progress: dict | None = None,
|
|
119
|
+
):
|
|
120
|
+
"""
|
|
121
|
+
Update knowledge base status and progress in kb_config.json.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
name: Knowledge base name
|
|
125
|
+
status: Status string ("initializing", "processing", "ready", "error")
|
|
126
|
+
progress: Optional progress dict with keys like:
|
|
127
|
+
- stage: Current stage name
|
|
128
|
+
- message: Human-readable message
|
|
129
|
+
- percent: Progress percentage (0-100)
|
|
130
|
+
- current: Current item number
|
|
131
|
+
- total: Total items
|
|
132
|
+
- file_name: Current file being processed
|
|
133
|
+
- error: Error message (if status is "error")
|
|
134
|
+
"""
|
|
135
|
+
# Reload config to get latest state
|
|
136
|
+
self.config = self._load_config()
|
|
137
|
+
|
|
138
|
+
if "knowledge_bases" not in self.config:
|
|
139
|
+
self.config["knowledge_bases"] = {}
|
|
140
|
+
|
|
141
|
+
if name not in self.config["knowledge_bases"]:
|
|
142
|
+
# Auto-register if not exists
|
|
143
|
+
self.config["knowledge_bases"][name] = {
|
|
144
|
+
"path": name,
|
|
145
|
+
"description": f"Knowledge base: {name}",
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
kb_config = self.config["knowledge_bases"][name]
|
|
149
|
+
kb_config["status"] = status
|
|
150
|
+
kb_config["updated_at"] = datetime.now().isoformat()
|
|
151
|
+
|
|
152
|
+
if progress is not None:
|
|
153
|
+
kb_config["progress"] = progress
|
|
154
|
+
elif status == "ready":
|
|
155
|
+
# Clear progress when ready
|
|
156
|
+
kb_config["progress"] = {
|
|
157
|
+
"stage": "completed",
|
|
158
|
+
"message": "Ready",
|
|
159
|
+
"percent": 100,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
self._save_config()
|
|
163
|
+
|
|
164
|
+
def get_kb_status(self, name: str) -> dict | None:
|
|
165
|
+
"""Get status and progress for a knowledge base."""
|
|
166
|
+
self.config = self._load_config()
|
|
167
|
+
kb_config = self.config.get("knowledge_bases", {}).get(name)
|
|
168
|
+
if not kb_config:
|
|
169
|
+
return None
|
|
170
|
+
return {
|
|
171
|
+
"status": kb_config.get("status", "unknown"),
|
|
172
|
+
"progress": kb_config.get("progress"),
|
|
173
|
+
"updated_at": kb_config.get("updated_at"),
|
|
174
|
+
}
|
|
48
175
|
|
|
49
176
|
def list_knowledge_bases(self) -> list[str]:
|
|
50
177
|
"""List all available knowledge bases from kb_config.json"""
|
|
51
|
-
|
|
178
|
+
# Always reload config from file to ensure we have the latest data
|
|
179
|
+
# This is important when new KBs are created by other processes/requests
|
|
180
|
+
self.config = self._load_config()
|
|
52
181
|
|
|
53
182
|
# Read knowledge base list from config file (this is the authoritative source)
|
|
183
|
+
# Return all KBs in config, regardless of directory status
|
|
184
|
+
# (status field indicates if KB is ready or still initializing)
|
|
54
185
|
config_kbs = self.config.get("knowledge_bases", {})
|
|
55
|
-
|
|
56
|
-
for kb_name in config_kbs.keys():
|
|
57
|
-
# Verify knowledge base directory exists
|
|
58
|
-
kb_dir = self.base_dir / kb_name
|
|
59
|
-
if kb_dir.exists() and kb_dir.is_dir():
|
|
60
|
-
kb_list.append(kb_name)
|
|
61
|
-
else:
|
|
62
|
-
# If in config but directory doesn't exist, log warning but don't add
|
|
63
|
-
print(
|
|
64
|
-
f"Warning: Knowledge base '{kb_name}' is in config but directory does not exist: {kb_dir}"
|
|
65
|
-
)
|
|
186
|
+
kb_list = list(config_kbs.keys())
|
|
66
187
|
|
|
67
188
|
# If no config file or config is empty, fallback to scanning directory (backward compatibility)
|
|
68
189
|
if not kb_list and self.base_dir.exists():
|
|
@@ -183,83 +304,115 @@ class KnowledgeBaseManager:
|
|
|
183
304
|
|
|
184
305
|
This method:
|
|
185
306
|
1. Gets the KB name (from parameter or default)
|
|
186
|
-
2. Reads
|
|
187
|
-
3.
|
|
307
|
+
2. Reads status and progress from kb_config.json
|
|
308
|
+
3. Reads metadata.json from the KB directory (if exists)
|
|
309
|
+
4. Collects statistics about files and RAG status
|
|
188
310
|
"""
|
|
311
|
+
# Reload config to get latest status
|
|
312
|
+
self.config = self._load_config()
|
|
313
|
+
|
|
189
314
|
kb_name = name or self.get_default()
|
|
190
315
|
if kb_name is None:
|
|
191
316
|
raise ValueError("No knowledge base name provided and no default set")
|
|
192
317
|
|
|
193
318
|
# Get knowledge base path
|
|
194
319
|
kb_dir = self.base_dir / kb_name
|
|
195
|
-
if not kb_dir.exists():
|
|
196
|
-
raise ValueError(f"Knowledge base directory does not exist: {kb_dir}")
|
|
197
320
|
|
|
198
|
-
#
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
321
|
+
# Get status and progress from kb_config.json
|
|
322
|
+
kb_config = self.config.get("knowledge_bases", {}).get(kb_name, {})
|
|
323
|
+
status = kb_config.get("status")
|
|
324
|
+
progress = kb_config.get("progress")
|
|
325
|
+
|
|
326
|
+
# KB might not have a directory yet if still initializing
|
|
327
|
+
dir_exists = kb_dir.exists()
|
|
328
|
+
|
|
329
|
+
# For old KBs without status field, determine status from rag_storage
|
|
330
|
+
if not status and dir_exists:
|
|
331
|
+
rag_storage_dir = kb_dir / "rag_storage"
|
|
332
|
+
if rag_storage_dir.exists() and any(rag_storage_dir.iterdir()):
|
|
333
|
+
status = "ready"
|
|
334
|
+
else:
|
|
335
|
+
status = "unknown"
|
|
336
|
+
elif not status:
|
|
337
|
+
status = "unknown"
|
|
203
338
|
|
|
204
339
|
info = {
|
|
205
340
|
"name": kb_name,
|
|
206
341
|
"path": str(kb_dir),
|
|
207
342
|
"is_default": kb_name == self.get_default(),
|
|
208
343
|
"metadata": {},
|
|
344
|
+
"status": status,
|
|
345
|
+
"progress": progress,
|
|
209
346
|
}
|
|
210
347
|
|
|
211
348
|
# Read metadata.json (if exists)
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# metadata.json doesn't exist, use empty dict
|
|
222
|
-
info["metadata"] = {}
|
|
349
|
+
if dir_exists:
|
|
350
|
+
metadata_file = kb_dir / "metadata.json"
|
|
351
|
+
if metadata_file.exists():
|
|
352
|
+
try:
|
|
353
|
+
with open(metadata_file, encoding="utf-8") as f:
|
|
354
|
+
info["metadata"] = json.load(f)
|
|
355
|
+
except Exception as e:
|
|
356
|
+
print(f"Warning: Failed to read metadata.json for KB '{kb_name}': {e}")
|
|
357
|
+
info["metadata"] = {}
|
|
223
358
|
|
|
224
359
|
# Count files - handle errors gracefully
|
|
225
|
-
raw_dir = kb_dir / "raw"
|
|
226
|
-
images_dir = kb_dir / "images"
|
|
227
|
-
content_list_dir = kb_dir / "content_list"
|
|
228
|
-
rag_storage_dir = kb_dir / "rag_storage"
|
|
360
|
+
raw_dir = kb_dir / "raw" if dir_exists else None
|
|
361
|
+
images_dir = kb_dir / "images" if dir_exists else None
|
|
362
|
+
content_list_dir = kb_dir / "content_list" if dir_exists else None
|
|
363
|
+
rag_storage_dir = kb_dir / "rag_storage" if dir_exists else None
|
|
229
364
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
)
|
|
234
|
-
except Exception:
|
|
235
|
-
raw_count = 0
|
|
365
|
+
raw_count = 0
|
|
366
|
+
images_count = 0
|
|
367
|
+
content_lists_count = 0
|
|
236
368
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
369
|
+
if dir_exists:
|
|
370
|
+
try:
|
|
371
|
+
raw_count = (
|
|
372
|
+
len([f for f in raw_dir.iterdir() if f.is_file()]) if raw_dir.exists() else 0
|
|
373
|
+
)
|
|
374
|
+
except Exception:
|
|
375
|
+
pass
|
|
243
376
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
377
|
+
try:
|
|
378
|
+
images_count = (
|
|
379
|
+
len([f for f in images_dir.iterdir() if f.is_file()])
|
|
380
|
+
if images_dir.exists()
|
|
381
|
+
else 0
|
|
382
|
+
)
|
|
383
|
+
except Exception:
|
|
384
|
+
pass
|
|
385
|
+
|
|
386
|
+
try:
|
|
387
|
+
content_lists_count = (
|
|
388
|
+
len(list(content_list_dir.glob("*.json"))) if content_list_dir.exists() else 0
|
|
389
|
+
)
|
|
390
|
+
except Exception:
|
|
391
|
+
pass
|
|
250
392
|
|
|
251
393
|
metadata = info["metadata"]
|
|
252
394
|
rag_provider = metadata.get("rag_provider") if isinstance(metadata, dict) else None
|
|
395
|
+
# Also check kb_config for rag_provider (fallback)
|
|
396
|
+
if not rag_provider:
|
|
397
|
+
rag_provider = kb_config.get("rag_provider")
|
|
398
|
+
|
|
399
|
+
rag_initialized = (
|
|
400
|
+
dir_exists and rag_storage_dir and rag_storage_dir.exists() and rag_storage_dir.is_dir()
|
|
401
|
+
)
|
|
402
|
+
|
|
253
403
|
info["statistics"] = {
|
|
254
404
|
"raw_documents": raw_count,
|
|
255
405
|
"images": images_count,
|
|
256
406
|
"content_lists": content_lists_count,
|
|
257
|
-
"rag_initialized":
|
|
258
|
-
"rag_provider": rag_provider,
|
|
407
|
+
"rag_initialized": rag_initialized,
|
|
408
|
+
"rag_provider": rag_provider,
|
|
409
|
+
# Include status and progress in statistics for backward compatibility
|
|
410
|
+
"status": status,
|
|
411
|
+
"progress": progress,
|
|
259
412
|
}
|
|
260
413
|
|
|
261
414
|
# Try to get RAG statistics
|
|
262
|
-
if
|
|
415
|
+
if rag_initialized:
|
|
263
416
|
try:
|
|
264
417
|
entities_file = rag_storage_dir / "kv_store_full_entities.json"
|
|
265
418
|
relations_file = rag_storage_dir / "kv_store_full_relations.json"
|
|
@@ -406,14 +559,25 @@ class KnowledgeBaseManager:
|
|
|
406
559
|
if not folder.is_dir():
|
|
407
560
|
raise ValueError(f"Path is not a directory: {folder}")
|
|
408
561
|
|
|
409
|
-
# Get
|
|
410
|
-
|
|
562
|
+
# Get RAG provider from KB metadata to determine supported extensions
|
|
563
|
+
kb_dir = self.base_dir / kb_name
|
|
564
|
+
metadata_file = kb_dir / "metadata.json"
|
|
565
|
+
provider = "raganything" # default to most comprehensive
|
|
566
|
+
if metadata_file.exists():
|
|
567
|
+
try:
|
|
568
|
+
with open(metadata_file, encoding="utf-8") as f:
|
|
569
|
+
kb_meta = json.load(f)
|
|
570
|
+
provider = kb_meta.get("rag_provider") or "raganything"
|
|
571
|
+
except Exception:
|
|
572
|
+
pass
|
|
573
|
+
|
|
574
|
+
# Get supported files in folder based on provider
|
|
575
|
+
supported_extensions = FileTypeRouter.get_extensions_for_provider(provider)
|
|
411
576
|
files: list[Path] = []
|
|
412
577
|
for ext in supported_extensions:
|
|
413
578
|
files.extend(folder.glob(f"**/*{ext}"))
|
|
414
579
|
|
|
415
580
|
# Generate folder ID
|
|
416
|
-
import hashlib
|
|
417
581
|
|
|
418
582
|
folder_id = hashlib.md5( # noqa: S324
|
|
419
583
|
str(folder).encode(), usedforsecurity=False
|
|
@@ -523,12 +687,13 @@ class KnowledgeBaseManager:
|
|
|
523
687
|
|
|
524
688
|
return True
|
|
525
689
|
|
|
526
|
-
def scan_linked_folder(self, folder_path: str) -> list[str]:
|
|
690
|
+
def scan_linked_folder(self, folder_path: str, provider: str = "raganything") -> list[str]:
|
|
527
691
|
"""
|
|
528
692
|
Scan a linked folder and return list of supported file paths.
|
|
529
693
|
|
|
530
694
|
Args:
|
|
531
695
|
folder_path: Path to folder
|
|
696
|
+
provider: RAG provider to determine supported extensions (default: raganything)
|
|
532
697
|
|
|
533
698
|
Returns:
|
|
534
699
|
List of file paths (as strings)
|
|
@@ -538,7 +703,7 @@ class KnowledgeBaseManager:
|
|
|
538
703
|
if not folder.exists() or not folder.is_dir():
|
|
539
704
|
return []
|
|
540
705
|
|
|
541
|
-
supported_extensions =
|
|
706
|
+
supported_extensions = FileTypeRouter.get_extensions_for_provider(provider)
|
|
542
707
|
files = []
|
|
543
708
|
|
|
544
709
|
for ext in supported_extensions:
|
|
@@ -583,8 +748,20 @@ class KnowledgeBaseManager:
|
|
|
583
748
|
except Exception:
|
|
584
749
|
pass
|
|
585
750
|
|
|
586
|
-
#
|
|
587
|
-
|
|
751
|
+
# Get RAG provider from KB metadata to determine supported extensions
|
|
752
|
+
kb_dir = self.base_dir / kb_name
|
|
753
|
+
metadata_file = kb_dir / "metadata.json"
|
|
754
|
+
provider = "raganything" # default to most comprehensive
|
|
755
|
+
if metadata_file.exists():
|
|
756
|
+
try:
|
|
757
|
+
with open(metadata_file, encoding="utf-8") as f:
|
|
758
|
+
metadata = json.load(f)
|
|
759
|
+
provider = metadata.get("rag_provider") or "raganything"
|
|
760
|
+
except Exception:
|
|
761
|
+
pass
|
|
762
|
+
|
|
763
|
+
# Scan current files based on provider's supported extensions
|
|
764
|
+
supported_extensions = FileTypeRouter.get_extensions_for_provider(provider)
|
|
588
765
|
new_files = []
|
|
589
766
|
modified_files = []
|
|
590
767
|
|
|
@@ -99,13 +99,54 @@ class ProgressTracker:
|
|
|
99
99
|
print(f"[ProgressTracker] Callback error: {e}")
|
|
100
100
|
|
|
101
101
|
def _save_progress(self, progress: dict):
|
|
102
|
-
"""Save progress to file"""
|
|
102
|
+
"""Save progress to kb_config.json and local .progress.json file"""
|
|
103
|
+
# Save to kb_config.json (centralized config)
|
|
104
|
+
try:
|
|
105
|
+
from src.knowledge.manager import KnowledgeBaseManager
|
|
106
|
+
|
|
107
|
+
manager = KnowledgeBaseManager(base_dir=str(self.base_dir))
|
|
108
|
+
|
|
109
|
+
# Determine status based on stage
|
|
110
|
+
stage = progress.get("stage", "")
|
|
111
|
+
if stage == "completed":
|
|
112
|
+
status = "ready"
|
|
113
|
+
elif stage == "error":
|
|
114
|
+
status = "error"
|
|
115
|
+
elif stage in [
|
|
116
|
+
"initializing",
|
|
117
|
+
"processing_documents",
|
|
118
|
+
"processing_file",
|
|
119
|
+
"extracting_items",
|
|
120
|
+
]:
|
|
121
|
+
status = "processing"
|
|
122
|
+
else:
|
|
123
|
+
status = "initializing"
|
|
124
|
+
|
|
125
|
+
# Update kb_config.json with status and progress
|
|
126
|
+
manager.update_kb_status(
|
|
127
|
+
name=self.kb_name,
|
|
128
|
+
status=status,
|
|
129
|
+
progress={
|
|
130
|
+
"stage": progress.get("stage"),
|
|
131
|
+
"message": progress.get("message"),
|
|
132
|
+
"percent": progress.get("progress_percent", 0),
|
|
133
|
+
"current": progress.get("current", 0),
|
|
134
|
+
"total": progress.get("total", 0),
|
|
135
|
+
"file_name": progress.get("file_name"),
|
|
136
|
+
"error": progress.get("error"),
|
|
137
|
+
"timestamp": progress.get("timestamp"),
|
|
138
|
+
},
|
|
139
|
+
)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f"[ProgressTracker] Failed to save progress to kb_config.json: {e}")
|
|
142
|
+
|
|
143
|
+
# Also save to local .progress.json file (for backward compatibility)
|
|
103
144
|
try:
|
|
104
145
|
self.kb_dir.mkdir(parents=True, exist_ok=True)
|
|
105
146
|
with open(self.progress_file, "w", encoding="utf-8") as f:
|
|
106
147
|
json.dump(progress, f, indent=2, ensure_ascii=False)
|
|
107
148
|
except Exception as e:
|
|
108
|
-
print(f"[ProgressTracker] Failed to save progress: {e}")
|
|
149
|
+
print(f"[ProgressTracker] Failed to save progress to local file: {e}")
|
|
109
150
|
|
|
110
151
|
def update(
|
|
111
152
|
self,
|
src/knowledge/start_kb.py
CHANGED
|
@@ -15,6 +15,8 @@ try:
|
|
|
15
15
|
from .config import KNOWLEDGE_BASES_DIR, get_env_config, setup_paths
|
|
16
16
|
|
|
17
17
|
setup_paths()
|
|
18
|
+
from src.services.rag.components.routing import FileTypeRouter
|
|
19
|
+
|
|
18
20
|
from .extract_numbered_items import process_content_list
|
|
19
21
|
from .initializer import KnowledgeBaseInitializer
|
|
20
22
|
from .manager import KnowledgeBaseManager
|
|
@@ -28,6 +30,7 @@ except ImportError:
|
|
|
28
30
|
from src.knowledge.extract_numbered_items import process_content_list
|
|
29
31
|
from src.knowledge.initializer import KnowledgeBaseInitializer
|
|
30
32
|
from src.knowledge.manager import KnowledgeBaseManager
|
|
33
|
+
from src.services.rag.components.routing import FileTypeRouter
|
|
31
34
|
|
|
32
35
|
|
|
33
36
|
def list_knowledge_bases():
|
|
@@ -123,6 +126,12 @@ async def init_knowledge_base(args):
|
|
|
123
126
|
return
|
|
124
127
|
|
|
125
128
|
# Collect document files
|
|
129
|
+
# Use provider from env var or default to raganything (most comprehensive)
|
|
130
|
+
import os
|
|
131
|
+
|
|
132
|
+
provider = os.getenv("RAG_PROVIDER", "raganything")
|
|
133
|
+
glob_patterns = FileTypeRouter.get_glob_patterns_for_provider(provider)
|
|
134
|
+
|
|
126
135
|
doc_files = []
|
|
127
136
|
if args.docs:
|
|
128
137
|
doc_files.extend(args.docs)
|
|
@@ -130,8 +139,8 @@ async def init_knowledge_base(args):
|
|
|
130
139
|
if args.docs_dir:
|
|
131
140
|
docs_dir = Path(args.docs_dir)
|
|
132
141
|
if docs_dir.exists() and docs_dir.is_dir():
|
|
133
|
-
for
|
|
134
|
-
doc_files.extend([str(f) for f in docs_dir.glob(
|
|
142
|
+
for pattern in glob_patterns:
|
|
143
|
+
doc_files.extend([str(f) for f in docs_dir.glob(pattern)])
|
|
135
144
|
else:
|
|
136
145
|
print(f"✗ Error: Document directory does not exist: {args.docs_dir}\n")
|
|
137
146
|
return
|
src/logging/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
Unified Logging System for DeepTutor
|
|
3
4
|
=====================================
|
|
@@ -39,6 +40,7 @@ from .adapters import (
|
|
|
39
40
|
from .config import (
|
|
40
41
|
LoggingConfig,
|
|
41
42
|
get_default_log_dir,
|
|
43
|
+
get_global_log_level,
|
|
42
44
|
load_logging_config,
|
|
43
45
|
)
|
|
44
46
|
|
|
@@ -58,6 +60,7 @@ from .logger import (
|
|
|
58
60
|
LogLevel,
|
|
59
61
|
get_logger,
|
|
60
62
|
reset_logger,
|
|
63
|
+
set_default_service_prefix,
|
|
61
64
|
)
|
|
62
65
|
|
|
63
66
|
# Statistics tracking
|
|
@@ -75,6 +78,7 @@ __all__ = [
|
|
|
75
78
|
"LogLevel",
|
|
76
79
|
"get_logger",
|
|
77
80
|
"reset_logger",
|
|
81
|
+
"set_default_service_prefix",
|
|
78
82
|
"ConsoleFormatter",
|
|
79
83
|
"FileFormatter",
|
|
80
84
|
# Handlers
|
|
@@ -100,4 +104,5 @@ __all__ = [
|
|
|
100
104
|
"LoggingConfig",
|
|
101
105
|
"load_logging_config",
|
|
102
106
|
"get_default_log_dir",
|
|
107
|
+
"get_global_log_level",
|
|
103
108
|
]
|
src/logging/adapters/__init__.py
CHANGED
src/logging/adapters/lightrag.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
2
3
|
"""
|
|
3
4
|
LightRAG Log Forwarder
|
|
4
5
|
======================
|
|
5
6
|
|
|
6
7
|
Forwards LightRAG and RAG-Anything logs to DeepTutor's unified logging system.
|
|
8
|
+
Uses the unified global log level from config/main.yaml -> logging.level
|
|
7
9
|
"""
|
|
8
10
|
|
|
9
11
|
from contextlib import contextmanager
|
|
@@ -31,17 +33,21 @@ class LightRAGLogForwarder(logging.Handler):
|
|
|
31
33
|
|
|
32
34
|
def emit(self, record: logging.LogRecord):
|
|
33
35
|
"""
|
|
34
|
-
Forward log record to DeepTutor logger.
|
|
35
|
-
All logs are forwarded as info level to maintain consistent format.
|
|
36
|
+
Forward log record to DeepTutor logger with proper level mapping.
|
|
36
37
|
"""
|
|
37
38
|
try:
|
|
38
|
-
# Get the original message directly without adding [LightRAG] prefix
|
|
39
|
-
# LightRAG already formats messages appropriately (e.g., "DEBUG: xxx" for debug logs)
|
|
40
39
|
message = record.getMessage()
|
|
41
40
|
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
41
|
+
# Map LightRAG log levels to DeepTutor logger methods
|
|
42
|
+
level = record.levelno
|
|
43
|
+
if level >= logging.ERROR:
|
|
44
|
+
self.ai_tutor_logger.error(message)
|
|
45
|
+
elif level >= logging.WARNING:
|
|
46
|
+
self.ai_tutor_logger.warning(message)
|
|
47
|
+
elif level >= logging.INFO:
|
|
48
|
+
self.ai_tutor_logger.info(message)
|
|
49
|
+
else:
|
|
50
|
+
self.ai_tutor_logger.debug(message)
|
|
45
51
|
|
|
46
52
|
except Exception:
|
|
47
53
|
# Avoid errors in forwarding from affecting main flow
|
|
@@ -50,7 +56,7 @@ class LightRAGLogForwarder(logging.Handler):
|
|
|
50
56
|
|
|
51
57
|
def get_lightrag_forwarding_config() -> dict:
|
|
52
58
|
"""
|
|
53
|
-
Load LightRAG forwarding configuration from main.yaml.
|
|
59
|
+
Load LightRAG forwarding configuration from config/main.yaml.
|
|
54
60
|
|
|
55
61
|
Returns:
|
|
56
62
|
dict: Configuration dictionary with defaults if not found
|
|
@@ -58,27 +64,28 @@ def get_lightrag_forwarding_config() -> dict:
|
|
|
58
64
|
try:
|
|
59
65
|
from src.services.config import load_config_with_main
|
|
60
66
|
|
|
61
|
-
|
|
67
|
+
from ..config import get_global_log_level
|
|
68
|
+
|
|
62
69
|
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
63
70
|
config = load_config_with_main("solve_config.yaml", project_root)
|
|
71
|
+
logging_config = config.get("logging", {})
|
|
64
72
|
|
|
65
|
-
|
|
73
|
+
# Use the unified global log level
|
|
74
|
+
level = get_global_log_level()
|
|
66
75
|
|
|
67
76
|
return {
|
|
68
|
-
"enabled":
|
|
69
|
-
"min_level":
|
|
70
|
-
"
|
|
71
|
-
|
|
72
|
-
"logger_names", {"knowledge_init": "KnowledgeInit", "rag_tool": "RAGTool"}
|
|
77
|
+
"enabled": True,
|
|
78
|
+
"min_level": level,
|
|
79
|
+
"logger_names": logging_config.get(
|
|
80
|
+
"rag_logger_names", {"knowledge_init": "RAG-Init", "rag_tool": "RAG"}
|
|
73
81
|
),
|
|
74
82
|
}
|
|
75
83
|
except Exception:
|
|
76
84
|
# Return defaults if config loading fails
|
|
77
85
|
return {
|
|
78
86
|
"enabled": True,
|
|
79
|
-
"min_level": "
|
|
80
|
-
"
|
|
81
|
-
"logger_names": {"knowledge_init": "KnowledgeInit", "rag_tool": "RAGTool"},
|
|
87
|
+
"min_level": "DEBUG",
|
|
88
|
+
"logger_names": {"knowledge_init": "RAG-Init", "rag_tool": "RAG"},
|
|
82
89
|
}
|
|
83
90
|
|
|
84
91
|
|