realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
  2. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
  3. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
  4. realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
  5. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
  6. scripts/__init__.py +1 -0
  7. scripts/audit_prompts.py +179 -0
  8. scripts/check_install.py +460 -0
  9. scripts/generate_roster.py +327 -0
  10. scripts/install_all.py +653 -0
  11. scripts/migrate_kb.py +655 -0
  12. scripts/start.py +807 -0
  13. scripts/start_web.py +632 -0
  14. scripts/sync_prompts_from_en.py +147 -0
  15. src/__init__.py +2 -2
  16. src/agents/ideagen/material_organizer_agent.py +2 -0
  17. src/agents/solve/__init__.py +6 -0
  18. src/agents/solve/main_solver.py +9 -0
  19. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
  20. src/agents/solve/session_manager.py +345 -0
  21. src/api/main.py +14 -0
  22. src/api/routers/chat.py +3 -3
  23. src/api/routers/co_writer.py +12 -7
  24. src/api/routers/config.py +1 -0
  25. src/api/routers/guide.py +3 -1
  26. src/api/routers/ideagen.py +7 -0
  27. src/api/routers/knowledge.py +64 -12
  28. src/api/routers/question.py +2 -0
  29. src/api/routers/realtimex.py +137 -0
  30. src/api/routers/research.py +9 -0
  31. src/api/routers/solve.py +120 -2
  32. src/cli/__init__.py +13 -0
  33. src/cli/start.py +209 -0
  34. src/config/constants.py +11 -9
  35. src/knowledge/add_documents.py +453 -213
  36. src/knowledge/extract_numbered_items.py +9 -10
  37. src/knowledge/initializer.py +102 -101
  38. src/knowledge/manager.py +251 -74
  39. src/knowledge/progress_tracker.py +43 -2
  40. src/knowledge/start_kb.py +11 -2
  41. src/logging/__init__.py +5 -0
  42. src/logging/adapters/__init__.py +1 -0
  43. src/logging/adapters/lightrag.py +25 -18
  44. src/logging/adapters/llamaindex.py +1 -0
  45. src/logging/config.py +30 -27
  46. src/logging/handlers/__init__.py +1 -0
  47. src/logging/handlers/console.py +7 -50
  48. src/logging/handlers/file.py +5 -20
  49. src/logging/handlers/websocket.py +23 -19
  50. src/logging/logger.py +161 -126
  51. src/logging/stats/__init__.py +1 -0
  52. src/logging/stats/llm_stats.py +37 -17
  53. src/services/__init__.py +17 -1
  54. src/services/config/__init__.py +1 -0
  55. src/services/config/knowledge_base_config.py +1 -0
  56. src/services/config/loader.py +1 -1
  57. src/services/config/unified_config.py +211 -4
  58. src/services/embedding/__init__.py +1 -0
  59. src/services/embedding/adapters/__init__.py +3 -0
  60. src/services/embedding/adapters/base.py +1 -0
  61. src/services/embedding/adapters/cohere.py +1 -0
  62. src/services/embedding/adapters/jina.py +1 -0
  63. src/services/embedding/adapters/ollama.py +1 -0
  64. src/services/embedding/adapters/openai_compatible.py +1 -0
  65. src/services/embedding/adapters/realtimex.py +125 -0
  66. src/services/embedding/client.py +27 -0
  67. src/services/embedding/config.py +3 -0
  68. src/services/embedding/provider.py +1 -0
  69. src/services/llm/__init__.py +17 -3
  70. src/services/llm/capabilities.py +47 -0
  71. src/services/llm/client.py +32 -0
  72. src/services/llm/cloud_provider.py +21 -4
  73. src/services/llm/config.py +36 -2
  74. src/services/llm/error_mapping.py +1 -0
  75. src/services/llm/exceptions.py +30 -0
  76. src/services/llm/factory.py +55 -16
  77. src/services/llm/local_provider.py +1 -0
  78. src/services/llm/providers/anthropic.py +1 -0
  79. src/services/llm/providers/base_provider.py +1 -0
  80. src/services/llm/providers/open_ai.py +1 -0
  81. src/services/llm/realtimex_provider.py +240 -0
  82. src/services/llm/registry.py +1 -0
  83. src/services/llm/telemetry.py +1 -0
  84. src/services/llm/types.py +1 -0
  85. src/services/llm/utils.py +1 -0
  86. src/services/prompt/__init__.py +1 -0
  87. src/services/prompt/manager.py +3 -2
  88. src/services/rag/__init__.py +27 -5
  89. src/services/rag/components/__init__.py +1 -0
  90. src/services/rag/components/base.py +1 -0
  91. src/services/rag/components/chunkers/__init__.py +1 -0
  92. src/services/rag/components/chunkers/base.py +1 -0
  93. src/services/rag/components/chunkers/fixed.py +1 -0
  94. src/services/rag/components/chunkers/numbered_item.py +1 -0
  95. src/services/rag/components/chunkers/semantic.py +1 -0
  96. src/services/rag/components/embedders/__init__.py +1 -0
  97. src/services/rag/components/embedders/base.py +1 -0
  98. src/services/rag/components/embedders/openai.py +1 -0
  99. src/services/rag/components/indexers/__init__.py +1 -0
  100. src/services/rag/components/indexers/base.py +1 -0
  101. src/services/rag/components/indexers/graph.py +5 -44
  102. src/services/rag/components/indexers/lightrag.py +5 -44
  103. src/services/rag/components/indexers/vector.py +1 -0
  104. src/services/rag/components/parsers/__init__.py +1 -0
  105. src/services/rag/components/parsers/base.py +1 -0
  106. src/services/rag/components/parsers/markdown.py +1 -0
  107. src/services/rag/components/parsers/pdf.py +1 -0
  108. src/services/rag/components/parsers/text.py +1 -0
  109. src/services/rag/components/retrievers/__init__.py +1 -0
  110. src/services/rag/components/retrievers/base.py +1 -0
  111. src/services/rag/components/retrievers/dense.py +1 -0
  112. src/services/rag/components/retrievers/hybrid.py +5 -44
  113. src/services/rag/components/retrievers/lightrag.py +5 -44
  114. src/services/rag/components/routing.py +48 -0
  115. src/services/rag/factory.py +112 -46
  116. src/services/rag/pipeline.py +1 -0
  117. src/services/rag/pipelines/__init__.py +27 -18
  118. src/services/rag/pipelines/lightrag.py +1 -0
  119. src/services/rag/pipelines/llamaindex.py +99 -0
  120. src/services/rag/pipelines/raganything.py +67 -100
  121. src/services/rag/pipelines/raganything_docling.py +368 -0
  122. src/services/rag/service.py +5 -12
  123. src/services/rag/types.py +1 -0
  124. src/services/rag/utils/__init__.py +17 -0
  125. src/services/rag/utils/image_migration.py +279 -0
  126. src/services/search/__init__.py +1 -0
  127. src/services/search/base.py +1 -0
  128. src/services/search/consolidation.py +1 -0
  129. src/services/search/providers/__init__.py +1 -0
  130. src/services/search/providers/baidu.py +1 -0
  131. src/services/search/providers/exa.py +1 -0
  132. src/services/search/providers/jina.py +1 -0
  133. src/services/search/providers/perplexity.py +1 -0
  134. src/services/search/providers/serper.py +1 -0
  135. src/services/search/providers/tavily.py +1 -0
  136. src/services/search/types.py +1 -0
  137. src/services/settings/__init__.py +1 -0
  138. src/services/settings/interface_settings.py +78 -0
  139. src/services/setup/__init__.py +1 -0
  140. src/services/tts/__init__.py +1 -0
  141. src/services/tts/config.py +1 -0
  142. src/utils/realtimex.py +284 -0
  143. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
  144. src/services/rag/pipelines/academic.py +0 -44
  145. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
src/knowledge/manager.py CHANGED
@@ -6,11 +6,61 @@ Knowledge Base Manager
6
6
  Manages multiple knowledge bases and provides utilities for accessing them.
7
7
  """
8
8
 
9
+ from contextlib import contextmanager
9
10
  from datetime import datetime
10
11
  import hashlib
11
12
  import json
13
+ import os
12
14
  from pathlib import Path
13
15
  import shutil
16
+ import sys
17
+
18
+ from src.services.rag.components.routing import FileTypeRouter
19
+
20
+
21
+ # Cross-platform file locking
22
+ @contextmanager
23
+ def file_lock_shared(file_handle):
24
+ """Acquire a shared (read) lock on a file - cross-platform."""
25
+ if sys.platform == "win32":
26
+ import msvcrt
27
+
28
+ msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1)
29
+ try:
30
+ yield
31
+ finally:
32
+ file_handle.seek(0)
33
+ msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
34
+ else:
35
+ import fcntl
36
+
37
+ fcntl.flock(file_handle.fileno(), fcntl.LOCK_SH)
38
+ try:
39
+ yield
40
+ finally:
41
+ fcntl.flock(file_handle.fileno(), fcntl.LOCK_UN)
42
+
43
+
44
+ @contextmanager
45
+ def file_lock_exclusive(file_handle):
46
+ """Acquire an exclusive (write) lock on a file - cross-platform."""
47
+ if sys.platform == "win32":
48
+ import msvcrt
49
+
50
+ msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1)
51
+ try:
52
+ yield
53
+ finally:
54
+ file_handle.seek(0)
55
+ msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
56
+ else:
57
+ import fcntl
58
+
59
+ fcntl.flock(file_handle.fileno(), fcntl.LOCK_EX)
60
+ try:
61
+ yield
62
+ finally:
63
+ fcntl.flock(file_handle.fileno(), fcntl.LOCK_UN)
14
64
 
15
65
 
16
66
  class KnowledgeBaseManager:
@@ -27,42 +77,113 @@ class KnowledgeBaseManager:
27
77
  def _load_config(self) -> dict:
28
78
  """Load knowledge base configuration (kb_config.json only stores KB list)"""
29
79
  if self.config_file.exists():
30
- with open(self.config_file, encoding="utf-8") as f:
31
- config = json.load(f)
80
+ try:
81
+ with open(self.config_file, encoding="utf-8") as f:
82
+ with file_lock_shared(f):
83
+ content = f.read()
84
+ if not content.strip():
85
+ # Empty file, return default
86
+ return {"knowledge_bases": {}}
87
+ config = json.loads(content)
88
+
89
+ # Ensure knowledge_bases key exists
90
+ if "knowledge_bases" not in config:
91
+ config["knowledge_bases"] = {}
92
+
32
93
  # Migration: remove old "default" field if present
33
94
  if "default" in config:
34
95
  del config["default"]
35
- # Save cleaned config
36
- try:
37
- with open(self.config_file, "w", encoding="utf-8") as wf:
38
- json.dump(config, wf, indent=2, ensure_ascii=False)
39
- except Exception:
40
- pass
96
+ # Note: Don't save during load to avoid recursion issues
97
+ # The next _save_config() call will persist this change
98
+
41
99
  return config
100
+ except (json.JSONDecodeError, Exception) as e:
101
+ print(f"[KnowledgeBaseManager] Error loading config: {e}")
102
+ return {"knowledge_bases": {}}
42
103
  return {"knowledge_bases": {}}
43
104
 
44
105
  def _save_config(self):
45
- """Save knowledge base configuration"""
106
+ """Save knowledge base configuration (thread-safe with file locking)"""
107
+ # Use exclusive lock for writing
46
108
  with open(self.config_file, "w", encoding="utf-8") as f:
47
- json.dump(self.config, f, indent=2, ensure_ascii=False)
109
+ with file_lock_exclusive(f):
110
+ json.dump(self.config, f, indent=2, ensure_ascii=False)
111
+ f.flush()
112
+ os.fsync(f.fileno()) # Ensure data is written to disk
113
+
114
+ def update_kb_status(
115
+ self,
116
+ name: str,
117
+ status: str,
118
+ progress: dict | None = None,
119
+ ):
120
+ """
121
+ Update knowledge base status and progress in kb_config.json.
122
+
123
+ Args:
124
+ name: Knowledge base name
125
+ status: Status string ("initializing", "processing", "ready", "error")
126
+ progress: Optional progress dict with keys like:
127
+ - stage: Current stage name
128
+ - message: Human-readable message
129
+ - percent: Progress percentage (0-100)
130
+ - current: Current item number
131
+ - total: Total items
132
+ - file_name: Current file being processed
133
+ - error: Error message (if status is "error")
134
+ """
135
+ # Reload config to get latest state
136
+ self.config = self._load_config()
137
+
138
+ if "knowledge_bases" not in self.config:
139
+ self.config["knowledge_bases"] = {}
140
+
141
+ if name not in self.config["knowledge_bases"]:
142
+ # Auto-register if not exists
143
+ self.config["knowledge_bases"][name] = {
144
+ "path": name,
145
+ "description": f"Knowledge base: {name}",
146
+ }
147
+
148
+ kb_config = self.config["knowledge_bases"][name]
149
+ kb_config["status"] = status
150
+ kb_config["updated_at"] = datetime.now().isoformat()
151
+
152
+ if progress is not None:
153
+ kb_config["progress"] = progress
154
+ elif status == "ready":
155
+ # Clear progress when ready
156
+ kb_config["progress"] = {
157
+ "stage": "completed",
158
+ "message": "Ready",
159
+ "percent": 100,
160
+ }
161
+
162
+ self._save_config()
163
+
164
+ def get_kb_status(self, name: str) -> dict | None:
165
+ """Get status and progress for a knowledge base."""
166
+ self.config = self._load_config()
167
+ kb_config = self.config.get("knowledge_bases", {}).get(name)
168
+ if not kb_config:
169
+ return None
170
+ return {
171
+ "status": kb_config.get("status", "unknown"),
172
+ "progress": kb_config.get("progress"),
173
+ "updated_at": kb_config.get("updated_at"),
174
+ }
48
175
 
49
176
  def list_knowledge_bases(self) -> list[str]:
50
177
  """List all available knowledge bases from kb_config.json"""
51
- kb_list = []
178
+ # Always reload config from file to ensure we have the latest data
179
+ # This is important when new KBs are created by other processes/requests
180
+ self.config = self._load_config()
52
181
 
53
182
  # Read knowledge base list from config file (this is the authoritative source)
183
+ # Return all KBs in config, regardless of directory status
184
+ # (status field indicates if KB is ready or still initializing)
54
185
  config_kbs = self.config.get("knowledge_bases", {})
55
-
56
- for kb_name in config_kbs.keys():
57
- # Verify knowledge base directory exists
58
- kb_dir = self.base_dir / kb_name
59
- if kb_dir.exists() and kb_dir.is_dir():
60
- kb_list.append(kb_name)
61
- else:
62
- # If in config but directory doesn't exist, log warning but don't add
63
- print(
64
- f"Warning: Knowledge base '{kb_name}' is in config but directory does not exist: {kb_dir}"
65
- )
186
+ kb_list = list(config_kbs.keys())
66
187
 
67
188
  # If no config file or config is empty, fallback to scanning directory (backward compatibility)
68
189
  if not kb_list and self.base_dir.exists():
@@ -183,83 +304,115 @@ class KnowledgeBaseManager:
183
304
 
184
305
  This method:
185
306
  1. Gets the KB name (from parameter or default)
186
- 2. Reads metadata.json from the KB directory
187
- 3. Collects statistics about files and RAG status
307
+ 2. Reads status and progress from kb_config.json
308
+ 3. Reads metadata.json from the KB directory (if exists)
309
+ 4. Collects statistics about files and RAG status
188
310
  """
311
+ # Reload config to get latest status
312
+ self.config = self._load_config()
313
+
189
314
  kb_name = name or self.get_default()
190
315
  if kb_name is None:
191
316
  raise ValueError("No knowledge base name provided and no default set")
192
317
 
193
318
  # Get knowledge base path
194
319
  kb_dir = self.base_dir / kb_name
195
- if not kb_dir.exists():
196
- raise ValueError(f"Knowledge base directory does not exist: {kb_dir}")
197
320
 
198
- # Verify knowledge base is in config (if not, give warning but don't block)
199
- if kb_name not in self.config.get("knowledge_bases", {}):
200
- print(
201
- f"Warning: Knowledge base '{kb_name}' is not in kb_config.json, but directory exists"
202
- )
321
+ # Get status and progress from kb_config.json
322
+ kb_config = self.config.get("knowledge_bases", {}).get(kb_name, {})
323
+ status = kb_config.get("status")
324
+ progress = kb_config.get("progress")
325
+
326
+ # KB might not have a directory yet if still initializing
327
+ dir_exists = kb_dir.exists()
328
+
329
+ # For old KBs without status field, determine status from rag_storage
330
+ if not status and dir_exists:
331
+ rag_storage_dir = kb_dir / "rag_storage"
332
+ if rag_storage_dir.exists() and any(rag_storage_dir.iterdir()):
333
+ status = "ready"
334
+ else:
335
+ status = "unknown"
336
+ elif not status:
337
+ status = "unknown"
203
338
 
204
339
  info = {
205
340
  "name": kb_name,
206
341
  "path": str(kb_dir),
207
342
  "is_default": kb_name == self.get_default(),
208
343
  "metadata": {},
344
+ "status": status,
345
+ "progress": progress,
209
346
  }
210
347
 
211
348
  # Read metadata.json (if exists)
212
- metadata_file = kb_dir / "metadata.json"
213
- if metadata_file.exists():
214
- try:
215
- with open(metadata_file, encoding="utf-8") as f:
216
- info["metadata"] = json.load(f)
217
- except Exception as e:
218
- print(f"Warning: Failed to read metadata.json for KB '{kb_name}': {e}")
219
- info["metadata"] = {}
220
- else:
221
- # metadata.json doesn't exist, use empty dict
222
- info["metadata"] = {}
349
+ if dir_exists:
350
+ metadata_file = kb_dir / "metadata.json"
351
+ if metadata_file.exists():
352
+ try:
353
+ with open(metadata_file, encoding="utf-8") as f:
354
+ info["metadata"] = json.load(f)
355
+ except Exception as e:
356
+ print(f"Warning: Failed to read metadata.json for KB '{kb_name}': {e}")
357
+ info["metadata"] = {}
223
358
 
224
359
  # Count files - handle errors gracefully
225
- raw_dir = kb_dir / "raw"
226
- images_dir = kb_dir / "images"
227
- content_list_dir = kb_dir / "content_list"
228
- rag_storage_dir = kb_dir / "rag_storage"
360
+ raw_dir = kb_dir / "raw" if dir_exists else None
361
+ images_dir = kb_dir / "images" if dir_exists else None
362
+ content_list_dir = kb_dir / "content_list" if dir_exists else None
363
+ rag_storage_dir = kb_dir / "rag_storage" if dir_exists else None
229
364
 
230
- try:
231
- raw_count = (
232
- len([f for f in raw_dir.iterdir() if f.is_file()]) if raw_dir.exists() else 0
233
- )
234
- except Exception:
235
- raw_count = 0
365
+ raw_count = 0
366
+ images_count = 0
367
+ content_lists_count = 0
236
368
 
237
- try:
238
- images_count = (
239
- len([f for f in images_dir.iterdir() if f.is_file()]) if images_dir.exists() else 0
240
- )
241
- except Exception:
242
- images_count = 0
369
+ if dir_exists:
370
+ try:
371
+ raw_count = (
372
+ len([f for f in raw_dir.iterdir() if f.is_file()]) if raw_dir.exists() else 0
373
+ )
374
+ except Exception:
375
+ pass
243
376
 
244
- try:
245
- content_lists_count = (
246
- len(list(content_list_dir.glob("*.json"))) if content_list_dir.exists() else 0
247
- )
248
- except Exception:
249
- content_lists_count = 0
377
+ try:
378
+ images_count = (
379
+ len([f for f in images_dir.iterdir() if f.is_file()])
380
+ if images_dir.exists()
381
+ else 0
382
+ )
383
+ except Exception:
384
+ pass
385
+
386
+ try:
387
+ content_lists_count = (
388
+ len(list(content_list_dir.glob("*.json"))) if content_list_dir.exists() else 0
389
+ )
390
+ except Exception:
391
+ pass
250
392
 
251
393
  metadata = info["metadata"]
252
394
  rag_provider = metadata.get("rag_provider") if isinstance(metadata, dict) else None
395
+ # Also check kb_config for rag_provider (fallback)
396
+ if not rag_provider:
397
+ rag_provider = kb_config.get("rag_provider")
398
+
399
+ rag_initialized = (
400
+ dir_exists and rag_storage_dir and rag_storage_dir.exists() and rag_storage_dir.is_dir()
401
+ )
402
+
253
403
  info["statistics"] = {
254
404
  "raw_documents": raw_count,
255
405
  "images": images_count,
256
406
  "content_lists": content_lists_count,
257
- "rag_initialized": rag_storage_dir.exists() and rag_storage_dir.is_dir(),
258
- "rag_provider": rag_provider, # Add RAG provider info
407
+ "rag_initialized": rag_initialized,
408
+ "rag_provider": rag_provider,
409
+ # Include status and progress in statistics for backward compatibility
410
+ "status": status,
411
+ "progress": progress,
259
412
  }
260
413
 
261
414
  # Try to get RAG statistics
262
- if rag_storage_dir.exists() and rag_storage_dir.is_dir():
415
+ if rag_initialized:
263
416
  try:
264
417
  entities_file = rag_storage_dir / "kv_store_full_entities.json"
265
418
  relations_file = rag_storage_dir / "kv_store_full_relations.json"
@@ -406,14 +559,25 @@ class KnowledgeBaseManager:
406
559
  if not folder.is_dir():
407
560
  raise ValueError(f"Path is not a directory: {folder}")
408
561
 
409
- # Get supported files in folder
410
- supported_extensions = {".pdf", ".docx", ".doc", ".txt", ".md", ".markdown"}
562
+ # Get RAG provider from KB metadata to determine supported extensions
563
+ kb_dir = self.base_dir / kb_name
564
+ metadata_file = kb_dir / "metadata.json"
565
+ provider = "raganything" # default to most comprehensive
566
+ if metadata_file.exists():
567
+ try:
568
+ with open(metadata_file, encoding="utf-8") as f:
569
+ kb_meta = json.load(f)
570
+ provider = kb_meta.get("rag_provider") or "raganything"
571
+ except Exception:
572
+ pass
573
+
574
+ # Get supported files in folder based on provider
575
+ supported_extensions = FileTypeRouter.get_extensions_for_provider(provider)
411
576
  files: list[Path] = []
412
577
  for ext in supported_extensions:
413
578
  files.extend(folder.glob(f"**/*{ext}"))
414
579
 
415
580
  # Generate folder ID
416
- import hashlib
417
581
 
418
582
  folder_id = hashlib.md5( # noqa: S324
419
583
  str(folder).encode(), usedforsecurity=False
@@ -523,12 +687,13 @@ class KnowledgeBaseManager:
523
687
 
524
688
  return True
525
689
 
526
- def scan_linked_folder(self, folder_path: str) -> list[str]:
690
+ def scan_linked_folder(self, folder_path: str, provider: str = "raganything") -> list[str]:
527
691
  """
528
692
  Scan a linked folder and return list of supported file paths.
529
693
 
530
694
  Args:
531
695
  folder_path: Path to folder
696
+ provider: RAG provider to determine supported extensions (default: raganything)
532
697
 
533
698
  Returns:
534
699
  List of file paths (as strings)
@@ -538,7 +703,7 @@ class KnowledgeBaseManager:
538
703
  if not folder.exists() or not folder.is_dir():
539
704
  return []
540
705
 
541
- supported_extensions = {".pdf", ".docx", ".doc", ".txt", ".md", ".markdown"}
706
+ supported_extensions = FileTypeRouter.get_extensions_for_provider(provider)
542
707
  files = []
543
708
 
544
709
  for ext in supported_extensions:
@@ -583,8 +748,20 @@ class KnowledgeBaseManager:
583
748
  except Exception:
584
749
  pass
585
750
 
586
- # Scan current files
587
- supported_extensions = {".pdf", ".docx", ".doc", ".txt", ".md", ".markdown"}
751
+ # Get RAG provider from KB metadata to determine supported extensions
752
+ kb_dir = self.base_dir / kb_name
753
+ metadata_file = kb_dir / "metadata.json"
754
+ provider = "raganything" # default to most comprehensive
755
+ if metadata_file.exists():
756
+ try:
757
+ with open(metadata_file, encoding="utf-8") as f:
758
+ metadata = json.load(f)
759
+ provider = metadata.get("rag_provider") or "raganything"
760
+ except Exception:
761
+ pass
762
+
763
+ # Scan current files based on provider's supported extensions
764
+ supported_extensions = FileTypeRouter.get_extensions_for_provider(provider)
588
765
  new_files = []
589
766
  modified_files = []
590
767
 
@@ -99,13 +99,54 @@ class ProgressTracker:
99
99
  print(f"[ProgressTracker] Callback error: {e}")
100
100
 
101
101
  def _save_progress(self, progress: dict):
102
- """Save progress to file"""
102
+ """Save progress to kb_config.json and local .progress.json file"""
103
+ # Save to kb_config.json (centralized config)
104
+ try:
105
+ from src.knowledge.manager import KnowledgeBaseManager
106
+
107
+ manager = KnowledgeBaseManager(base_dir=str(self.base_dir))
108
+
109
+ # Determine status based on stage
110
+ stage = progress.get("stage", "")
111
+ if stage == "completed":
112
+ status = "ready"
113
+ elif stage == "error":
114
+ status = "error"
115
+ elif stage in [
116
+ "initializing",
117
+ "processing_documents",
118
+ "processing_file",
119
+ "extracting_items",
120
+ ]:
121
+ status = "processing"
122
+ else:
123
+ status = "initializing"
124
+
125
+ # Update kb_config.json with status and progress
126
+ manager.update_kb_status(
127
+ name=self.kb_name,
128
+ status=status,
129
+ progress={
130
+ "stage": progress.get("stage"),
131
+ "message": progress.get("message"),
132
+ "percent": progress.get("progress_percent", 0),
133
+ "current": progress.get("current", 0),
134
+ "total": progress.get("total", 0),
135
+ "file_name": progress.get("file_name"),
136
+ "error": progress.get("error"),
137
+ "timestamp": progress.get("timestamp"),
138
+ },
139
+ )
140
+ except Exception as e:
141
+ print(f"[ProgressTracker] Failed to save progress to kb_config.json: {e}")
142
+
143
+ # Also save to local .progress.json file (for backward compatibility)
103
144
  try:
104
145
  self.kb_dir.mkdir(parents=True, exist_ok=True)
105
146
  with open(self.progress_file, "w", encoding="utf-8") as f:
106
147
  json.dump(progress, f, indent=2, ensure_ascii=False)
107
148
  except Exception as e:
108
- print(f"[ProgressTracker] Failed to save progress: {e}")
149
+ print(f"[ProgressTracker] Failed to save progress to local file: {e}")
109
150
 
110
151
  def update(
111
152
  self,
src/knowledge/start_kb.py CHANGED
@@ -15,6 +15,8 @@ try:
15
15
  from .config import KNOWLEDGE_BASES_DIR, get_env_config, setup_paths
16
16
 
17
17
  setup_paths()
18
+ from src.services.rag.components.routing import FileTypeRouter
19
+
18
20
  from .extract_numbered_items import process_content_list
19
21
  from .initializer import KnowledgeBaseInitializer
20
22
  from .manager import KnowledgeBaseManager
@@ -28,6 +30,7 @@ except ImportError:
28
30
  from src.knowledge.extract_numbered_items import process_content_list
29
31
  from src.knowledge.initializer import KnowledgeBaseInitializer
30
32
  from src.knowledge.manager import KnowledgeBaseManager
33
+ from src.services.rag.components.routing import FileTypeRouter
31
34
 
32
35
 
33
36
  def list_knowledge_bases():
@@ -123,6 +126,12 @@ async def init_knowledge_base(args):
123
126
  return
124
127
 
125
128
  # Collect document files
129
+ # Use provider from env var or default to raganything (most comprehensive)
130
+ import os
131
+
132
+ provider = os.getenv("RAG_PROVIDER", "raganything")
133
+ glob_patterns = FileTypeRouter.get_glob_patterns_for_provider(provider)
134
+
126
135
  doc_files = []
127
136
  if args.docs:
128
137
  doc_files.extend(args.docs)
@@ -130,8 +139,8 @@ async def init_knowledge_base(args):
130
139
  if args.docs_dir:
131
140
  docs_dir = Path(args.docs_dir)
132
141
  if docs_dir.exists() and docs_dir.is_dir():
133
- for ext in ["*.pdf", "*.docx", "*.doc", "*.txt", "*.md"]:
134
- doc_files.extend([str(f) for f in docs_dir.glob(ext)])
142
+ for pattern in glob_patterns:
143
+ doc_files.extend([str(f) for f in docs_dir.glob(pattern)])
135
144
  else:
136
145
  print(f"✗ Error: Document directory does not exist: {args.docs_dir}\n")
137
146
  return
src/logging/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Unified Logging System for DeepTutor
3
4
  =====================================
@@ -39,6 +40,7 @@ from .adapters import (
39
40
  from .config import (
40
41
  LoggingConfig,
41
42
  get_default_log_dir,
43
+ get_global_log_level,
42
44
  load_logging_config,
43
45
  )
44
46
 
@@ -58,6 +60,7 @@ from .logger import (
58
60
  LogLevel,
59
61
  get_logger,
60
62
  reset_logger,
63
+ set_default_service_prefix,
61
64
  )
62
65
 
63
66
  # Statistics tracking
@@ -75,6 +78,7 @@ __all__ = [
75
78
  "LogLevel",
76
79
  "get_logger",
77
80
  "reset_logger",
81
+ "set_default_service_prefix",
78
82
  "ConsoleFormatter",
79
83
  "FileFormatter",
80
84
  # Handlers
@@ -100,4 +104,5 @@ __all__ = [
100
104
  "LoggingConfig",
101
105
  "load_logging_config",
102
106
  "get_default_log_dir",
107
+ "get_global_log_level",
103
108
  ]
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Log Adapters
3
4
  ============
@@ -1,9 +1,11 @@
1
1
  #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
2
3
  """
3
4
  LightRAG Log Forwarder
4
5
  ======================
5
6
 
6
7
  Forwards LightRAG and RAG-Anything logs to DeepTutor's unified logging system.
8
+ Uses the unified global log level from config/main.yaml -> logging.level
7
9
  """
8
10
 
9
11
  from contextlib import contextmanager
@@ -31,17 +33,21 @@ class LightRAGLogForwarder(logging.Handler):
31
33
 
32
34
  def emit(self, record: logging.LogRecord):
33
35
  """
34
- Forward log record to DeepTutor logger.
35
- All logs are forwarded as info level to maintain consistent format.
36
+ Forward log record to DeepTutor logger with proper level mapping.
36
37
  """
37
38
  try:
38
- # Get the original message directly without adding [LightRAG] prefix
39
- # LightRAG already formats messages appropriately (e.g., "DEBUG: xxx" for debug logs)
40
39
  message = record.getMessage()
41
40
 
42
- # Use info() for all levels to maintain consistent format
43
- # This ensures all logs appear as [RAGTool] ... (or [RAGTool] DEBUG: ... for debug)
44
- self.ai_tutor_logger.info(message)
41
+ # Map LightRAG log levels to DeepTutor logger methods
42
+ level = record.levelno
43
+ if level >= logging.ERROR:
44
+ self.ai_tutor_logger.error(message)
45
+ elif level >= logging.WARNING:
46
+ self.ai_tutor_logger.warning(message)
47
+ elif level >= logging.INFO:
48
+ self.ai_tutor_logger.info(message)
49
+ else:
50
+ self.ai_tutor_logger.debug(message)
45
51
 
46
52
  except Exception:
47
53
  # Avoid errors in forwarding from affecting main flow
@@ -50,7 +56,7 @@ class LightRAGLogForwarder(logging.Handler):
50
56
 
51
57
  def get_lightrag_forwarding_config() -> dict:
52
58
  """
53
- Load LightRAG forwarding configuration from main.yaml.
59
+ Load LightRAG forwarding configuration from config/main.yaml.
54
60
 
55
61
  Returns:
56
62
  dict: Configuration dictionary with defaults if not found
@@ -58,27 +64,28 @@ def get_lightrag_forwarding_config() -> dict:
58
64
  try:
59
65
  from src.services.config import load_config_with_main
60
66
 
61
- # Use resolve() to get absolute path, ensuring correct project root regardless of working directory
67
+ from ..config import get_global_log_level
68
+
62
69
  project_root = Path(__file__).resolve().parent.parent.parent.parent
63
70
  config = load_config_with_main("solve_config.yaml", project_root)
71
+ logging_config = config.get("logging", {})
64
72
 
65
- forwarding_config = config.get("logging", {}).get("lightrag_forwarding", {})
73
+ # Use the unified global log level
74
+ level = get_global_log_level()
66
75
 
67
76
  return {
68
- "enabled": forwarding_config.get("enabled", True),
69
- "min_level": forwarding_config.get("min_level", "INFO"),
70
- "add_prefix": forwarding_config.get("add_prefix", True),
71
- "logger_names": forwarding_config.get(
72
- "logger_names", {"knowledge_init": "KnowledgeInit", "rag_tool": "RAGTool"}
77
+ "enabled": True,
78
+ "min_level": level,
79
+ "logger_names": logging_config.get(
80
+ "rag_logger_names", {"knowledge_init": "RAG-Init", "rag_tool": "RAG"}
73
81
  ),
74
82
  }
75
83
  except Exception:
76
84
  # Return defaults if config loading fails
77
85
  return {
78
86
  "enabled": True,
79
- "min_level": "INFO",
80
- "add_prefix": True,
81
- "logger_names": {"knowledge_init": "KnowledgeInit", "rag_tool": "RAGTool"},
87
+ "min_level": "DEBUG",
88
+ "logger_names": {"knowledge_init": "RAG-Init", "rag_tool": "RAG"},
82
89
  }
83
90
 
84
91
 
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
2
3
  """
3
4
  LlamaIndex Log Forwarder
4
5
  ========================