realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
  2. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
  3. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
  4. realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
  5. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
  6. scripts/__init__.py +1 -0
  7. scripts/audit_prompts.py +179 -0
  8. scripts/check_install.py +460 -0
  9. scripts/generate_roster.py +327 -0
  10. scripts/install_all.py +653 -0
  11. scripts/migrate_kb.py +655 -0
  12. scripts/start.py +807 -0
  13. scripts/start_web.py +632 -0
  14. scripts/sync_prompts_from_en.py +147 -0
  15. src/__init__.py +2 -2
  16. src/agents/ideagen/material_organizer_agent.py +2 -0
  17. src/agents/solve/__init__.py +6 -0
  18. src/agents/solve/main_solver.py +9 -0
  19. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
  20. src/agents/solve/session_manager.py +345 -0
  21. src/api/main.py +14 -0
  22. src/api/routers/chat.py +3 -3
  23. src/api/routers/co_writer.py +12 -7
  24. src/api/routers/config.py +1 -0
  25. src/api/routers/guide.py +3 -1
  26. src/api/routers/ideagen.py +7 -0
  27. src/api/routers/knowledge.py +64 -12
  28. src/api/routers/question.py +2 -0
  29. src/api/routers/realtimex.py +137 -0
  30. src/api/routers/research.py +9 -0
  31. src/api/routers/solve.py +120 -2
  32. src/cli/__init__.py +13 -0
  33. src/cli/start.py +209 -0
  34. src/config/constants.py +11 -9
  35. src/knowledge/add_documents.py +453 -213
  36. src/knowledge/extract_numbered_items.py +9 -10
  37. src/knowledge/initializer.py +102 -101
  38. src/knowledge/manager.py +251 -74
  39. src/knowledge/progress_tracker.py +43 -2
  40. src/knowledge/start_kb.py +11 -2
  41. src/logging/__init__.py +5 -0
  42. src/logging/adapters/__init__.py +1 -0
  43. src/logging/adapters/lightrag.py +25 -18
  44. src/logging/adapters/llamaindex.py +1 -0
  45. src/logging/config.py +30 -27
  46. src/logging/handlers/__init__.py +1 -0
  47. src/logging/handlers/console.py +7 -50
  48. src/logging/handlers/file.py +5 -20
  49. src/logging/handlers/websocket.py +23 -19
  50. src/logging/logger.py +161 -126
  51. src/logging/stats/__init__.py +1 -0
  52. src/logging/stats/llm_stats.py +37 -17
  53. src/services/__init__.py +17 -1
  54. src/services/config/__init__.py +1 -0
  55. src/services/config/knowledge_base_config.py +1 -0
  56. src/services/config/loader.py +1 -1
  57. src/services/config/unified_config.py +211 -4
  58. src/services/embedding/__init__.py +1 -0
  59. src/services/embedding/adapters/__init__.py +3 -0
  60. src/services/embedding/adapters/base.py +1 -0
  61. src/services/embedding/adapters/cohere.py +1 -0
  62. src/services/embedding/adapters/jina.py +1 -0
  63. src/services/embedding/adapters/ollama.py +1 -0
  64. src/services/embedding/adapters/openai_compatible.py +1 -0
  65. src/services/embedding/adapters/realtimex.py +125 -0
  66. src/services/embedding/client.py +27 -0
  67. src/services/embedding/config.py +3 -0
  68. src/services/embedding/provider.py +1 -0
  69. src/services/llm/__init__.py +17 -3
  70. src/services/llm/capabilities.py +47 -0
  71. src/services/llm/client.py +32 -0
  72. src/services/llm/cloud_provider.py +21 -4
  73. src/services/llm/config.py +36 -2
  74. src/services/llm/error_mapping.py +1 -0
  75. src/services/llm/exceptions.py +30 -0
  76. src/services/llm/factory.py +55 -16
  77. src/services/llm/local_provider.py +1 -0
  78. src/services/llm/providers/anthropic.py +1 -0
  79. src/services/llm/providers/base_provider.py +1 -0
  80. src/services/llm/providers/open_ai.py +1 -0
  81. src/services/llm/realtimex_provider.py +240 -0
  82. src/services/llm/registry.py +1 -0
  83. src/services/llm/telemetry.py +1 -0
  84. src/services/llm/types.py +1 -0
  85. src/services/llm/utils.py +1 -0
  86. src/services/prompt/__init__.py +1 -0
  87. src/services/prompt/manager.py +3 -2
  88. src/services/rag/__init__.py +27 -5
  89. src/services/rag/components/__init__.py +1 -0
  90. src/services/rag/components/base.py +1 -0
  91. src/services/rag/components/chunkers/__init__.py +1 -0
  92. src/services/rag/components/chunkers/base.py +1 -0
  93. src/services/rag/components/chunkers/fixed.py +1 -0
  94. src/services/rag/components/chunkers/numbered_item.py +1 -0
  95. src/services/rag/components/chunkers/semantic.py +1 -0
  96. src/services/rag/components/embedders/__init__.py +1 -0
  97. src/services/rag/components/embedders/base.py +1 -0
  98. src/services/rag/components/embedders/openai.py +1 -0
  99. src/services/rag/components/indexers/__init__.py +1 -0
  100. src/services/rag/components/indexers/base.py +1 -0
  101. src/services/rag/components/indexers/graph.py +5 -44
  102. src/services/rag/components/indexers/lightrag.py +5 -44
  103. src/services/rag/components/indexers/vector.py +1 -0
  104. src/services/rag/components/parsers/__init__.py +1 -0
  105. src/services/rag/components/parsers/base.py +1 -0
  106. src/services/rag/components/parsers/markdown.py +1 -0
  107. src/services/rag/components/parsers/pdf.py +1 -0
  108. src/services/rag/components/parsers/text.py +1 -0
  109. src/services/rag/components/retrievers/__init__.py +1 -0
  110. src/services/rag/components/retrievers/base.py +1 -0
  111. src/services/rag/components/retrievers/dense.py +1 -0
  112. src/services/rag/components/retrievers/hybrid.py +5 -44
  113. src/services/rag/components/retrievers/lightrag.py +5 -44
  114. src/services/rag/components/routing.py +48 -0
  115. src/services/rag/factory.py +112 -46
  116. src/services/rag/pipeline.py +1 -0
  117. src/services/rag/pipelines/__init__.py +27 -18
  118. src/services/rag/pipelines/lightrag.py +1 -0
  119. src/services/rag/pipelines/llamaindex.py +99 -0
  120. src/services/rag/pipelines/raganything.py +67 -100
  121. src/services/rag/pipelines/raganything_docling.py +368 -0
  122. src/services/rag/service.py +5 -12
  123. src/services/rag/types.py +1 -0
  124. src/services/rag/utils/__init__.py +17 -0
  125. src/services/rag/utils/image_migration.py +279 -0
  126. src/services/search/__init__.py +1 -0
  127. src/services/search/base.py +1 -0
  128. src/services/search/consolidation.py +1 -0
  129. src/services/search/providers/__init__.py +1 -0
  130. src/services/search/providers/baidu.py +1 -0
  131. src/services/search/providers/exa.py +1 -0
  132. src/services/search/providers/jina.py +1 -0
  133. src/services/search/providers/perplexity.py +1 -0
  134. src/services/search/providers/serper.py +1 -0
  135. src/services/search/providers/tavily.py +1 -0
  136. src/services/search/types.py +1 -0
  137. src/services/settings/__init__.py +1 -0
  138. src/services/settings/interface_settings.py +78 -0
  139. src/services/setup/__init__.py +1 -0
  140. src/services/tts/__init__.py +1 -0
  141. src/services/tts/config.py +1 -0
  142. src/utils/realtimex.py +284 -0
  143. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
  144. src/services/rag/pipelines/academic.py +0 -44
  145. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
@@ -19,9 +19,8 @@ from typing import Any
19
19
  sys.path.append(str(Path(__file__).parent.parent.parent))
20
20
 
21
21
  from dotenv import load_dotenv
22
- from lightrag.llm.openai import openai_complete_if_cache
23
22
 
24
- from src.services.llm import get_llm_config
23
+ from src.services.llm import get_llm_client, get_llm_config
25
24
 
26
25
  load_dotenv(dotenv_path=".env", override=False)
27
26
 
@@ -62,18 +61,18 @@ async def _call_llm_async(
62
61
  temperature: float = 0.1,
63
62
  model: str = None,
64
63
  ) -> str:
65
- """Asynchronously call LLM"""
66
- # If model not specified, get from env_config
64
+ """Asynchronously call LLM using unified LLM service"""
65
+ # Get unified LLM client (handles all provider differences and env var setup)
66
+ llm_client = get_llm_client()
67
+ llm_model_func = llm_client.get_model_func()
68
+
69
+ # If model not specified, use configured model
67
70
  if model is None:
68
- llm_cfg = get_llm_config()
69
- model = llm_cfg.model
71
+ model = llm_client.config.model
70
72
 
71
- result = openai_complete_if_cache(
72
- model,
73
+ result = llm_model_func(
73
74
  prompt,
74
75
  system_prompt=system_prompt,
75
- api_key=api_key,
76
- base_url=base_url,
77
76
  max_tokens=max_tokens,
78
77
  temperature=temperature,
79
78
  )
@@ -21,6 +21,7 @@ import shutil
21
21
  from src.logging import get_logger
22
22
  from src.services.embedding import get_embedding_config
23
23
  from src.services.llm import get_llm_config
24
+ from src.services.rag.components.routing import FileTypeRouter
24
25
  from src.services.rag.service import RAGService
25
26
 
26
27
  logger = get_logger("KnowledgeInit")
@@ -59,39 +60,44 @@ class KnowledgeBaseInitializer:
59
60
  self.rag_provider = rag_provider
60
61
 
61
62
  def _register_to_config(self):
62
- """Register KB to kb_config.json (only knowledge_bases list, no default)."""
63
- config_file = self.base_dir / "kb_config.json"
64
- if config_file.exists():
65
- try:
66
- with open(config_file, encoding="utf-8") as f:
67
- config = json.load(f)
68
- except Exception as e:
69
- logger.warning(f"Failed to read config: {e}, creating new")
70
- config = {"knowledge_bases": {}}
71
- else:
72
- config = {"knowledge_bases": {}}
73
-
74
- if "knowledge_bases" not in config:
75
- config["knowledge_bases"] = {}
76
-
77
- # Remove old "default" field if exists (migration)
78
- if "default" in config:
79
- del config["default"]
63
+ """Register KB to kb_config.json using KnowledgeBaseManager for consistency."""
64
+ try:
65
+ from src.knowledge.manager import KnowledgeBaseManager
66
+
67
+ manager = KnowledgeBaseManager(base_dir=str(self.base_dir))
68
+
69
+ # Check if already registered (reload config to get latest)
70
+ manager.config = manager._load_config()
71
+ if self.kb_name in manager.config.get("knowledge_bases", {}):
72
+ logger.info(" ✓ Already registered in kb_config.json")
73
+ return
74
+
75
+ # Register with initializing status
76
+ manager.update_kb_status(
77
+ name=self.kb_name,
78
+ status="initializing",
79
+ progress={
80
+ "stage": "initializing",
81
+ "message": "Creating directory structure...",
82
+ "percent": 0,
83
+ "current": 0,
84
+ "total": 0,
85
+ },
86
+ )
87
+ logger.info(" ✓ Registered to kb_config.json")
88
+ except Exception as e:
89
+ logger.warning(f"Failed to register to config: {e}")
80
90
 
81
- if self.kb_name not in config.get("knowledge_bases", {}):
82
- config["knowledge_bases"][self.kb_name] = {
83
- "path": self.kb_name,
84
- "description": f"Knowledge base: {self.kb_name}",
85
- }
91
+ def _get_file_hash(self, file_path: Path) -> str:
92
+ """Calculate SHA-256 hash of a file."""
93
+ import hashlib
86
94
 
87
- try:
88
- with open(config_file, "w", encoding="utf-8") as f:
89
- json.dump(config, indent=2, ensure_ascii=False, fp=f)
90
- logger.info(" ✓ Registered to kb_config.json")
91
- except Exception as e:
92
- logger.warning(f"Failed to update config: {e}")
93
- else:
94
- logger.info(" ✓ Already registered in kb_config.json")
95
+ sha256_hash = hashlib.sha256()
96
+ chunk_size = 65536 # 64KB
97
+ with open(file_path, "rb") as f:
98
+ for byte_block in iter(lambda: f.read(chunk_size), b""):
99
+ sha256_hash.update(byte_block)
100
+ return sha256_hash.hexdigest()
95
101
 
96
102
  def _update_metadata_with_provider(self, provider: str):
97
103
  """Update metadata.json and centralized config with the RAG provider used."""
@@ -106,10 +112,22 @@ class KnowledgeBaseInitializer:
106
112
  metadata["rag_provider"] = provider
107
113
  metadata["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
108
114
 
115
+ # Record file hashes for all successfully processed files in raw/
116
+ # This enables incremental add to detect duplicates
117
+ file_hashes = metadata.get("file_hashes", {})
118
+ for raw_file in self.raw_dir.glob("*"):
119
+ if raw_file.is_file():
120
+ try:
121
+ file_hashes[raw_file.name] = self._get_file_hash(raw_file)
122
+ except Exception as hash_err:
123
+ logger.warning(f"Failed to hash {raw_file.name}: {hash_err}")
124
+ metadata["file_hashes"] = file_hashes
125
+
109
126
  with open(metadata_file, "w", encoding="utf-8") as f:
110
127
  json.dump(metadata, indent=2, ensure_ascii=False, fp=f)
111
128
 
112
129
  logger.info(f" ✓ Updated metadata with RAG provider: {provider}")
130
+ logger.info(f" ✓ Recorded {len(file_hashes)} file hashes for incremental add")
113
131
 
114
132
  # Also save to centralized config file
115
133
  try:
@@ -186,10 +204,11 @@ class KnowledgeBaseInitializer:
186
204
  total=0,
187
205
  )
188
206
 
189
- # Get all documents in raw directory
207
+ # Get all documents in raw directory based on provider's supported extensions
190
208
  doc_files = []
191
- for ext in ["*.pdf", "*.docx", "*.doc", "*.txt", "*.md"]:
192
- doc_files.extend(list(self.raw_dir.glob(ext)))
209
+ glob_patterns = FileTypeRouter.get_glob_patterns_for_provider(provider)
210
+ for pattern in glob_patterns:
211
+ doc_files.extend(list(self.raw_dir.glob(pattern)))
193
212
 
194
213
  if not doc_files:
195
214
  logger.warning("No documents found to process")
@@ -274,80 +293,58 @@ class KnowledgeBaseInitializer:
274
293
 
275
294
  async def fix_structure(self):
276
295
  """
277
- Fix the nested structure created by process_document_complete.
278
- Flattens content_list directories and moves images to the correct location.
279
- """
280
- logger.info("\nFixing directory structure...")
296
+ Clean up parser output directories after image migration.
281
297
 
282
- # Find nested content lists
283
- content_list_moves = []
284
- for doc_dir in self.content_list_dir.glob("*"):
285
- if not doc_dir.is_dir():
286
- continue
298
+ NOTE: Image migration and path updates are now handled by the RAG pipeline
299
+ (raganything.py / raganything_docling.py) BEFORE RAG insertion. This ensures
300
+ RAG stores the correct canonical image paths (kb/images/) from the start.
287
301
 
288
- auto_dir = doc_dir / "auto"
289
- if not auto_dir.exists():
290
- continue
302
+ This method now only:
303
+ 1. Checks if there are any leftover nested directories to clean up
304
+ 2. Removes empty temporary parser output directories
291
305
 
292
- # Find the _content_list.json file
293
- for json_file in auto_dir.glob("*_content_list.json"):
294
- target_file = self.content_list_dir / f"{doc_dir.name}.json"
295
- content_list_moves.append((json_file, target_file))
306
+ Supports both 'auto' (MinerU) and 'docling' parser output directories.
307
+ """
308
+ logger.info("\nChecking for leftover parser output directories...")
296
309
 
297
- # Move content list files
298
- for source, target in content_list_moves:
299
- try:
300
- shutil.copy2(source, target)
301
- logger.info(f" ✓ Moved: {source.name} -> {target.name}")
302
- except Exception as e:
303
- logger.error(f" ✗ Error moving {source.name}: {e!s}")
310
+ # Support both 'auto' (MinerU) and 'docling' parser output directories
311
+ parser_subdirs = ["auto", "docling"]
312
+ cleaned_count = 0
304
313
 
305
- # Find and move nested images
306
- for doc_dir in self.content_list_dir.glob("*"):
314
+ # Find and remove empty parser output directories
315
+ for doc_dir in list(self.content_list_dir.glob("*")):
307
316
  if not doc_dir.is_dir():
308
317
  continue
309
318
 
310
- auto_dir = doc_dir / "auto"
311
- if not auto_dir.exists():
312
- continue
319
+ for parser_subdir in parser_subdirs:
320
+ subdir = doc_dir / parser_subdir
321
+ if subdir.exists():
322
+ try:
323
+ # Check if directory is empty or only contains empty subdirs
324
+ has_content = any(
325
+ f.is_file() or (f.is_dir() and any(f.iterdir()))
326
+ for f in subdir.iterdir()
327
+ )
328
+
329
+ if not has_content:
330
+ shutil.rmtree(subdir)
331
+ cleaned_count += 1
332
+ logger.debug(f" Removed empty directory: {subdir}")
333
+ except Exception as e:
334
+ logger.debug(f" Could not clean up {subdir}: {e}")
335
+
336
+ # Remove doc_dir if it's now empty
337
+ try:
338
+ if doc_dir.exists() and not any(doc_dir.iterdir()):
339
+ doc_dir.rmdir()
340
+ logger.debug(f" Removed empty directory: {doc_dir}")
341
+ except Exception:
342
+ pass
313
343
 
314
- images_dir = auto_dir / "images"
315
- if images_dir.exists() and images_dir.is_dir():
316
- image_count = 0
317
- # Ensure target directory exists
318
- self.images_dir.mkdir(parents=True, exist_ok=True)
319
-
320
- for img_file in images_dir.glob("*"):
321
- if img_file.is_file() and img_file.exists():
322
- target_img = self.images_dir / img_file.name
323
- if not target_img.exists():
324
- try:
325
- # Ensure source file exists
326
- if not img_file.exists():
327
- logger.warning(f" ⚠ Source image not found: {img_file}")
328
- continue
329
- shutil.copy2(img_file, target_img)
330
- image_count += 1
331
- except FileNotFoundError:
332
- logger.error(
333
- f" ✗ Error moving image {img_file.name}: Source file not found: {img_file}"
334
- )
335
- except Exception as e:
336
- logger.error(f" ✗ Error moving image {img_file.name}: {e!s}")
337
-
338
- if image_count > 0:
339
- logger.info(f" ✓ Moved {image_count} images from {doc_dir.name}/auto/images/")
340
-
341
- # Clean up nested directories
342
- for doc_dir in self.content_list_dir.glob("*"):
343
- if doc_dir.is_dir():
344
- try:
345
- shutil.rmtree(doc_dir)
346
- logger.info(f" ✓ Cleaned up: {doc_dir.name}/")
347
- except Exception as e:
348
- logger.error(f" ✗ Error removing {doc_dir.name}: {e!s}")
349
-
350
- logger.info("✓ Structure fixed!")
344
+ if cleaned_count > 0:
345
+ logger.info(f"✓ Cleaned up {cleaned_count} empty parser directories")
346
+ else:
347
+ logger.info("✓ No cleanup needed (structure already organized)")
351
348
 
352
349
  def extract_numbered_items(self, batch_size: int = 20):
353
350
  """
@@ -563,6 +560,10 @@ Example usage:
563
560
  return
564
561
 
565
562
  # Collect document files
563
+ # Use provider from env var or default to raganything (most comprehensive)
564
+ provider = os.getenv("RAG_PROVIDER", "raganything")
565
+ glob_patterns = FileTypeRouter.get_glob_patterns_for_provider(provider)
566
+
566
567
  doc_files = []
567
568
  if args.docs:
568
569
  doc_files.extend(args.docs)
@@ -570,8 +571,8 @@ Example usage:
570
571
  if args.docs_dir:
571
572
  docs_dir = Path(args.docs_dir)
572
573
  if docs_dir.exists() and docs_dir.is_dir():
573
- for ext in ["*.pdf", "*.docx", "*.doc", "*.txt", "*.md"]:
574
- doc_files.extend([str(f) for f in docs_dir.glob(ext)])
574
+ for pattern in glob_patterns:
575
+ doc_files.extend([str(f) for f in docs_dir.glob(pattern)])
575
576
  else:
576
577
  logger.error(f"Error: Documents directory not found: {args.docs_dir}")
577
578
  return