realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
  2. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
  3. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
  4. realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
  5. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
  6. scripts/__init__.py +1 -0
  7. scripts/audit_prompts.py +179 -0
  8. scripts/check_install.py +460 -0
  9. scripts/generate_roster.py +327 -0
  10. scripts/install_all.py +653 -0
  11. scripts/migrate_kb.py +655 -0
  12. scripts/start.py +807 -0
  13. scripts/start_web.py +632 -0
  14. scripts/sync_prompts_from_en.py +147 -0
  15. src/__init__.py +2 -2
  16. src/agents/ideagen/material_organizer_agent.py +2 -0
  17. src/agents/solve/__init__.py +6 -0
  18. src/agents/solve/main_solver.py +9 -0
  19. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
  20. src/agents/solve/session_manager.py +345 -0
  21. src/api/main.py +14 -0
  22. src/api/routers/chat.py +3 -3
  23. src/api/routers/co_writer.py +12 -7
  24. src/api/routers/config.py +1 -0
  25. src/api/routers/guide.py +3 -1
  26. src/api/routers/ideagen.py +7 -0
  27. src/api/routers/knowledge.py +64 -12
  28. src/api/routers/question.py +2 -0
  29. src/api/routers/realtimex.py +137 -0
  30. src/api/routers/research.py +9 -0
  31. src/api/routers/solve.py +120 -2
  32. src/cli/__init__.py +13 -0
  33. src/cli/start.py +209 -0
  34. src/config/constants.py +11 -9
  35. src/knowledge/add_documents.py +453 -213
  36. src/knowledge/extract_numbered_items.py +9 -10
  37. src/knowledge/initializer.py +102 -101
  38. src/knowledge/manager.py +251 -74
  39. src/knowledge/progress_tracker.py +43 -2
  40. src/knowledge/start_kb.py +11 -2
  41. src/logging/__init__.py +5 -0
  42. src/logging/adapters/__init__.py +1 -0
  43. src/logging/adapters/lightrag.py +25 -18
  44. src/logging/adapters/llamaindex.py +1 -0
  45. src/logging/config.py +30 -27
  46. src/logging/handlers/__init__.py +1 -0
  47. src/logging/handlers/console.py +7 -50
  48. src/logging/handlers/file.py +5 -20
  49. src/logging/handlers/websocket.py +23 -19
  50. src/logging/logger.py +161 -126
  51. src/logging/stats/__init__.py +1 -0
  52. src/logging/stats/llm_stats.py +37 -17
  53. src/services/__init__.py +17 -1
  54. src/services/config/__init__.py +1 -0
  55. src/services/config/knowledge_base_config.py +1 -0
  56. src/services/config/loader.py +1 -1
  57. src/services/config/unified_config.py +211 -4
  58. src/services/embedding/__init__.py +1 -0
  59. src/services/embedding/adapters/__init__.py +3 -0
  60. src/services/embedding/adapters/base.py +1 -0
  61. src/services/embedding/adapters/cohere.py +1 -0
  62. src/services/embedding/adapters/jina.py +1 -0
  63. src/services/embedding/adapters/ollama.py +1 -0
  64. src/services/embedding/adapters/openai_compatible.py +1 -0
  65. src/services/embedding/adapters/realtimex.py +125 -0
  66. src/services/embedding/client.py +27 -0
  67. src/services/embedding/config.py +3 -0
  68. src/services/embedding/provider.py +1 -0
  69. src/services/llm/__init__.py +17 -3
  70. src/services/llm/capabilities.py +47 -0
  71. src/services/llm/client.py +32 -0
  72. src/services/llm/cloud_provider.py +21 -4
  73. src/services/llm/config.py +36 -2
  74. src/services/llm/error_mapping.py +1 -0
  75. src/services/llm/exceptions.py +30 -0
  76. src/services/llm/factory.py +55 -16
  77. src/services/llm/local_provider.py +1 -0
  78. src/services/llm/providers/anthropic.py +1 -0
  79. src/services/llm/providers/base_provider.py +1 -0
  80. src/services/llm/providers/open_ai.py +1 -0
  81. src/services/llm/realtimex_provider.py +240 -0
  82. src/services/llm/registry.py +1 -0
  83. src/services/llm/telemetry.py +1 -0
  84. src/services/llm/types.py +1 -0
  85. src/services/llm/utils.py +1 -0
  86. src/services/prompt/__init__.py +1 -0
  87. src/services/prompt/manager.py +3 -2
  88. src/services/rag/__init__.py +27 -5
  89. src/services/rag/components/__init__.py +1 -0
  90. src/services/rag/components/base.py +1 -0
  91. src/services/rag/components/chunkers/__init__.py +1 -0
  92. src/services/rag/components/chunkers/base.py +1 -0
  93. src/services/rag/components/chunkers/fixed.py +1 -0
  94. src/services/rag/components/chunkers/numbered_item.py +1 -0
  95. src/services/rag/components/chunkers/semantic.py +1 -0
  96. src/services/rag/components/embedders/__init__.py +1 -0
  97. src/services/rag/components/embedders/base.py +1 -0
  98. src/services/rag/components/embedders/openai.py +1 -0
  99. src/services/rag/components/indexers/__init__.py +1 -0
  100. src/services/rag/components/indexers/base.py +1 -0
  101. src/services/rag/components/indexers/graph.py +5 -44
  102. src/services/rag/components/indexers/lightrag.py +5 -44
  103. src/services/rag/components/indexers/vector.py +1 -0
  104. src/services/rag/components/parsers/__init__.py +1 -0
  105. src/services/rag/components/parsers/base.py +1 -0
  106. src/services/rag/components/parsers/markdown.py +1 -0
  107. src/services/rag/components/parsers/pdf.py +1 -0
  108. src/services/rag/components/parsers/text.py +1 -0
  109. src/services/rag/components/retrievers/__init__.py +1 -0
  110. src/services/rag/components/retrievers/base.py +1 -0
  111. src/services/rag/components/retrievers/dense.py +1 -0
  112. src/services/rag/components/retrievers/hybrid.py +5 -44
  113. src/services/rag/components/retrievers/lightrag.py +5 -44
  114. src/services/rag/components/routing.py +48 -0
  115. src/services/rag/factory.py +112 -46
  116. src/services/rag/pipeline.py +1 -0
  117. src/services/rag/pipelines/__init__.py +27 -18
  118. src/services/rag/pipelines/lightrag.py +1 -0
  119. src/services/rag/pipelines/llamaindex.py +99 -0
  120. src/services/rag/pipelines/raganything.py +67 -100
  121. src/services/rag/pipelines/raganything_docling.py +368 -0
  122. src/services/rag/service.py +5 -12
  123. src/services/rag/types.py +1 -0
  124. src/services/rag/utils/__init__.py +17 -0
  125. src/services/rag/utils/image_migration.py +279 -0
  126. src/services/search/__init__.py +1 -0
  127. src/services/search/base.py +1 -0
  128. src/services/search/consolidation.py +1 -0
  129. src/services/search/providers/__init__.py +1 -0
  130. src/services/search/providers/baidu.py +1 -0
  131. src/services/search/providers/exa.py +1 -0
  132. src/services/search/providers/jina.py +1 -0
  133. src/services/search/providers/perplexity.py +1 -0
  134. src/services/search/providers/serper.py +1 -0
  135. src/services/search/providers/tavily.py +1 -0
  136. src/services/search/types.py +1 -0
  137. src/services/settings/__init__.py +1 -0
  138. src/services/settings/interface_settings.py +78 -0
  139. src/services/setup/__init__.py +1 -0
  140. src/services/tts/__init__.py +1 -0
  141. src/services/tts/config.py +1 -0
  142. src/utils/realtimex.py +284 -0
  143. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
  144. src/services/rag/pipelines/academic.py +0 -44
  145. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,17 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ RAG Utilities
4
+ =============
5
+
6
+ Utility modules for RAG operations.
7
+ """
8
+
9
+ from .image_migration import (
10
+ cleanup_parser_output_dirs,
11
+ migrate_images_and_update_paths,
12
+ )
13
+
14
+ __all__ = [
15
+ "migrate_images_and_update_paths",
16
+ "cleanup_parser_output_dirs",
17
+ ]
@@ -0,0 +1,279 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Image Migration Utilities
4
+ =========================
5
+
6
+ Utilities for migrating images from parser output directories to the canonical
7
+ knowledge base images directory, and updating content_list paths accordingly.
8
+
9
+ This is needed because:
10
+ 1. Parsers (MinerU/Docling) output images to nested directories like:
11
+ content_list/{doc}/auto/images/ or content_list/{doc}/docling/images/
12
+ 2. RAG stores these paths in chunks, so if we move files later, retrieval breaks
13
+ 3. By migrating images BEFORE RAG indexing, we ensure correct paths are stored
14
+ """
15
+
16
+ import asyncio
17
+ from pathlib import Path
18
+ import shutil
19
+ from typing import Any, Dict, List, Tuple
20
+
21
+ from src.logging import get_logger
22
+
23
+ logger = get_logger("ImageMigration")
24
+
25
+ # Maximum concurrent file operations to avoid overwhelming I/O
26
+ MAX_CONCURRENT_COPIES = 10
27
+
28
+
29
+ async def migrate_images_and_update_paths(
30
+ content_list: List[Dict[str, Any]],
31
+ source_base_dir: Path,
32
+ target_images_dir: Path,
33
+ batch_size: int = 50,
34
+ ) -> Tuple[List[Dict[str, Any]], int]:
35
+ """
36
+ Migrate images from parser output to canonical images directory and update paths.
37
+
38
+ This function:
39
+ 1. Scans content_list for image paths
40
+ 2. Copies images to target_images_dir (with deduplication)
41
+ 3. Updates content_list with new paths
42
+ 4. Returns updated content_list
43
+
44
+ Args:
45
+ content_list: Parsed content list from MinerU/Docling
46
+ source_base_dir: Base directory where parser outputs are located
47
+ target_images_dir: Canonical images directory (e.g., kb/images/)
48
+ batch_size: Number of images to process in each batch
49
+
50
+ Returns:
51
+ Tuple of (updated_content_list, num_images_migrated)
52
+ """
53
+ # Ensure target directory exists
54
+ target_images_dir.mkdir(parents=True, exist_ok=True)
55
+
56
+ # Collect all image items that need migration
57
+ image_items = []
58
+ for idx, item in enumerate(content_list):
59
+ if not isinstance(item, dict):
60
+ continue
61
+
62
+ # Check for image path in various fields
63
+ img_path = item.get("img_path") or item.get("image_path")
64
+ if img_path:
65
+ image_items.append((idx, img_path, "img_path" if "img_path" in item else "image_path"))
66
+
67
+ if not image_items:
68
+ logger.debug("No images found in content_list, skipping migration")
69
+ return content_list, 0
70
+
71
+ logger.info(f"Found {len(image_items)} images to migrate")
72
+
73
+ # Process images in batches to handle large quantities
74
+ migrated_count = 0
75
+ path_updates = {} # old_path -> new_path mapping
76
+
77
+ for batch_start in range(0, len(image_items), batch_size):
78
+ batch = image_items[batch_start : batch_start + batch_size]
79
+ batch_updates = await _process_image_batch(batch, source_base_dir, target_images_dir)
80
+ path_updates.update(batch_updates)
81
+ migrated_count += len([v for v in batch_updates.values() if v])
82
+
83
+ if batch_start + batch_size < len(image_items):
84
+ logger.info(f"Migrated {batch_start + len(batch)}/{len(image_items)} images...")
85
+
86
+ # Update content_list with new paths
87
+ updated_content_list = _update_content_list_paths(content_list, path_updates)
88
+
89
+ logger.info(f"Image migration complete: {migrated_count} images migrated")
90
+ return updated_content_list, migrated_count
91
+
92
+
93
+ async def _process_image_batch(
94
+ batch: List[Tuple[int, str, str]],
95
+ source_base_dir: Path,
96
+ target_images_dir: Path,
97
+ ) -> Dict[str, str]:
98
+ """
99
+ Process a batch of images concurrently.
100
+
101
+ Args:
102
+ batch: List of (index, image_path, field_name) tuples
103
+ source_base_dir: Base directory for resolving relative paths
104
+ target_images_dir: Target directory for images
105
+
106
+ Returns:
107
+ Dict mapping old paths to new paths
108
+ """
109
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_COPIES)
110
+
111
+ async def copy_single_image(idx: int, img_path: str, field_name: str) -> Tuple[str, str]:
112
+ async with semaphore:
113
+ return await _migrate_single_image(img_path, source_base_dir, target_images_dir)
114
+
115
+ tasks = [copy_single_image(idx, img_path, field_name) for idx, img_path, field_name in batch]
116
+
117
+ results = await asyncio.gather(*tasks, return_exceptions=True)
118
+
119
+ path_updates = {}
120
+ for result in results:
121
+ if isinstance(result, Exception):
122
+ logger.warning(f"Error migrating image: {result}")
123
+ continue
124
+ old_path, new_path = result
125
+ if new_path:
126
+ path_updates[old_path] = new_path
127
+
128
+ return path_updates
129
+
130
+
131
+ async def _migrate_single_image(
132
+ img_path: str,
133
+ source_base_dir: Path,
134
+ target_images_dir: Path,
135
+ ) -> Tuple[str, str]:
136
+ """
137
+ Migrate a single image file.
138
+
139
+ Args:
140
+ img_path: Original image path (may be absolute or relative)
141
+ source_base_dir: Base directory for resolving relative paths
142
+ target_images_dir: Target directory for images
143
+
144
+ Returns:
145
+ Tuple of (original_path, new_path) or (original_path, None) if failed
146
+ """
147
+ try:
148
+ # Resolve the source path
149
+ source_path = Path(img_path)
150
+ if not source_path.is_absolute():
151
+ source_path = source_base_dir / img_path
152
+
153
+ if not source_path.exists():
154
+ logger.warning(f"Source image not found: {img_path}")
155
+ return (img_path, None)
156
+
157
+ # Generate target filename (preserve original name)
158
+ target_filename = source_path.name
159
+ target_path = target_images_dir / target_filename
160
+
161
+ # Handle filename conflicts by adding suffix
162
+ if target_path.exists():
163
+ # Check if it's the same file (by size)
164
+ if target_path.stat().st_size == source_path.stat().st_size:
165
+ # Same file already exists, just update path
166
+ return (img_path, str(target_path))
167
+
168
+ # Different file with same name, add suffix
169
+ stem = source_path.stem
170
+ suffix = source_path.suffix
171
+ counter = 1
172
+ while target_path.exists():
173
+ target_filename = f"{stem}_{counter}{suffix}"
174
+ target_path = target_images_dir / target_filename
175
+ counter += 1
176
+
177
+ # Copy file using thread pool to avoid blocking
178
+ loop = asyncio.get_event_loop()
179
+ await loop.run_in_executor(None, shutil.copy2, str(source_path), str(target_path))
180
+
181
+ logger.debug(f"Migrated: {source_path.name} -> {target_path}")
182
+ return (img_path, str(target_path))
183
+
184
+ except Exception as e:
185
+ logger.error(f"Failed to migrate image {img_path}: {e}")
186
+ return (img_path, None)
187
+
188
+
189
+ def _update_content_list_paths(
190
+ content_list: List[Dict[str, Any]],
191
+ path_updates: Dict[str, str],
192
+ ) -> List[Dict[str, Any]]:
193
+ """
194
+ Update image paths in content_list with new paths.
195
+
196
+ Args:
197
+ content_list: Original content list
198
+ path_updates: Mapping of old paths to new paths
199
+
200
+ Returns:
201
+ Updated content list (new list, original is not modified)
202
+ """
203
+ updated_list = []
204
+
205
+ for item in content_list:
206
+ if not isinstance(item, dict):
207
+ updated_list.append(item)
208
+ continue
209
+
210
+ # Create a copy of the item
211
+ updated_item = dict(item)
212
+
213
+ # Update img_path if present
214
+ if "img_path" in updated_item:
215
+ old_path = updated_item["img_path"]
216
+ if old_path in path_updates and path_updates[old_path]:
217
+ updated_item["img_path"] = path_updates[old_path]
218
+
219
+ # Update image_path if present (alternative field name)
220
+ if "image_path" in updated_item:
221
+ old_path = updated_item["image_path"]
222
+ if old_path in path_updates and path_updates[old_path]:
223
+ updated_item["image_path"] = path_updates[old_path]
224
+
225
+ updated_list.append(updated_item)
226
+
227
+ return updated_list
228
+
229
+
230
+ async def cleanup_parser_output_dirs(
231
+ content_list_dir: Path,
232
+ parser_subdirs: List[str] = None,
233
+ ) -> int:
234
+ """
235
+ Clean up parser output directories after successful migration.
236
+
237
+ Only removes the nested parser output directories (auto/, docling/),
238
+ NOT the content_list JSON files at the root level.
239
+
240
+ Args:
241
+ content_list_dir: The content_list directory
242
+ parser_subdirs: List of parser subdirectory names to clean
243
+
244
+ Returns:
245
+ Number of directories cleaned up
246
+ """
247
+ if parser_subdirs is None:
248
+ parser_subdirs = ["auto", "docling"]
249
+
250
+ cleaned_count = 0
251
+
252
+ for doc_dir in content_list_dir.glob("*"):
253
+ if not doc_dir.is_dir():
254
+ continue
255
+
256
+ for parser_subdir in parser_subdirs:
257
+ subdir = doc_dir / parser_subdir
258
+ if subdir.exists():
259
+ try:
260
+ # Run in thread pool to avoid blocking
261
+ loop = asyncio.get_event_loop()
262
+ await loop.run_in_executor(None, shutil.rmtree, str(subdir))
263
+ cleaned_count += 1
264
+ logger.debug(f"Cleaned up: {subdir}")
265
+ except Exception as e:
266
+ logger.warning(f"Failed to clean up {subdir}: {e}")
267
+
268
+ # Remove the doc_dir if it's now empty
269
+ try:
270
+ if doc_dir.exists() and not any(doc_dir.iterdir()):
271
+ doc_dir.rmdir()
272
+ logger.debug(f"Removed empty directory: {doc_dir}")
273
+ except Exception as e:
274
+ logger.debug(f"Could not remove directory {doc_dir}: {e}")
275
+
276
+ if cleaned_count > 0:
277
+ logger.info(f"Cleaned up {cleaned_count} parser output directories")
278
+
279
+ return cleaned_count
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Web Search Service - Pluggable search provider architecture
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Web Search Base Provider - Abstract base class for all search providers
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Answer Consolidation - Generate answers from raw search results
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Web Search Provider Registry
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Baidu AI Search Provider
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Exa Neural Search Provider
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Jina Reader Search Provider
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Perplexity AI Search Provider
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Serper Google SERP Provider
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Tavily Search Provider
3
4
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Web Search Types - Shared dataclasses and type definitions
3
4
 
@@ -0,0 +1 @@
1
+ """User interface (UI) settings helpers."""
@@ -0,0 +1,78 @@
1
+ """
2
+ Interface (UI) settings reader.
3
+
4
+ This is the canonical backend source for user-selected UI language/theme stored in:
5
+ data/user/settings/interface.json
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ PROJECT_ROOT = Path(__file__).resolve().parents[3]
15
+ INTERFACE_SETTINGS_FILE = PROJECT_ROOT / "data" / "user" / "settings" / "interface.json"
16
+
17
+ DEFAULT_UI_SETTINGS: dict[str, Any] = {
18
+ "theme": "light",
19
+ "language": "en",
20
+ }
21
+
22
+
23
+ def _normalize_language(language: Any, default: str = "en") -> str:
24
+ """
25
+ Normalize language codes:
26
+ - en/english -> en
27
+ - zh/chinese/cn -> zh
28
+ """
29
+ if language is None or language == "":
30
+ language = default
31
+
32
+ if isinstance(language, str):
33
+ s = language.lower().strip()
34
+ if s in {"en", "english"}:
35
+ return "en"
36
+ if s in {"zh", "chinese", "cn"}:
37
+ return "zh"
38
+
39
+ # Fall back to default
40
+ if isinstance(default, str):
41
+ return _normalize_language(default, "en")
42
+ return "en"
43
+
44
+
45
+ def get_ui_settings() -> dict[str, Any]:
46
+ """
47
+ Read UI settings from interface.json with defaults.
48
+
49
+ Returns:
50
+ dict containing at least: {"theme": "...", "language": "..."}
51
+ """
52
+ if INTERFACE_SETTINGS_FILE.exists():
53
+ try:
54
+ with open(INTERFACE_SETTINGS_FILE, encoding="utf-8") as f:
55
+ saved = json.load(f) or {}
56
+ merged = {**DEFAULT_UI_SETTINGS, **saved}
57
+ merged["language"] = _normalize_language(
58
+ merged.get("language"), DEFAULT_UI_SETTINGS["language"]
59
+ )
60
+ return merged
61
+ except Exception:
62
+ # On any parse error, fall back to defaults (safe)
63
+ return DEFAULT_UI_SETTINGS.copy()
64
+
65
+ return DEFAULT_UI_SETTINGS.copy()
66
+
67
+
68
+ def get_ui_language(default: str = "en") -> str:
69
+ """
70
+ Get current UI language.
71
+
72
+ Priority:
73
+ 1) interface.json
74
+ 2) provided default
75
+ 3) 'en'
76
+ """
77
+ settings = get_ui_settings()
78
+ return _normalize_language(settings.get("language"), default)
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Setup Service
3
4
  =============
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  TTS Service
3
4
  ===========
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  TTS Configuration
3
4
  =================