realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
- realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
- scripts/__init__.py +1 -0
- scripts/audit_prompts.py +179 -0
- scripts/check_install.py +460 -0
- scripts/generate_roster.py +327 -0
- scripts/install_all.py +653 -0
- scripts/migrate_kb.py +655 -0
- scripts/start.py +807 -0
- scripts/start_web.py +632 -0
- scripts/sync_prompts_from_en.py +147 -0
- src/__init__.py +2 -2
- src/agents/ideagen/material_organizer_agent.py +2 -0
- src/agents/solve/__init__.py +6 -0
- src/agents/solve/main_solver.py +9 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
- src/agents/solve/session_manager.py +345 -0
- src/api/main.py +14 -0
- src/api/routers/chat.py +3 -3
- src/api/routers/co_writer.py +12 -7
- src/api/routers/config.py +1 -0
- src/api/routers/guide.py +3 -1
- src/api/routers/ideagen.py +7 -0
- src/api/routers/knowledge.py +64 -12
- src/api/routers/question.py +2 -0
- src/api/routers/realtimex.py +137 -0
- src/api/routers/research.py +9 -0
- src/api/routers/solve.py +120 -2
- src/cli/__init__.py +13 -0
- src/cli/start.py +209 -0
- src/config/constants.py +11 -9
- src/knowledge/add_documents.py +453 -213
- src/knowledge/extract_numbered_items.py +9 -10
- src/knowledge/initializer.py +102 -101
- src/knowledge/manager.py +251 -74
- src/knowledge/progress_tracker.py +43 -2
- src/knowledge/start_kb.py +11 -2
- src/logging/__init__.py +5 -0
- src/logging/adapters/__init__.py +1 -0
- src/logging/adapters/lightrag.py +25 -18
- src/logging/adapters/llamaindex.py +1 -0
- src/logging/config.py +30 -27
- src/logging/handlers/__init__.py +1 -0
- src/logging/handlers/console.py +7 -50
- src/logging/handlers/file.py +5 -20
- src/logging/handlers/websocket.py +23 -19
- src/logging/logger.py +161 -126
- src/logging/stats/__init__.py +1 -0
- src/logging/stats/llm_stats.py +37 -17
- src/services/__init__.py +17 -1
- src/services/config/__init__.py +1 -0
- src/services/config/knowledge_base_config.py +1 -0
- src/services/config/loader.py +1 -1
- src/services/config/unified_config.py +211 -4
- src/services/embedding/__init__.py +1 -0
- src/services/embedding/adapters/__init__.py +3 -0
- src/services/embedding/adapters/base.py +1 -0
- src/services/embedding/adapters/cohere.py +1 -0
- src/services/embedding/adapters/jina.py +1 -0
- src/services/embedding/adapters/ollama.py +1 -0
- src/services/embedding/adapters/openai_compatible.py +1 -0
- src/services/embedding/adapters/realtimex.py +125 -0
- src/services/embedding/client.py +27 -0
- src/services/embedding/config.py +3 -0
- src/services/embedding/provider.py +1 -0
- src/services/llm/__init__.py +17 -3
- src/services/llm/capabilities.py +47 -0
- src/services/llm/client.py +32 -0
- src/services/llm/cloud_provider.py +21 -4
- src/services/llm/config.py +36 -2
- src/services/llm/error_mapping.py +1 -0
- src/services/llm/exceptions.py +30 -0
- src/services/llm/factory.py +55 -16
- src/services/llm/local_provider.py +1 -0
- src/services/llm/providers/anthropic.py +1 -0
- src/services/llm/providers/base_provider.py +1 -0
- src/services/llm/providers/open_ai.py +1 -0
- src/services/llm/realtimex_provider.py +240 -0
- src/services/llm/registry.py +1 -0
- src/services/llm/telemetry.py +1 -0
- src/services/llm/types.py +1 -0
- src/services/llm/utils.py +1 -0
- src/services/prompt/__init__.py +1 -0
- src/services/prompt/manager.py +3 -2
- src/services/rag/__init__.py +27 -5
- src/services/rag/components/__init__.py +1 -0
- src/services/rag/components/base.py +1 -0
- src/services/rag/components/chunkers/__init__.py +1 -0
- src/services/rag/components/chunkers/base.py +1 -0
- src/services/rag/components/chunkers/fixed.py +1 -0
- src/services/rag/components/chunkers/numbered_item.py +1 -0
- src/services/rag/components/chunkers/semantic.py +1 -0
- src/services/rag/components/embedders/__init__.py +1 -0
- src/services/rag/components/embedders/base.py +1 -0
- src/services/rag/components/embedders/openai.py +1 -0
- src/services/rag/components/indexers/__init__.py +1 -0
- src/services/rag/components/indexers/base.py +1 -0
- src/services/rag/components/indexers/graph.py +5 -44
- src/services/rag/components/indexers/lightrag.py +5 -44
- src/services/rag/components/indexers/vector.py +1 -0
- src/services/rag/components/parsers/__init__.py +1 -0
- src/services/rag/components/parsers/base.py +1 -0
- src/services/rag/components/parsers/markdown.py +1 -0
- src/services/rag/components/parsers/pdf.py +1 -0
- src/services/rag/components/parsers/text.py +1 -0
- src/services/rag/components/retrievers/__init__.py +1 -0
- src/services/rag/components/retrievers/base.py +1 -0
- src/services/rag/components/retrievers/dense.py +1 -0
- src/services/rag/components/retrievers/hybrid.py +5 -44
- src/services/rag/components/retrievers/lightrag.py +5 -44
- src/services/rag/components/routing.py +48 -0
- src/services/rag/factory.py +112 -46
- src/services/rag/pipeline.py +1 -0
- src/services/rag/pipelines/__init__.py +27 -18
- src/services/rag/pipelines/lightrag.py +1 -0
- src/services/rag/pipelines/llamaindex.py +99 -0
- src/services/rag/pipelines/raganything.py +67 -100
- src/services/rag/pipelines/raganything_docling.py +368 -0
- src/services/rag/service.py +5 -12
- src/services/rag/types.py +1 -0
- src/services/rag/utils/__init__.py +17 -0
- src/services/rag/utils/image_migration.py +279 -0
- src/services/search/__init__.py +1 -0
- src/services/search/base.py +1 -0
- src/services/search/consolidation.py +1 -0
- src/services/search/providers/__init__.py +1 -0
- src/services/search/providers/baidu.py +1 -0
- src/services/search/providers/exa.py +1 -0
- src/services/search/providers/jina.py +1 -0
- src/services/search/providers/perplexity.py +1 -0
- src/services/search/providers/serper.py +1 -0
- src/services/search/providers/tavily.py +1 -0
- src/services/search/types.py +1 -0
- src/services/settings/__init__.py +1 -0
- src/services/settings/interface_settings.py +78 -0
- src/services/setup/__init__.py +1 -0
- src/services/tts/__init__.py +1 -0
- src/services/tts/config.py +1 -0
- src/utils/realtimex.py +284 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
- src/services/rag/pipelines/academic.py +0 -44
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
|
@@ -19,9 +19,8 @@ from typing import Any
|
|
|
19
19
|
sys.path.append(str(Path(__file__).parent.parent.parent))
|
|
20
20
|
|
|
21
21
|
from dotenv import load_dotenv
|
|
22
|
-
from lightrag.llm.openai import openai_complete_if_cache
|
|
23
22
|
|
|
24
|
-
from src.services.llm import get_llm_config
|
|
23
|
+
from src.services.llm import get_llm_client, get_llm_config
|
|
25
24
|
|
|
26
25
|
load_dotenv(dotenv_path=".env", override=False)
|
|
27
26
|
|
|
@@ -62,18 +61,18 @@ async def _call_llm_async(
|
|
|
62
61
|
temperature: float = 0.1,
|
|
63
62
|
model: str = None,
|
|
64
63
|
) -> str:
|
|
65
|
-
"""Asynchronously call LLM"""
|
|
66
|
-
#
|
|
64
|
+
"""Asynchronously call LLM using unified LLM service"""
|
|
65
|
+
# Get unified LLM client (handles all provider differences and env var setup)
|
|
66
|
+
llm_client = get_llm_client()
|
|
67
|
+
llm_model_func = llm_client.get_model_func()
|
|
68
|
+
|
|
69
|
+
# If model not specified, use configured model
|
|
67
70
|
if model is None:
|
|
68
|
-
|
|
69
|
-
model = llm_cfg.model
|
|
71
|
+
model = llm_client.config.model
|
|
70
72
|
|
|
71
|
-
result =
|
|
72
|
-
model,
|
|
73
|
+
result = llm_model_func(
|
|
73
74
|
prompt,
|
|
74
75
|
system_prompt=system_prompt,
|
|
75
|
-
api_key=api_key,
|
|
76
|
-
base_url=base_url,
|
|
77
76
|
max_tokens=max_tokens,
|
|
78
77
|
temperature=temperature,
|
|
79
78
|
)
|
src/knowledge/initializer.py
CHANGED
|
@@ -21,6 +21,7 @@ import shutil
|
|
|
21
21
|
from src.logging import get_logger
|
|
22
22
|
from src.services.embedding import get_embedding_config
|
|
23
23
|
from src.services.llm import get_llm_config
|
|
24
|
+
from src.services.rag.components.routing import FileTypeRouter
|
|
24
25
|
from src.services.rag.service import RAGService
|
|
25
26
|
|
|
26
27
|
logger = get_logger("KnowledgeInit")
|
|
@@ -59,39 +60,44 @@ class KnowledgeBaseInitializer:
|
|
|
59
60
|
self.rag_provider = rag_provider
|
|
60
61
|
|
|
61
62
|
def _register_to_config(self):
|
|
62
|
-
"""Register KB to kb_config.json
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
63
|
+
"""Register KB to kb_config.json using KnowledgeBaseManager for consistency."""
|
|
64
|
+
try:
|
|
65
|
+
from src.knowledge.manager import KnowledgeBaseManager
|
|
66
|
+
|
|
67
|
+
manager = KnowledgeBaseManager(base_dir=str(self.base_dir))
|
|
68
|
+
|
|
69
|
+
# Check if already registered (reload config to get latest)
|
|
70
|
+
manager.config = manager._load_config()
|
|
71
|
+
if self.kb_name in manager.config.get("knowledge_bases", {}):
|
|
72
|
+
logger.info(" ✓ Already registered in kb_config.json")
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
# Register with initializing status
|
|
76
|
+
manager.update_kb_status(
|
|
77
|
+
name=self.kb_name,
|
|
78
|
+
status="initializing",
|
|
79
|
+
progress={
|
|
80
|
+
"stage": "initializing",
|
|
81
|
+
"message": "Creating directory structure...",
|
|
82
|
+
"percent": 0,
|
|
83
|
+
"current": 0,
|
|
84
|
+
"total": 0,
|
|
85
|
+
},
|
|
86
|
+
)
|
|
87
|
+
logger.info(" ✓ Registered to kb_config.json")
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.warning(f"Failed to register to config: {e}")
|
|
80
90
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
"description": f"Knowledge base: {self.kb_name}",
|
|
85
|
-
}
|
|
91
|
+
def _get_file_hash(self, file_path: Path) -> str:
|
|
92
|
+
"""Calculate SHA-256 hash of a file."""
|
|
93
|
+
import hashlib
|
|
86
94
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
else:
|
|
94
|
-
logger.info(" ✓ Already registered in kb_config.json")
|
|
95
|
+
sha256_hash = hashlib.sha256()
|
|
96
|
+
chunk_size = 65536 # 64KB
|
|
97
|
+
with open(file_path, "rb") as f:
|
|
98
|
+
for byte_block in iter(lambda: f.read(chunk_size), b""):
|
|
99
|
+
sha256_hash.update(byte_block)
|
|
100
|
+
return sha256_hash.hexdigest()
|
|
95
101
|
|
|
96
102
|
def _update_metadata_with_provider(self, provider: str):
|
|
97
103
|
"""Update metadata.json and centralized config with the RAG provider used."""
|
|
@@ -106,10 +112,22 @@ class KnowledgeBaseInitializer:
|
|
|
106
112
|
metadata["rag_provider"] = provider
|
|
107
113
|
metadata["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
108
114
|
|
|
115
|
+
# Record file hashes for all successfully processed files in raw/
|
|
116
|
+
# This enables incremental add to detect duplicates
|
|
117
|
+
file_hashes = metadata.get("file_hashes", {})
|
|
118
|
+
for raw_file in self.raw_dir.glob("*"):
|
|
119
|
+
if raw_file.is_file():
|
|
120
|
+
try:
|
|
121
|
+
file_hashes[raw_file.name] = self._get_file_hash(raw_file)
|
|
122
|
+
except Exception as hash_err:
|
|
123
|
+
logger.warning(f"Failed to hash {raw_file.name}: {hash_err}")
|
|
124
|
+
metadata["file_hashes"] = file_hashes
|
|
125
|
+
|
|
109
126
|
with open(metadata_file, "w", encoding="utf-8") as f:
|
|
110
127
|
json.dump(metadata, indent=2, ensure_ascii=False, fp=f)
|
|
111
128
|
|
|
112
129
|
logger.info(f" ✓ Updated metadata with RAG provider: {provider}")
|
|
130
|
+
logger.info(f" ✓ Recorded {len(file_hashes)} file hashes for incremental add")
|
|
113
131
|
|
|
114
132
|
# Also save to centralized config file
|
|
115
133
|
try:
|
|
@@ -186,10 +204,11 @@ class KnowledgeBaseInitializer:
|
|
|
186
204
|
total=0,
|
|
187
205
|
)
|
|
188
206
|
|
|
189
|
-
# Get all documents in raw directory
|
|
207
|
+
# Get all documents in raw directory based on provider's supported extensions
|
|
190
208
|
doc_files = []
|
|
191
|
-
|
|
192
|
-
|
|
209
|
+
glob_patterns = FileTypeRouter.get_glob_patterns_for_provider(provider)
|
|
210
|
+
for pattern in glob_patterns:
|
|
211
|
+
doc_files.extend(list(self.raw_dir.glob(pattern)))
|
|
193
212
|
|
|
194
213
|
if not doc_files:
|
|
195
214
|
logger.warning("No documents found to process")
|
|
@@ -274,80 +293,58 @@ class KnowledgeBaseInitializer:
|
|
|
274
293
|
|
|
275
294
|
async def fix_structure(self):
|
|
276
295
|
"""
|
|
277
|
-
|
|
278
|
-
Flattens content_list directories and moves images to the correct location.
|
|
279
|
-
"""
|
|
280
|
-
logger.info("\nFixing directory structure...")
|
|
296
|
+
Clean up parser output directories after image migration.
|
|
281
297
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
if not doc_dir.is_dir():
|
|
286
|
-
continue
|
|
298
|
+
NOTE: Image migration and path updates are now handled by the RAG pipeline
|
|
299
|
+
(raganything.py / raganything_docling.py) BEFORE RAG insertion. This ensures
|
|
300
|
+
RAG stores the correct canonical image paths (kb/images/) from the start.
|
|
287
301
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
302
|
+
This method now only:
|
|
303
|
+
1. Checks if there are any leftover nested directories to clean up
|
|
304
|
+
2. Removes empty temporary parser output directories
|
|
291
305
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
content_list_moves.append((json_file, target_file))
|
|
306
|
+
Supports both 'auto' (MinerU) and 'docling' parser output directories.
|
|
307
|
+
"""
|
|
308
|
+
logger.info("\nChecking for leftover parser output directories...")
|
|
296
309
|
|
|
297
|
-
#
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
shutil.copy2(source, target)
|
|
301
|
-
logger.info(f" ✓ Moved: {source.name} -> {target.name}")
|
|
302
|
-
except Exception as e:
|
|
303
|
-
logger.error(f" ✗ Error moving {source.name}: {e!s}")
|
|
310
|
+
# Support both 'auto' (MinerU) and 'docling' parser output directories
|
|
311
|
+
parser_subdirs = ["auto", "docling"]
|
|
312
|
+
cleaned_count = 0
|
|
304
313
|
|
|
305
|
-
# Find and
|
|
306
|
-
for doc_dir in self.content_list_dir.glob("*"):
|
|
314
|
+
# Find and remove empty parser output directories
|
|
315
|
+
for doc_dir in list(self.content_list_dir.glob("*")):
|
|
307
316
|
if not doc_dir.is_dir():
|
|
308
317
|
continue
|
|
309
318
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
319
|
+
for parser_subdir in parser_subdirs:
|
|
320
|
+
subdir = doc_dir / parser_subdir
|
|
321
|
+
if subdir.exists():
|
|
322
|
+
try:
|
|
323
|
+
# Check if directory is empty or only contains empty subdirs
|
|
324
|
+
has_content = any(
|
|
325
|
+
f.is_file() or (f.is_dir() and any(f.iterdir()))
|
|
326
|
+
for f in subdir.iterdir()
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
if not has_content:
|
|
330
|
+
shutil.rmtree(subdir)
|
|
331
|
+
cleaned_count += 1
|
|
332
|
+
logger.debug(f" Removed empty directory: {subdir}")
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.debug(f" Could not clean up {subdir}: {e}")
|
|
335
|
+
|
|
336
|
+
# Remove doc_dir if it's now empty
|
|
337
|
+
try:
|
|
338
|
+
if doc_dir.exists() and not any(doc_dir.iterdir()):
|
|
339
|
+
doc_dir.rmdir()
|
|
340
|
+
logger.debug(f" Removed empty directory: {doc_dir}")
|
|
341
|
+
except Exception:
|
|
342
|
+
pass
|
|
313
343
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
self.images_dir.mkdir(parents=True, exist_ok=True)
|
|
319
|
-
|
|
320
|
-
for img_file in images_dir.glob("*"):
|
|
321
|
-
if img_file.is_file() and img_file.exists():
|
|
322
|
-
target_img = self.images_dir / img_file.name
|
|
323
|
-
if not target_img.exists():
|
|
324
|
-
try:
|
|
325
|
-
# Ensure source file exists
|
|
326
|
-
if not img_file.exists():
|
|
327
|
-
logger.warning(f" ⚠ Source image not found: {img_file}")
|
|
328
|
-
continue
|
|
329
|
-
shutil.copy2(img_file, target_img)
|
|
330
|
-
image_count += 1
|
|
331
|
-
except FileNotFoundError:
|
|
332
|
-
logger.error(
|
|
333
|
-
f" ✗ Error moving image {img_file.name}: Source file not found: {img_file}"
|
|
334
|
-
)
|
|
335
|
-
except Exception as e:
|
|
336
|
-
logger.error(f" ✗ Error moving image {img_file.name}: {e!s}")
|
|
337
|
-
|
|
338
|
-
if image_count > 0:
|
|
339
|
-
logger.info(f" ✓ Moved {image_count} images from {doc_dir.name}/auto/images/")
|
|
340
|
-
|
|
341
|
-
# Clean up nested directories
|
|
342
|
-
for doc_dir in self.content_list_dir.glob("*"):
|
|
343
|
-
if doc_dir.is_dir():
|
|
344
|
-
try:
|
|
345
|
-
shutil.rmtree(doc_dir)
|
|
346
|
-
logger.info(f" ✓ Cleaned up: {doc_dir.name}/")
|
|
347
|
-
except Exception as e:
|
|
348
|
-
logger.error(f" ✗ Error removing {doc_dir.name}: {e!s}")
|
|
349
|
-
|
|
350
|
-
logger.info("✓ Structure fixed!")
|
|
344
|
+
if cleaned_count > 0:
|
|
345
|
+
logger.info(f"✓ Cleaned up {cleaned_count} empty parser directories")
|
|
346
|
+
else:
|
|
347
|
+
logger.info("✓ No cleanup needed (structure already organized)")
|
|
351
348
|
|
|
352
349
|
def extract_numbered_items(self, batch_size: int = 20):
|
|
353
350
|
"""
|
|
@@ -563,6 +560,10 @@ Example usage:
|
|
|
563
560
|
return
|
|
564
561
|
|
|
565
562
|
# Collect document files
|
|
563
|
+
# Use provider from env var or default to raganything (most comprehensive)
|
|
564
|
+
provider = os.getenv("RAG_PROVIDER", "raganything")
|
|
565
|
+
glob_patterns = FileTypeRouter.get_glob_patterns_for_provider(provider)
|
|
566
|
+
|
|
566
567
|
doc_files = []
|
|
567
568
|
if args.docs:
|
|
568
569
|
doc_files.extend(args.docs)
|
|
@@ -570,8 +571,8 @@ Example usage:
|
|
|
570
571
|
if args.docs_dir:
|
|
571
572
|
docs_dir = Path(args.docs_dir)
|
|
572
573
|
if docs_dir.exists() and docs_dir.is_dir():
|
|
573
|
-
for
|
|
574
|
-
doc_files.extend([str(f) for f in docs_dir.glob(
|
|
574
|
+
for pattern in glob_patterns:
|
|
575
|
+
doc_files.extend([str(f) for f in docs_dir.glob(pattern)])
|
|
575
576
|
else:
|
|
576
577
|
logger.error(f"Error: Documents directory not found: {args.docs_dir}")
|
|
577
578
|
return
|