realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
src/tools/rag_tool.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
RAG Query Tool - Pure tool wrapper for RAG operations
|
|
4
|
+
|
|
5
|
+
This module provides simple function wrappers for RAG operations.
|
|
6
|
+
All logic is delegated to RAGService in src/services/rag/service.py.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
from dotenv import load_dotenv
|
|
14
|
+
|
|
15
|
+
# Load environment variables
|
|
16
|
+
project_root = Path(__file__).parent.parent.parent
|
|
17
|
+
load_dotenv(project_root / "DeepTutor.env", override=False)
|
|
18
|
+
load_dotenv(project_root / ".env", override=False)
|
|
19
|
+
|
|
20
|
+
# Import RAGService as the single entry point
|
|
21
|
+
from src.services.rag.service import RAGService
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def rag_search(
|
|
25
|
+
query: str,
|
|
26
|
+
kb_name: Optional[str] = None,
|
|
27
|
+
mode: str = "hybrid",
|
|
28
|
+
provider: Optional[str] = None,
|
|
29
|
+
kb_base_dir: Optional[str] = None,
|
|
30
|
+
**kwargs,
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
Query knowledge base using configurable RAG pipeline.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
query: Query question
|
|
37
|
+
kb_name: Knowledge base name (optional, defaults to default knowledge base)
|
|
38
|
+
mode: Query mode (e.g., "hybrid", "local", "global", "naive")
|
|
39
|
+
provider: RAG pipeline to use (defaults to RAG_PROVIDER env var or "lightrag")
|
|
40
|
+
kb_base_dir: Base directory for knowledge bases (for testing)
|
|
41
|
+
**kwargs: Additional parameters passed to the RAG pipeline
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
dict: Dictionary containing query results
|
|
45
|
+
{
|
|
46
|
+
"query": str,
|
|
47
|
+
"answer": str,
|
|
48
|
+
"content": str,
|
|
49
|
+
"mode": str,
|
|
50
|
+
"provider": str
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If the specified RAG pipeline is not found
|
|
55
|
+
Exception: If the query fails
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
# Use default provider (from .env)
|
|
59
|
+
result = await rag_search("What is machine learning?", kb_name="textbook")
|
|
60
|
+
|
|
61
|
+
# Override provider
|
|
62
|
+
result = await rag_search("What is ML?", kb_name="textbook", provider="lightrag")
|
|
63
|
+
"""
|
|
64
|
+
service = RAGService(kb_base_dir=kb_base_dir, provider=provider)
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
return await service.search(query=query, kb_name=kb_name, mode=mode, **kwargs)
|
|
68
|
+
except Exception as e:
|
|
69
|
+
raise Exception(f"RAG search failed: {e}")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def initialize_rag(
|
|
73
|
+
kb_name: str,
|
|
74
|
+
documents: List[str],
|
|
75
|
+
provider: Optional[str] = None,
|
|
76
|
+
kb_base_dir: Optional[str] = None,
|
|
77
|
+
**kwargs,
|
|
78
|
+
) -> bool:
|
|
79
|
+
"""
|
|
80
|
+
Initialize RAG with documents.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
kb_name: Knowledge base name
|
|
84
|
+
documents: List of document file paths to index
|
|
85
|
+
provider: RAG pipeline to use (defaults to RAG_PROVIDER env var)
|
|
86
|
+
kb_base_dir: Base directory for knowledge bases (for testing)
|
|
87
|
+
**kwargs: Additional arguments passed to pipeline
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
True if successful
|
|
91
|
+
|
|
92
|
+
Example:
|
|
93
|
+
documents = ["doc1.pdf", "doc2.txt"]
|
|
94
|
+
success = await initialize_rag("my_kb", documents)
|
|
95
|
+
"""
|
|
96
|
+
service = RAGService(kb_base_dir=kb_base_dir, provider=provider)
|
|
97
|
+
return await service.initialize(kb_name=kb_name, file_paths=documents, **kwargs)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def delete_rag(
|
|
101
|
+
kb_name: str,
|
|
102
|
+
provider: Optional[str] = None,
|
|
103
|
+
kb_base_dir: Optional[str] = None,
|
|
104
|
+
) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
Delete a knowledge base.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
kb_name: Knowledge base name
|
|
110
|
+
provider: RAG pipeline to use (defaults to RAG_PROVIDER env var)
|
|
111
|
+
kb_base_dir: Base directory for knowledge bases (for testing)
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
True if successful
|
|
115
|
+
|
|
116
|
+
Example:
|
|
117
|
+
success = await delete_rag("old_kb")
|
|
118
|
+
"""
|
|
119
|
+
service = RAGService(kb_base_dir=kb_base_dir, provider=provider)
|
|
120
|
+
return await service.delete(kb_name=kb_name)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_available_providers() -> List[Dict]:
|
|
124
|
+
"""
|
|
125
|
+
Get list of available RAG pipelines.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of pipeline information dictionaries
|
|
129
|
+
|
|
130
|
+
Example:
|
|
131
|
+
providers = get_available_providers()
|
|
132
|
+
for p in providers:
|
|
133
|
+
print(f"{p['name']}: {p['description']}")
|
|
134
|
+
"""
|
|
135
|
+
return RAGService.list_providers()
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_current_provider() -> str:
|
|
139
|
+
"""Get the currently configured RAG provider"""
|
|
140
|
+
return RAGService.get_current_provider()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Backward compatibility aliases
|
|
144
|
+
get_available_plugins = get_available_providers
|
|
145
|
+
list_providers = RAGService.list_providers
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
import sys
|
|
150
|
+
|
|
151
|
+
if sys.platform == "win32":
|
|
152
|
+
import io
|
|
153
|
+
|
|
154
|
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
|
|
155
|
+
|
|
156
|
+
# List available providers
|
|
157
|
+
print("Available RAG Pipelines:")
|
|
158
|
+
for provider in get_available_providers():
|
|
159
|
+
print(f" - {provider['id']}: {provider['description']}")
|
|
160
|
+
print(f"\nCurrent provider: {get_current_provider()}\n")
|
|
161
|
+
|
|
162
|
+
# Test search (requires existing knowledge base)
|
|
163
|
+
result = asyncio.run(
|
|
164
|
+
rag_search(
|
|
165
|
+
"What is the lookup table (LUT) in FPGA?",
|
|
166
|
+
kb_name="DE-all",
|
|
167
|
+
mode="naive",
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
print(f"Query: {result['query']}")
|
|
172
|
+
print(f"Answer: {result['answer']}")
|
|
173
|
+
print(f"Provider: {result.get('provider', 'unknown')}")
|
src/tools/tex_chunker.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
TeX Chunker - LaTeX text chunking tool
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
1. Intelligent chunking of LaTeX content (by section or token count)
|
|
7
|
+
2. Token estimation (based on GPT tokenizer)
|
|
8
|
+
3. Maintain context coherence (overlap between chunks)
|
|
9
|
+
|
|
10
|
+
Author: DeepTutor Team
|
|
11
|
+
Version: v1.0
|
|
12
|
+
Based on: TODO.md specification
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
import tiktoken
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TexChunker:
|
|
22
|
+
"""LaTeX text chunking tool"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, model: str | None = None):
|
|
25
|
+
"""
|
|
26
|
+
Initialize chunking tool
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
model: Model name (for token estimation). If not provided, read from LLM_MODEL environment variable
|
|
30
|
+
"""
|
|
31
|
+
# Read model configuration from environment variables
|
|
32
|
+
if model is None:
|
|
33
|
+
model = os.getenv("LLM_MODEL")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
if model:
|
|
37
|
+
self.encoder = tiktoken.encoding_for_model(model)
|
|
38
|
+
else:
|
|
39
|
+
# Use cl100k_base as default encoding if no model specified
|
|
40
|
+
self.encoder = tiktoken.get_encoding("cl100k_base")
|
|
41
|
+
except Exception:
|
|
42
|
+
# If model not supported, use cl100k_base (GPT-4 encoding)
|
|
43
|
+
self.encoder = tiktoken.get_encoding("cl100k_base")
|
|
44
|
+
|
|
45
|
+
def estimate_tokens(self, text: str) -> int:
|
|
46
|
+
"""
|
|
47
|
+
Estimate token count of text
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
text: Input text
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Token count
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
# Clean text: remove overly long repeated characters (may cause token explosion)
|
|
57
|
+
cleaned_text = self._clean_text(text)
|
|
58
|
+
tokens = self.encoder.encode(cleaned_text)
|
|
59
|
+
return len(tokens)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
# If encoding fails, use rough estimate: 1 token ≈ 4 chars
|
|
62
|
+
print(f" ⚠️ Token estimation failed, using rough estimate: {e!s}")
|
|
63
|
+
return len(text) // 4
|
|
64
|
+
|
|
65
|
+
def _clean_text(self, text: str) -> str:
|
|
66
|
+
"""
|
|
67
|
+
Clean text to prevent token estimation anomalies
|
|
68
|
+
|
|
69
|
+
- Remove overly long repeated character sequences
|
|
70
|
+
- Limit single line length
|
|
71
|
+
"""
|
|
72
|
+
import re
|
|
73
|
+
|
|
74
|
+
# Remove overly long repeated characters (e.g., consecutive spaces, newlines, etc.)
|
|
75
|
+
text = re.sub(r"(\s)\1{100,}", r"\1" * 10, text)
|
|
76
|
+
|
|
77
|
+
# Remove overly long single lines (may be erroneous data)
|
|
78
|
+
lines = text.split("\n")
|
|
79
|
+
cleaned_lines = []
|
|
80
|
+
for line in lines:
|
|
81
|
+
if len(line) > 10000: # Single line over 10k characters, may be problematic
|
|
82
|
+
print(f" ⚠️ Detected overly long line ({len(line)} characters), truncating")
|
|
83
|
+
line = line[:10000] + "...[truncated]"
|
|
84
|
+
cleaned_lines.append(line)
|
|
85
|
+
|
|
86
|
+
return "\n".join(cleaned_lines)
|
|
87
|
+
|
|
88
|
+
def split_tex_into_chunks(
|
|
89
|
+
self, tex_content: str, max_tokens: int = 8000, overlap: int = 500
|
|
90
|
+
) -> list[str]:
|
|
91
|
+
r"""
|
|
92
|
+
Split LaTeX content into chunks
|
|
93
|
+
|
|
94
|
+
Strategy:
|
|
95
|
+
1. Prioritize splitting by sections (\section, \subsection)
|
|
96
|
+
2. If single section is too long, split by paragraphs
|
|
97
|
+
3. Maintain overlap tokens to avoid context loss
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
tex_content: LaTeX source code
|
|
101
|
+
max_tokens: Maximum tokens per chunk (default: 8000)
|
|
102
|
+
overlap: Overlap tokens between chunks (default: 500)
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of chunks
|
|
106
|
+
"""
|
|
107
|
+
total_tokens = self.estimate_tokens(tex_content)
|
|
108
|
+
|
|
109
|
+
# If total length doesn't exceed max_tokens, return directly
|
|
110
|
+
if total_tokens <= max_tokens:
|
|
111
|
+
return [tex_content]
|
|
112
|
+
|
|
113
|
+
print(f" LaTeX content needs chunking: {total_tokens:,} tokens > {max_tokens:,} tokens")
|
|
114
|
+
print(
|
|
115
|
+
f" File character count: {len(tex_content):,}, line count: {len(tex_content.splitlines()):,}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# 1. Try splitting by sections
|
|
119
|
+
sections = self._split_by_sections(tex_content)
|
|
120
|
+
|
|
121
|
+
# 2. Merge sections into chunks
|
|
122
|
+
chunks = []
|
|
123
|
+
current_chunk = ""
|
|
124
|
+
current_tokens = 0
|
|
125
|
+
|
|
126
|
+
for section in sections:
|
|
127
|
+
section_tokens = self.estimate_tokens(section)
|
|
128
|
+
|
|
129
|
+
if section_tokens > max_tokens:
|
|
130
|
+
# Single section too long, need further splitting
|
|
131
|
+
if current_chunk:
|
|
132
|
+
chunks.append(current_chunk)
|
|
133
|
+
current_chunk = ""
|
|
134
|
+
current_tokens = 0
|
|
135
|
+
|
|
136
|
+
# Split overly long section by paragraphs
|
|
137
|
+
sub_chunks = self._split_by_paragraphs(section, max_tokens, overlap)
|
|
138
|
+
chunks.extend(sub_chunks)
|
|
139
|
+
# Check if can merge into current chunk
|
|
140
|
+
elif current_tokens + section_tokens <= max_tokens:
|
|
141
|
+
current_chunk += section
|
|
142
|
+
current_tokens += section_tokens
|
|
143
|
+
else:
|
|
144
|
+
# Save current chunk, start new chunk
|
|
145
|
+
if current_chunk:
|
|
146
|
+
chunks.append(current_chunk)
|
|
147
|
+
|
|
148
|
+
# Add overlap (take part from end of current chunk)
|
|
149
|
+
if chunks and overlap > 0:
|
|
150
|
+
overlap_text = self._get_overlap_text(chunks[-1], overlap)
|
|
151
|
+
current_chunk = overlap_text + section
|
|
152
|
+
current_tokens = self.estimate_tokens(current_chunk)
|
|
153
|
+
else:
|
|
154
|
+
current_chunk = section
|
|
155
|
+
current_tokens = section_tokens
|
|
156
|
+
|
|
157
|
+
# Save last chunk
|
|
158
|
+
if current_chunk:
|
|
159
|
+
chunks.append(current_chunk)
|
|
160
|
+
|
|
161
|
+
print(f" Chunking completed: {len(chunks)} chunks")
|
|
162
|
+
return chunks
|
|
163
|
+
|
|
164
|
+
def _split_by_sections(self, tex_content: str) -> list[str]:
|
|
165
|
+
"""
|
|
166
|
+
Split LaTeX content by sections
|
|
167
|
+
|
|
168
|
+
Recognizes:
|
|
169
|
+
- \\section{...}
|
|
170
|
+
- \\subsection{...}
|
|
171
|
+
- \\subsubsection{...}
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
List of sections
|
|
175
|
+
"""
|
|
176
|
+
# Regex match section markers
|
|
177
|
+
pattern = r"(\\(?:sub)*section\{[^}]*\})"
|
|
178
|
+
|
|
179
|
+
# Split text
|
|
180
|
+
parts = re.split(pattern, tex_content)
|
|
181
|
+
|
|
182
|
+
if len(parts) <= 1:
|
|
183
|
+
# No section markers found, split by paragraphs
|
|
184
|
+
return self._split_by_paragraphs(tex_content, max_tokens=10000, overlap=0)
|
|
185
|
+
|
|
186
|
+
# Recombine: merge section markers and content
|
|
187
|
+
sections = []
|
|
188
|
+
for i in range(1, len(parts), 2):
|
|
189
|
+
if i < len(parts):
|
|
190
|
+
section = parts[i] # Section marker
|
|
191
|
+
if i + 1 < len(parts):
|
|
192
|
+
section += parts[i + 1] # Section content
|
|
193
|
+
sections.append(section)
|
|
194
|
+
|
|
195
|
+
# Add preamble part (first element)
|
|
196
|
+
if parts[0].strip():
|
|
197
|
+
sections.insert(0, parts[0])
|
|
198
|
+
|
|
199
|
+
return sections
|
|
200
|
+
|
|
201
|
+
def _split_by_paragraphs(self, text: str, max_tokens: int, overlap: int) -> list[str]:
|
|
202
|
+
"""
|
|
203
|
+
Split text by paragraphs (for overly long sections)
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
text: Input text
|
|
207
|
+
max_tokens: Maximum tokens per chunk
|
|
208
|
+
overlap: Overlap tokens
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List of paragraph chunks
|
|
212
|
+
"""
|
|
213
|
+
# Split paragraphs by double newlines
|
|
214
|
+
paragraphs = re.split(r"\n\n+", text)
|
|
215
|
+
|
|
216
|
+
chunks = []
|
|
217
|
+
current_chunk = ""
|
|
218
|
+
current_tokens = 0
|
|
219
|
+
|
|
220
|
+
for para in paragraphs:
|
|
221
|
+
para_tokens = self.estimate_tokens(para)
|
|
222
|
+
|
|
223
|
+
if para_tokens > max_tokens:
|
|
224
|
+
# Single paragraph too long, split by sentences
|
|
225
|
+
if current_chunk:
|
|
226
|
+
chunks.append(current_chunk)
|
|
227
|
+
current_chunk = ""
|
|
228
|
+
current_tokens = 0
|
|
229
|
+
|
|
230
|
+
# Split by sentences (simple method: split by periods)
|
|
231
|
+
sentences = re.split(r"(?<=[.!?])\s+", para)
|
|
232
|
+
for sentence in sentences:
|
|
233
|
+
sentence_tokens = self.estimate_tokens(sentence)
|
|
234
|
+
if current_tokens + sentence_tokens <= max_tokens:
|
|
235
|
+
current_chunk += sentence + " "
|
|
236
|
+
current_tokens += sentence_tokens
|
|
237
|
+
else:
|
|
238
|
+
if current_chunk:
|
|
239
|
+
chunks.append(current_chunk)
|
|
240
|
+
current_chunk = sentence + " "
|
|
241
|
+
current_tokens = sentence_tokens
|
|
242
|
+
# Check if can merge
|
|
243
|
+
elif current_tokens + para_tokens <= max_tokens:
|
|
244
|
+
current_chunk += para + "\n\n"
|
|
245
|
+
current_tokens += para_tokens
|
|
246
|
+
else:
|
|
247
|
+
# Save current chunk
|
|
248
|
+
if current_chunk:
|
|
249
|
+
chunks.append(current_chunk)
|
|
250
|
+
|
|
251
|
+
# Add overlap
|
|
252
|
+
if chunks and overlap > 0:
|
|
253
|
+
overlap_text = self._get_overlap_text(chunks[-1], overlap)
|
|
254
|
+
current_chunk = overlap_text + para + "\n\n"
|
|
255
|
+
current_tokens = self.estimate_tokens(current_chunk)
|
|
256
|
+
else:
|
|
257
|
+
current_chunk = para + "\n\n"
|
|
258
|
+
current_tokens = para_tokens
|
|
259
|
+
|
|
260
|
+
# Save last chunk
|
|
261
|
+
if current_chunk:
|
|
262
|
+
chunks.append(current_chunk)
|
|
263
|
+
|
|
264
|
+
return chunks
|
|
265
|
+
|
|
266
|
+
def _get_overlap_text(self, previous_chunk: str, overlap_tokens: int) -> str:
|
|
267
|
+
"""
|
|
268
|
+
Extract overlap portion from end of previous chunk
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
previous_chunk: Previous chunk
|
|
272
|
+
overlap_tokens: Number of overlap tokens
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Overlap text
|
|
276
|
+
"""
|
|
277
|
+
# Encode entire chunk
|
|
278
|
+
tokens = self.encoder.encode(previous_chunk)
|
|
279
|
+
|
|
280
|
+
# Take last overlap_tokens tokens
|
|
281
|
+
if len(tokens) <= overlap_tokens:
|
|
282
|
+
return previous_chunk
|
|
283
|
+
|
|
284
|
+
overlap_token_ids = tokens[-overlap_tokens:]
|
|
285
|
+
overlap_text = self.encoder.decode(overlap_token_ids)
|
|
286
|
+
|
|
287
|
+
return overlap_text
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# ========== Usage Example ==========
|
|
291
|
+
|
|
292
|
+
if __name__ == "__main__":
|
|
293
|
+
# Create chunking tool
|
|
294
|
+
chunker = TexChunker(model="gpt-4o")
|
|
295
|
+
|
|
296
|
+
# Test text
|
|
297
|
+
test_tex = r"""
|
|
298
|
+
\section{Introduction}
|
|
299
|
+
This is the introduction section with some content that is moderately long.
|
|
300
|
+
It contains multiple paragraphs and discusses the background of the research.
|
|
301
|
+
|
|
302
|
+
The problem we are addressing is important and has wide applications.
|
|
303
|
+
|
|
304
|
+
\section{Related Work}
|
|
305
|
+
Previous work has explored various approaches to this problem.
|
|
306
|
+
Some researchers have used method A, while others prefer method B.
|
|
307
|
+
|
|
308
|
+
Recent advances in deep learning have opened new possibilities.
|
|
309
|
+
|
|
310
|
+
\subsection{Deep Learning Approaches}
|
|
311
|
+
Neural networks have shown promising results in many tasks.
|
|
312
|
+
Convolutional networks are particularly effective for image processing.
|
|
313
|
+
|
|
314
|
+
\section{Methodology}
|
|
315
|
+
Our approach combines the best aspects of previous methods.
|
|
316
|
+
We propose a novel architecture that addresses the key limitations.
|
|
317
|
+
|
|
318
|
+
\subsection{Model Architecture}
|
|
319
|
+
The model consists of three main components: encoder, processor, and decoder.
|
|
320
|
+
Each component is carefully designed to handle specific aspects of the task.
|
|
321
|
+
|
|
322
|
+
\section{Experiments}
|
|
323
|
+
We conducted extensive experiments on multiple datasets.
|
|
324
|
+
The results demonstrate the effectiveness of our approach.
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
# Estimate tokens
|
|
328
|
+
total_tokens = chunker.estimate_tokens(test_tex)
|
|
329
|
+
print(f"Total tokens: {total_tokens}")
|
|
330
|
+
|
|
331
|
+
# Chunk (set smaller max_tokens for demonstration)
|
|
332
|
+
chunks = chunker.split_tex_into_chunks(tex_content=test_tex, max_tokens=200, overlap=50)
|
|
333
|
+
|
|
334
|
+
print(f"\nChunking result: {len(chunks)} chunks\n")
|
|
335
|
+
|
|
336
|
+
for i, chunk in enumerate(chunks, 1):
|
|
337
|
+
chunk_tokens = chunker.estimate_tokens(chunk)
|
|
338
|
+
print(f"Chunk {i} ({chunk_tokens} tokens):")
|
|
339
|
+
print(chunk[:200] + "...\n")
|