realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
src/tools/rag_tool.py ADDED
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ RAG Query Tool - Pure tool wrapper for RAG operations
4
+
5
+ This module provides simple function wrappers for RAG operations.
6
+ All logic is delegated to RAGService in src/services/rag/service.py.
7
+ """
8
+
9
+ import asyncio
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ from dotenv import load_dotenv
14
+
15
+ # Load environment variables
16
+ project_root = Path(__file__).parent.parent.parent
17
+ load_dotenv(project_root / "DeepTutor.env", override=False)
18
+ load_dotenv(project_root / ".env", override=False)
19
+
20
+ # Import RAGService as the single entry point
21
+ from src.services.rag.service import RAGService
22
+
23
+
24
+ async def rag_search(
25
+ query: str,
26
+ kb_name: Optional[str] = None,
27
+ mode: str = "hybrid",
28
+ provider: Optional[str] = None,
29
+ kb_base_dir: Optional[str] = None,
30
+ **kwargs,
31
+ ) -> dict:
32
+ """
33
+ Query knowledge base using configurable RAG pipeline.
34
+
35
+ Args:
36
+ query: Query question
37
+ kb_name: Knowledge base name (optional, defaults to default knowledge base)
38
+ mode: Query mode (e.g., "hybrid", "local", "global", "naive")
39
+ provider: RAG pipeline to use (defaults to RAG_PROVIDER env var or "lightrag")
40
+ kb_base_dir: Base directory for knowledge bases (for testing)
41
+ **kwargs: Additional parameters passed to the RAG pipeline
42
+
43
+ Returns:
44
+ dict: Dictionary containing query results
45
+ {
46
+ "query": str,
47
+ "answer": str,
48
+ "content": str,
49
+ "mode": str,
50
+ "provider": str
51
+ }
52
+
53
+ Raises:
54
+ ValueError: If the specified RAG pipeline is not found
55
+ Exception: If the query fails
56
+
57
+ Example:
58
+ # Use default provider (from .env)
59
+ result = await rag_search("What is machine learning?", kb_name="textbook")
60
+
61
+ # Override provider
62
+ result = await rag_search("What is ML?", kb_name="textbook", provider="lightrag")
63
+ """
64
+ service = RAGService(kb_base_dir=kb_base_dir, provider=provider)
65
+
66
+ try:
67
+ return await service.search(query=query, kb_name=kb_name, mode=mode, **kwargs)
68
+ except Exception as e:
69
+ raise Exception(f"RAG search failed: {e}")
70
+
71
+
72
+ async def initialize_rag(
73
+ kb_name: str,
74
+ documents: List[str],
75
+ provider: Optional[str] = None,
76
+ kb_base_dir: Optional[str] = None,
77
+ **kwargs,
78
+ ) -> bool:
79
+ """
80
+ Initialize RAG with documents.
81
+
82
+ Args:
83
+ kb_name: Knowledge base name
84
+ documents: List of document file paths to index
85
+ provider: RAG pipeline to use (defaults to RAG_PROVIDER env var)
86
+ kb_base_dir: Base directory for knowledge bases (for testing)
87
+ **kwargs: Additional arguments passed to pipeline
88
+
89
+ Returns:
90
+ True if successful
91
+
92
+ Example:
93
+ documents = ["doc1.pdf", "doc2.txt"]
94
+ success = await initialize_rag("my_kb", documents)
95
+ """
96
+ service = RAGService(kb_base_dir=kb_base_dir, provider=provider)
97
+ return await service.initialize(kb_name=kb_name, file_paths=documents, **kwargs)
98
+
99
+
100
+ async def delete_rag(
101
+ kb_name: str,
102
+ provider: Optional[str] = None,
103
+ kb_base_dir: Optional[str] = None,
104
+ ) -> bool:
105
+ """
106
+ Delete a knowledge base.
107
+
108
+ Args:
109
+ kb_name: Knowledge base name
110
+ provider: RAG pipeline to use (defaults to RAG_PROVIDER env var)
111
+ kb_base_dir: Base directory for knowledge bases (for testing)
112
+
113
+ Returns:
114
+ True if successful
115
+
116
+ Example:
117
+ success = await delete_rag("old_kb")
118
+ """
119
+ service = RAGService(kb_base_dir=kb_base_dir, provider=provider)
120
+ return await service.delete(kb_name=kb_name)
121
+
122
+
123
+ def get_available_providers() -> List[Dict]:
124
+ """
125
+ Get list of available RAG pipelines.
126
+
127
+ Returns:
128
+ List of pipeline information dictionaries
129
+
130
+ Example:
131
+ providers = get_available_providers()
132
+ for p in providers:
133
+ print(f"{p['name']}: {p['description']}")
134
+ """
135
+ return RAGService.list_providers()
136
+
137
+
138
+ def get_current_provider() -> str:
139
+ """Get the currently configured RAG provider"""
140
+ return RAGService.get_current_provider()
141
+
142
+
143
+ # Backward compatibility aliases
144
+ get_available_plugins = get_available_providers
145
+ list_providers = RAGService.list_providers
146
+
147
+
148
+ if __name__ == "__main__":
149
+ import sys
150
+
151
+ if sys.platform == "win32":
152
+ import io
153
+
154
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
155
+
156
+ # List available providers
157
+ print("Available RAG Pipelines:")
158
+ for provider in get_available_providers():
159
+ print(f" - {provider['id']}: {provider['description']}")
160
+ print(f"\nCurrent provider: {get_current_provider()}\n")
161
+
162
+ # Test search (requires existing knowledge base)
163
+ result = asyncio.run(
164
+ rag_search(
165
+ "What is the lookup table (LUT) in FPGA?",
166
+ kb_name="DE-all",
167
+ mode="naive",
168
+ )
169
+ )
170
+
171
+ print(f"Query: {result['query']}")
172
+ print(f"Answer: {result['answer']}")
173
+ print(f"Provider: {result.get('provider', 'unknown')}")
@@ -0,0 +1,339 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ TeX Chunker - LaTeX text chunking tool
4
+
5
+ Features:
6
+ 1. Intelligent chunking of LaTeX content (by section or token count)
7
+ 2. Token estimation (based on GPT tokenizer)
8
+ 3. Maintain context coherence (overlap between chunks)
9
+
10
+ Author: DeepTutor Team
11
+ Version: v1.0
12
+ Based on: TODO.md specification
13
+ """
14
+
15
+ import os
16
+ import re
17
+
18
+ import tiktoken
19
+
20
+
21
+ class TexChunker:
22
+ """LaTeX text chunking tool"""
23
+
24
+ def __init__(self, model: str | None = None):
25
+ """
26
+ Initialize chunking tool
27
+
28
+ Args:
29
+ model: Model name (for token estimation). If not provided, read from LLM_MODEL environment variable
30
+ """
31
+ # Read model configuration from environment variables
32
+ if model is None:
33
+ model = os.getenv("LLM_MODEL")
34
+
35
+ try:
36
+ if model:
37
+ self.encoder = tiktoken.encoding_for_model(model)
38
+ else:
39
+ # Use cl100k_base as default encoding if no model specified
40
+ self.encoder = tiktoken.get_encoding("cl100k_base")
41
+ except Exception:
42
+ # If model not supported, use cl100k_base (GPT-4 encoding)
43
+ self.encoder = tiktoken.get_encoding("cl100k_base")
44
+
45
+ def estimate_tokens(self, text: str) -> int:
46
+ """
47
+ Estimate token count of text
48
+
49
+ Args:
50
+ text: Input text
51
+
52
+ Returns:
53
+ Token count
54
+ """
55
+ try:
56
+ # Clean text: remove overly long repeated characters (may cause token explosion)
57
+ cleaned_text = self._clean_text(text)
58
+ tokens = self.encoder.encode(cleaned_text)
59
+ return len(tokens)
60
+ except Exception as e:
61
+ # If encoding fails, use rough estimate: 1 token ≈ 4 chars
62
+ print(f" ⚠️ Token estimation failed, using rough estimate: {e!s}")
63
+ return len(text) // 4
64
+
65
+ def _clean_text(self, text: str) -> str:
66
+ """
67
+ Clean text to prevent token estimation anomalies
68
+
69
+ - Remove overly long repeated character sequences
70
+ - Limit single line length
71
+ """
72
+ import re
73
+
74
+ # Remove overly long repeated characters (e.g., consecutive spaces, newlines, etc.)
75
+ text = re.sub(r"(\s)\1{100,}", r"\1" * 10, text)
76
+
77
+ # Remove overly long single lines (may be erroneous data)
78
+ lines = text.split("\n")
79
+ cleaned_lines = []
80
+ for line in lines:
81
+ if len(line) > 10000: # Single line over 10k characters, may be problematic
82
+ print(f" ⚠️ Detected overly long line ({len(line)} characters), truncating")
83
+ line = line[:10000] + "...[truncated]"
84
+ cleaned_lines.append(line)
85
+
86
+ return "\n".join(cleaned_lines)
87
+
88
+ def split_tex_into_chunks(
89
+ self, tex_content: str, max_tokens: int = 8000, overlap: int = 500
90
+ ) -> list[str]:
91
+ r"""
92
+ Split LaTeX content into chunks
93
+
94
+ Strategy:
95
+ 1. Prioritize splitting by sections (\section, \subsection)
96
+ 2. If single section is too long, split by paragraphs
97
+ 3. Maintain overlap tokens to avoid context loss
98
+
99
+ Args:
100
+ tex_content: LaTeX source code
101
+ max_tokens: Maximum tokens per chunk (default: 8000)
102
+ overlap: Overlap tokens between chunks (default: 500)
103
+
104
+ Returns:
105
+ List of chunks
106
+ """
107
+ total_tokens = self.estimate_tokens(tex_content)
108
+
109
+ # If total length doesn't exceed max_tokens, return directly
110
+ if total_tokens <= max_tokens:
111
+ return [tex_content]
112
+
113
+ print(f" LaTeX content needs chunking: {total_tokens:,} tokens > {max_tokens:,} tokens")
114
+ print(
115
+ f" File character count: {len(tex_content):,}, line count: {len(tex_content.splitlines()):,}"
116
+ )
117
+
118
+ # 1. Try splitting by sections
119
+ sections = self._split_by_sections(tex_content)
120
+
121
+ # 2. Merge sections into chunks
122
+ chunks = []
123
+ current_chunk = ""
124
+ current_tokens = 0
125
+
126
+ for section in sections:
127
+ section_tokens = self.estimate_tokens(section)
128
+
129
+ if section_tokens > max_tokens:
130
+ # Single section too long, need further splitting
131
+ if current_chunk:
132
+ chunks.append(current_chunk)
133
+ current_chunk = ""
134
+ current_tokens = 0
135
+
136
+ # Split overly long section by paragraphs
137
+ sub_chunks = self._split_by_paragraphs(section, max_tokens, overlap)
138
+ chunks.extend(sub_chunks)
139
+ # Check if can merge into current chunk
140
+ elif current_tokens + section_tokens <= max_tokens:
141
+ current_chunk += section
142
+ current_tokens += section_tokens
143
+ else:
144
+ # Save current chunk, start new chunk
145
+ if current_chunk:
146
+ chunks.append(current_chunk)
147
+
148
+ # Add overlap (take part from end of current chunk)
149
+ if chunks and overlap > 0:
150
+ overlap_text = self._get_overlap_text(chunks[-1], overlap)
151
+ current_chunk = overlap_text + section
152
+ current_tokens = self.estimate_tokens(current_chunk)
153
+ else:
154
+ current_chunk = section
155
+ current_tokens = section_tokens
156
+
157
+ # Save last chunk
158
+ if current_chunk:
159
+ chunks.append(current_chunk)
160
+
161
+ print(f" Chunking completed: {len(chunks)} chunks")
162
+ return chunks
163
+
164
+ def _split_by_sections(self, tex_content: str) -> list[str]:
165
+ """
166
+ Split LaTeX content by sections
167
+
168
+ Recognizes:
169
+ - \\section{...}
170
+ - \\subsection{...}
171
+ - \\subsubsection{...}
172
+
173
+ Returns:
174
+ List of sections
175
+ """
176
+ # Regex match section markers
177
+ pattern = r"(\\(?:sub)*section\{[^}]*\})"
178
+
179
+ # Split text
180
+ parts = re.split(pattern, tex_content)
181
+
182
+ if len(parts) <= 1:
183
+ # No section markers found, split by paragraphs
184
+ return self._split_by_paragraphs(tex_content, max_tokens=10000, overlap=0)
185
+
186
+ # Recombine: merge section markers and content
187
+ sections = []
188
+ for i in range(1, len(parts), 2):
189
+ if i < len(parts):
190
+ section = parts[i] # Section marker
191
+ if i + 1 < len(parts):
192
+ section += parts[i + 1] # Section content
193
+ sections.append(section)
194
+
195
+ # Add preamble part (first element)
196
+ if parts[0].strip():
197
+ sections.insert(0, parts[0])
198
+
199
+ return sections
200
+
201
+ def _split_by_paragraphs(self, text: str, max_tokens: int, overlap: int) -> list[str]:
202
+ """
203
+ Split text by paragraphs (for overly long sections)
204
+
205
+ Args:
206
+ text: Input text
207
+ max_tokens: Maximum tokens per chunk
208
+ overlap: Overlap tokens
209
+
210
+ Returns:
211
+ List of paragraph chunks
212
+ """
213
+ # Split paragraphs by double newlines
214
+ paragraphs = re.split(r"\n\n+", text)
215
+
216
+ chunks = []
217
+ current_chunk = ""
218
+ current_tokens = 0
219
+
220
+ for para in paragraphs:
221
+ para_tokens = self.estimate_tokens(para)
222
+
223
+ if para_tokens > max_tokens:
224
+ # Single paragraph too long, split by sentences
225
+ if current_chunk:
226
+ chunks.append(current_chunk)
227
+ current_chunk = ""
228
+ current_tokens = 0
229
+
230
+ # Split by sentences (simple method: split by periods)
231
+ sentences = re.split(r"(?<=[.!?])\s+", para)
232
+ for sentence in sentences:
233
+ sentence_tokens = self.estimate_tokens(sentence)
234
+ if current_tokens + sentence_tokens <= max_tokens:
235
+ current_chunk += sentence + " "
236
+ current_tokens += sentence_tokens
237
+ else:
238
+ if current_chunk:
239
+ chunks.append(current_chunk)
240
+ current_chunk = sentence + " "
241
+ current_tokens = sentence_tokens
242
+ # Check if can merge
243
+ elif current_tokens + para_tokens <= max_tokens:
244
+ current_chunk += para + "\n\n"
245
+ current_tokens += para_tokens
246
+ else:
247
+ # Save current chunk
248
+ if current_chunk:
249
+ chunks.append(current_chunk)
250
+
251
+ # Add overlap
252
+ if chunks and overlap > 0:
253
+ overlap_text = self._get_overlap_text(chunks[-1], overlap)
254
+ current_chunk = overlap_text + para + "\n\n"
255
+ current_tokens = self.estimate_tokens(current_chunk)
256
+ else:
257
+ current_chunk = para + "\n\n"
258
+ current_tokens = para_tokens
259
+
260
+ # Save last chunk
261
+ if current_chunk:
262
+ chunks.append(current_chunk)
263
+
264
+ return chunks
265
+
266
+ def _get_overlap_text(self, previous_chunk: str, overlap_tokens: int) -> str:
267
+ """
268
+ Extract overlap portion from end of previous chunk
269
+
270
+ Args:
271
+ previous_chunk: Previous chunk
272
+ overlap_tokens: Number of overlap tokens
273
+
274
+ Returns:
275
+ Overlap text
276
+ """
277
+ # Encode entire chunk
278
+ tokens = self.encoder.encode(previous_chunk)
279
+
280
+ # Take last overlap_tokens tokens
281
+ if len(tokens) <= overlap_tokens:
282
+ return previous_chunk
283
+
284
+ overlap_token_ids = tokens[-overlap_tokens:]
285
+ overlap_text = self.encoder.decode(overlap_token_ids)
286
+
287
+ return overlap_text
288
+
289
+
290
+ # ========== Usage Example ==========
291
+
292
+ if __name__ == "__main__":
293
+ # Create chunking tool
294
+ chunker = TexChunker(model="gpt-4o")
295
+
296
+ # Test text
297
+ test_tex = r"""
298
+ \section{Introduction}
299
+ This is the introduction section with some content that is moderately long.
300
+ It contains multiple paragraphs and discusses the background of the research.
301
+
302
+ The problem we are addressing is important and has wide applications.
303
+
304
+ \section{Related Work}
305
+ Previous work has explored various approaches to this problem.
306
+ Some researchers have used method A, while others prefer method B.
307
+
308
+ Recent advances in deep learning have opened new possibilities.
309
+
310
+ \subsection{Deep Learning Approaches}
311
+ Neural networks have shown promising results in many tasks.
312
+ Convolutional networks are particularly effective for image processing.
313
+
314
+ \section{Methodology}
315
+ Our approach combines the best aspects of previous methods.
316
+ We propose a novel architecture that addresses the key limitations.
317
+
318
+ \subsection{Model Architecture}
319
+ The model consists of three main components: encoder, processor, and decoder.
320
+ Each component is carefully designed to handle specific aspects of the task.
321
+
322
+ \section{Experiments}
323
+ We conducted extensive experiments on multiple datasets.
324
+ The results demonstrate the effectiveness of our approach.
325
+ """
326
+
327
+ # Estimate tokens
328
+ total_tokens = chunker.estimate_tokens(test_tex)
329
+ print(f"Total tokens: {total_tokens}")
330
+
331
+ # Chunk (set smaller max_tokens for demonstration)
332
+ chunks = chunker.split_tex_into_chunks(tex_content=test_tex, max_tokens=200, overlap=50)
333
+
334
+ print(f"\nChunking result: {len(chunks)} chunks\n")
335
+
336
+ for i, chunk in enumerate(chunks, 1):
337
+ chunk_tokens = chunker.estimate_tokens(chunk)
338
+ print(f"Chunk {i} ({chunk_tokens} tokens):")
339
+ print(chunk[:200] + "...\n")