realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,253 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ TeX Downloader - LaTeX source code download tool
4
+
5
+ Features:
6
+ 1. Download LaTeX source from ArXiv
7
+ 2. Extract and locate main tex file
8
+ 3. Read tex content
9
+
10
+ Author: DeepTutor Team
11
+ Version: v1.0
12
+ Based on: TODO.md specification
13
+ """
14
+
15
+ import os
16
+ from pathlib import Path
17
+ import re
18
+ import shutil
19
+ import tarfile
20
+ import tempfile
21
+ import zipfile
22
+
23
+ import requests
24
+
25
+
26
+ class TexDownloadResult:
27
+ """LaTeX download result"""
28
+
29
+ def __init__(
30
+ self,
31
+ success: bool,
32
+ tex_path: str | None = None,
33
+ tex_content: str | None = None,
34
+ error: str | None = None,
35
+ ):
36
+ self.success = success
37
+ self.tex_path = tex_path
38
+ self.tex_content = tex_content
39
+ self.error = error
40
+
41
+
42
+ class TexDownloader:
43
+ """LaTeX source code download tool"""
44
+
45
+ def __init__(self, workspace_dir: str):
46
+ """
47
+ Initialize downloader
48
+
49
+ Args:
50
+ workspace_dir: Workspace directory (for saving downloaded files)
51
+ """
52
+ self.workspace_dir = Path(workspace_dir)
53
+ self.workspace_dir.mkdir(parents=True, exist_ok=True)
54
+
55
+ def download_arxiv_source(
56
+ self, arxiv_url: str, arxiv_id: str | None = None
57
+ ) -> TexDownloadResult:
58
+ """
59
+ Download LaTeX source from ArXiv
60
+
61
+ Args:
62
+ arxiv_url: ArXiv paper URL
63
+ arxiv_id: ArXiv ID (optional, if not in URL)
64
+
65
+ Returns:
66
+ TexDownloadResult object
67
+ """
68
+ # Extract ArXiv ID
69
+ if not arxiv_id:
70
+ arxiv_id = self._extract_arxiv_id(arxiv_url)
71
+
72
+ if not arxiv_id:
73
+ return TexDownloadResult(success=False, error="Unable to extract ArXiv ID")
74
+
75
+ try:
76
+ # Build source download URL
77
+ source_url = f"https://arxiv.org/e-print/{arxiv_id}"
78
+
79
+ # Download source package
80
+ print(f" Downloading source: {source_url}")
81
+ response = requests.get(source_url, timeout=30)
82
+ response.raise_for_status()
83
+
84
+ # Create temporary directory
85
+ temp_dir = tempfile.mkdtemp(dir=self.workspace_dir)
86
+
87
+ # Save source package
88
+ source_file = Path(temp_dir) / f"{arxiv_id}_source"
89
+ with open(source_file, "wb") as f:
90
+ f.write(response.content)
91
+
92
+ # Extract source package
93
+ extract_dir = Path(temp_dir) / "extracted"
94
+ extract_dir.mkdir(exist_ok=True)
95
+
96
+ if self._is_tar_file(source_file):
97
+ self._extract_tar(source_file, extract_dir)
98
+ elif self._is_zip_file(source_file):
99
+ self._extract_zip(source_file, extract_dir)
100
+ else:
101
+ # Might be a single tex file
102
+ shutil.copy(source_file, extract_dir / f"{arxiv_id}.tex")
103
+
104
+ # Find main tex file
105
+ main_tex = self._find_main_tex(extract_dir)
106
+
107
+ if not main_tex:
108
+ return TexDownloadResult(success=False, error="Main tex file not found")
109
+
110
+ # Read tex content
111
+ tex_content = self._read_tex_file(main_tex)
112
+
113
+ # Move to permanent location
114
+ paper_dir = self.workspace_dir / f"paper_{arxiv_id}"
115
+ paper_dir.mkdir(exist_ok=True)
116
+
117
+ final_tex_path = paper_dir / "main.tex"
118
+ shutil.copy(main_tex, final_tex_path)
119
+
120
+ # Clean up temporary directory
121
+ shutil.rmtree(temp_dir, ignore_errors=True)
122
+
123
+ return TexDownloadResult(
124
+ success=True, tex_path=str(final_tex_path), tex_content=tex_content
125
+ )
126
+
127
+ except requests.exceptions.RequestException as e:
128
+ return TexDownloadResult(success=False, error=f"Download failed: {e!s}")
129
+ except Exception as e:
130
+ return TexDownloadResult(success=False, error=f"Processing failed: {e!s}")
131
+
132
+ def _extract_arxiv_id(self, url: str) -> str | None:
133
+ """Extract ArXiv ID from URL"""
134
+ match = re.search(r"arxiv\.org/(?:abs|pdf)/(\d+\.\d+)", url)
135
+ if match:
136
+ return match.group(1)
137
+ return None
138
+
139
+ def _is_tar_file(self, file_path: Path) -> bool:
140
+ """Check if file is a tar file"""
141
+ try:
142
+ with tarfile.open(file_path, "r:*") as tar:
143
+ return True
144
+ except:
145
+ return False
146
+
147
+ def _is_zip_file(self, file_path: Path) -> bool:
148
+ """Check if file is a zip file"""
149
+ try:
150
+ with zipfile.ZipFile(file_path, "r") as zip_file:
151
+ return True
152
+ except:
153
+ return False
154
+
155
+ def _extract_tar(self, tar_path: Path, extract_dir: Path):
156
+ """Extract tar file safely (prevent ZipSlip/TarSlip)"""
157
+ with tarfile.open(tar_path, "r:*") as tar:
158
+ # Safe extraction filter
159
+ def is_within_directory(directory, target):
160
+ abs_directory = os.path.abspath(directory)
161
+ abs_target = os.path.abspath(target)
162
+ prefix = os.path.commonprefix([abs_directory, abs_target])
163
+ return prefix == abs_directory
164
+
165
+ def safe_members(members):
166
+ for member in members:
167
+ member_path = os.path.join(extract_dir, member.name)
168
+ if not is_within_directory(extract_dir, member_path):
169
+ print(f"Suspicious file path in tar: {member.name}. Skipping.")
170
+ continue
171
+ yield member
172
+
173
+ tar.extractall(extract_dir, members=safe_members(tar))
174
+
175
+ def _extract_zip(self, zip_path: Path, extract_dir: Path):
176
+ """Extract zip file"""
177
+ with zipfile.ZipFile(zip_path, "r") as zip_file:
178
+ zip_file.extractall(extract_dir)
179
+
180
+ def _find_main_tex(self, directory: Path) -> Path | None:
181
+ """
182
+ Find main tex file
183
+
184
+ Priority:
185
+ 1. main.tex
186
+ 2. paper.tex
187
+ 3. Tex file containing \\documentclass
188
+ 4. Largest tex file
189
+ """
190
+ tex_files = list(directory.rglob("*.tex"))
191
+
192
+ if not tex_files:
193
+ return None
194
+
195
+ # 1. Find main.tex or paper.tex
196
+ for name in ["main.tex", "paper.tex", "manuscript.tex"]:
197
+ for tex_file in tex_files:
198
+ if tex_file.name.lower() == name:
199
+ return tex_file
200
+
201
+ # 2. Find file containing \documentclass
202
+ for tex_file in tex_files:
203
+ try:
204
+ content = tex_file.read_text(encoding="utf-8", errors="ignore")
205
+ if r"\documentclass" in content:
206
+ return tex_file
207
+ except:
208
+ continue
209
+
210
+ # 3. Return largest tex file
211
+ largest_tex = max(tex_files, key=lambda f: f.stat().st_size)
212
+ return largest_tex
213
+
214
+ def _read_tex_file(self, tex_path: Path) -> str:
215
+ """Read tex file content"""
216
+ try:
217
+ return tex_path.read_text(encoding="utf-8", errors="ignore")
218
+ except Exception as e:
219
+ raise Exception(f"Failed to read tex file: {e!s}")
220
+
221
+
222
+ def read_tex_file(tex_path: str) -> str:
223
+ """
224
+ Read tex file content (convenience function)
225
+
226
+ Args:
227
+ tex_path: tex file path
228
+
229
+ Returns:
230
+ tex content
231
+ """
232
+ return Path(tex_path).read_text(encoding="utf-8", errors="ignore")
233
+
234
+
235
+ # ========== Usage Example ==========
236
+
237
+ if __name__ == "__main__":
238
+ # Test download
239
+ downloader = TexDownloader(workspace_dir="./test_workspace")
240
+
241
+ # Test an ArXiv paper
242
+ result = downloader.download_arxiv_source(
243
+ arxiv_url="https://arxiv.org/abs/1706.03762", # Attention is All You Need
244
+ arxiv_id="1706.03762",
245
+ )
246
+
247
+ if result.success:
248
+ print("✓ Download successful!")
249
+ print(f" File path: {result.tex_path}")
250
+ print(f" Content length: {len(result.tex_content)} characters")
251
+ print(f" Content preview: {result.tex_content[:500]}...")
252
+ else:
253
+ print(f"✗ Download failed: {result.error}")
@@ -0,0 +1,71 @@
1
+ """
2
+ Web Search Tool - Simple entry point for agents
3
+
4
+ This module provides a simple interface to the web search service.
5
+ All search logic is implemented in src/services/search/.
6
+
7
+ Usage:
8
+ from src.tools.web_search import web_search
9
+
10
+ # Simple usage
11
+ result = web_search("What is AI?")
12
+
13
+ # With provider
14
+ result = web_search("What is AI?", provider="tavily")
15
+
16
+ Environment Variables:
17
+ - SEARCH_PROVIDER: Default search provider (default: perplexity)
18
+ - SEARCH_API_KEY: Unified API key for all providers
19
+
20
+ Available Providers:
21
+ - perplexity: AI-powered search (default)
22
+ - baidu: Baidu AI Search
23
+ - tavily: Research-focused with optional answers
24
+ - exa: Neural/embeddings search with summaries
25
+ - serper: Google SERP results
26
+ - jina: SERP with full content extraction
27
+ """
28
+
29
+ # Re-export from services layer
30
+ from src.services.search import (
31
+ CONSOLIDATION_TYPES,
32
+ PROVIDER_TEMPLATES,
33
+ SEARCH_API_KEY_ENV,
34
+ AnswerConsolidator,
35
+ BaseSearchProvider,
36
+ Citation,
37
+ SearchProvider,
38
+ SearchResult,
39
+ WebSearchResponse,
40
+ get_available_providers,
41
+ get_current_config,
42
+ get_default_provider,
43
+ get_provider,
44
+ get_providers_info,
45
+ list_providers,
46
+ web_search,
47
+ )
48
+
49
+ __all__ = [
50
+ # Main function
51
+ "web_search",
52
+ "get_current_config",
53
+ # Provider management
54
+ "get_provider",
55
+ "list_providers",
56
+ "get_available_providers",
57
+ "get_default_provider",
58
+ "get_providers_info",
59
+ # Types
60
+ "WebSearchResponse",
61
+ "Citation",
62
+ "SearchResult",
63
+ # Consolidation
64
+ "AnswerConsolidator",
65
+ "CONSOLIDATION_TYPES",
66
+ "PROVIDER_TEMPLATES",
67
+ # Base class
68
+ "BaseSearchProvider",
69
+ "SearchProvider",
70
+ "SEARCH_API_KEY_ENV",
71
+ ]
@@ -0,0 +1,206 @@
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ import tempfile
5
+ from threading import Lock
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from dotenv import dotenv_values, load_dotenv
9
+ from pydantic import ValidationError
10
+ import yaml
11
+
12
+ from ..config.defaults import DEFAULTS
13
+
14
+ # Use package-relative imports to avoid PYTHONPATH issues
15
+ from ..config.schema import AppConfig, migrate_config
16
+ from ..core.errors import ConfigError
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class ConfigManager:
22
+ """
23
+ Thread-safe manager for reading and writing configuration files.
24
+ Primarily manages config/main.yaml and reads .env.
25
+
26
+ Governance additions:
27
+ - Schema validation via pydantic (AppConfig); invalid configs are rejected.
28
+ - Versioned migrations via migrate_config.
29
+ - Atomic writes with temp file and os.replace; creates main.yaml.bak.
30
+ - Single lock guards mtime read, load, and save.
31
+ - Deterministic YAML dumps; returns deep copies.
32
+ - Layered env: .env, then .env.local (override), then process env.
33
+ """
34
+
35
+ _instance: Optional["ConfigManager"] = None
36
+ _lock = Lock()
37
+
38
+ def __new__(cls, project_root: Optional[Path] = None):
39
+ if cls._instance is None:
40
+ with cls._lock:
41
+ if cls._instance is None:
42
+ cls._instance = super(ConfigManager, cls).__new__(cls)
43
+ cls._instance._initialized = False
44
+ return cls._instance
45
+
46
+ def __init__(self, project_root: Optional[Path] = None):
47
+ if getattr(self, "_initialized", False):
48
+ return
49
+
50
+ self.project_root = project_root or Path(__file__).parent.parent.parent
51
+ self.config_path = self.project_root / "config" / "main.yaml"
52
+ self._config_cache: Dict[str, Any] = {}
53
+ self._last_mtime: float = 0.0
54
+ self._initialized = True
55
+
56
+ # Layered env loading
57
+ load_dotenv(dotenv_path=self.project_root / ".env", override=False)
58
+ load_dotenv(dotenv_path=self.project_root / ".env.local", override=True)
59
+
60
+ def _load_env_file(self, path: Path) -> Dict[str, str]:
61
+ """Load a .env file and return non-None values as strings."""
62
+ if not path.exists():
63
+ return {}
64
+ return {k: str(v) for k, v in dotenv_values(path).items() if v is not None}
65
+
66
+ def _read_yaml(self) -> Dict[str, Any]:
67
+ """Read the main YAML configuration file safely."""
68
+ if not self.config_path.exists():
69
+ return {}
70
+ with open(self.config_path, "r", encoding="utf-8") as f:
71
+ return yaml.safe_load(f) or {}
72
+
73
+ def _deep_update(self, target: Dict[str, Any], source: Dict[str, Any]) -> None:
74
+ for key, value in source.items():
75
+ if isinstance(value, dict) and isinstance(target.get(key), dict):
76
+ self._deep_update(target[key], value)
77
+ else:
78
+ target[key] = value
79
+
80
+ def _validate_and_migrate(self, raw: Dict[str, Any]) -> Dict[str, Any]:
81
+ merged: Dict[str, Any] = {}
82
+ self._deep_update(merged, DEFAULTS)
83
+ self._deep_update(merged, raw)
84
+ migrated = migrate_config(merged)
85
+ try:
86
+ return AppConfig(**migrated).dict()
87
+ except ValidationError as e:
88
+ raise ConfigError("Config validation failed", details={"errors": e.errors()})
89
+
90
+ def load_config(self, force_reload: bool = False) -> Dict[str, Any]:
91
+ """
92
+ Load configuration from main.yaml.
93
+ Uses caching based on file modification time and validates against schema.
94
+ """
95
+ with self._lock:
96
+ if not self.config_path.exists():
97
+ logger.info("Config not found at %s", self.config_path)
98
+ self._config_cache = {}
99
+ self._last_mtime = 0
100
+ return {}
101
+
102
+ current_mtime = self.config_path.stat().st_mtime
103
+ if not self._config_cache or force_reload or current_mtime > self._last_mtime:
104
+ try:
105
+ raw = self._read_yaml()
106
+ validated = self._validate_and_migrate(raw)
107
+ self._config_cache = validated
108
+ self._last_mtime = current_mtime
109
+ except ConfigError as ce:
110
+ logger.error("%s", ce, extra={"context": getattr(ce, "context", {})})
111
+ return {}
112
+ except Exception as e:
113
+ logger.exception("Error loading config: %s", e)
114
+ return {}
115
+
116
+ # deep copy via dump/load for immutability
117
+ return yaml.safe_load(yaml.safe_dump(self._config_cache, sort_keys=False)) or {}
118
+
119
+ def save_config(self, config: Dict[str, Any]) -> bool:
120
+ """
121
+ Save configuration to main.yaml.
122
+ Deep-merges provided config with existing one; writes atomically.
123
+ Rejects invalid configs per schema.
124
+ """
125
+ try:
126
+ with self._lock:
127
+ current = self.load_config(force_reload=True)
128
+ self._deep_update(current, config)
129
+ validated = self._validate_and_migrate(current)
130
+
131
+ self.config_path.parent.mkdir(parents=True, exist_ok=True)
132
+ yaml_str = yaml.safe_dump(
133
+ validated,
134
+ default_flow_style=False,
135
+ allow_unicode=True,
136
+ sort_keys=False,
137
+ )
138
+
139
+ # Atomic write with backup
140
+ fd, tmp_path = tempfile.mkstemp(
141
+ prefix="main.yaml.", dir=str(self.config_path.parent)
142
+ )
143
+ try:
144
+ with os.fdopen(fd, "w", encoding="utf-8") as tmp:
145
+ tmp.write(yaml_str)
146
+ tmp.flush()
147
+ os.fsync(tmp.fileno())
148
+ backup_path = self.config_path.with_suffix(".yaml.bak")
149
+ if self.config_path.exists():
150
+ try:
151
+ os.replace(self.config_path, backup_path)
152
+ except Exception:
153
+ logger.debug("Backup replace failed; continuing.")
154
+ os.replace(tmp_path, self.config_path)
155
+ self._config_cache = validated
156
+ self._last_mtime = self.config_path.stat().st_mtime
157
+ return True
158
+ finally:
159
+ if os.path.exists(tmp_path):
160
+ try:
161
+ os.remove(tmp_path)
162
+ except Exception:
163
+ pass
164
+ except ConfigError as ce:
165
+ logger.error(
166
+ "Refusing to save invalid config: %s",
167
+ ce,
168
+ extra={"context": getattr(ce, "context", {})},
169
+ )
170
+ return False
171
+ except Exception as e:
172
+ logger.exception("Error saving config: %s", e)
173
+ return False
174
+
175
+ def get_env_info(self) -> Dict[str, str]:
176
+ """
177
+ Read relevant environment variables using layered .env files and process env.
178
+ Returns only non-sensitive metadata.
179
+ """
180
+ env_path = self.project_root / ".env"
181
+ local_path = self.project_root / ".env.local"
182
+ parsed_env = self._load_env_file(env_path)
183
+ parsed_env.update(self._load_env_file(local_path))
184
+
185
+ def _get(key: str, default: str = "") -> str:
186
+ return str(parsed_env.get(key) or os.environ.get(key, default))
187
+
188
+ return {
189
+ "model": _get("LLM_MODEL", DEFAULTS.get("llm", {}).get("model", "Pro/Flash")),
190
+ }
191
+
192
+ def validate_required_env(self, keys: List[str]) -> Dict[str, List[str]]:
193
+ env_path = self.project_root / ".env"
194
+ local_path = self.project_root / ".env.local"
195
+ parsed_env = self._load_env_file(env_path)
196
+ parsed_env.update(self._load_env_file(local_path))
197
+ missing = [k for k in keys if not (parsed_env.get(k) or os.environ.get(k))]
198
+ if missing:
199
+ logger.warning("Missing required env keys", extra={"missing": missing})
200
+ return {"missing": missing}
201
+
202
+ @classmethod
203
+ def reset_for_tests(cls) -> None:
204
+ """Reset singleton to allow re-initialization in tests with a different project_root."""
205
+ with cls._lock:
206
+ cls._instance = None
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Document Validator - Validation utilities for document uploads
5
+ """
6
+
7
+ import mimetypes
8
+ import os
9
+ import re
10
+ from typing import ClassVar
11
+
12
+
13
+ class DocumentValidator:
14
+ """Document validation utilities"""
15
+
16
+ # Maximum file size in bytes (100MB)
17
+ MAX_FILE_SIZE: ClassVar[int] = 100 * 1024 * 1024
18
+
19
+ # Maximum file size for PDF processing (50MB to prevent resource exhaustion)
20
+ MAX_PDF_SIZE: ClassVar[int] = 50 * 1024 * 1024
21
+
22
+ # Allowed file extensions
23
+ ALLOWED_EXTENSIONS: ClassVar[set[str]] = {
24
+ ".pdf",
25
+ ".txt",
26
+ ".md",
27
+ ".doc",
28
+ ".docx",
29
+ ".rtf",
30
+ ".html",
31
+ ".htm",
32
+ ".xml",
33
+ ".json",
34
+ ".csv",
35
+ ".xlsx",
36
+ ".xls",
37
+ ".pptx",
38
+ ".ppt",
39
+ }
40
+
41
+ # MIME type mapping for additional validation
42
+ ALLOWED_MIME_TYPES: ClassVar[set[str]] = {
43
+ "application/pdf",
44
+ "text/plain",
45
+ "text/markdown",
46
+ "application/msword",
47
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
48
+ "application/rtf",
49
+ "text/html",
50
+ "application/xml",
51
+ "text/xml",
52
+ "application/json",
53
+ "text/csv",
54
+ "application/vnd.ms-excel",
55
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
56
+ "application/vnd.ms-powerpoint",
57
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
58
+ }
59
+
60
+ @staticmethod
61
+ def validate_upload_safety(
62
+ filename: str, file_size: int | None, allowed_extensions: set[str] | None = None
63
+ ) -> str:
64
+ """
65
+ Validate file upload safety
66
+
67
+ Args:
68
+ filename: Name of the file
69
+ file_size: Size of the file in bytes, or None to skip size validation
70
+ allowed_extensions: Optional override for allowed extensions
71
+
72
+ Returns:
73
+ Sanitized filename safe for filesystem use
74
+
75
+ Raises:
76
+ ValueError: If validation fails
77
+ """
78
+ # Check file size (skip if size is None)
79
+ if file_size is not None and file_size > DocumentValidator.MAX_FILE_SIZE:
80
+ raise ValueError(
81
+ f"File too large: {file_size} bytes. Maximum allowed: {DocumentValidator.MAX_FILE_SIZE} bytes"
82
+ )
83
+
84
+ # Additional size check for PDFs to prevent resource exhaustion
85
+ _, ext = os.path.splitext(filename.lower())
86
+ if ext == ".pdf" and file_size is not None and file_size > DocumentValidator.MAX_PDF_SIZE:
87
+ raise ValueError(
88
+ f"PDF file too large: {file_size} bytes. Maximum allowed for PDFs: {DocumentValidator.MAX_PDF_SIZE} bytes"
89
+ )
90
+
91
+ # Sanitize filename - remove path components and dangerous characters
92
+ # Extract just the filename, removing any path components
93
+ safe_name = os.path.basename(filename)
94
+ # Remove null bytes and other control characters
95
+ safe_name = re.sub(r"[\x00-\x1f\x7f]", "", safe_name)
96
+ # Replace problematic characters
97
+ safe_name = re.sub(r'[<>:"/\\|?*]', "_", safe_name)
98
+
99
+ if not safe_name or safe_name in (".", "..") or safe_name.strip("_") == "":
100
+ raise ValueError("Invalid filename")
101
+
102
+ # Check file extension
103
+ exts_to_check = allowed_extensions or DocumentValidator.ALLOWED_EXTENSIONS
104
+ if ext not in exts_to_check:
105
+ raise ValueError(
106
+ f"Unsupported file type: {ext}. Allowed types: {', '.join(exts_to_check)}"
107
+ )
108
+
109
+ # Additional MIME type validation for security
110
+ guessed_mime, _ = mimetypes.guess_type(filename)
111
+ if guessed_mime and guessed_mime not in DocumentValidator.ALLOWED_MIME_TYPES:
112
+ raise ValueError(
113
+ f"MIME type validation failed: {guessed_mime}. File may be malicious or corrupted."
114
+ )
115
+
116
+ return safe_name
117
+
118
+ @staticmethod
119
+ def get_file_info(filename: str, file_size: int) -> dict:
120
+ """
121
+ Get file information
122
+
123
+ Args:
124
+ filename: Name of the file
125
+ file_size: Size of the file in bytes
126
+
127
+ Returns:
128
+ Dictionary with file information
129
+ """
130
+ _, ext = os.path.splitext(filename.lower())
131
+ return {
132
+ "filename": filename,
133
+ "extension": ext,
134
+ "size_bytes": file_size,
135
+ "size_mb": round(file_size / (1024 * 1024), 2),
136
+ "is_allowed": ext in DocumentValidator.ALLOWED_EXTENSIONS,
137
+ }
138
+
139
+ @staticmethod
140
+ def validate_file(path: str) -> dict:
141
+ """
142
+ Validate that a file exists, is readable, and has valid content.
143
+
144
+ Args:
145
+ path: Path to the file to validate
146
+
147
+ Returns:
148
+ File info dictionary
149
+
150
+ Raises:
151
+ ValueError: If file is missing or validation fails
152
+ """
153
+ if not os.path.exists(path):
154
+ raise ValueError(f"File not found: {path}")
155
+
156
+ if not os.path.isfile(path):
157
+ raise ValueError(f"Not a file: {path}")
158
+
159
+ if not os.access(path, os.R_OK):
160
+ raise ValueError(f"File not readable: {path}")
161
+
162
+ size = os.path.getsize(path)
163
+ filename = os.path.basename(path)
164
+
165
+ # Validate using validate_upload_safety
166
+ safe_name = DocumentValidator.validate_upload_safety(filename, size)
167
+
168
+ return DocumentValidator.get_file_info(safe_name, size)