realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
TeX Downloader - LaTeX source code download tool
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
1. Download LaTeX source from ArXiv
|
|
7
|
+
2. Extract and locate main tex file
|
|
8
|
+
3. Read tex content
|
|
9
|
+
|
|
10
|
+
Author: DeepTutor Team
|
|
11
|
+
Version: v1.0
|
|
12
|
+
Based on: TODO.md specification
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import re
|
|
18
|
+
import shutil
|
|
19
|
+
import tarfile
|
|
20
|
+
import tempfile
|
|
21
|
+
import zipfile
|
|
22
|
+
|
|
23
|
+
import requests
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TexDownloadResult:
|
|
27
|
+
"""LaTeX download result"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
success: bool,
|
|
32
|
+
tex_path: str | None = None,
|
|
33
|
+
tex_content: str | None = None,
|
|
34
|
+
error: str | None = None,
|
|
35
|
+
):
|
|
36
|
+
self.success = success
|
|
37
|
+
self.tex_path = tex_path
|
|
38
|
+
self.tex_content = tex_content
|
|
39
|
+
self.error = error
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class TexDownloader:
|
|
43
|
+
"""LaTeX source code download tool"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, workspace_dir: str):
|
|
46
|
+
"""
|
|
47
|
+
Initialize downloader
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
workspace_dir: Workspace directory (for saving downloaded files)
|
|
51
|
+
"""
|
|
52
|
+
self.workspace_dir = Path(workspace_dir)
|
|
53
|
+
self.workspace_dir.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
def download_arxiv_source(
|
|
56
|
+
self, arxiv_url: str, arxiv_id: str | None = None
|
|
57
|
+
) -> TexDownloadResult:
|
|
58
|
+
"""
|
|
59
|
+
Download LaTeX source from ArXiv
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
arxiv_url: ArXiv paper URL
|
|
63
|
+
arxiv_id: ArXiv ID (optional, if not in URL)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
TexDownloadResult object
|
|
67
|
+
"""
|
|
68
|
+
# Extract ArXiv ID
|
|
69
|
+
if not arxiv_id:
|
|
70
|
+
arxiv_id = self._extract_arxiv_id(arxiv_url)
|
|
71
|
+
|
|
72
|
+
if not arxiv_id:
|
|
73
|
+
return TexDownloadResult(success=False, error="Unable to extract ArXiv ID")
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
# Build source download URL
|
|
77
|
+
source_url = f"https://arxiv.org/e-print/{arxiv_id}"
|
|
78
|
+
|
|
79
|
+
# Download source package
|
|
80
|
+
print(f" Downloading source: {source_url}")
|
|
81
|
+
response = requests.get(source_url, timeout=30)
|
|
82
|
+
response.raise_for_status()
|
|
83
|
+
|
|
84
|
+
# Create temporary directory
|
|
85
|
+
temp_dir = tempfile.mkdtemp(dir=self.workspace_dir)
|
|
86
|
+
|
|
87
|
+
# Save source package
|
|
88
|
+
source_file = Path(temp_dir) / f"{arxiv_id}_source"
|
|
89
|
+
with open(source_file, "wb") as f:
|
|
90
|
+
f.write(response.content)
|
|
91
|
+
|
|
92
|
+
# Extract source package
|
|
93
|
+
extract_dir = Path(temp_dir) / "extracted"
|
|
94
|
+
extract_dir.mkdir(exist_ok=True)
|
|
95
|
+
|
|
96
|
+
if self._is_tar_file(source_file):
|
|
97
|
+
self._extract_tar(source_file, extract_dir)
|
|
98
|
+
elif self._is_zip_file(source_file):
|
|
99
|
+
self._extract_zip(source_file, extract_dir)
|
|
100
|
+
else:
|
|
101
|
+
# Might be a single tex file
|
|
102
|
+
shutil.copy(source_file, extract_dir / f"{arxiv_id}.tex")
|
|
103
|
+
|
|
104
|
+
# Find main tex file
|
|
105
|
+
main_tex = self._find_main_tex(extract_dir)
|
|
106
|
+
|
|
107
|
+
if not main_tex:
|
|
108
|
+
return TexDownloadResult(success=False, error="Main tex file not found")
|
|
109
|
+
|
|
110
|
+
# Read tex content
|
|
111
|
+
tex_content = self._read_tex_file(main_tex)
|
|
112
|
+
|
|
113
|
+
# Move to permanent location
|
|
114
|
+
paper_dir = self.workspace_dir / f"paper_{arxiv_id}"
|
|
115
|
+
paper_dir.mkdir(exist_ok=True)
|
|
116
|
+
|
|
117
|
+
final_tex_path = paper_dir / "main.tex"
|
|
118
|
+
shutil.copy(main_tex, final_tex_path)
|
|
119
|
+
|
|
120
|
+
# Clean up temporary directory
|
|
121
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
122
|
+
|
|
123
|
+
return TexDownloadResult(
|
|
124
|
+
success=True, tex_path=str(final_tex_path), tex_content=tex_content
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
except requests.exceptions.RequestException as e:
|
|
128
|
+
return TexDownloadResult(success=False, error=f"Download failed: {e!s}")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
return TexDownloadResult(success=False, error=f"Processing failed: {e!s}")
|
|
131
|
+
|
|
132
|
+
def _extract_arxiv_id(self, url: str) -> str | None:
|
|
133
|
+
"""Extract ArXiv ID from URL"""
|
|
134
|
+
match = re.search(r"arxiv\.org/(?:abs|pdf)/(\d+\.\d+)", url)
|
|
135
|
+
if match:
|
|
136
|
+
return match.group(1)
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
def _is_tar_file(self, file_path: Path) -> bool:
|
|
140
|
+
"""Check if file is a tar file"""
|
|
141
|
+
try:
|
|
142
|
+
with tarfile.open(file_path, "r:*") as tar:
|
|
143
|
+
return True
|
|
144
|
+
except:
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
def _is_zip_file(self, file_path: Path) -> bool:
|
|
148
|
+
"""Check if file is a zip file"""
|
|
149
|
+
try:
|
|
150
|
+
with zipfile.ZipFile(file_path, "r") as zip_file:
|
|
151
|
+
return True
|
|
152
|
+
except:
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
def _extract_tar(self, tar_path: Path, extract_dir: Path):
|
|
156
|
+
"""Extract tar file safely (prevent ZipSlip/TarSlip)"""
|
|
157
|
+
with tarfile.open(tar_path, "r:*") as tar:
|
|
158
|
+
# Safe extraction filter
|
|
159
|
+
def is_within_directory(directory, target):
|
|
160
|
+
abs_directory = os.path.abspath(directory)
|
|
161
|
+
abs_target = os.path.abspath(target)
|
|
162
|
+
prefix = os.path.commonprefix([abs_directory, abs_target])
|
|
163
|
+
return prefix == abs_directory
|
|
164
|
+
|
|
165
|
+
def safe_members(members):
|
|
166
|
+
for member in members:
|
|
167
|
+
member_path = os.path.join(extract_dir, member.name)
|
|
168
|
+
if not is_within_directory(extract_dir, member_path):
|
|
169
|
+
print(f"Suspicious file path in tar: {member.name}. Skipping.")
|
|
170
|
+
continue
|
|
171
|
+
yield member
|
|
172
|
+
|
|
173
|
+
tar.extractall(extract_dir, members=safe_members(tar))
|
|
174
|
+
|
|
175
|
+
def _extract_zip(self, zip_path: Path, extract_dir: Path):
|
|
176
|
+
"""Extract zip file"""
|
|
177
|
+
with zipfile.ZipFile(zip_path, "r") as zip_file:
|
|
178
|
+
zip_file.extractall(extract_dir)
|
|
179
|
+
|
|
180
|
+
def _find_main_tex(self, directory: Path) -> Path | None:
|
|
181
|
+
"""
|
|
182
|
+
Find main tex file
|
|
183
|
+
|
|
184
|
+
Priority:
|
|
185
|
+
1. main.tex
|
|
186
|
+
2. paper.tex
|
|
187
|
+
3. Tex file containing \\documentclass
|
|
188
|
+
4. Largest tex file
|
|
189
|
+
"""
|
|
190
|
+
tex_files = list(directory.rglob("*.tex"))
|
|
191
|
+
|
|
192
|
+
if not tex_files:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
# 1. Find main.tex or paper.tex
|
|
196
|
+
for name in ["main.tex", "paper.tex", "manuscript.tex"]:
|
|
197
|
+
for tex_file in tex_files:
|
|
198
|
+
if tex_file.name.lower() == name:
|
|
199
|
+
return tex_file
|
|
200
|
+
|
|
201
|
+
# 2. Find file containing \documentclass
|
|
202
|
+
for tex_file in tex_files:
|
|
203
|
+
try:
|
|
204
|
+
content = tex_file.read_text(encoding="utf-8", errors="ignore")
|
|
205
|
+
if r"\documentclass" in content:
|
|
206
|
+
return tex_file
|
|
207
|
+
except:
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
# 3. Return largest tex file
|
|
211
|
+
largest_tex = max(tex_files, key=lambda f: f.stat().st_size)
|
|
212
|
+
return largest_tex
|
|
213
|
+
|
|
214
|
+
def _read_tex_file(self, tex_path: Path) -> str:
|
|
215
|
+
"""Read tex file content"""
|
|
216
|
+
try:
|
|
217
|
+
return tex_path.read_text(encoding="utf-8", errors="ignore")
|
|
218
|
+
except Exception as e:
|
|
219
|
+
raise Exception(f"Failed to read tex file: {e!s}")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def read_tex_file(tex_path: str) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Read tex file content (convenience function)
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
tex_path: tex file path
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
tex content
|
|
231
|
+
"""
|
|
232
|
+
return Path(tex_path).read_text(encoding="utf-8", errors="ignore")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# ========== Usage Example ==========
|
|
236
|
+
|
|
237
|
+
if __name__ == "__main__":
|
|
238
|
+
# Test download
|
|
239
|
+
downloader = TexDownloader(workspace_dir="./test_workspace")
|
|
240
|
+
|
|
241
|
+
# Test an ArXiv paper
|
|
242
|
+
result = downloader.download_arxiv_source(
|
|
243
|
+
arxiv_url="https://arxiv.org/abs/1706.03762", # Attention is All You Need
|
|
244
|
+
arxiv_id="1706.03762",
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if result.success:
|
|
248
|
+
print("✓ Download successful!")
|
|
249
|
+
print(f" File path: {result.tex_path}")
|
|
250
|
+
print(f" Content length: {len(result.tex_content)} characters")
|
|
251
|
+
print(f" Content preview: {result.tex_content[:500]}...")
|
|
252
|
+
else:
|
|
253
|
+
print(f"✗ Download failed: {result.error}")
|
src/tools/web_search.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Web Search Tool - Simple entry point for agents
|
|
3
|
+
|
|
4
|
+
This module provides a simple interface to the web search service.
|
|
5
|
+
All search logic is implemented in src/services/search/.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from src.tools.web_search import web_search
|
|
9
|
+
|
|
10
|
+
# Simple usage
|
|
11
|
+
result = web_search("What is AI?")
|
|
12
|
+
|
|
13
|
+
# With provider
|
|
14
|
+
result = web_search("What is AI?", provider="tavily")
|
|
15
|
+
|
|
16
|
+
Environment Variables:
|
|
17
|
+
- SEARCH_PROVIDER: Default search provider (default: perplexity)
|
|
18
|
+
- SEARCH_API_KEY: Unified API key for all providers
|
|
19
|
+
|
|
20
|
+
Available Providers:
|
|
21
|
+
- perplexity: AI-powered search (default)
|
|
22
|
+
- baidu: Baidu AI Search
|
|
23
|
+
- tavily: Research-focused with optional answers
|
|
24
|
+
- exa: Neural/embeddings search with summaries
|
|
25
|
+
- serper: Google SERP results
|
|
26
|
+
- jina: SERP with full content extraction
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# Re-export from services layer
|
|
30
|
+
from src.services.search import (
|
|
31
|
+
CONSOLIDATION_TYPES,
|
|
32
|
+
PROVIDER_TEMPLATES,
|
|
33
|
+
SEARCH_API_KEY_ENV,
|
|
34
|
+
AnswerConsolidator,
|
|
35
|
+
BaseSearchProvider,
|
|
36
|
+
Citation,
|
|
37
|
+
SearchProvider,
|
|
38
|
+
SearchResult,
|
|
39
|
+
WebSearchResponse,
|
|
40
|
+
get_available_providers,
|
|
41
|
+
get_current_config,
|
|
42
|
+
get_default_provider,
|
|
43
|
+
get_provider,
|
|
44
|
+
get_providers_info,
|
|
45
|
+
list_providers,
|
|
46
|
+
web_search,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
# Main function
|
|
51
|
+
"web_search",
|
|
52
|
+
"get_current_config",
|
|
53
|
+
# Provider management
|
|
54
|
+
"get_provider",
|
|
55
|
+
"list_providers",
|
|
56
|
+
"get_available_providers",
|
|
57
|
+
"get_default_provider",
|
|
58
|
+
"get_providers_info",
|
|
59
|
+
# Types
|
|
60
|
+
"WebSearchResponse",
|
|
61
|
+
"Citation",
|
|
62
|
+
"SearchResult",
|
|
63
|
+
# Consolidation
|
|
64
|
+
"AnswerConsolidator",
|
|
65
|
+
"CONSOLIDATION_TYPES",
|
|
66
|
+
"PROVIDER_TEMPLATES",
|
|
67
|
+
# Base class
|
|
68
|
+
"BaseSearchProvider",
|
|
69
|
+
"SearchProvider",
|
|
70
|
+
"SEARCH_API_KEY_ENV",
|
|
71
|
+
]
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import tempfile
|
|
5
|
+
from threading import Lock
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from dotenv import dotenv_values, load_dotenv
|
|
9
|
+
from pydantic import ValidationError
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
from ..config.defaults import DEFAULTS
|
|
13
|
+
|
|
14
|
+
# Use package-relative imports to avoid PYTHONPATH issues
|
|
15
|
+
from ..config.schema import AppConfig, migrate_config
|
|
16
|
+
from ..core.errors import ConfigError
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ConfigManager:
|
|
22
|
+
"""
|
|
23
|
+
Thread-safe manager for reading and writing configuration files.
|
|
24
|
+
Primarily manages config/main.yaml and reads .env.
|
|
25
|
+
|
|
26
|
+
Governance additions:
|
|
27
|
+
- Schema validation via pydantic (AppConfig); invalid configs are rejected.
|
|
28
|
+
- Versioned migrations via migrate_config.
|
|
29
|
+
- Atomic writes with temp file and os.replace; creates main.yaml.bak.
|
|
30
|
+
- Single lock guards mtime read, load, and save.
|
|
31
|
+
- Deterministic YAML dumps; returns deep copies.
|
|
32
|
+
- Layered env: .env, then .env.local (override), then process env.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
_instance: Optional["ConfigManager"] = None
|
|
36
|
+
_lock = Lock()
|
|
37
|
+
|
|
38
|
+
def __new__(cls, project_root: Optional[Path] = None):
|
|
39
|
+
if cls._instance is None:
|
|
40
|
+
with cls._lock:
|
|
41
|
+
if cls._instance is None:
|
|
42
|
+
cls._instance = super(ConfigManager, cls).__new__(cls)
|
|
43
|
+
cls._instance._initialized = False
|
|
44
|
+
return cls._instance
|
|
45
|
+
|
|
46
|
+
def __init__(self, project_root: Optional[Path] = None):
|
|
47
|
+
if getattr(self, "_initialized", False):
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
self.project_root = project_root or Path(__file__).parent.parent.parent
|
|
51
|
+
self.config_path = self.project_root / "config" / "main.yaml"
|
|
52
|
+
self._config_cache: Dict[str, Any] = {}
|
|
53
|
+
self._last_mtime: float = 0.0
|
|
54
|
+
self._initialized = True
|
|
55
|
+
|
|
56
|
+
# Layered env loading
|
|
57
|
+
load_dotenv(dotenv_path=self.project_root / ".env", override=False)
|
|
58
|
+
load_dotenv(dotenv_path=self.project_root / ".env.local", override=True)
|
|
59
|
+
|
|
60
|
+
def _load_env_file(self, path: Path) -> Dict[str, str]:
|
|
61
|
+
"""Load a .env file and return non-None values as strings."""
|
|
62
|
+
if not path.exists():
|
|
63
|
+
return {}
|
|
64
|
+
return {k: str(v) for k, v in dotenv_values(path).items() if v is not None}
|
|
65
|
+
|
|
66
|
+
def _read_yaml(self) -> Dict[str, Any]:
|
|
67
|
+
"""Read the main YAML configuration file safely."""
|
|
68
|
+
if not self.config_path.exists():
|
|
69
|
+
return {}
|
|
70
|
+
with open(self.config_path, "r", encoding="utf-8") as f:
|
|
71
|
+
return yaml.safe_load(f) or {}
|
|
72
|
+
|
|
73
|
+
def _deep_update(self, target: Dict[str, Any], source: Dict[str, Any]) -> None:
|
|
74
|
+
for key, value in source.items():
|
|
75
|
+
if isinstance(value, dict) and isinstance(target.get(key), dict):
|
|
76
|
+
self._deep_update(target[key], value)
|
|
77
|
+
else:
|
|
78
|
+
target[key] = value
|
|
79
|
+
|
|
80
|
+
def _validate_and_migrate(self, raw: Dict[str, Any]) -> Dict[str, Any]:
|
|
81
|
+
merged: Dict[str, Any] = {}
|
|
82
|
+
self._deep_update(merged, DEFAULTS)
|
|
83
|
+
self._deep_update(merged, raw)
|
|
84
|
+
migrated = migrate_config(merged)
|
|
85
|
+
try:
|
|
86
|
+
return AppConfig(**migrated).dict()
|
|
87
|
+
except ValidationError as e:
|
|
88
|
+
raise ConfigError("Config validation failed", details={"errors": e.errors()})
|
|
89
|
+
|
|
90
|
+
def load_config(self, force_reload: bool = False) -> Dict[str, Any]:
|
|
91
|
+
"""
|
|
92
|
+
Load configuration from main.yaml.
|
|
93
|
+
Uses caching based on file modification time and validates against schema.
|
|
94
|
+
"""
|
|
95
|
+
with self._lock:
|
|
96
|
+
if not self.config_path.exists():
|
|
97
|
+
logger.info("Config not found at %s", self.config_path)
|
|
98
|
+
self._config_cache = {}
|
|
99
|
+
self._last_mtime = 0
|
|
100
|
+
return {}
|
|
101
|
+
|
|
102
|
+
current_mtime = self.config_path.stat().st_mtime
|
|
103
|
+
if not self._config_cache or force_reload or current_mtime > self._last_mtime:
|
|
104
|
+
try:
|
|
105
|
+
raw = self._read_yaml()
|
|
106
|
+
validated = self._validate_and_migrate(raw)
|
|
107
|
+
self._config_cache = validated
|
|
108
|
+
self._last_mtime = current_mtime
|
|
109
|
+
except ConfigError as ce:
|
|
110
|
+
logger.error("%s", ce, extra={"context": getattr(ce, "context", {})})
|
|
111
|
+
return {}
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.exception("Error loading config: %s", e)
|
|
114
|
+
return {}
|
|
115
|
+
|
|
116
|
+
# deep copy via dump/load for immutability
|
|
117
|
+
return yaml.safe_load(yaml.safe_dump(self._config_cache, sort_keys=False)) or {}
|
|
118
|
+
|
|
119
|
+
def save_config(self, config: Dict[str, Any]) -> bool:
|
|
120
|
+
"""
|
|
121
|
+
Save configuration to main.yaml.
|
|
122
|
+
Deep-merges provided config with existing one; writes atomically.
|
|
123
|
+
Rejects invalid configs per schema.
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
with self._lock:
|
|
127
|
+
current = self.load_config(force_reload=True)
|
|
128
|
+
self._deep_update(current, config)
|
|
129
|
+
validated = self._validate_and_migrate(current)
|
|
130
|
+
|
|
131
|
+
self.config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
yaml_str = yaml.safe_dump(
|
|
133
|
+
validated,
|
|
134
|
+
default_flow_style=False,
|
|
135
|
+
allow_unicode=True,
|
|
136
|
+
sort_keys=False,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Atomic write with backup
|
|
140
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
141
|
+
prefix="main.yaml.", dir=str(self.config_path.parent)
|
|
142
|
+
)
|
|
143
|
+
try:
|
|
144
|
+
with os.fdopen(fd, "w", encoding="utf-8") as tmp:
|
|
145
|
+
tmp.write(yaml_str)
|
|
146
|
+
tmp.flush()
|
|
147
|
+
os.fsync(tmp.fileno())
|
|
148
|
+
backup_path = self.config_path.with_suffix(".yaml.bak")
|
|
149
|
+
if self.config_path.exists():
|
|
150
|
+
try:
|
|
151
|
+
os.replace(self.config_path, backup_path)
|
|
152
|
+
except Exception:
|
|
153
|
+
logger.debug("Backup replace failed; continuing.")
|
|
154
|
+
os.replace(tmp_path, self.config_path)
|
|
155
|
+
self._config_cache = validated
|
|
156
|
+
self._last_mtime = self.config_path.stat().st_mtime
|
|
157
|
+
return True
|
|
158
|
+
finally:
|
|
159
|
+
if os.path.exists(tmp_path):
|
|
160
|
+
try:
|
|
161
|
+
os.remove(tmp_path)
|
|
162
|
+
except Exception:
|
|
163
|
+
pass
|
|
164
|
+
except ConfigError as ce:
|
|
165
|
+
logger.error(
|
|
166
|
+
"Refusing to save invalid config: %s",
|
|
167
|
+
ce,
|
|
168
|
+
extra={"context": getattr(ce, "context", {})},
|
|
169
|
+
)
|
|
170
|
+
return False
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.exception("Error saving config: %s", e)
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
def get_env_info(self) -> Dict[str, str]:
|
|
176
|
+
"""
|
|
177
|
+
Read relevant environment variables using layered .env files and process env.
|
|
178
|
+
Returns only non-sensitive metadata.
|
|
179
|
+
"""
|
|
180
|
+
env_path = self.project_root / ".env"
|
|
181
|
+
local_path = self.project_root / ".env.local"
|
|
182
|
+
parsed_env = self._load_env_file(env_path)
|
|
183
|
+
parsed_env.update(self._load_env_file(local_path))
|
|
184
|
+
|
|
185
|
+
def _get(key: str, default: str = "") -> str:
|
|
186
|
+
return str(parsed_env.get(key) or os.environ.get(key, default))
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
"model": _get("LLM_MODEL", DEFAULTS.get("llm", {}).get("model", "Pro/Flash")),
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
def validate_required_env(self, keys: List[str]) -> Dict[str, List[str]]:
|
|
193
|
+
env_path = self.project_root / ".env"
|
|
194
|
+
local_path = self.project_root / ".env.local"
|
|
195
|
+
parsed_env = self._load_env_file(env_path)
|
|
196
|
+
parsed_env.update(self._load_env_file(local_path))
|
|
197
|
+
missing = [k for k in keys if not (parsed_env.get(k) or os.environ.get(k))]
|
|
198
|
+
if missing:
|
|
199
|
+
logger.warning("Missing required env keys", extra={"missing": missing})
|
|
200
|
+
return {"missing": missing}
|
|
201
|
+
|
|
202
|
+
@classmethod
|
|
203
|
+
def reset_for_tests(cls) -> None:
|
|
204
|
+
"""Reset singleton to allow re-initialization in tests with a different project_root."""
|
|
205
|
+
with cls._lock:
|
|
206
|
+
cls._instance = None
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Document Validator - Validation utilities for document uploads
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import mimetypes
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
from typing import ClassVar
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DocumentValidator:
|
|
14
|
+
"""Document validation utilities"""
|
|
15
|
+
|
|
16
|
+
# Maximum file size in bytes (100MB)
|
|
17
|
+
MAX_FILE_SIZE: ClassVar[int] = 100 * 1024 * 1024
|
|
18
|
+
|
|
19
|
+
# Maximum file size for PDF processing (50MB to prevent resource exhaustion)
|
|
20
|
+
MAX_PDF_SIZE: ClassVar[int] = 50 * 1024 * 1024
|
|
21
|
+
|
|
22
|
+
# Allowed file extensions
|
|
23
|
+
ALLOWED_EXTENSIONS: ClassVar[set[str]] = {
|
|
24
|
+
".pdf",
|
|
25
|
+
".txt",
|
|
26
|
+
".md",
|
|
27
|
+
".doc",
|
|
28
|
+
".docx",
|
|
29
|
+
".rtf",
|
|
30
|
+
".html",
|
|
31
|
+
".htm",
|
|
32
|
+
".xml",
|
|
33
|
+
".json",
|
|
34
|
+
".csv",
|
|
35
|
+
".xlsx",
|
|
36
|
+
".xls",
|
|
37
|
+
".pptx",
|
|
38
|
+
".ppt",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# MIME type mapping for additional validation
|
|
42
|
+
ALLOWED_MIME_TYPES: ClassVar[set[str]] = {
|
|
43
|
+
"application/pdf",
|
|
44
|
+
"text/plain",
|
|
45
|
+
"text/markdown",
|
|
46
|
+
"application/msword",
|
|
47
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
48
|
+
"application/rtf",
|
|
49
|
+
"text/html",
|
|
50
|
+
"application/xml",
|
|
51
|
+
"text/xml",
|
|
52
|
+
"application/json",
|
|
53
|
+
"text/csv",
|
|
54
|
+
"application/vnd.ms-excel",
|
|
55
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
56
|
+
"application/vnd.ms-powerpoint",
|
|
57
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def validate_upload_safety(
|
|
62
|
+
filename: str, file_size: int | None, allowed_extensions: set[str] | None = None
|
|
63
|
+
) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Validate file upload safety
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
filename: Name of the file
|
|
69
|
+
file_size: Size of the file in bytes, or None to skip size validation
|
|
70
|
+
allowed_extensions: Optional override for allowed extensions
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Sanitized filename safe for filesystem use
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If validation fails
|
|
77
|
+
"""
|
|
78
|
+
# Check file size (skip if size is None)
|
|
79
|
+
if file_size is not None and file_size > DocumentValidator.MAX_FILE_SIZE:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"File too large: {file_size} bytes. Maximum allowed: {DocumentValidator.MAX_FILE_SIZE} bytes"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Additional size check for PDFs to prevent resource exhaustion
|
|
85
|
+
_, ext = os.path.splitext(filename.lower())
|
|
86
|
+
if ext == ".pdf" and file_size is not None and file_size > DocumentValidator.MAX_PDF_SIZE:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
f"PDF file too large: {file_size} bytes. Maximum allowed for PDFs: {DocumentValidator.MAX_PDF_SIZE} bytes"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Sanitize filename - remove path components and dangerous characters
|
|
92
|
+
# Extract just the filename, removing any path components
|
|
93
|
+
safe_name = os.path.basename(filename)
|
|
94
|
+
# Remove null bytes and other control characters
|
|
95
|
+
safe_name = re.sub(r"[\x00-\x1f\x7f]", "", safe_name)
|
|
96
|
+
# Replace problematic characters
|
|
97
|
+
safe_name = re.sub(r'[<>:"/\\|?*]', "_", safe_name)
|
|
98
|
+
|
|
99
|
+
if not safe_name or safe_name in (".", "..") or safe_name.strip("_") == "":
|
|
100
|
+
raise ValueError("Invalid filename")
|
|
101
|
+
|
|
102
|
+
# Check file extension
|
|
103
|
+
exts_to_check = allowed_extensions or DocumentValidator.ALLOWED_EXTENSIONS
|
|
104
|
+
if ext not in exts_to_check:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Unsupported file type: {ext}. Allowed types: {', '.join(exts_to_check)}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Additional MIME type validation for security
|
|
110
|
+
guessed_mime, _ = mimetypes.guess_type(filename)
|
|
111
|
+
if guessed_mime and guessed_mime not in DocumentValidator.ALLOWED_MIME_TYPES:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"MIME type validation failed: {guessed_mime}. File may be malicious or corrupted."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return safe_name
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def get_file_info(filename: str, file_size: int) -> dict:
|
|
120
|
+
"""
|
|
121
|
+
Get file information
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
filename: Name of the file
|
|
125
|
+
file_size: Size of the file in bytes
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Dictionary with file information
|
|
129
|
+
"""
|
|
130
|
+
_, ext = os.path.splitext(filename.lower())
|
|
131
|
+
return {
|
|
132
|
+
"filename": filename,
|
|
133
|
+
"extension": ext,
|
|
134
|
+
"size_bytes": file_size,
|
|
135
|
+
"size_mb": round(file_size / (1024 * 1024), 2),
|
|
136
|
+
"is_allowed": ext in DocumentValidator.ALLOWED_EXTENSIONS,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def validate_file(path: str) -> dict:
|
|
141
|
+
"""
|
|
142
|
+
Validate that a file exists, is readable, and has valid content.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
path: Path to the file to validate
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
File info dictionary
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
ValueError: If file is missing or validation fails
|
|
152
|
+
"""
|
|
153
|
+
if not os.path.exists(path):
|
|
154
|
+
raise ValueError(f"File not found: {path}")
|
|
155
|
+
|
|
156
|
+
if not os.path.isfile(path):
|
|
157
|
+
raise ValueError(f"Not a file: {path}")
|
|
158
|
+
|
|
159
|
+
if not os.access(path, os.R_OK):
|
|
160
|
+
raise ValueError(f"File not readable: {path}")
|
|
161
|
+
|
|
162
|
+
size = os.path.getsize(path)
|
|
163
|
+
filename = os.path.basename(path)
|
|
164
|
+
|
|
165
|
+
# Validate using validate_upload_safety
|
|
166
|
+
safe_name = DocumentValidator.validate_upload_safety(filename, size)
|
|
167
|
+
|
|
168
|
+
return DocumentValidator.get_file_info(safe_name, size)
|