realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Parse PDF files using MinerU and save results to reference_papers directory
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def check_mineru_installed():
|
|
16
|
+
"""Check if MinerU is installed"""
|
|
17
|
+
try:
|
|
18
|
+
# Security: Using partial path is intentional here - we need to find
|
|
19
|
+
# the command in user's PATH. These are trusted CLI tools, not user input.
|
|
20
|
+
result = subprocess.run(
|
|
21
|
+
["magic-pdf", "--version"], # nosec B607
|
|
22
|
+
check=False,
|
|
23
|
+
capture_output=True,
|
|
24
|
+
text=True,
|
|
25
|
+
shell=False,
|
|
26
|
+
)
|
|
27
|
+
if result.returncode == 0:
|
|
28
|
+
return "magic-pdf"
|
|
29
|
+
except FileNotFoundError:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
# Security: Same as above - intentionally using PATH lookup for CLI tool.
|
|
34
|
+
result = subprocess.run(
|
|
35
|
+
["mineru", "--version"], # nosec B607
|
|
36
|
+
check=False,
|
|
37
|
+
capture_output=True,
|
|
38
|
+
text=True,
|
|
39
|
+
shell=False,
|
|
40
|
+
)
|
|
41
|
+
if result.returncode == 0:
|
|
42
|
+
return "mineru"
|
|
43
|
+
except FileNotFoundError:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def parse_pdf_with_mineru(pdf_path: str, output_base_dir: str = None):
|
|
50
|
+
"""
|
|
51
|
+
Parse PDF file using MinerU
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
pdf_path: Path to PDF file
|
|
55
|
+
output_base_dir: Base path for output directory, defaults to reference_papers
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
bool: Whether parsing was successful
|
|
59
|
+
"""
|
|
60
|
+
mineru_cmd = check_mineru_installed()
|
|
61
|
+
if not mineru_cmd:
|
|
62
|
+
print("ā Error: MinerU installation not detected")
|
|
63
|
+
print("Please install MinerU first:")
|
|
64
|
+
print(" pip install magic-pdf[full]")
|
|
65
|
+
print("or")
|
|
66
|
+
print(" pip install mineru")
|
|
67
|
+
print("or visit: https://github.com/opendatalab/MinerU")
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
print(f"ā Detected MinerU command: {mineru_cmd}")
|
|
71
|
+
|
|
72
|
+
pdf_path = Path(pdf_path).resolve()
|
|
73
|
+
if not pdf_path.exists():
|
|
74
|
+
print(f"ā Error: PDF file does not exist: {pdf_path}")
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
if not pdf_path.suffix.lower() == ".pdf":
|
|
78
|
+
print(f"ā Error: File is not PDF format: {pdf_path}")
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
# Project root is 3 levels up from src/tools/question/
|
|
82
|
+
project_root = Path(__file__).parent.parent.parent.parent
|
|
83
|
+
if output_base_dir is None:
|
|
84
|
+
output_base_dir = project_root / "reference_papers"
|
|
85
|
+
else:
|
|
86
|
+
output_base_dir = Path(output_base_dir)
|
|
87
|
+
|
|
88
|
+
output_base_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
|
|
90
|
+
pdf_name = pdf_path.stem
|
|
91
|
+
output_dir = output_base_dir / pdf_name
|
|
92
|
+
|
|
93
|
+
if output_dir.exists():
|
|
94
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
95
|
+
backup_dir = output_base_dir / f"{pdf_name}_backup_{timestamp}"
|
|
96
|
+
print(f"ā ļø Directory already exists, backing up to: {backup_dir.name}")
|
|
97
|
+
shutil.move(str(output_dir), str(backup_dir))
|
|
98
|
+
|
|
99
|
+
print(f"š PDF file: {pdf_path}")
|
|
100
|
+
print(f"š Output directory: {output_dir}")
|
|
101
|
+
print("ā Starting parsing...")
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
temp_output = output_base_dir / "temp_mineru_output"
|
|
105
|
+
temp_output.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
cmd = [mineru_cmd, "-p", str(pdf_path), "-o", str(temp_output)]
|
|
108
|
+
|
|
109
|
+
print(f"š§ Executing command: {' '.join(cmd)}")
|
|
110
|
+
|
|
111
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=False)
|
|
112
|
+
|
|
113
|
+
if result.returncode != 0:
|
|
114
|
+
print("ā MinerU parsing failed:")
|
|
115
|
+
print(f"Stdout: {result.stdout}")
|
|
116
|
+
print(f"Stderr: {result.stderr}")
|
|
117
|
+
if temp_output.exists():
|
|
118
|
+
shutil.rmtree(temp_output)
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
print("ā MinerU parsing completed!")
|
|
122
|
+
|
|
123
|
+
generated_folders = list(temp_output.iterdir())
|
|
124
|
+
|
|
125
|
+
if not generated_folders:
|
|
126
|
+
print("ā ļø Warning: No generated files found in temp directory")
|
|
127
|
+
if temp_output.exists():
|
|
128
|
+
shutil.rmtree(temp_output)
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
source_folder = generated_folders[0] if generated_folders[0].is_dir() else temp_output
|
|
132
|
+
|
|
133
|
+
# Create target directory and move content
|
|
134
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
# Move MinerU-generated content to target directory
|
|
137
|
+
if source_folder.exists() and source_folder.is_dir():
|
|
138
|
+
# If source_folder is the PDF-named directory, move its contents
|
|
139
|
+
for item in source_folder.iterdir():
|
|
140
|
+
dest_item = output_dir / item.name
|
|
141
|
+
if dest_item.exists():
|
|
142
|
+
if dest_item.is_dir():
|
|
143
|
+
shutil.rmtree(dest_item)
|
|
144
|
+
else:
|
|
145
|
+
dest_item.unlink()
|
|
146
|
+
shutil.move(str(item), str(dest_item))
|
|
147
|
+
print(f"š¦ Files saved to: {output_dir}")
|
|
148
|
+
else:
|
|
149
|
+
if output_dir.exists():
|
|
150
|
+
shutil.rmtree(output_dir)
|
|
151
|
+
shutil.move(str(source_folder), str(output_dir))
|
|
152
|
+
print(f"š¦ Files saved to: {output_dir}")
|
|
153
|
+
|
|
154
|
+
if temp_output.exists():
|
|
155
|
+
shutil.rmtree(temp_output)
|
|
156
|
+
|
|
157
|
+
print("\nš Generated files:")
|
|
158
|
+
for item in output_dir.rglob("*"):
|
|
159
|
+
if item.is_file():
|
|
160
|
+
rel_path = item.relative_to(output_dir)
|
|
161
|
+
print(f" - {rel_path}")
|
|
162
|
+
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f"ā Error occurred during parsing: {e!s}")
|
|
167
|
+
import traceback
|
|
168
|
+
|
|
169
|
+
traceback.print_exc()
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def main():
|
|
174
|
+
"""Main function"""
|
|
175
|
+
parser = argparse.ArgumentParser(
|
|
176
|
+
description="Parse PDF files using MinerU and save results to reference_papers directory",
|
|
177
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
178
|
+
epilog="""
|
|
179
|
+
Examples:
|
|
180
|
+
# Parse a single PDF file
|
|
181
|
+
python pdf_parser.py /path/to/paper.pdf
|
|
182
|
+
|
|
183
|
+
# Parse PDF and specify output directory
|
|
184
|
+
python pdf_parser.py /path/to/paper.pdf -o /custom/output/dir
|
|
185
|
+
""",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
parser.add_argument("pdf_path", type=str, help="Path to PDF file")
|
|
189
|
+
|
|
190
|
+
parser.add_argument(
|
|
191
|
+
"-o",
|
|
192
|
+
"--output",
|
|
193
|
+
type=str,
|
|
194
|
+
default=None,
|
|
195
|
+
help="Base path for output directory (default: reference_papers)",
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
args = parser.parse_args()
|
|
199
|
+
|
|
200
|
+
success = parse_pdf_with_mineru(args.pdf_path, args.output)
|
|
201
|
+
|
|
202
|
+
if success:
|
|
203
|
+
print("\nā Parsing completed!")
|
|
204
|
+
sys.exit(0)
|
|
205
|
+
else:
|
|
206
|
+
print("\nā Parsing failed!")
|
|
207
|
+
sys.exit(1)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
if __name__ == "__main__":
|
|
211
|
+
main()
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Extract question information from MinerU-parsed exam papers
|
|
5
|
+
|
|
6
|
+
This script reads MinerU-parsed markdown files and content_list.json,
|
|
7
|
+
uses LLM to analyze and extract all questions, including question content and related images.
|
|
8
|
+
|
|
9
|
+
Uses the unified LLM Factory for all LLM calls, supporting:
|
|
10
|
+
- Cloud providers (OpenAI, Anthropic, DeepSeek, etc.)
|
|
11
|
+
- Local providers (Ollama, LM Studio, vLLM, etc.)
|
|
12
|
+
- Automatic retry with exponential backoff
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import asyncio
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
import json
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
import sys
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
# Project root is 3 levels up from src/tools/question/
|
|
24
|
+
project_root = Path(__file__).parent.parent.parent.parent
|
|
25
|
+
sys.path.insert(0, str(project_root))
|
|
26
|
+
|
|
27
|
+
from src.services.config import get_agent_params
|
|
28
|
+
from src.services.llm import complete as llm_complete
|
|
29
|
+
from src.services.llm.capabilities import supports_response_format
|
|
30
|
+
from src.services.llm.config import get_llm_config
|
|
31
|
+
from src.utils.json_parser import parse_json_response
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_parsed_paper(paper_dir: Path) -> tuple[str | None, list[dict] | None, Path]:
|
|
35
|
+
"""
|
|
36
|
+
Load MinerU-parsed exam paper files
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
paper_dir: MinerU output directory (e.g., reference_papers/paper_name_20241129/)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
(markdown_content, content_list, images_dir)
|
|
43
|
+
"""
|
|
44
|
+
auto_dir = paper_dir / "auto"
|
|
45
|
+
if not auto_dir.exists():
|
|
46
|
+
auto_dir = paper_dir
|
|
47
|
+
|
|
48
|
+
md_files = list(auto_dir.glob("*.md"))
|
|
49
|
+
if not md_files:
|
|
50
|
+
print(f"ā Error: No markdown file found in {auto_dir}")
|
|
51
|
+
return None, None, auto_dir / "images"
|
|
52
|
+
|
|
53
|
+
md_file = md_files[0]
|
|
54
|
+
print(f"š Found markdown file: {md_file.name}")
|
|
55
|
+
|
|
56
|
+
with open(md_file, encoding="utf-8") as f:
|
|
57
|
+
markdown_content = f.read()
|
|
58
|
+
|
|
59
|
+
json_files = list(auto_dir.glob("*_content_list.json"))
|
|
60
|
+
content_list = None
|
|
61
|
+
if json_files:
|
|
62
|
+
json_file = json_files[0]
|
|
63
|
+
print(f"š Found content_list file: {json_file.name}")
|
|
64
|
+
with open(json_file, encoding="utf-8") as f:
|
|
65
|
+
content_list = json.load(f)
|
|
66
|
+
else:
|
|
67
|
+
print("ā ļø Warning: content_list.json file not found, will use markdown content only")
|
|
68
|
+
|
|
69
|
+
images_dir = auto_dir / "images"
|
|
70
|
+
if images_dir.exists():
|
|
71
|
+
image_count = len(list(images_dir.glob("*")))
|
|
72
|
+
print(f"š¼ļø Found image directory: {image_count} images")
|
|
73
|
+
else:
|
|
74
|
+
print("ā ļø Warning: images directory not found")
|
|
75
|
+
|
|
76
|
+
return markdown_content, content_list, images_dir
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def extract_questions_with_llm(
|
|
80
|
+
markdown_content: str,
|
|
81
|
+
content_list: list[dict] | None,
|
|
82
|
+
images_dir: Path,
|
|
83
|
+
api_key: str,
|
|
84
|
+
base_url: str,
|
|
85
|
+
model: str,
|
|
86
|
+
api_version: str | None = None,
|
|
87
|
+
binding: str | None = None,
|
|
88
|
+
) -> list[dict[str, Any]]:
|
|
89
|
+
"""
|
|
90
|
+
Use LLM to analyze markdown content and extract questions
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
markdown_content: Document content in Markdown format
|
|
94
|
+
content_list: MinerU-generated content_list (optional)
|
|
95
|
+
images_dir: Image directory path
|
|
96
|
+
api_key: OpenAI API key
|
|
97
|
+
base_url: API endpoint URL
|
|
98
|
+
model: Model name
|
|
99
|
+
api_version: API version for Azure OpenAI (optional)
|
|
100
|
+
binding: Provider binding type (optional)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Question list, each question contains:
|
|
104
|
+
{
|
|
105
|
+
"question_number": Question number,
|
|
106
|
+
"question_text": Question text content (multiple choice includes options),
|
|
107
|
+
"images": [List of relative paths to related images]
|
|
108
|
+
}
|
|
109
|
+
"""
|
|
110
|
+
import os
|
|
111
|
+
|
|
112
|
+
binding = binding or os.getenv("LLM_BINDING", "openai")
|
|
113
|
+
|
|
114
|
+
image_list = []
|
|
115
|
+
if images_dir.exists():
|
|
116
|
+
for img_file in sorted(images_dir.glob("*")):
|
|
117
|
+
if img_file.suffix.lower() in [".jpg", ".jpeg", ".png", ".gif", ".webp"]:
|
|
118
|
+
image_list.append(img_file.name)
|
|
119
|
+
|
|
120
|
+
system_prompt = """You are a professional exam paper analysis assistant. Your task is to extract all question information from the provided exam paper content.
|
|
121
|
+
|
|
122
|
+
Please carefully analyze the exam paper content and extract the following information for each question:
|
|
123
|
+
1. Question number (e.g., "1.", "Question 1", etc.)
|
|
124
|
+
2. Complete question text content (if multiple choice, include all options)
|
|
125
|
+
3. Related image file names (if the question references images)
|
|
126
|
+
|
|
127
|
+
For multiple choice questions, please merge the stem and all options into one complete question text, for example:
|
|
128
|
+
"1. Which of the following descriptions about neural networks is correct? ()\nA. Option A content\nB. Option B content\nC. Option C content\nD. Option D content"
|
|
129
|
+
|
|
130
|
+
Please return results in JSON format as follows:
|
|
131
|
+
```json
|
|
132
|
+
{
|
|
133
|
+
"questions": [
|
|
134
|
+
{
|
|
135
|
+
"question_number": "1",
|
|
136
|
+
"question_text": "Complete question content (including options)...",
|
|
137
|
+
"images": ["image_001.jpg", "image_002.jpg"]
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"question_number": "2",
|
|
141
|
+
"question_text": "Complete content of another question...",
|
|
142
|
+
"images": []
|
|
143
|
+
}
|
|
144
|
+
]
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Important Notes:
|
|
149
|
+
1. Ensure all questions are extracted, do not miss any
|
|
150
|
+
2. Keep the original question text, do not modify or summarize
|
|
151
|
+
3. For multiple choice questions, must merge stem and options in question_text
|
|
152
|
+
4. If a question has no associated images, set images field to empty array []
|
|
153
|
+
5. Image file names should be actual existing file names
|
|
154
|
+
6. Ensure the returned format is valid JSON
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
user_prompt = f"""Exam paper content (Markdown format):
|
|
158
|
+
|
|
159
|
+
{markdown_content[:15000]}
|
|
160
|
+
|
|
161
|
+
Available image files:
|
|
162
|
+
{json.dumps(image_list, ensure_ascii=False, indent=2)}
|
|
163
|
+
|
|
164
|
+
Please analyze the above exam paper content, extract all question information, and return in JSON format.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
print("\nš¤ Using LLM to analyze questions...")
|
|
168
|
+
print(f"š Model: {model}")
|
|
169
|
+
print(f"š Document length: {len(markdown_content)} characters")
|
|
170
|
+
print(f"š¼ļø Available images: {len(image_list)}")
|
|
171
|
+
|
|
172
|
+
# Get agent parameters from unified config
|
|
173
|
+
agent_params = get_agent_params("question")
|
|
174
|
+
|
|
175
|
+
# Build kwargs for LLM Factory
|
|
176
|
+
llm_kwargs = {
|
|
177
|
+
"temperature": agent_params["temperature"],
|
|
178
|
+
"max_tokens": agent_params["max_tokens"],
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# Only add response_format if the provider supports it
|
|
182
|
+
if supports_response_format(binding, model):
|
|
183
|
+
llm_kwargs["response_format"] = {"type": "json_object"}
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
# Call LLM via unified Factory (async, so we need to run in event loop)
|
|
187
|
+
loop = asyncio.get_event_loop()
|
|
188
|
+
if loop.is_running():
|
|
189
|
+
# We're in an existing event loop, run in a thread
|
|
190
|
+
import concurrent.futures
|
|
191
|
+
|
|
192
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
193
|
+
future = executor.submit(
|
|
194
|
+
asyncio.run,
|
|
195
|
+
llm_complete(
|
|
196
|
+
prompt=user_prompt,
|
|
197
|
+
system_prompt=system_prompt,
|
|
198
|
+
model=model,
|
|
199
|
+
api_key=api_key,
|
|
200
|
+
base_url=base_url,
|
|
201
|
+
api_version=api_version,
|
|
202
|
+
binding=binding,
|
|
203
|
+
**llm_kwargs,
|
|
204
|
+
),
|
|
205
|
+
)
|
|
206
|
+
result_text = future.result()
|
|
207
|
+
else:
|
|
208
|
+
# No running loop, use run_until_complete
|
|
209
|
+
result_text = loop.run_until_complete(
|
|
210
|
+
llm_complete(
|
|
211
|
+
prompt=user_prompt,
|
|
212
|
+
system_prompt=system_prompt,
|
|
213
|
+
model=model,
|
|
214
|
+
api_key=api_key,
|
|
215
|
+
base_url=base_url,
|
|
216
|
+
api_version=api_version,
|
|
217
|
+
binding=binding,
|
|
218
|
+
**llm_kwargs,
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
except RuntimeError as e:
|
|
222
|
+
if "already running" in str(e):
|
|
223
|
+
# Fallback: use asyncio.run in a thread
|
|
224
|
+
import concurrent.futures
|
|
225
|
+
|
|
226
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
227
|
+
future = executor.submit(
|
|
228
|
+
asyncio.run,
|
|
229
|
+
llm_complete(
|
|
230
|
+
prompt=user_prompt,
|
|
231
|
+
system_prompt=system_prompt,
|
|
232
|
+
model=model,
|
|
233
|
+
api_key=api_key,
|
|
234
|
+
base_url=base_url,
|
|
235
|
+
api_version=api_version,
|
|
236
|
+
binding=binding,
|
|
237
|
+
**llm_kwargs,
|
|
238
|
+
),
|
|
239
|
+
)
|
|
240
|
+
result_text = future.result()
|
|
241
|
+
else:
|
|
242
|
+
raise
|
|
243
|
+
|
|
244
|
+
# Parse JSON response
|
|
245
|
+
try:
|
|
246
|
+
if not result_text:
|
|
247
|
+
raise ValueError("LLM returned empty or None response")
|
|
248
|
+
result = parse_json_response(result_text, logger_instance=None, fallback={})
|
|
249
|
+
if result is None:
|
|
250
|
+
raise ValueError("JSON parsing returned None")
|
|
251
|
+
except Exception as e:
|
|
252
|
+
print(f"ā JSON parsing error: {e!s}")
|
|
253
|
+
print(f"LLM response content: {result_text[:500]}...")
|
|
254
|
+
raise ValueError(
|
|
255
|
+
f"Failed to parse LLM JSON response: {e}. "
|
|
256
|
+
f"Raw response (first 500 chars): {result_text[:500]!r}"
|
|
257
|
+
) from e
|
|
258
|
+
|
|
259
|
+
questions = result.get("questions", [])
|
|
260
|
+
print(f"ā Successfully extracted {len(questions)} questions")
|
|
261
|
+
|
|
262
|
+
return questions
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def save_questions_json(questions: list[dict[str, Any]], output_dir: Path, paper_name: str) -> Path:
|
|
266
|
+
"""
|
|
267
|
+
Save question information as JSON file
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
questions: Question list
|
|
271
|
+
output_dir: Output directory
|
|
272
|
+
paper_name: Paper name
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Saved file path
|
|
276
|
+
"""
|
|
277
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
278
|
+
|
|
279
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
280
|
+
|
|
281
|
+
output_data = {
|
|
282
|
+
"paper_name": paper_name,
|
|
283
|
+
"extraction_time": datetime.now().isoformat(),
|
|
284
|
+
"total_questions": len(questions),
|
|
285
|
+
"questions": questions,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
output_file = output_dir / f"{paper_name}_{timestamp}_questions.json"
|
|
289
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
290
|
+
json.dump(output_data, f, ensure_ascii=False, indent=2)
|
|
291
|
+
|
|
292
|
+
print(f"š¾ Question information saved to: {output_file.name}")
|
|
293
|
+
|
|
294
|
+
print("\nš Question statistics:")
|
|
295
|
+
print(f" Total questions: {len(questions)}")
|
|
296
|
+
|
|
297
|
+
questions_with_images = sum(1 for q in questions if q.get("images"))
|
|
298
|
+
print(f" Questions with images: {questions_with_images}")
|
|
299
|
+
|
|
300
|
+
return output_file
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def extract_questions_from_paper(paper_dir: str, output_dir: str | None = None) -> bool:
|
|
304
|
+
"""
|
|
305
|
+
Extract questions from parsed exam paper
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
paper_dir: MinerU-parsed directory path
|
|
309
|
+
output_dir: Output directory (default: paper_dir)
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Whether extraction was successful
|
|
313
|
+
"""
|
|
314
|
+
paper_dir = Path(paper_dir).resolve()
|
|
315
|
+
if not paper_dir.exists():
|
|
316
|
+
print(f"ā Error: Directory does not exist: {paper_dir}")
|
|
317
|
+
return False
|
|
318
|
+
|
|
319
|
+
print(f"š Paper directory: {paper_dir}")
|
|
320
|
+
|
|
321
|
+
markdown_content, content_list, images_dir = load_parsed_paper(paper_dir)
|
|
322
|
+
|
|
323
|
+
if not markdown_content:
|
|
324
|
+
print("ā Error: Unable to load paper content")
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
try:
|
|
328
|
+
llm_config = get_llm_config()
|
|
329
|
+
except ValueError as e:
|
|
330
|
+
print(f"ā {e!s}")
|
|
331
|
+
print(
|
|
332
|
+
"Tip: Please create .env file in project root and configure LLM-related environment variables"
|
|
333
|
+
)
|
|
334
|
+
return False
|
|
335
|
+
|
|
336
|
+
questions = extract_questions_with_llm(
|
|
337
|
+
markdown_content=markdown_content,
|
|
338
|
+
content_list=content_list,
|
|
339
|
+
images_dir=images_dir,
|
|
340
|
+
api_key=llm_config.api_key,
|
|
341
|
+
base_url=llm_config.base_url,
|
|
342
|
+
model=llm_config.model,
|
|
343
|
+
api_version=getattr(llm_config, "api_version", None),
|
|
344
|
+
binding=getattr(llm_config, "binding", None),
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
if not questions:
|
|
348
|
+
print("ā ļø Warning: No questions extracted")
|
|
349
|
+
return False
|
|
350
|
+
|
|
351
|
+
if output_dir is None:
|
|
352
|
+
output_dir = paper_dir
|
|
353
|
+
else:
|
|
354
|
+
output_dir = Path(output_dir)
|
|
355
|
+
|
|
356
|
+
paper_name = paper_dir.name
|
|
357
|
+
output_file = save_questions_json(questions, output_dir, paper_name)
|
|
358
|
+
|
|
359
|
+
print("\nā Question extraction completed!")
|
|
360
|
+
print(f"š View results: {output_file}")
|
|
361
|
+
|
|
362
|
+
return True
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def main():
|
|
366
|
+
"""Main function"""
|
|
367
|
+
parser = argparse.ArgumentParser(
|
|
368
|
+
description="Extract question information from MinerU-parsed exam papers",
|
|
369
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
370
|
+
epilog="""
|
|
371
|
+
Examples:
|
|
372
|
+
# Extract questions from parsed exam paper directory
|
|
373
|
+
python question_extractor.py reference_papers/exam_20241129_143052
|
|
374
|
+
|
|
375
|
+
# Specify output directory
|
|
376
|
+
python question_extractor.py reference_papers/exam_20241129_143052 -o ./output
|
|
377
|
+
""",
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
parser.add_argument("paper_dir", type=str, help="MinerU-parsed exam paper directory path")
|
|
381
|
+
|
|
382
|
+
parser.add_argument(
|
|
383
|
+
"-o", "--output", type=str, default=None, help="Output directory (default: paper directory)"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
args = parser.parse_args()
|
|
387
|
+
|
|
388
|
+
success = extract_questions_from_paper(args.paper_dir, args.output)
|
|
389
|
+
|
|
390
|
+
if success:
|
|
391
|
+
sys.exit(0)
|
|
392
|
+
else:
|
|
393
|
+
sys.exit(1)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
if __name__ == "__main__":
|
|
397
|
+
main()
|