realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,1039 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Extract numbered important content from knowledge base content_list
|
|
5
|
+
Such as: Definition 1.5., Proposition 1.3., Theorem x.x., Equation x.x., Formula x.x., etc.
|
|
6
|
+
|
|
7
|
+
Use LLM to identify these contents and store the mapping between numbers and original text in JSON file
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import asyncio
|
|
12
|
+
import inspect
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
import sys
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
sys.path.append(str(Path(__file__).parent.parent.parent))
|
|
20
|
+
|
|
21
|
+
from dotenv import load_dotenv
|
|
22
|
+
from lightrag.llm.openai import openai_complete_if_cache
|
|
23
|
+
|
|
24
|
+
from src.services.llm import get_llm_config
|
|
25
|
+
|
|
26
|
+
load_dotenv(dotenv_path=".env", override=False)
|
|
27
|
+
|
|
28
|
+
# Use project unified logging system
|
|
29
|
+
import logging as std_logging
|
|
30
|
+
|
|
31
|
+
# Logger can be either custom Logger or standard logging.Logger
|
|
32
|
+
logger: Any # Use Any to allow both types
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
from src.logging import get_logger
|
|
38
|
+
from src.services.config import load_config_with_main
|
|
39
|
+
|
|
40
|
+
project_root = Path(__file__).parent.parent.parent.parent
|
|
41
|
+
config = load_config_with_main(
|
|
42
|
+
"solve_config.yaml", project_root
|
|
43
|
+
) # Use any config to get main.yaml
|
|
44
|
+
log_dir = config.get("paths", {}).get("user_log_dir") or config.get("logging", {}).get(
|
|
45
|
+
"log_dir"
|
|
46
|
+
)
|
|
47
|
+
logger = get_logger("Knowledge", log_dir=log_dir)
|
|
48
|
+
except ImportError:
|
|
49
|
+
# If import fails, use basic logging
|
|
50
|
+
logger = std_logging.getLogger("knowledge_init.extract_items")
|
|
51
|
+
std_logging.basicConfig(
|
|
52
|
+
level=std_logging.INFO, format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def _call_llm_async(
|
|
57
|
+
prompt: str,
|
|
58
|
+
system_prompt: str,
|
|
59
|
+
api_key: str,
|
|
60
|
+
base_url: str | None,
|
|
61
|
+
max_tokens: int = 2000,
|
|
62
|
+
temperature: float = 0.1,
|
|
63
|
+
model: str = None,
|
|
64
|
+
) -> str:
|
|
65
|
+
"""Asynchronously call LLM"""
|
|
66
|
+
# If model not specified, get from env_config
|
|
67
|
+
if model is None:
|
|
68
|
+
llm_cfg = get_llm_config()
|
|
69
|
+
model = llm_cfg.model
|
|
70
|
+
|
|
71
|
+
result = openai_complete_if_cache(
|
|
72
|
+
model,
|
|
73
|
+
prompt,
|
|
74
|
+
system_prompt=system_prompt,
|
|
75
|
+
api_key=api_key,
|
|
76
|
+
base_url=base_url,
|
|
77
|
+
max_tokens=max_tokens,
|
|
78
|
+
temperature=temperature,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if inspect.isawaitable(result):
|
|
82
|
+
return await result
|
|
83
|
+
return str(result)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _extract_json_block(text: str) -> str:
|
|
87
|
+
"""Extract JSON block from text"""
|
|
88
|
+
try:
|
|
89
|
+
s = str(text).strip()
|
|
90
|
+
# Remove code block markers
|
|
91
|
+
if s.startswith("```") and s.endswith("```"):
|
|
92
|
+
lines = s.split("\n")
|
|
93
|
+
if lines[0].startswith("```"):
|
|
94
|
+
lines = lines[1:]
|
|
95
|
+
if lines and lines[-1].strip() == "```":
|
|
96
|
+
lines = lines[:-1]
|
|
97
|
+
s = "\n".join(lines).strip()
|
|
98
|
+
|
|
99
|
+
# Try to extract JSON object or array
|
|
100
|
+
if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
|
|
101
|
+
return s
|
|
102
|
+
|
|
103
|
+
o_start, o_end = s.find("{"), s.rfind("}")
|
|
104
|
+
a_start, a_end = s.find("["), s.rfind("]")
|
|
105
|
+
|
|
106
|
+
candidates = []
|
|
107
|
+
if o_start != -1 and o_end != -1 and o_end > o_start:
|
|
108
|
+
candidates.append((o_start, s[o_start : o_end + 1]))
|
|
109
|
+
if a_start != -1 and a_end != -1 and a_end > a_start:
|
|
110
|
+
candidates.append((a_start, s[a_start : a_end + 1]))
|
|
111
|
+
|
|
112
|
+
if candidates:
|
|
113
|
+
candidates.sort(key=lambda x: x[0])
|
|
114
|
+
return candidates[0][1]
|
|
115
|
+
|
|
116
|
+
return s
|
|
117
|
+
except Exception:
|
|
118
|
+
return text
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
async def _check_content_belongs_async(
|
|
122
|
+
start_text: str, candidate_text: str, api_key: str, base_url: str | None
|
|
123
|
+
) -> bool:
|
|
124
|
+
"""
|
|
125
|
+
Use LLM to determine if candidate content belongs to (is part of) the starting content
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
start_text: Starting content (beginning part of numbered item)
|
|
129
|
+
candidate_text: Candidate content (subsequent content block)
|
|
130
|
+
api_key: OpenAI API key
|
|
131
|
+
base_url: API base URL
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
True means candidate content belongs to starting content, False means it's new independent content
|
|
135
|
+
"""
|
|
136
|
+
system_prompt = """You are an expert at analyzing the structure of academic mathematical texts.
|
|
137
|
+
Your task is to determine if a candidate text block belongs to (is a continuation of) a starting numbered item, or if it's a new independent item.
|
|
138
|
+
|
|
139
|
+
Numbered items include: Definitions, Propositions, Theorems, Lemmas, Corollaries, Examples, Remarks, Figures, Equations, etc.
|
|
140
|
+
|
|
141
|
+
Rules:
|
|
142
|
+
- Equations, formulas, and images that follow a numbered item usually belong to that item
|
|
143
|
+
- Explanatory text that continues the same topic belongs to the item
|
|
144
|
+
- A new numbered item (starting with "Definition X.Y", "Theorem X.Y", etc.) is independent
|
|
145
|
+
- Text that starts a completely different topic is independent
|
|
146
|
+
|
|
147
|
+
Return ONLY "YES" if the candidate belongs to the starting item, or "NO" if it's independent."""
|
|
148
|
+
|
|
149
|
+
user_prompt = f"""Starting item:
|
|
150
|
+
{start_text[:500]}
|
|
151
|
+
|
|
152
|
+
Candidate block:
|
|
153
|
+
{candidate_text[:300]}
|
|
154
|
+
|
|
155
|
+
Does the candidate block belong to (continue) the starting item?
|
|
156
|
+
Answer with ONLY "YES" or "NO"."""
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
llm_cfg = get_llm_config()
|
|
160
|
+
response = await _call_llm_async(
|
|
161
|
+
user_prompt,
|
|
162
|
+
system_prompt,
|
|
163
|
+
api_key,
|
|
164
|
+
base_url,
|
|
165
|
+
max_tokens=10,
|
|
166
|
+
temperature=0.0,
|
|
167
|
+
model=llm_cfg.model,
|
|
168
|
+
)
|
|
169
|
+
answer = response.strip().upper()
|
|
170
|
+
return answer == "YES"
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.warning(f"LLM judgment failed, default to not include: {e}")
|
|
173
|
+
# Default to conservative strategy: don't include
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
async def _get_complete_content_async(
|
|
178
|
+
content_items: list[dict[str, Any]],
|
|
179
|
+
start_index: int,
|
|
180
|
+
api_key: str,
|
|
181
|
+
base_url: str | None,
|
|
182
|
+
max_following: int = 5,
|
|
183
|
+
) -> tuple[str, list[str]]:
|
|
184
|
+
"""
|
|
185
|
+
Get complete content, including subsequent formulas, text, etc., and all related image paths
|
|
186
|
+
Use LLM to determine if subsequent content belongs to current numbered item
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
content_items: Complete content_list
|
|
190
|
+
start_index: Starting index
|
|
191
|
+
api_key: OpenAI API key
|
|
192
|
+
base_url: API base URL
|
|
193
|
+
max_following: Maximum number of subsequent entries to check
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
(Complete text content, image path list)
|
|
197
|
+
"""
|
|
198
|
+
complete_text = content_items[start_index].get("text", "")
|
|
199
|
+
img_paths = []
|
|
200
|
+
|
|
201
|
+
# Collect image paths from starting item
|
|
202
|
+
start_img_path = content_items[start_index].get("img_path", "")
|
|
203
|
+
if start_img_path:
|
|
204
|
+
img_paths.append(start_img_path)
|
|
205
|
+
|
|
206
|
+
logger.debug(
|
|
207
|
+
f"Starting to use LLM to determine content boundaries, starting text (first 50 chars): {complete_text[:50]}..."
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Check subsequent entries
|
|
211
|
+
for i in range(1, max_following + 1):
|
|
212
|
+
next_index = start_index + i
|
|
213
|
+
if next_index >= len(content_items):
|
|
214
|
+
break
|
|
215
|
+
|
|
216
|
+
next_item = content_items[next_index]
|
|
217
|
+
next_type = next_item.get("type", "")
|
|
218
|
+
|
|
219
|
+
# If encountering title-level text, definitely stop
|
|
220
|
+
if next_type == "text" and next_item.get("text_level", 0) > 0:
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
# If it's a formula, usually belongs to current content, add directly
|
|
224
|
+
if next_type == "equation":
|
|
225
|
+
equation_text = next_item.get("text", "")
|
|
226
|
+
if equation_text:
|
|
227
|
+
complete_text += " " + equation_text
|
|
228
|
+
# Collect formula image paths
|
|
229
|
+
eq_img_path = next_item.get("img_path", "")
|
|
230
|
+
if eq_img_path:
|
|
231
|
+
img_paths.append(eq_img_path)
|
|
232
|
+
# If it's an image, collect image paths
|
|
233
|
+
elif next_type == "image":
|
|
234
|
+
img_path = next_item.get("img_path", "")
|
|
235
|
+
if img_path:
|
|
236
|
+
img_paths.append(img_path)
|
|
237
|
+
# Can also add image captions to text
|
|
238
|
+
captions = next_item.get("image_caption", [])
|
|
239
|
+
if captions:
|
|
240
|
+
caption_text = " ".join(captions) if isinstance(captions, list) else str(captions)
|
|
241
|
+
complete_text += " " + caption_text
|
|
242
|
+
# If it's regular text, use LLM to judge
|
|
243
|
+
elif next_type == "text" and next_item.get("text_level", 0) == 0:
|
|
244
|
+
next_text = next_item.get("text", "").strip()
|
|
245
|
+
if not next_text:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
# Use LLM to determine if this text belongs to current numbered item
|
|
249
|
+
belongs = await _check_content_belongs_async(
|
|
250
|
+
complete_text, next_text, api_key, base_url
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if belongs:
|
|
254
|
+
complete_text += " " + next_text
|
|
255
|
+
logger.debug(
|
|
256
|
+
f"LLM judgment: Subsequent text belongs to current content, added (first 30 chars: {next_text[:30]}...)"
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
# Doesn't belong to current content, stop collecting
|
|
260
|
+
logger.debug(
|
|
261
|
+
f"LLM judgment: Subsequent text doesn't belong to current content, stop collecting (first 30 chars: {next_text[:30]}...)"
|
|
262
|
+
)
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
return complete_text.strip(), img_paths
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _get_complete_content(
|
|
269
|
+
content_items: list[dict[str, Any]],
|
|
270
|
+
start_index: int,
|
|
271
|
+
api_key: str,
|
|
272
|
+
base_url: str | None,
|
|
273
|
+
max_following: int = 5,
|
|
274
|
+
) -> tuple[str, list[str]]:
|
|
275
|
+
"""
|
|
276
|
+
Synchronous wrapper for async function to get complete content
|
|
277
|
+
"""
|
|
278
|
+
try:
|
|
279
|
+
loop = asyncio.get_event_loop()
|
|
280
|
+
if loop.is_running():
|
|
281
|
+
# If event loop is already running, check if it's uvloop
|
|
282
|
+
loop_type = type(loop).__name__
|
|
283
|
+
if "uvloop" in loop_type.lower():
|
|
284
|
+
# uvloop doesn't support nest_asyncio, use threading approach
|
|
285
|
+
import concurrent.futures
|
|
286
|
+
|
|
287
|
+
def run_in_new_loop():
|
|
288
|
+
# Create a new asyncio event loop in a new thread
|
|
289
|
+
new_loop = asyncio.new_event_loop()
|
|
290
|
+
asyncio.set_event_loop(new_loop)
|
|
291
|
+
try:
|
|
292
|
+
return new_loop.run_until_complete(
|
|
293
|
+
_get_complete_content_async(
|
|
294
|
+
content_items, start_index, api_key, base_url, max_following
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
finally:
|
|
298
|
+
new_loop.close()
|
|
299
|
+
|
|
300
|
+
# Run in a thread with a new event loop
|
|
301
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
302
|
+
future = executor.submit(run_in_new_loop)
|
|
303
|
+
return future.result()
|
|
304
|
+
else:
|
|
305
|
+
# Try nest_asyncio for standard event loops
|
|
306
|
+
try:
|
|
307
|
+
import nest_asyncio
|
|
308
|
+
|
|
309
|
+
nest_asyncio.apply()
|
|
310
|
+
return loop.run_until_complete(
|
|
311
|
+
_get_complete_content_async(
|
|
312
|
+
content_items, start_index, api_key, base_url, max_following
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
except (ValueError, TypeError) as e:
|
|
316
|
+
# nest_asyncio failed, fall back to threading approach
|
|
317
|
+
logger.debug(f"nest_asyncio failed ({e}), using threading fallback")
|
|
318
|
+
import concurrent.futures
|
|
319
|
+
|
|
320
|
+
def run_in_new_loop():
|
|
321
|
+
new_loop = asyncio.new_event_loop()
|
|
322
|
+
asyncio.set_event_loop(new_loop)
|
|
323
|
+
try:
|
|
324
|
+
return new_loop.run_until_complete(
|
|
325
|
+
_get_complete_content_async(
|
|
326
|
+
content_items, start_index, api_key, base_url, max_following
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
finally:
|
|
330
|
+
new_loop.close()
|
|
331
|
+
|
|
332
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
333
|
+
future = executor.submit(run_in_new_loop)
|
|
334
|
+
return future.result()
|
|
335
|
+
else:
|
|
336
|
+
return loop.run_until_complete(
|
|
337
|
+
_get_complete_content_async(
|
|
338
|
+
content_items, start_index, api_key, base_url, max_following
|
|
339
|
+
)
|
|
340
|
+
)
|
|
341
|
+
except RuntimeError:
|
|
342
|
+
# No event loop, create new one
|
|
343
|
+
return asyncio.run(
|
|
344
|
+
_get_complete_content_async(
|
|
345
|
+
content_items, start_index, api_key, base_url, max_following
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
async def _process_single_batch(
|
|
351
|
+
batch_idx: int,
|
|
352
|
+
batch: list[dict[str, Any]],
|
|
353
|
+
batch_start: int,
|
|
354
|
+
content_items: list[dict[str, Any]],
|
|
355
|
+
text_item_to_full_index: dict[int, int],
|
|
356
|
+
api_key: str,
|
|
357
|
+
base_url: str | None,
|
|
358
|
+
total_batches: int,
|
|
359
|
+
) -> dict[str, dict[str, Any]]:
|
|
360
|
+
"""Asynchronously process a single batch"""
|
|
361
|
+
numbered_items: dict[str, dict[str, Any]] = {}
|
|
362
|
+
|
|
363
|
+
# Build batch processing text
|
|
364
|
+
batch_texts = []
|
|
365
|
+
for idx, item in enumerate(batch):
|
|
366
|
+
batch_texts.append(f"[{batch_start + idx}] {item.get('text', '')}")
|
|
367
|
+
|
|
368
|
+
combined_text = "\n\n".join(batch_texts)
|
|
369
|
+
|
|
370
|
+
system_prompt = """You are an expert at identifying numbered mathematical and scientific content in academic texts.
|
|
371
|
+
You need to extract items like:
|
|
372
|
+
- Definitions (e.g., "Definition 1.5.", "Definition 1.1")
|
|
373
|
+
- Propositions (e.g., "Proposition 1.3.")
|
|
374
|
+
- Theorems (e.g., "Theorem 2.1.")
|
|
375
|
+
- Lemmas (e.g., "Lemma 3.2.")
|
|
376
|
+
- Corollaries (e.g., "Corollary 1.4.")
|
|
377
|
+
- Examples (e.g., "Example 2.3.")
|
|
378
|
+
- Remarks (e.g., "Remark 1.6.")
|
|
379
|
+
- Figures (e.g., "Figure 1.1", "Fig. 2.3")
|
|
380
|
+
- Equations (formulas with \\tag{x.y.z})
|
|
381
|
+
- Tables (e.g., "Table 1.1")
|
|
382
|
+
|
|
383
|
+
Note: Do NOT extract section titles or headings.
|
|
384
|
+
|
|
385
|
+
IMPORTANT:
|
|
386
|
+
- For equations with tags like \\tag{1.2.1}, extract identifier as "(1.2.1)" (only the number in parentheses)
|
|
387
|
+
- For figures, extract the figure number from the caption
|
|
388
|
+
- Return ONLY a valid JSON array
|
|
389
|
+
- Ensure all backslashes in LaTeX formulas are properly escaped (use \\\\ instead of \\)."""
|
|
390
|
+
|
|
391
|
+
user_prompt = f"""Analyze the following text segments and extract all numbered items (definitions, propositions, theorems, lemmas, corollaries, examples, remarks, figures, equations, tables, etc.).
|
|
392
|
+
|
|
393
|
+
Each segment starts with [N] where N is the segment index number.
|
|
394
|
+
|
|
395
|
+
For each numbered item found, extract:
|
|
396
|
+
1. The index number N from the brackets [N] at the start of that segment
|
|
397
|
+
2. The identifier (e.g., "Definition 1.5", "Figure 1.1", "(1.2.1)")
|
|
398
|
+
3. The item type (e.g., "Definition", "Proposition", "Theorem", "Figure", "Equation", "Table")
|
|
399
|
+
4. The complete text of that item
|
|
400
|
+
|
|
401
|
+
Special cases:
|
|
402
|
+
- For equations with \\tag{{x.y.z}}, extract identifier as "(x.y.z)" - ONLY the number in parentheses, no "Equation" prefix
|
|
403
|
+
- For figures, extract the figure number from captions like "Figure 1.1: ..."
|
|
404
|
+
- For tables, extract table numbers like "Table 2.1"
|
|
405
|
+
|
|
406
|
+
Return a JSON array of objects with this structure:
|
|
407
|
+
[
|
|
408
|
+
{{
|
|
409
|
+
"index": 152,
|
|
410
|
+
"identifier": "Figure 1.1",
|
|
411
|
+
"type": "Figure",
|
|
412
|
+
"full_text": "Figure 1.1: Evolution of phylogenetic intelligence..."
|
|
413
|
+
}},
|
|
414
|
+
{{
|
|
415
|
+
"index": 185,
|
|
416
|
+
"identifier": "(1.2.1)",
|
|
417
|
+
"type": "Equation",
|
|
418
|
+
"full_text": "$$S = 1,2,3,4,5,6,\\\\ldots ,n,n + 1,\\\\ldots \\\\tag{{1.2.1}}$$"
|
|
419
|
+
}},
|
|
420
|
+
...
|
|
421
|
+
]
|
|
422
|
+
|
|
423
|
+
CRITICAL REQUIREMENTS:
|
|
424
|
+
- The "index" field MUST be the number N from [N] in brackets, NOT a relative position
|
|
425
|
+
- For equations, identifier must be ONLY "(x.y.z)" format, not "Equation (x.y.z)"
|
|
426
|
+
- Ensure all backslashes in LaTeX are properly escaped for JSON (double them: \\\\ instead of \\).
|
|
427
|
+
|
|
428
|
+
Text segments:
|
|
429
|
+
{combined_text}
|
|
430
|
+
|
|
431
|
+
Return ONLY the JSON array, no other text. Ensure it is valid JSON."""
|
|
432
|
+
|
|
433
|
+
# Asynchronously call LLM
|
|
434
|
+
try:
|
|
435
|
+
llm_cfg = get_llm_config()
|
|
436
|
+
response = await _call_llm_async(
|
|
437
|
+
user_prompt,
|
|
438
|
+
system_prompt,
|
|
439
|
+
api_key,
|
|
440
|
+
base_url,
|
|
441
|
+
max_tokens=4000,
|
|
442
|
+
temperature=0.1,
|
|
443
|
+
model=llm_cfg.model,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Parse response
|
|
447
|
+
json_str = _extract_json_block(response)
|
|
448
|
+
# Try direct parsing
|
|
449
|
+
try:
|
|
450
|
+
extracted = json.loads(json_str)
|
|
451
|
+
except json.JSONDecodeError as e_first:
|
|
452
|
+
# If parsing fails, try to fix common issues
|
|
453
|
+
logger.warning(f"Batch {batch_idx}: Initial JSON parsing failed, attempting to fix...")
|
|
454
|
+
|
|
455
|
+
# Try 1: Use strict=False
|
|
456
|
+
try:
|
|
457
|
+
from json.decoder import JSONDecoder
|
|
458
|
+
|
|
459
|
+
decoder = JSONDecoder(strict=False)
|
|
460
|
+
extracted = decoder.decode(json_str)
|
|
461
|
+
logger.info(f"Batch {batch_idx}: Parsed successfully using non-strict mode")
|
|
462
|
+
except Exception:
|
|
463
|
+
# Try 2: Use ast.literal_eval
|
|
464
|
+
try:
|
|
465
|
+
import ast
|
|
466
|
+
|
|
467
|
+
extracted = ast.literal_eval(json_str)
|
|
468
|
+
logger.info(f"Batch {batch_idx}: Parsed successfully using literal_eval")
|
|
469
|
+
except Exception:
|
|
470
|
+
# All methods failed, skip this batch
|
|
471
|
+
logger.warning(f"Batch {batch_idx}: All parsing methods failed, skipping batch")
|
|
472
|
+
logger.error(f"Original error: {e_first!s}")
|
|
473
|
+
logger.error(f"Response content (first 500 chars): {response[:500]}")
|
|
474
|
+
return numbered_items
|
|
475
|
+
|
|
476
|
+
if not isinstance(extracted, list):
|
|
477
|
+
logger.warning(f"Batch {batch_idx}: LLM returned non-array")
|
|
478
|
+
return numbered_items
|
|
479
|
+
|
|
480
|
+
# Process extracted results
|
|
481
|
+
for item in extracted:
|
|
482
|
+
index = item.get("index")
|
|
483
|
+
if index is None or index < batch_start or index >= batch_start + len(batch):
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
# Convert to index relative to batch
|
|
487
|
+
relative_index = index - batch_start
|
|
488
|
+
original_item = batch[relative_index]
|
|
489
|
+
identifier = item.get("identifier", "").strip()
|
|
490
|
+
|
|
491
|
+
if not identifier:
|
|
492
|
+
continue
|
|
493
|
+
|
|
494
|
+
# Get complete content and related images
|
|
495
|
+
# Prefer LLM-extracted full_text (contains complete content)
|
|
496
|
+
llm_extracted_text = item.get("full_text", "").strip()
|
|
497
|
+
img_paths = []
|
|
498
|
+
|
|
499
|
+
# For image or equation types, use LLM-extracted content directly (no need to complete)
|
|
500
|
+
original_type = original_item.get("_original_type", original_item.get("type", ""))
|
|
501
|
+
if original_type in ["image", "equation"]:
|
|
502
|
+
complete_text = llm_extracted_text
|
|
503
|
+
# Collect image path for current item
|
|
504
|
+
img_path = original_item.get("img_path", "")
|
|
505
|
+
if img_path:
|
|
506
|
+
img_paths.append(img_path)
|
|
507
|
+
else:
|
|
508
|
+
# For plain text, get index in full content_items and complete
|
|
509
|
+
full_index = text_item_to_full_index.get(index)
|
|
510
|
+
if full_index is not None:
|
|
511
|
+
# Get complete content (including subsequent equations, etc.) and all related images
|
|
512
|
+
# Use LLM to intelligently determine content boundaries
|
|
513
|
+
complete_text, img_paths = await _get_complete_content_async(
|
|
514
|
+
content_items, full_index, api_key, base_url
|
|
515
|
+
)
|
|
516
|
+
else:
|
|
517
|
+
complete_text = original_item.get("text", "")
|
|
518
|
+
# Collect image path for current item
|
|
519
|
+
img_path = original_item.get("img_path", "")
|
|
520
|
+
if img_path:
|
|
521
|
+
img_paths.append(img_path)
|
|
522
|
+
|
|
523
|
+
# If completed content is shorter than LLM-extracted, use LLM-extracted
|
|
524
|
+
if len(llm_extracted_text) > len(complete_text):
|
|
525
|
+
complete_text = llm_extracted_text
|
|
526
|
+
|
|
527
|
+
numbered_items[identifier] = {
|
|
528
|
+
"text": complete_text,
|
|
529
|
+
"type": item.get("type", "Unknown"),
|
|
530
|
+
"page": original_item.get("page_idx", 0) + 1,
|
|
531
|
+
"img_paths": img_paths if img_paths else [],
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
extracted_count = len([e for e in extracted if e.get("identifier", "").strip()])
|
|
535
|
+
logger.info(
|
|
536
|
+
f" Batch {batch_idx}/{total_batches}: Extracted {extracted_count} numbered items"
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
except Exception as e:
|
|
540
|
+
logger.error(f"Batch {batch_idx}: Processing failed: {e}")
|
|
541
|
+
|
|
542
|
+
return numbered_items
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
async def extract_numbered_items_with_llm_async(
|
|
546
|
+
content_items: list[dict[str, Any]],
|
|
547
|
+
api_key: str,
|
|
548
|
+
base_url: str | None,
|
|
549
|
+
batch_size: int = 20,
|
|
550
|
+
max_concurrent: int = 5,
|
|
551
|
+
) -> dict[str, dict[str, Any]]:
|
|
552
|
+
"""
|
|
553
|
+
Use LLM to asynchronously batch extract numbered important content
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
content_items: List of content items from content_list
|
|
557
|
+
api_key: OpenAI API key
|
|
558
|
+
base_url: API base URL
|
|
559
|
+
batch_size: Number of items to process per batch
|
|
560
|
+
max_concurrent: Maximum concurrency
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
Dict[identifier, {text: original text, type: type, page: page number}]
|
|
564
|
+
"""
|
|
565
|
+
numbered_items: dict[str, dict[str, Any]] = {}
|
|
566
|
+
|
|
567
|
+
# Create index mapping: from text_items index to full content_items index
|
|
568
|
+
text_item_to_full_index: dict[int, int] = {}
|
|
569
|
+
text_items: list[dict[str, Any]] = []
|
|
570
|
+
|
|
571
|
+
for idx, item in enumerate(content_items):
|
|
572
|
+
item_type = item.get("type", "")
|
|
573
|
+
|
|
574
|
+
# Process plain text
|
|
575
|
+
if item_type == "text" and item.get("text_level", 0) == 0:
|
|
576
|
+
text_item_to_full_index[len(text_items)] = idx
|
|
577
|
+
text_items.append(item)
|
|
578
|
+
|
|
579
|
+
# Process images (extract Figure number from caption)
|
|
580
|
+
elif item_type == "image":
|
|
581
|
+
captions = item.get("image_caption", [])
|
|
582
|
+
if captions:
|
|
583
|
+
# Create a virtual text item
|
|
584
|
+
caption_text = " ".join(captions) if isinstance(captions, list) else str(captions)
|
|
585
|
+
virtual_item = {
|
|
586
|
+
"type": "image",
|
|
587
|
+
"text": caption_text,
|
|
588
|
+
"page_idx": item.get("page_idx", 0),
|
|
589
|
+
"bbox": item.get("bbox", []),
|
|
590
|
+
"img_path": item.get("img_path", ""),
|
|
591
|
+
"_original_type": "image",
|
|
592
|
+
}
|
|
593
|
+
text_item_to_full_index[len(text_items)] = idx
|
|
594
|
+
text_items.append(virtual_item)
|
|
595
|
+
|
|
596
|
+
# Process numbered equations (extract from tag)
|
|
597
|
+
elif item_type == "equation":
|
|
598
|
+
equation_text = item.get("text", "")
|
|
599
|
+
# Check if there's a number tag, like \tag{1.2.1} or other forms
|
|
600
|
+
if "\\tag{" in equation_text or "tag{" in equation_text:
|
|
601
|
+
virtual_item = {
|
|
602
|
+
"type": "equation",
|
|
603
|
+
"text": equation_text,
|
|
604
|
+
"page_idx": item.get("page_idx", 0),
|
|
605
|
+
"bbox": item.get("bbox", []),
|
|
606
|
+
"img_path": item.get("img_path", ""),
|
|
607
|
+
"_original_type": "equation",
|
|
608
|
+
}
|
|
609
|
+
text_item_to_full_index[len(text_items)] = idx
|
|
610
|
+
text_items.append(virtual_item)
|
|
611
|
+
|
|
612
|
+
# Statistics
|
|
613
|
+
text_count = sum(
|
|
614
|
+
1 for item in content_items if item.get("type") == "text" and item.get("text_level", 0) == 0
|
|
615
|
+
)
|
|
616
|
+
image_count = sum(
|
|
617
|
+
1 for item in content_items if item.get("type") == "image" and item.get("image_caption")
|
|
618
|
+
)
|
|
619
|
+
equation_count = sum(
|
|
620
|
+
1
|
|
621
|
+
for item in content_items
|
|
622
|
+
if item.get("type") == "equation"
|
|
623
|
+
and ("\\tag{" in item.get("text", "") or "tag{" in item.get("text", ""))
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
logger.info(f"Total {len(text_items)} items to process")
|
|
627
|
+
logger.info(f" - Plain text: {text_count}")
|
|
628
|
+
logger.info(f" - Images with captions: {image_count}")
|
|
629
|
+
logger.info(f" - Numbered equations: {equation_count}")
|
|
630
|
+
|
|
631
|
+
# Prepare all batches
|
|
632
|
+
batches = []
|
|
633
|
+
for batch_start in range(0, len(text_items), batch_size):
|
|
634
|
+
batch_end = min(batch_start + batch_size, len(text_items))
|
|
635
|
+
batch = text_items[batch_start:batch_end]
|
|
636
|
+
batches.append((batch_start, batch))
|
|
637
|
+
|
|
638
|
+
total_batches = len(batches)
|
|
639
|
+
logger.info(f"Using {max_concurrent} concurrent tasks to process {total_batches} batches")
|
|
640
|
+
|
|
641
|
+
# Use semaphore to control concurrency
|
|
642
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
643
|
+
|
|
644
|
+
async def process_with_semaphore(batch_idx, batch_start, batch):
|
|
645
|
+
async with semaphore:
|
|
646
|
+
return await _process_single_batch(
|
|
647
|
+
batch_idx + 1,
|
|
648
|
+
batch,
|
|
649
|
+
batch_start,
|
|
650
|
+
content_items,
|
|
651
|
+
text_item_to_full_index,
|
|
652
|
+
api_key,
|
|
653
|
+
base_url,
|
|
654
|
+
total_batches,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# Create all tasks
|
|
658
|
+
tasks = [
|
|
659
|
+
process_with_semaphore(idx, batch_start, batch)
|
|
660
|
+
for idx, (batch_start, batch) in enumerate(batches)
|
|
661
|
+
]
|
|
662
|
+
|
|
663
|
+
# Execute all batches concurrently
|
|
664
|
+
results = await asyncio.gather(*tasks)
|
|
665
|
+
|
|
666
|
+
# Merge all results
|
|
667
|
+
for result in results:
|
|
668
|
+
numbered_items.update(result)
|
|
669
|
+
|
|
670
|
+
# Count results
|
|
671
|
+
type_stats: dict[str, int] = {}
|
|
672
|
+
for item_data in numbered_items.values():
|
|
673
|
+
item_type = item_data.get("type", "Unknown")
|
|
674
|
+
type_stats[item_type] = type_stats.get(item_type, 0) + 1
|
|
675
|
+
|
|
676
|
+
logger.info(f"\nExtraction complete, total {len(numbered_items)} numbered items")
|
|
677
|
+
logger.info("Statistics by type:")
|
|
678
|
+
for item_type, count in sorted(type_stats.items()):
|
|
679
|
+
logger.info(f" - {item_type}: {count}")
|
|
680
|
+
|
|
681
|
+
return numbered_items
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def extract_numbered_items_with_llm(
|
|
685
|
+
content_items: list[dict[str, Any]],
|
|
686
|
+
api_key: str,
|
|
687
|
+
base_url: str | None,
|
|
688
|
+
batch_size: int = 20,
|
|
689
|
+
max_concurrent: int = 5,
|
|
690
|
+
) -> dict[str, dict[str, Any]]:
|
|
691
|
+
"""
|
|
692
|
+
Synchronous wrapper for async extraction function
|
|
693
|
+
"""
|
|
694
|
+
try:
|
|
695
|
+
loop = asyncio.get_event_loop()
|
|
696
|
+
if loop.is_running():
|
|
697
|
+
# If event loop is already running, check if it's uvloop
|
|
698
|
+
loop_type = type(loop).__name__
|
|
699
|
+
if "uvloop" in loop_type.lower():
|
|
700
|
+
# uvloop doesn't support nest_asyncio, use threading approach
|
|
701
|
+
import concurrent.futures
|
|
702
|
+
|
|
703
|
+
def run_in_new_loop():
|
|
704
|
+
# Create a new asyncio event loop in a new thread
|
|
705
|
+
new_loop = asyncio.new_event_loop()
|
|
706
|
+
asyncio.set_event_loop(new_loop)
|
|
707
|
+
try:
|
|
708
|
+
return new_loop.run_until_complete(
|
|
709
|
+
extract_numbered_items_with_llm_async(
|
|
710
|
+
content_items, api_key, base_url, batch_size, max_concurrent
|
|
711
|
+
)
|
|
712
|
+
)
|
|
713
|
+
finally:
|
|
714
|
+
new_loop.close()
|
|
715
|
+
|
|
716
|
+
# Run in a thread with a new event loop
|
|
717
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
718
|
+
future = executor.submit(run_in_new_loop)
|
|
719
|
+
return future.result()
|
|
720
|
+
else:
|
|
721
|
+
# Try nest_asyncio for standard event loops
|
|
722
|
+
try:
|
|
723
|
+
import nest_asyncio
|
|
724
|
+
|
|
725
|
+
nest_asyncio.apply()
|
|
726
|
+
return loop.run_until_complete(
|
|
727
|
+
extract_numbered_items_with_llm_async(
|
|
728
|
+
content_items, api_key, base_url, batch_size, max_concurrent
|
|
729
|
+
)
|
|
730
|
+
)
|
|
731
|
+
except (ValueError, TypeError) as e:
|
|
732
|
+
# nest_asyncio failed, fall back to threading approach
|
|
733
|
+
logger.debug(f"nest_asyncio failed ({e}), using threading fallback")
|
|
734
|
+
import concurrent.futures
|
|
735
|
+
|
|
736
|
+
def run_in_new_loop():
|
|
737
|
+
new_loop = asyncio.new_event_loop()
|
|
738
|
+
asyncio.set_event_loop(new_loop)
|
|
739
|
+
try:
|
|
740
|
+
return new_loop.run_until_complete(
|
|
741
|
+
extract_numbered_items_with_llm_async(
|
|
742
|
+
content_items, api_key, base_url, batch_size, max_concurrent
|
|
743
|
+
)
|
|
744
|
+
)
|
|
745
|
+
finally:
|
|
746
|
+
new_loop.close()
|
|
747
|
+
|
|
748
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
749
|
+
future = executor.submit(run_in_new_loop)
|
|
750
|
+
return future.result()
|
|
751
|
+
else:
|
|
752
|
+
return loop.run_until_complete(
|
|
753
|
+
extract_numbered_items_with_llm_async(
|
|
754
|
+
content_items, api_key, base_url, batch_size, max_concurrent
|
|
755
|
+
)
|
|
756
|
+
)
|
|
757
|
+
except RuntimeError:
|
|
758
|
+
# No event loop, create new one
|
|
759
|
+
return asyncio.run(
|
|
760
|
+
extract_numbered_items_with_llm_async(
|
|
761
|
+
content_items, api_key, base_url, batch_size, max_concurrent
|
|
762
|
+
)
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def process_content_list(
|
|
767
|
+
content_list_file: Path,
|
|
768
|
+
output_file: Path,
|
|
769
|
+
api_key: str,
|
|
770
|
+
base_url: str | None,
|
|
771
|
+
batch_size: int = 20,
|
|
772
|
+
merge: bool = True,
|
|
773
|
+
):
|
|
774
|
+
"""
|
|
775
|
+
Process content_list file and extract numbered items
|
|
776
|
+
|
|
777
|
+
Args:
|
|
778
|
+
content_list_file: Path to content_list JSON file
|
|
779
|
+
output_file: Path to output JSON file
|
|
780
|
+
api_key: OpenAI API key
|
|
781
|
+
base_url: API base URL
|
|
782
|
+
batch_size: Batch processing size
|
|
783
|
+
merge: Whether to merge with existing results (default True)
|
|
784
|
+
"""
|
|
785
|
+
logger.info(f"Reading file: {content_list_file}")
|
|
786
|
+
|
|
787
|
+
# Read content_list
|
|
788
|
+
with open(content_list_file, encoding="utf-8") as f:
|
|
789
|
+
content_items = json.load(f)
|
|
790
|
+
|
|
791
|
+
logger.info(f"File contains {len(content_items)} items")
|
|
792
|
+
|
|
793
|
+
# Extract numbered items
|
|
794
|
+
logger.info("Starting numbered items extraction...")
|
|
795
|
+
new_items = extract_numbered_items_with_llm(
|
|
796
|
+
content_items,
|
|
797
|
+
api_key,
|
|
798
|
+
base_url,
|
|
799
|
+
batch_size,
|
|
800
|
+
max_concurrent=5, # Default concurrency
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
logger.info(f"Extracted {len(new_items)} numbered items this time")
|
|
804
|
+
|
|
805
|
+
# If merge is needed and file exists
|
|
806
|
+
if merge and output_file.exists():
|
|
807
|
+
logger.info(f"Existing file detected: {output_file}")
|
|
808
|
+
try:
|
|
809
|
+
with open(output_file, encoding="utf-8") as f:
|
|
810
|
+
existing_items = json.load(f)
|
|
811
|
+
logger.info(f"Loaded {len(existing_items)} existing numbered items")
|
|
812
|
+
|
|
813
|
+
# Merge (new items will override old items with same identifier)
|
|
814
|
+
merged_count = 0
|
|
815
|
+
for identifier, data in new_items.items():
|
|
816
|
+
if identifier in existing_items:
|
|
817
|
+
merged_count += 1
|
|
818
|
+
existing_items[identifier] = data
|
|
819
|
+
|
|
820
|
+
numbered_items = existing_items
|
|
821
|
+
logger.info(
|
|
822
|
+
f"Merge complete: Updated {merged_count} existing items, added {len(new_items) - merged_count} new items"
|
|
823
|
+
)
|
|
824
|
+
logger.info(f"Total {len(numbered_items)} numbered items after merge")
|
|
825
|
+
except Exception as e:
|
|
826
|
+
logger.warning(f"Could not read existing file, will create new file: {e}")
|
|
827
|
+
numbered_items = new_items
|
|
828
|
+
else:
|
|
829
|
+
numbered_items = new_items
|
|
830
|
+
|
|
831
|
+
# Save results
|
|
832
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
833
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
834
|
+
json.dump(numbered_items, f, indent=2, ensure_ascii=False)
|
|
835
|
+
|
|
836
|
+
logger.info(f"Results saved to: {output_file}")
|
|
837
|
+
|
|
838
|
+
# Print statistics
|
|
839
|
+
type_counts: dict[str, int] = {}
|
|
840
|
+
for identifier in numbered_items.keys():
|
|
841
|
+
# Identify equations: starting with parenthesis, e.g., (1.2.1)
|
|
842
|
+
if identifier.startswith("(") and ")" in identifier:
|
|
843
|
+
item_type = "Equation"
|
|
844
|
+
else:
|
|
845
|
+
# Extract type from identifier (e.g., "Definition 1.1" -> "Definition")
|
|
846
|
+
parts = identifier.split()
|
|
847
|
+
if parts:
|
|
848
|
+
item_type = parts[0]
|
|
849
|
+
else:
|
|
850
|
+
item_type = "Unknown"
|
|
851
|
+
type_counts[item_type] = type_counts.get(item_type, 0) + 1
|
|
852
|
+
|
|
853
|
+
logger.info("\n=== Extraction Statistics ===")
|
|
854
|
+
for item_type, count in sorted(type_counts.items()):
|
|
855
|
+
logger.info(f" {item_type}: {count}")
|
|
856
|
+
|
|
857
|
+
return numbered_items
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def main():
|
|
861
|
+
parser = argparse.ArgumentParser(
|
|
862
|
+
description="Extract numbered important content from knowledge base content_list"
|
|
863
|
+
)
|
|
864
|
+
parser.add_argument(
|
|
865
|
+
"--kb", required=True, help="Knowledge base name (under knowledge_bases directory)"
|
|
866
|
+
)
|
|
867
|
+
parser.add_argument(
|
|
868
|
+
"--content-file",
|
|
869
|
+
help="content_list file name (optional, if not specified, automatically process all JSON files)",
|
|
870
|
+
default=None,
|
|
871
|
+
)
|
|
872
|
+
parser.add_argument(
|
|
873
|
+
"--debug",
|
|
874
|
+
action="store_true",
|
|
875
|
+
help="Debug mode: only process first file (for quick testing)",
|
|
876
|
+
)
|
|
877
|
+
parser.add_argument(
|
|
878
|
+
"--output-name",
|
|
879
|
+
help="Output file name (default: numbered_items.json)",
|
|
880
|
+
default="numbered_items.json",
|
|
881
|
+
)
|
|
882
|
+
parser.add_argument(
|
|
883
|
+
"--base-dir",
|
|
884
|
+
help="Data storage base directory (default: ./knowledge_bases)",
|
|
885
|
+
default="./knowledge_bases",
|
|
886
|
+
)
|
|
887
|
+
parser.add_argument(
|
|
888
|
+
"--batch-size",
|
|
889
|
+
type=int,
|
|
890
|
+
help="Number of items to process per batch (default: 20)",
|
|
891
|
+
default=20,
|
|
892
|
+
)
|
|
893
|
+
parser.add_argument(
|
|
894
|
+
"--max-concurrent", type=int, help="Maximum concurrent tasks (default: 5)", default=5
|
|
895
|
+
)
|
|
896
|
+
parser.add_argument(
|
|
897
|
+
"--no-merge",
|
|
898
|
+
action="store_true",
|
|
899
|
+
help="Do not merge existing results, directly overwrite (default will merge)",
|
|
900
|
+
)
|
|
901
|
+
parser.add_argument(
|
|
902
|
+
"--api-key",
|
|
903
|
+
default=os.getenv("LLM_API_KEY"),
|
|
904
|
+
help="OpenAI API key (default reads from LLM_API_KEY)",
|
|
905
|
+
)
|
|
906
|
+
parser.add_argument(
|
|
907
|
+
"--base-url",
|
|
908
|
+
default=os.getenv("LLM_HOST"),
|
|
909
|
+
help="OpenAI API Base URL (default reads from LLM_HOST)",
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
args = parser.parse_args()
|
|
913
|
+
|
|
914
|
+
# Get API configuration
|
|
915
|
+
api_key = args.api_key
|
|
916
|
+
base_url = args.base_url
|
|
917
|
+
|
|
918
|
+
# Validate API key
|
|
919
|
+
if not api_key:
|
|
920
|
+
raise SystemExit(
|
|
921
|
+
"Missing API Key: Please set environment variable LLM_API_KEY or pass via --api-key"
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Build paths
|
|
925
|
+
base_dir = Path(args.base_dir)
|
|
926
|
+
kb_dir = base_dir / args.kb
|
|
927
|
+
content_list_dir = kb_dir / "content_list"
|
|
928
|
+
|
|
929
|
+
# Check if content_list directory exists
|
|
930
|
+
if not content_list_dir.exists():
|
|
931
|
+
logger.error(f"content_list directory does not exist: {content_list_dir}")
|
|
932
|
+
sys.exit(1)
|
|
933
|
+
|
|
934
|
+
# Get list of files to process
|
|
935
|
+
if args.content_file:
|
|
936
|
+
# If file is specified, only process that file
|
|
937
|
+
content_list_files = [content_list_dir / args.content_file]
|
|
938
|
+
if not content_list_files[0].exists():
|
|
939
|
+
logger.error(f"content_list file does not exist: {content_list_files[0]}")
|
|
940
|
+
sys.exit(1)
|
|
941
|
+
else:
|
|
942
|
+
# Otherwise automatically scan all JSON files
|
|
943
|
+
content_list_files = sorted(content_list_dir.glob("*.json"))
|
|
944
|
+
if not content_list_files:
|
|
945
|
+
logger.error(f"No JSON files found in {content_list_dir}")
|
|
946
|
+
sys.exit(1)
|
|
947
|
+
|
|
948
|
+
# Debug mode: only process first file
|
|
949
|
+
if args.debug:
|
|
950
|
+
logger.info(f"ā ļø Debug mode: Only processing first file {content_list_files[0].name}")
|
|
951
|
+
content_list_files = content_list_files[:1]
|
|
952
|
+
|
|
953
|
+
# Output file fixed as numbered_items.json (shared across entire knowledge base)
|
|
954
|
+
output_file = kb_dir / args.output_name
|
|
955
|
+
|
|
956
|
+
# Display configuration information
|
|
957
|
+
logger.info("=" * 60)
|
|
958
|
+
logger.info("š Configuration Information")
|
|
959
|
+
logger.info("=" * 60)
|
|
960
|
+
logger.info(f"Knowledge base: {args.kb}")
|
|
961
|
+
logger.info(f"Content files: {len(content_list_files)} files")
|
|
962
|
+
for f in content_list_files:
|
|
963
|
+
logger.info(f" - {f.name}")
|
|
964
|
+
logger.info(f"Output file: {output_file}")
|
|
965
|
+
logger.info(f"Batch size: {args.batch_size}")
|
|
966
|
+
logger.info(f"Max concurrent: {args.max_concurrent}")
|
|
967
|
+
logger.info(f"Auto merge: {'Yes' if not args.no_merge else 'No'}")
|
|
968
|
+
logger.info(f"Debug mode: {'Yes' if args.debug else 'No'}")
|
|
969
|
+
logger.info(
|
|
970
|
+
f"API key: {'Set (' + api_key[:8] + '...' + api_key[-4:] + ')' if api_key else 'Not set'}"
|
|
971
|
+
)
|
|
972
|
+
logger.info(f"API base URL: {base_url if base_url else 'Default (https://api.openai.com/v1)'}")
|
|
973
|
+
logger.info("=" * 60)
|
|
974
|
+
logger.info("")
|
|
975
|
+
|
|
976
|
+
try:
|
|
977
|
+
# Process all files
|
|
978
|
+
for idx, content_list_file in enumerate(content_list_files, 1):
|
|
979
|
+
logger.info(f"\n{'=' * 60}")
|
|
980
|
+
logger.info(
|
|
981
|
+
f"Processing file [{idx}/{len(content_list_files)}]: {content_list_file.name}"
|
|
982
|
+
)
|
|
983
|
+
logger.info(f"{'=' * 60}\n")
|
|
984
|
+
|
|
985
|
+
process_content_list(
|
|
986
|
+
content_list_file,
|
|
987
|
+
output_file,
|
|
988
|
+
api_key,
|
|
989
|
+
base_url,
|
|
990
|
+
args.batch_size,
|
|
991
|
+
merge=not args.no_merge, # Auto-merge after first file
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
# From second file onwards, force merge mode
|
|
995
|
+
if idx == 1 and len(content_list_files) > 1:
|
|
996
|
+
args.no_merge = False
|
|
997
|
+
logger.info(f"\nSubsequent files will be automatically merged to {output_file}\n")
|
|
998
|
+
|
|
999
|
+
logger.info("\n" + "=" * 60)
|
|
1000
|
+
logger.info("ā All files processed!")
|
|
1001
|
+
logger.info("=" * 60)
|
|
1002
|
+
|
|
1003
|
+
# Display final statistics
|
|
1004
|
+
if output_file.exists():
|
|
1005
|
+
with open(output_file, encoding="utf-8") as f:
|
|
1006
|
+
final_items = json.load(f)
|
|
1007
|
+
|
|
1008
|
+
logger.info(f"\nFinal result: {output_file}")
|
|
1009
|
+
logger.info(f"Total extracted {len(final_items)} numbered items")
|
|
1010
|
+
|
|
1011
|
+
# Statistics by type
|
|
1012
|
+
type_counts = {}
|
|
1013
|
+
for identifier in final_items.keys():
|
|
1014
|
+
# Identify equations: starting with parenthesis, e.g., (1.2.1)
|
|
1015
|
+
if identifier.startswith("(") and ")" in identifier:
|
|
1016
|
+
item_type = "Equation"
|
|
1017
|
+
else:
|
|
1018
|
+
# Extract type from identifier (e.g., "Definition 1.1" -> "Definition")
|
|
1019
|
+
parts = identifier.split()
|
|
1020
|
+
if parts:
|
|
1021
|
+
item_type = parts[0]
|
|
1022
|
+
else:
|
|
1023
|
+
item_type = "Unknown"
|
|
1024
|
+
type_counts[item_type] = type_counts.get(item_type, 0) + 1
|
|
1025
|
+
|
|
1026
|
+
logger.info("\n=== Final Statistics ===")
|
|
1027
|
+
for item_type, count in sorted(type_counts.items()):
|
|
1028
|
+
logger.info(f" {item_type}: {count}")
|
|
1029
|
+
|
|
1030
|
+
except Exception as e:
|
|
1031
|
+
logger.error(f"\nā Processing failed: {e}")
|
|
1032
|
+
import traceback
|
|
1033
|
+
|
|
1034
|
+
traceback.print_exc()
|
|
1035
|
+
sys.exit(1)
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
if __name__ == "__main__":
|
|
1039
|
+
main()
|