realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,1039 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Extract numbered important content from knowledge base content_list
5
+ Such as: Definition 1.5., Proposition 1.3., Theorem x.x., Equation x.x., Formula x.x., etc.
6
+
7
+ Use LLM to identify these contents and store the mapping between numbers and original text in JSON file
8
+ """
9
+
10
+ import argparse
11
+ import asyncio
12
+ import inspect
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ import sys
17
+ from typing import Any
18
+
19
+ sys.path.append(str(Path(__file__).parent.parent.parent))
20
+
21
+ from dotenv import load_dotenv
22
+ from lightrag.llm.openai import openai_complete_if_cache
23
+
24
+ from src.services.llm import get_llm_config
25
+
26
+ load_dotenv(dotenv_path=".env", override=False)
27
+
28
+ # Use project unified logging system
29
+ import logging as std_logging
30
+
31
+ # Logger can be either custom Logger or standard logging.Logger
32
+ logger: Any # Use Any to allow both types
33
+
34
+ try:
35
+ from pathlib import Path
36
+
37
+ from src.logging import get_logger
38
+ from src.services.config import load_config_with_main
39
+
40
+ project_root = Path(__file__).parent.parent.parent.parent
41
+ config = load_config_with_main(
42
+ "solve_config.yaml", project_root
43
+ ) # Use any config to get main.yaml
44
+ log_dir = config.get("paths", {}).get("user_log_dir") or config.get("logging", {}).get(
45
+ "log_dir"
46
+ )
47
+ logger = get_logger("Knowledge", log_dir=log_dir)
48
+ except ImportError:
49
+ # If import fails, use basic logging
50
+ logger = std_logging.getLogger("knowledge_init.extract_items")
51
+ std_logging.basicConfig(
52
+ level=std_logging.INFO, format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
53
+ )
54
+
55
+
56
+ async def _call_llm_async(
57
+ prompt: str,
58
+ system_prompt: str,
59
+ api_key: str,
60
+ base_url: str | None,
61
+ max_tokens: int = 2000,
62
+ temperature: float = 0.1,
63
+ model: str = None,
64
+ ) -> str:
65
+ """Asynchronously call LLM"""
66
+ # If model not specified, get from env_config
67
+ if model is None:
68
+ llm_cfg = get_llm_config()
69
+ model = llm_cfg.model
70
+
71
+ result = openai_complete_if_cache(
72
+ model,
73
+ prompt,
74
+ system_prompt=system_prompt,
75
+ api_key=api_key,
76
+ base_url=base_url,
77
+ max_tokens=max_tokens,
78
+ temperature=temperature,
79
+ )
80
+
81
+ if inspect.isawaitable(result):
82
+ return await result
83
+ return str(result)
84
+
85
+
86
+ def _extract_json_block(text: str) -> str:
87
+ """Extract JSON block from text"""
88
+ try:
89
+ s = str(text).strip()
90
+ # Remove code block markers
91
+ if s.startswith("```") and s.endswith("```"):
92
+ lines = s.split("\n")
93
+ if lines[0].startswith("```"):
94
+ lines = lines[1:]
95
+ if lines and lines[-1].strip() == "```":
96
+ lines = lines[:-1]
97
+ s = "\n".join(lines).strip()
98
+
99
+ # Try to extract JSON object or array
100
+ if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
101
+ return s
102
+
103
+ o_start, o_end = s.find("{"), s.rfind("}")
104
+ a_start, a_end = s.find("["), s.rfind("]")
105
+
106
+ candidates = []
107
+ if o_start != -1 and o_end != -1 and o_end > o_start:
108
+ candidates.append((o_start, s[o_start : o_end + 1]))
109
+ if a_start != -1 and a_end != -1 and a_end > a_start:
110
+ candidates.append((a_start, s[a_start : a_end + 1]))
111
+
112
+ if candidates:
113
+ candidates.sort(key=lambda x: x[0])
114
+ return candidates[0][1]
115
+
116
+ return s
117
+ except Exception:
118
+ return text
119
+
120
+
121
+ async def _check_content_belongs_async(
122
+ start_text: str, candidate_text: str, api_key: str, base_url: str | None
123
+ ) -> bool:
124
+ """
125
+ Use LLM to determine if candidate content belongs to (is part of) the starting content
126
+
127
+ Args:
128
+ start_text: Starting content (beginning part of numbered item)
129
+ candidate_text: Candidate content (subsequent content block)
130
+ api_key: OpenAI API key
131
+ base_url: API base URL
132
+
133
+ Returns:
134
+ True means candidate content belongs to starting content, False means it's new independent content
135
+ """
136
+ system_prompt = """You are an expert at analyzing the structure of academic mathematical texts.
137
+ Your task is to determine if a candidate text block belongs to (is a continuation of) a starting numbered item, or if it's a new independent item.
138
+
139
+ Numbered items include: Definitions, Propositions, Theorems, Lemmas, Corollaries, Examples, Remarks, Figures, Equations, etc.
140
+
141
+ Rules:
142
+ - Equations, formulas, and images that follow a numbered item usually belong to that item
143
+ - Explanatory text that continues the same topic belongs to the item
144
+ - A new numbered item (starting with "Definition X.Y", "Theorem X.Y", etc.) is independent
145
+ - Text that starts a completely different topic is independent
146
+
147
+ Return ONLY "YES" if the candidate belongs to the starting item, or "NO" if it's independent."""
148
+
149
+ user_prompt = f"""Starting item:
150
+ {start_text[:500]}
151
+
152
+ Candidate block:
153
+ {candidate_text[:300]}
154
+
155
+ Does the candidate block belong to (continue) the starting item?
156
+ Answer with ONLY "YES" or "NO"."""
157
+
158
+ try:
159
+ llm_cfg = get_llm_config()
160
+ response = await _call_llm_async(
161
+ user_prompt,
162
+ system_prompt,
163
+ api_key,
164
+ base_url,
165
+ max_tokens=10,
166
+ temperature=0.0,
167
+ model=llm_cfg.model,
168
+ )
169
+ answer = response.strip().upper()
170
+ return answer == "YES"
171
+ except Exception as e:
172
+ logger.warning(f"LLM judgment failed, default to not include: {e}")
173
+ # Default to conservative strategy: don't include
174
+ return False
175
+
176
+
177
+ async def _get_complete_content_async(
178
+ content_items: list[dict[str, Any]],
179
+ start_index: int,
180
+ api_key: str,
181
+ base_url: str | None,
182
+ max_following: int = 5,
183
+ ) -> tuple[str, list[str]]:
184
+ """
185
+ Get complete content, including subsequent formulas, text, etc., and all related image paths
186
+ Use LLM to determine if subsequent content belongs to current numbered item
187
+
188
+ Args:
189
+ content_items: Complete content_list
190
+ start_index: Starting index
191
+ api_key: OpenAI API key
192
+ base_url: API base URL
193
+ max_following: Maximum number of subsequent entries to check
194
+
195
+ Returns:
196
+ (Complete text content, image path list)
197
+ """
198
+ complete_text = content_items[start_index].get("text", "")
199
+ img_paths = []
200
+
201
+ # Collect image paths from starting item
202
+ start_img_path = content_items[start_index].get("img_path", "")
203
+ if start_img_path:
204
+ img_paths.append(start_img_path)
205
+
206
+ logger.debug(
207
+ f"Starting to use LLM to determine content boundaries, starting text (first 50 chars): {complete_text[:50]}..."
208
+ )
209
+
210
+ # Check subsequent entries
211
+ for i in range(1, max_following + 1):
212
+ next_index = start_index + i
213
+ if next_index >= len(content_items):
214
+ break
215
+
216
+ next_item = content_items[next_index]
217
+ next_type = next_item.get("type", "")
218
+
219
+ # If encountering title-level text, definitely stop
220
+ if next_type == "text" and next_item.get("text_level", 0) > 0:
221
+ break
222
+
223
+ # If it's a formula, usually belongs to current content, add directly
224
+ if next_type == "equation":
225
+ equation_text = next_item.get("text", "")
226
+ if equation_text:
227
+ complete_text += " " + equation_text
228
+ # Collect formula image paths
229
+ eq_img_path = next_item.get("img_path", "")
230
+ if eq_img_path:
231
+ img_paths.append(eq_img_path)
232
+ # If it's an image, collect image paths
233
+ elif next_type == "image":
234
+ img_path = next_item.get("img_path", "")
235
+ if img_path:
236
+ img_paths.append(img_path)
237
+ # Can also add image captions to text
238
+ captions = next_item.get("image_caption", [])
239
+ if captions:
240
+ caption_text = " ".join(captions) if isinstance(captions, list) else str(captions)
241
+ complete_text += " " + caption_text
242
+ # If it's regular text, use LLM to judge
243
+ elif next_type == "text" and next_item.get("text_level", 0) == 0:
244
+ next_text = next_item.get("text", "").strip()
245
+ if not next_text:
246
+ continue
247
+
248
+ # Use LLM to determine if this text belongs to current numbered item
249
+ belongs = await _check_content_belongs_async(
250
+ complete_text, next_text, api_key, base_url
251
+ )
252
+
253
+ if belongs:
254
+ complete_text += " " + next_text
255
+ logger.debug(
256
+ f"LLM judgment: Subsequent text belongs to current content, added (first 30 chars: {next_text[:30]}...)"
257
+ )
258
+ else:
259
+ # Doesn't belong to current content, stop collecting
260
+ logger.debug(
261
+ f"LLM judgment: Subsequent text doesn't belong to current content, stop collecting (first 30 chars: {next_text[:30]}...)"
262
+ )
263
+ break
264
+
265
+ return complete_text.strip(), img_paths
266
+
267
+
268
+ def _get_complete_content(
269
+ content_items: list[dict[str, Any]],
270
+ start_index: int,
271
+ api_key: str,
272
+ base_url: str | None,
273
+ max_following: int = 5,
274
+ ) -> tuple[str, list[str]]:
275
+ """
276
+ Synchronous wrapper for async function to get complete content
277
+ """
278
+ try:
279
+ loop = asyncio.get_event_loop()
280
+ if loop.is_running():
281
+ # If event loop is already running, check if it's uvloop
282
+ loop_type = type(loop).__name__
283
+ if "uvloop" in loop_type.lower():
284
+ # uvloop doesn't support nest_asyncio, use threading approach
285
+ import concurrent.futures
286
+
287
+ def run_in_new_loop():
288
+ # Create a new asyncio event loop in a new thread
289
+ new_loop = asyncio.new_event_loop()
290
+ asyncio.set_event_loop(new_loop)
291
+ try:
292
+ return new_loop.run_until_complete(
293
+ _get_complete_content_async(
294
+ content_items, start_index, api_key, base_url, max_following
295
+ )
296
+ )
297
+ finally:
298
+ new_loop.close()
299
+
300
+ # Run in a thread with a new event loop
301
+ with concurrent.futures.ThreadPoolExecutor() as executor:
302
+ future = executor.submit(run_in_new_loop)
303
+ return future.result()
304
+ else:
305
+ # Try nest_asyncio for standard event loops
306
+ try:
307
+ import nest_asyncio
308
+
309
+ nest_asyncio.apply()
310
+ return loop.run_until_complete(
311
+ _get_complete_content_async(
312
+ content_items, start_index, api_key, base_url, max_following
313
+ )
314
+ )
315
+ except (ValueError, TypeError) as e:
316
+ # nest_asyncio failed, fall back to threading approach
317
+ logger.debug(f"nest_asyncio failed ({e}), using threading fallback")
318
+ import concurrent.futures
319
+
320
+ def run_in_new_loop():
321
+ new_loop = asyncio.new_event_loop()
322
+ asyncio.set_event_loop(new_loop)
323
+ try:
324
+ return new_loop.run_until_complete(
325
+ _get_complete_content_async(
326
+ content_items, start_index, api_key, base_url, max_following
327
+ )
328
+ )
329
+ finally:
330
+ new_loop.close()
331
+
332
+ with concurrent.futures.ThreadPoolExecutor() as executor:
333
+ future = executor.submit(run_in_new_loop)
334
+ return future.result()
335
+ else:
336
+ return loop.run_until_complete(
337
+ _get_complete_content_async(
338
+ content_items, start_index, api_key, base_url, max_following
339
+ )
340
+ )
341
+ except RuntimeError:
342
+ # No event loop, create new one
343
+ return asyncio.run(
344
+ _get_complete_content_async(
345
+ content_items, start_index, api_key, base_url, max_following
346
+ )
347
+ )
348
+
349
+
350
+ async def _process_single_batch(
351
+ batch_idx: int,
352
+ batch: list[dict[str, Any]],
353
+ batch_start: int,
354
+ content_items: list[dict[str, Any]],
355
+ text_item_to_full_index: dict[int, int],
356
+ api_key: str,
357
+ base_url: str | None,
358
+ total_batches: int,
359
+ ) -> dict[str, dict[str, Any]]:
360
+ """Asynchronously process a single batch"""
361
+ numbered_items: dict[str, dict[str, Any]] = {}
362
+
363
+ # Build batch processing text
364
+ batch_texts = []
365
+ for idx, item in enumerate(batch):
366
+ batch_texts.append(f"[{batch_start + idx}] {item.get('text', '')}")
367
+
368
+ combined_text = "\n\n".join(batch_texts)
369
+
370
+ system_prompt = """You are an expert at identifying numbered mathematical and scientific content in academic texts.
371
+ You need to extract items like:
372
+ - Definitions (e.g., "Definition 1.5.", "Definition 1.1")
373
+ - Propositions (e.g., "Proposition 1.3.")
374
+ - Theorems (e.g., "Theorem 2.1.")
375
+ - Lemmas (e.g., "Lemma 3.2.")
376
+ - Corollaries (e.g., "Corollary 1.4.")
377
+ - Examples (e.g., "Example 2.3.")
378
+ - Remarks (e.g., "Remark 1.6.")
379
+ - Figures (e.g., "Figure 1.1", "Fig. 2.3")
380
+ - Equations (formulas with \\tag{x.y.z})
381
+ - Tables (e.g., "Table 1.1")
382
+
383
+ Note: Do NOT extract section titles or headings.
384
+
385
+ IMPORTANT:
386
+ - For equations with tags like \\tag{1.2.1}, extract identifier as "(1.2.1)" (only the number in parentheses)
387
+ - For figures, extract the figure number from the caption
388
+ - Return ONLY a valid JSON array
389
+ - Ensure all backslashes in LaTeX formulas are properly escaped (use \\\\ instead of \\)."""
390
+
391
+ user_prompt = f"""Analyze the following text segments and extract all numbered items (definitions, propositions, theorems, lemmas, corollaries, examples, remarks, figures, equations, tables, etc.).
392
+
393
+ Each segment starts with [N] where N is the segment index number.
394
+
395
+ For each numbered item found, extract:
396
+ 1. The index number N from the brackets [N] at the start of that segment
397
+ 2. The identifier (e.g., "Definition 1.5", "Figure 1.1", "(1.2.1)")
398
+ 3. The item type (e.g., "Definition", "Proposition", "Theorem", "Figure", "Equation", "Table")
399
+ 4. The complete text of that item
400
+
401
+ Special cases:
402
+ - For equations with \\tag{{x.y.z}}, extract identifier as "(x.y.z)" - ONLY the number in parentheses, no "Equation" prefix
403
+ - For figures, extract the figure number from captions like "Figure 1.1: ..."
404
+ - For tables, extract table numbers like "Table 2.1"
405
+
406
+ Return a JSON array of objects with this structure:
407
+ [
408
+ {{
409
+ "index": 152,
410
+ "identifier": "Figure 1.1",
411
+ "type": "Figure",
412
+ "full_text": "Figure 1.1: Evolution of phylogenetic intelligence..."
413
+ }},
414
+ {{
415
+ "index": 185,
416
+ "identifier": "(1.2.1)",
417
+ "type": "Equation",
418
+ "full_text": "$$S = 1,2,3,4,5,6,\\\\ldots ,n,n + 1,\\\\ldots \\\\tag{{1.2.1}}$$"
419
+ }},
420
+ ...
421
+ ]
422
+
423
+ CRITICAL REQUIREMENTS:
424
+ - The "index" field MUST be the number N from [N] in brackets, NOT a relative position
425
+ - For equations, identifier must be ONLY "(x.y.z)" format, not "Equation (x.y.z)"
426
+ - Ensure all backslashes in LaTeX are properly escaped for JSON (double them: \\\\ instead of \\).
427
+
428
+ Text segments:
429
+ {combined_text}
430
+
431
+ Return ONLY the JSON array, no other text. Ensure it is valid JSON."""
432
+
433
+ # Asynchronously call LLM
434
+ try:
435
+ llm_cfg = get_llm_config()
436
+ response = await _call_llm_async(
437
+ user_prompt,
438
+ system_prompt,
439
+ api_key,
440
+ base_url,
441
+ max_tokens=4000,
442
+ temperature=0.1,
443
+ model=llm_cfg.model,
444
+ )
445
+
446
+ # Parse response
447
+ json_str = _extract_json_block(response)
448
+ # Try direct parsing
449
+ try:
450
+ extracted = json.loads(json_str)
451
+ except json.JSONDecodeError as e_first:
452
+ # If parsing fails, try to fix common issues
453
+ logger.warning(f"Batch {batch_idx}: Initial JSON parsing failed, attempting to fix...")
454
+
455
+ # Try 1: Use strict=False
456
+ try:
457
+ from json.decoder import JSONDecoder
458
+
459
+ decoder = JSONDecoder(strict=False)
460
+ extracted = decoder.decode(json_str)
461
+ logger.info(f"Batch {batch_idx}: Parsed successfully using non-strict mode")
462
+ except Exception:
463
+ # Try 2: Use ast.literal_eval
464
+ try:
465
+ import ast
466
+
467
+ extracted = ast.literal_eval(json_str)
468
+ logger.info(f"Batch {batch_idx}: Parsed successfully using literal_eval")
469
+ except Exception:
470
+ # All methods failed, skip this batch
471
+ logger.warning(f"Batch {batch_idx}: All parsing methods failed, skipping batch")
472
+ logger.error(f"Original error: {e_first!s}")
473
+ logger.error(f"Response content (first 500 chars): {response[:500]}")
474
+ return numbered_items
475
+
476
+ if not isinstance(extracted, list):
477
+ logger.warning(f"Batch {batch_idx}: LLM returned non-array")
478
+ return numbered_items
479
+
480
+ # Process extracted results
481
+ for item in extracted:
482
+ index = item.get("index")
483
+ if index is None or index < batch_start or index >= batch_start + len(batch):
484
+ continue
485
+
486
+ # Convert to index relative to batch
487
+ relative_index = index - batch_start
488
+ original_item = batch[relative_index]
489
+ identifier = item.get("identifier", "").strip()
490
+
491
+ if not identifier:
492
+ continue
493
+
494
+ # Get complete content and related images
495
+ # Prefer LLM-extracted full_text (contains complete content)
496
+ llm_extracted_text = item.get("full_text", "").strip()
497
+ img_paths = []
498
+
499
+ # For image or equation types, use LLM-extracted content directly (no need to complete)
500
+ original_type = original_item.get("_original_type", original_item.get("type", ""))
501
+ if original_type in ["image", "equation"]:
502
+ complete_text = llm_extracted_text
503
+ # Collect image path for current item
504
+ img_path = original_item.get("img_path", "")
505
+ if img_path:
506
+ img_paths.append(img_path)
507
+ else:
508
+ # For plain text, get index in full content_items and complete
509
+ full_index = text_item_to_full_index.get(index)
510
+ if full_index is not None:
511
+ # Get complete content (including subsequent equations, etc.) and all related images
512
+ # Use LLM to intelligently determine content boundaries
513
+ complete_text, img_paths = await _get_complete_content_async(
514
+ content_items, full_index, api_key, base_url
515
+ )
516
+ else:
517
+ complete_text = original_item.get("text", "")
518
+ # Collect image path for current item
519
+ img_path = original_item.get("img_path", "")
520
+ if img_path:
521
+ img_paths.append(img_path)
522
+
523
+ # If completed content is shorter than LLM-extracted, use LLM-extracted
524
+ if len(llm_extracted_text) > len(complete_text):
525
+ complete_text = llm_extracted_text
526
+
527
+ numbered_items[identifier] = {
528
+ "text": complete_text,
529
+ "type": item.get("type", "Unknown"),
530
+ "page": original_item.get("page_idx", 0) + 1,
531
+ "img_paths": img_paths if img_paths else [],
532
+ }
533
+
534
+ extracted_count = len([e for e in extracted if e.get("identifier", "").strip()])
535
+ logger.info(
536
+ f" Batch {batch_idx}/{total_batches}: Extracted {extracted_count} numbered items"
537
+ )
538
+
539
+ except Exception as e:
540
+ logger.error(f"Batch {batch_idx}: Processing failed: {e}")
541
+
542
+ return numbered_items
543
+
544
+
545
+ async def extract_numbered_items_with_llm_async(
546
+ content_items: list[dict[str, Any]],
547
+ api_key: str,
548
+ base_url: str | None,
549
+ batch_size: int = 20,
550
+ max_concurrent: int = 5,
551
+ ) -> dict[str, dict[str, Any]]:
552
+ """
553
+ Use LLM to asynchronously batch extract numbered important content
554
+
555
+ Args:
556
+ content_items: List of content items from content_list
557
+ api_key: OpenAI API key
558
+ base_url: API base URL
559
+ batch_size: Number of items to process per batch
560
+ max_concurrent: Maximum concurrency
561
+
562
+ Returns:
563
+ Dict[identifier, {text: original text, type: type, page: page number}]
564
+ """
565
+ numbered_items: dict[str, dict[str, Any]] = {}
566
+
567
+ # Create index mapping: from text_items index to full content_items index
568
+ text_item_to_full_index: dict[int, int] = {}
569
+ text_items: list[dict[str, Any]] = []
570
+
571
+ for idx, item in enumerate(content_items):
572
+ item_type = item.get("type", "")
573
+
574
+ # Process plain text
575
+ if item_type == "text" and item.get("text_level", 0) == 0:
576
+ text_item_to_full_index[len(text_items)] = idx
577
+ text_items.append(item)
578
+
579
+ # Process images (extract Figure number from caption)
580
+ elif item_type == "image":
581
+ captions = item.get("image_caption", [])
582
+ if captions:
583
+ # Create a virtual text item
584
+ caption_text = " ".join(captions) if isinstance(captions, list) else str(captions)
585
+ virtual_item = {
586
+ "type": "image",
587
+ "text": caption_text,
588
+ "page_idx": item.get("page_idx", 0),
589
+ "bbox": item.get("bbox", []),
590
+ "img_path": item.get("img_path", ""),
591
+ "_original_type": "image",
592
+ }
593
+ text_item_to_full_index[len(text_items)] = idx
594
+ text_items.append(virtual_item)
595
+
596
+ # Process numbered equations (extract from tag)
597
+ elif item_type == "equation":
598
+ equation_text = item.get("text", "")
599
+ # Check if there's a number tag, like \tag{1.2.1} or other forms
600
+ if "\\tag{" in equation_text or "tag{" in equation_text:
601
+ virtual_item = {
602
+ "type": "equation",
603
+ "text": equation_text,
604
+ "page_idx": item.get("page_idx", 0),
605
+ "bbox": item.get("bbox", []),
606
+ "img_path": item.get("img_path", ""),
607
+ "_original_type": "equation",
608
+ }
609
+ text_item_to_full_index[len(text_items)] = idx
610
+ text_items.append(virtual_item)
611
+
612
+ # Statistics
613
+ text_count = sum(
614
+ 1 for item in content_items if item.get("type") == "text" and item.get("text_level", 0) == 0
615
+ )
616
+ image_count = sum(
617
+ 1 for item in content_items if item.get("type") == "image" and item.get("image_caption")
618
+ )
619
+ equation_count = sum(
620
+ 1
621
+ for item in content_items
622
+ if item.get("type") == "equation"
623
+ and ("\\tag{" in item.get("text", "") or "tag{" in item.get("text", ""))
624
+ )
625
+
626
+ logger.info(f"Total {len(text_items)} items to process")
627
+ logger.info(f" - Plain text: {text_count}")
628
+ logger.info(f" - Images with captions: {image_count}")
629
+ logger.info(f" - Numbered equations: {equation_count}")
630
+
631
+ # Prepare all batches
632
+ batches = []
633
+ for batch_start in range(0, len(text_items), batch_size):
634
+ batch_end = min(batch_start + batch_size, len(text_items))
635
+ batch = text_items[batch_start:batch_end]
636
+ batches.append((batch_start, batch))
637
+
638
+ total_batches = len(batches)
639
+ logger.info(f"Using {max_concurrent} concurrent tasks to process {total_batches} batches")
640
+
641
+ # Use semaphore to control concurrency
642
+ semaphore = asyncio.Semaphore(max_concurrent)
643
+
644
+ async def process_with_semaphore(batch_idx, batch_start, batch):
645
+ async with semaphore:
646
+ return await _process_single_batch(
647
+ batch_idx + 1,
648
+ batch,
649
+ batch_start,
650
+ content_items,
651
+ text_item_to_full_index,
652
+ api_key,
653
+ base_url,
654
+ total_batches,
655
+ )
656
+
657
+ # Create all tasks
658
+ tasks = [
659
+ process_with_semaphore(idx, batch_start, batch)
660
+ for idx, (batch_start, batch) in enumerate(batches)
661
+ ]
662
+
663
+ # Execute all batches concurrently
664
+ results = await asyncio.gather(*tasks)
665
+
666
+ # Merge all results
667
+ for result in results:
668
+ numbered_items.update(result)
669
+
670
+ # Count results
671
+ type_stats: dict[str, int] = {}
672
+ for item_data in numbered_items.values():
673
+ item_type = item_data.get("type", "Unknown")
674
+ type_stats[item_type] = type_stats.get(item_type, 0) + 1
675
+
676
+ logger.info(f"\nExtraction complete, total {len(numbered_items)} numbered items")
677
+ logger.info("Statistics by type:")
678
+ for item_type, count in sorted(type_stats.items()):
679
+ logger.info(f" - {item_type}: {count}")
680
+
681
+ return numbered_items
682
+
683
+
684
+ def extract_numbered_items_with_llm(
685
+ content_items: list[dict[str, Any]],
686
+ api_key: str,
687
+ base_url: str | None,
688
+ batch_size: int = 20,
689
+ max_concurrent: int = 5,
690
+ ) -> dict[str, dict[str, Any]]:
691
+ """
692
+ Synchronous wrapper for async extraction function
693
+ """
694
+ try:
695
+ loop = asyncio.get_event_loop()
696
+ if loop.is_running():
697
+ # If event loop is already running, check if it's uvloop
698
+ loop_type = type(loop).__name__
699
+ if "uvloop" in loop_type.lower():
700
+ # uvloop doesn't support nest_asyncio, use threading approach
701
+ import concurrent.futures
702
+
703
+ def run_in_new_loop():
704
+ # Create a new asyncio event loop in a new thread
705
+ new_loop = asyncio.new_event_loop()
706
+ asyncio.set_event_loop(new_loop)
707
+ try:
708
+ return new_loop.run_until_complete(
709
+ extract_numbered_items_with_llm_async(
710
+ content_items, api_key, base_url, batch_size, max_concurrent
711
+ )
712
+ )
713
+ finally:
714
+ new_loop.close()
715
+
716
+ # Run in a thread with a new event loop
717
+ with concurrent.futures.ThreadPoolExecutor() as executor:
718
+ future = executor.submit(run_in_new_loop)
719
+ return future.result()
720
+ else:
721
+ # Try nest_asyncio for standard event loops
722
+ try:
723
+ import nest_asyncio
724
+
725
+ nest_asyncio.apply()
726
+ return loop.run_until_complete(
727
+ extract_numbered_items_with_llm_async(
728
+ content_items, api_key, base_url, batch_size, max_concurrent
729
+ )
730
+ )
731
+ except (ValueError, TypeError) as e:
732
+ # nest_asyncio failed, fall back to threading approach
733
+ logger.debug(f"nest_asyncio failed ({e}), using threading fallback")
734
+ import concurrent.futures
735
+
736
+ def run_in_new_loop():
737
+ new_loop = asyncio.new_event_loop()
738
+ asyncio.set_event_loop(new_loop)
739
+ try:
740
+ return new_loop.run_until_complete(
741
+ extract_numbered_items_with_llm_async(
742
+ content_items, api_key, base_url, batch_size, max_concurrent
743
+ )
744
+ )
745
+ finally:
746
+ new_loop.close()
747
+
748
+ with concurrent.futures.ThreadPoolExecutor() as executor:
749
+ future = executor.submit(run_in_new_loop)
750
+ return future.result()
751
+ else:
752
+ return loop.run_until_complete(
753
+ extract_numbered_items_with_llm_async(
754
+ content_items, api_key, base_url, batch_size, max_concurrent
755
+ )
756
+ )
757
+ except RuntimeError:
758
+ # No event loop, create new one
759
+ return asyncio.run(
760
+ extract_numbered_items_with_llm_async(
761
+ content_items, api_key, base_url, batch_size, max_concurrent
762
+ )
763
+ )
764
+
765
+
766
+ def process_content_list(
767
+ content_list_file: Path,
768
+ output_file: Path,
769
+ api_key: str,
770
+ base_url: str | None,
771
+ batch_size: int = 20,
772
+ merge: bool = True,
773
+ ):
774
+ """
775
+ Process content_list file and extract numbered items
776
+
777
+ Args:
778
+ content_list_file: Path to content_list JSON file
779
+ output_file: Path to output JSON file
780
+ api_key: OpenAI API key
781
+ base_url: API base URL
782
+ batch_size: Batch processing size
783
+ merge: Whether to merge with existing results (default True)
784
+ """
785
+ logger.info(f"Reading file: {content_list_file}")
786
+
787
+ # Read content_list
788
+ with open(content_list_file, encoding="utf-8") as f:
789
+ content_items = json.load(f)
790
+
791
+ logger.info(f"File contains {len(content_items)} items")
792
+
793
+ # Extract numbered items
794
+ logger.info("Starting numbered items extraction...")
795
+ new_items = extract_numbered_items_with_llm(
796
+ content_items,
797
+ api_key,
798
+ base_url,
799
+ batch_size,
800
+ max_concurrent=5, # Default concurrency
801
+ )
802
+
803
+ logger.info(f"Extracted {len(new_items)} numbered items this time")
804
+
805
+ # If merge is needed and file exists
806
+ if merge and output_file.exists():
807
+ logger.info(f"Existing file detected: {output_file}")
808
+ try:
809
+ with open(output_file, encoding="utf-8") as f:
810
+ existing_items = json.load(f)
811
+ logger.info(f"Loaded {len(existing_items)} existing numbered items")
812
+
813
+ # Merge (new items will override old items with same identifier)
814
+ merged_count = 0
815
+ for identifier, data in new_items.items():
816
+ if identifier in existing_items:
817
+ merged_count += 1
818
+ existing_items[identifier] = data
819
+
820
+ numbered_items = existing_items
821
+ logger.info(
822
+ f"Merge complete: Updated {merged_count} existing items, added {len(new_items) - merged_count} new items"
823
+ )
824
+ logger.info(f"Total {len(numbered_items)} numbered items after merge")
825
+ except Exception as e:
826
+ logger.warning(f"Could not read existing file, will create new file: {e}")
827
+ numbered_items = new_items
828
+ else:
829
+ numbered_items = new_items
830
+
831
+ # Save results
832
+ output_file.parent.mkdir(parents=True, exist_ok=True)
833
+ with open(output_file, "w", encoding="utf-8") as f:
834
+ json.dump(numbered_items, f, indent=2, ensure_ascii=False)
835
+
836
+ logger.info(f"Results saved to: {output_file}")
837
+
838
+ # Print statistics
839
+ type_counts: dict[str, int] = {}
840
+ for identifier in numbered_items.keys():
841
+ # Identify equations: starting with parenthesis, e.g., (1.2.1)
842
+ if identifier.startswith("(") and ")" in identifier:
843
+ item_type = "Equation"
844
+ else:
845
+ # Extract type from identifier (e.g., "Definition 1.1" -> "Definition")
846
+ parts = identifier.split()
847
+ if parts:
848
+ item_type = parts[0]
849
+ else:
850
+ item_type = "Unknown"
851
+ type_counts[item_type] = type_counts.get(item_type, 0) + 1
852
+
853
+ logger.info("\n=== Extraction Statistics ===")
854
+ for item_type, count in sorted(type_counts.items()):
855
+ logger.info(f" {item_type}: {count}")
856
+
857
+ return numbered_items
858
+
859
+
860
+ def main():
861
+ parser = argparse.ArgumentParser(
862
+ description="Extract numbered important content from knowledge base content_list"
863
+ )
864
+ parser.add_argument(
865
+ "--kb", required=True, help="Knowledge base name (under knowledge_bases directory)"
866
+ )
867
+ parser.add_argument(
868
+ "--content-file",
869
+ help="content_list file name (optional, if not specified, automatically process all JSON files)",
870
+ default=None,
871
+ )
872
+ parser.add_argument(
873
+ "--debug",
874
+ action="store_true",
875
+ help="Debug mode: only process first file (for quick testing)",
876
+ )
877
+ parser.add_argument(
878
+ "--output-name",
879
+ help="Output file name (default: numbered_items.json)",
880
+ default="numbered_items.json",
881
+ )
882
+ parser.add_argument(
883
+ "--base-dir",
884
+ help="Data storage base directory (default: ./knowledge_bases)",
885
+ default="./knowledge_bases",
886
+ )
887
+ parser.add_argument(
888
+ "--batch-size",
889
+ type=int,
890
+ help="Number of items to process per batch (default: 20)",
891
+ default=20,
892
+ )
893
+ parser.add_argument(
894
+ "--max-concurrent", type=int, help="Maximum concurrent tasks (default: 5)", default=5
895
+ )
896
+ parser.add_argument(
897
+ "--no-merge",
898
+ action="store_true",
899
+ help="Do not merge existing results, directly overwrite (default will merge)",
900
+ )
901
+ parser.add_argument(
902
+ "--api-key",
903
+ default=os.getenv("LLM_API_KEY"),
904
+ help="OpenAI API key (default reads from LLM_API_KEY)",
905
+ )
906
+ parser.add_argument(
907
+ "--base-url",
908
+ default=os.getenv("LLM_HOST"),
909
+ help="OpenAI API Base URL (default reads from LLM_HOST)",
910
+ )
911
+
912
+ args = parser.parse_args()
913
+
914
+ # Get API configuration
915
+ api_key = args.api_key
916
+ base_url = args.base_url
917
+
918
+ # Validate API key
919
+ if not api_key:
920
+ raise SystemExit(
921
+ "Missing API Key: Please set environment variable LLM_API_KEY or pass via --api-key"
922
+ )
923
+
924
+ # Build paths
925
+ base_dir = Path(args.base_dir)
926
+ kb_dir = base_dir / args.kb
927
+ content_list_dir = kb_dir / "content_list"
928
+
929
+ # Check if content_list directory exists
930
+ if not content_list_dir.exists():
931
+ logger.error(f"content_list directory does not exist: {content_list_dir}")
932
+ sys.exit(1)
933
+
934
+ # Get list of files to process
935
+ if args.content_file:
936
+ # If file is specified, only process that file
937
+ content_list_files = [content_list_dir / args.content_file]
938
+ if not content_list_files[0].exists():
939
+ logger.error(f"content_list file does not exist: {content_list_files[0]}")
940
+ sys.exit(1)
941
+ else:
942
+ # Otherwise automatically scan all JSON files
943
+ content_list_files = sorted(content_list_dir.glob("*.json"))
944
+ if not content_list_files:
945
+ logger.error(f"No JSON files found in {content_list_dir}")
946
+ sys.exit(1)
947
+
948
+ # Debug mode: only process first file
949
+ if args.debug:
950
+ logger.info(f"āš ļø Debug mode: Only processing first file {content_list_files[0].name}")
951
+ content_list_files = content_list_files[:1]
952
+
953
+ # Output file fixed as numbered_items.json (shared across entire knowledge base)
954
+ output_file = kb_dir / args.output_name
955
+
956
+ # Display configuration information
957
+ logger.info("=" * 60)
958
+ logger.info("šŸ“‹ Configuration Information")
959
+ logger.info("=" * 60)
960
+ logger.info(f"Knowledge base: {args.kb}")
961
+ logger.info(f"Content files: {len(content_list_files)} files")
962
+ for f in content_list_files:
963
+ logger.info(f" - {f.name}")
964
+ logger.info(f"Output file: {output_file}")
965
+ logger.info(f"Batch size: {args.batch_size}")
966
+ logger.info(f"Max concurrent: {args.max_concurrent}")
967
+ logger.info(f"Auto merge: {'Yes' if not args.no_merge else 'No'}")
968
+ logger.info(f"Debug mode: {'Yes' if args.debug else 'No'}")
969
+ logger.info(
970
+ f"API key: {'Set (' + api_key[:8] + '...' + api_key[-4:] + ')' if api_key else 'Not set'}"
971
+ )
972
+ logger.info(f"API base URL: {base_url if base_url else 'Default (https://api.openai.com/v1)'}")
973
+ logger.info("=" * 60)
974
+ logger.info("")
975
+
976
+ try:
977
+ # Process all files
978
+ for idx, content_list_file in enumerate(content_list_files, 1):
979
+ logger.info(f"\n{'=' * 60}")
980
+ logger.info(
981
+ f"Processing file [{idx}/{len(content_list_files)}]: {content_list_file.name}"
982
+ )
983
+ logger.info(f"{'=' * 60}\n")
984
+
985
+ process_content_list(
986
+ content_list_file,
987
+ output_file,
988
+ api_key,
989
+ base_url,
990
+ args.batch_size,
991
+ merge=not args.no_merge, # Auto-merge after first file
992
+ )
993
+
994
+ # From second file onwards, force merge mode
995
+ if idx == 1 and len(content_list_files) > 1:
996
+ args.no_merge = False
997
+ logger.info(f"\nSubsequent files will be automatically merged to {output_file}\n")
998
+
999
+ logger.info("\n" + "=" * 60)
1000
+ logger.info("āœ“ All files processed!")
1001
+ logger.info("=" * 60)
1002
+
1003
+ # Display final statistics
1004
+ if output_file.exists():
1005
+ with open(output_file, encoding="utf-8") as f:
1006
+ final_items = json.load(f)
1007
+
1008
+ logger.info(f"\nFinal result: {output_file}")
1009
+ logger.info(f"Total extracted {len(final_items)} numbered items")
1010
+
1011
+ # Statistics by type
1012
+ type_counts = {}
1013
+ for identifier in final_items.keys():
1014
+ # Identify equations: starting with parenthesis, e.g., (1.2.1)
1015
+ if identifier.startswith("(") and ")" in identifier:
1016
+ item_type = "Equation"
1017
+ else:
1018
+ # Extract type from identifier (e.g., "Definition 1.1" -> "Definition")
1019
+ parts = identifier.split()
1020
+ if parts:
1021
+ item_type = parts[0]
1022
+ else:
1023
+ item_type = "Unknown"
1024
+ type_counts[item_type] = type_counts.get(item_type, 0) + 1
1025
+
1026
+ logger.info("\n=== Final Statistics ===")
1027
+ for item_type, count in sorted(type_counts.items()):
1028
+ logger.info(f" {item_type}: {count}")
1029
+
1030
+ except Exception as e:
1031
+ logger.error(f"\nāœ— Processing failed: {e}")
1032
+ import traceback
1033
+
1034
+ traceback.print_exc()
1035
+ sys.exit(1)
1036
+
1037
+
1038
+ if __name__ == "__main__":
1039
+ main()