botrun-flow-lang 5.12.263__py3-none-any.whl → 5.12.264__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- botrun_flow_lang/api/auth_api.py +39 -39
- botrun_flow_lang/api/auth_utils.py +183 -183
- botrun_flow_lang/api/botrun_back_api.py +65 -65
- botrun_flow_lang/api/flow_api.py +3 -3
- botrun_flow_lang/api/hatch_api.py +508 -508
- botrun_flow_lang/api/langgraph_api.py +811 -811
- botrun_flow_lang/api/line_bot_api.py +1484 -1484
- botrun_flow_lang/api/model_api.py +300 -300
- botrun_flow_lang/api/rate_limit_api.py +32 -32
- botrun_flow_lang/api/routes.py +79 -79
- botrun_flow_lang/api/search_api.py +53 -53
- botrun_flow_lang/api/storage_api.py +395 -395
- botrun_flow_lang/api/subsidy_api.py +290 -290
- botrun_flow_lang/api/subsidy_api_system_prompt.txt +109 -109
- botrun_flow_lang/api/user_setting_api.py +70 -70
- botrun_flow_lang/api/version_api.py +31 -31
- botrun_flow_lang/api/youtube_api.py +26 -26
- botrun_flow_lang/constants.py +13 -13
- botrun_flow_lang/langgraph_agents/agents/agent_runner.py +178 -178
- botrun_flow_lang/langgraph_agents/agents/agent_tools/step_planner.py +77 -77
- botrun_flow_lang/langgraph_agents/agents/checkpointer/firestore_checkpointer.py +666 -666
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/GOV_RESEARCHER_PRD.md +192 -192
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/gemini_subsidy_graph.py +460 -460
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_2_graph.py +1002 -1002
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_graph.py +822 -822
- botrun_flow_lang/langgraph_agents/agents/langgraph_react_agent.py +723 -723
- botrun_flow_lang/langgraph_agents/agents/search_agent_graph.py +864 -864
- botrun_flow_lang/langgraph_agents/agents/tools/__init__.py +4 -4
- botrun_flow_lang/langgraph_agents/agents/tools/gemini_code_execution.py +376 -376
- botrun_flow_lang/langgraph_agents/agents/util/gemini_grounding.py +66 -66
- botrun_flow_lang/langgraph_agents/agents/util/html_util.py +316 -316
- botrun_flow_lang/langgraph_agents/agents/util/img_util.py +294 -294
- botrun_flow_lang/langgraph_agents/agents/util/local_files.py +419 -419
- botrun_flow_lang/langgraph_agents/agents/util/mermaid_util.py +86 -86
- botrun_flow_lang/langgraph_agents/agents/util/model_utils.py +143 -143
- botrun_flow_lang/langgraph_agents/agents/util/pdf_analyzer.py +486 -486
- botrun_flow_lang/langgraph_agents/agents/util/pdf_cache.py +250 -250
- botrun_flow_lang/langgraph_agents/agents/util/pdf_processor.py +204 -204
- botrun_flow_lang/langgraph_agents/agents/util/perplexity_search.py +464 -464
- botrun_flow_lang/langgraph_agents/agents/util/plotly_util.py +59 -59
- botrun_flow_lang/langgraph_agents/agents/util/tavily_search.py +199 -199
- botrun_flow_lang/langgraph_agents/agents/util/youtube_util.py +90 -90
- botrun_flow_lang/langgraph_agents/cache/langgraph_botrun_cache.py +197 -197
- botrun_flow_lang/llm_agent/llm_agent.py +19 -19
- botrun_flow_lang/llm_agent/llm_agent_util.py +83 -83
- botrun_flow_lang/log/.gitignore +2 -2
- botrun_flow_lang/main.py +61 -61
- botrun_flow_lang/main_fast.py +51 -51
- botrun_flow_lang/mcp_server/__init__.py +10 -10
- botrun_flow_lang/mcp_server/default_mcp.py +744 -744
- botrun_flow_lang/models/nodes/utils.py +205 -205
- botrun_flow_lang/models/token_usage.py +34 -34
- botrun_flow_lang/requirements.txt +21 -21
- botrun_flow_lang/services/base/firestore_base.py +30 -30
- botrun_flow_lang/services/hatch/hatch_factory.py +11 -11
- botrun_flow_lang/services/hatch/hatch_fs_store.py +419 -419
- botrun_flow_lang/services/storage/storage_cs_store.py +206 -206
- botrun_flow_lang/services/storage/storage_factory.py +12 -12
- botrun_flow_lang/services/storage/storage_store.py +65 -65
- botrun_flow_lang/services/user_setting/user_setting_factory.py +9 -9
- botrun_flow_lang/services/user_setting/user_setting_fs_store.py +66 -66
- botrun_flow_lang/static/docs/tools/index.html +926 -926
- botrun_flow_lang/tests/api_functional_tests.py +1525 -1525
- botrun_flow_lang/tests/api_stress_test.py +357 -357
- botrun_flow_lang/tests/shared_hatch_tests.py +333 -333
- botrun_flow_lang/tests/test_botrun_app.py +46 -46
- botrun_flow_lang/tests/test_html_util.py +31 -31
- botrun_flow_lang/tests/test_img_analyzer.py +190 -190
- botrun_flow_lang/tests/test_img_util.py +39 -39
- botrun_flow_lang/tests/test_local_files.py +114 -114
- botrun_flow_lang/tests/test_mermaid_util.py +103 -103
- botrun_flow_lang/tests/test_pdf_analyzer.py +104 -104
- botrun_flow_lang/tests/test_plotly_util.py +151 -151
- botrun_flow_lang/tests/test_run_workflow_engine.py +65 -65
- botrun_flow_lang/tools/generate_docs.py +133 -133
- botrun_flow_lang/tools/templates/tools.html +153 -153
- botrun_flow_lang/utils/__init__.py +7 -7
- botrun_flow_lang/utils/botrun_logger.py +344 -344
- botrun_flow_lang/utils/clients/rate_limit_client.py +209 -209
- botrun_flow_lang/utils/clients/token_verify_client.py +153 -153
- botrun_flow_lang/utils/google_drive_utils.py +654 -654
- botrun_flow_lang/utils/langchain_utils.py +324 -324
- botrun_flow_lang/utils/yaml_utils.py +9 -9
- {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-5.12.264.dist-info}/METADATA +1 -1
- botrun_flow_lang-5.12.264.dist-info/RECORD +102 -0
- botrun_flow_lang-5.12.263.dist-info/RECORD +0 -102
- {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-5.12.264.dist-info}/WHEEL +0 -0
|
@@ -1,204 +1,204 @@
|
|
|
1
|
-
"""
|
|
2
|
-
PDF 處理工具模組
|
|
3
|
-
|
|
4
|
-
提供 PDF 切割等功能,用於處理大型 PDF 檔案。
|
|
5
|
-
使用 pypdf(純 Python)實作,避免 C++ 庫的 segfault 問題。
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import io
|
|
9
|
-
from typing import List, Tuple
|
|
10
|
-
|
|
11
|
-
from pypdf import PdfReader, PdfWriter
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def get_pdf_size(pdf_content: bytes) -> int:
|
|
15
|
-
"""
|
|
16
|
-
取得 PDF 檔案大小(bytes)
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
pdf_content: PDF 檔案的二進位內容
|
|
20
|
-
|
|
21
|
-
Returns:
|
|
22
|
-
int: 檔案大小(bytes)
|
|
23
|
-
"""
|
|
24
|
-
return len(pdf_content)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def get_pdf_size_mb(pdf_content: bytes) -> float:
|
|
28
|
-
"""
|
|
29
|
-
取得 PDF 檔案大小(MB)
|
|
30
|
-
|
|
31
|
-
Args:
|
|
32
|
-
pdf_content: PDF 檔案的二進位內容
|
|
33
|
-
|
|
34
|
-
Returns:
|
|
35
|
-
float: 檔案大小(MB)
|
|
36
|
-
"""
|
|
37
|
-
return len(pdf_content) / (1024 * 1024)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def get_pdf_page_count(pdf_content: bytes) -> int:
|
|
41
|
-
"""
|
|
42
|
-
取得 PDF 總頁數
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
pdf_content: PDF 檔案的二進位內容
|
|
46
|
-
|
|
47
|
-
Returns:
|
|
48
|
-
int: 總頁數
|
|
49
|
-
"""
|
|
50
|
-
try:
|
|
51
|
-
reader = PdfReader(io.BytesIO(pdf_content))
|
|
52
|
-
return len(reader.pages)
|
|
53
|
-
except Exception as e:
|
|
54
|
-
print(f"[get_pdf_page_count] 無法讀取 PDF 頁數: {e}")
|
|
55
|
-
return 0
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def split_pdf_by_pages(
|
|
59
|
-
pdf_content: bytes, pages_per_chunk: int = 15
|
|
60
|
-
) -> List[Tuple[bytes, str]]:
|
|
61
|
-
"""
|
|
62
|
-
按頁數切割 PDF
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
pdf_content: PDF 檔案的二進位內容
|
|
66
|
-
pages_per_chunk: 每個切片的頁數(預設 15 頁)
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
List[Tuple[bytes, str]]: 切片清單,每個元素為 (切片內容, 頁碼範圍字串)
|
|
70
|
-
例如: [(chunk_bytes, "page-001-015"), (chunk_bytes, "page-016-030"), ...]
|
|
71
|
-
"""
|
|
72
|
-
chunks = []
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
reader = PdfReader(io.BytesIO(pdf_content))
|
|
76
|
-
total_pages = len(reader.pages)
|
|
77
|
-
|
|
78
|
-
for start_idx in range(0, total_pages, pages_per_chunk):
|
|
79
|
-
end_idx = min(start_idx + pages_per_chunk, total_pages)
|
|
80
|
-
|
|
81
|
-
# 建立新的 PDF 並複製頁面
|
|
82
|
-
writer = PdfWriter()
|
|
83
|
-
for page_idx in range(start_idx, end_idx):
|
|
84
|
-
writer.add_page(reader.pages[page_idx])
|
|
85
|
-
|
|
86
|
-
# 輸出切片
|
|
87
|
-
output = io.BytesIO()
|
|
88
|
-
writer.write(output)
|
|
89
|
-
chunk_bytes = output.getvalue()
|
|
90
|
-
|
|
91
|
-
# 產生頁碼範圍字串(1-indexed)
|
|
92
|
-
page_range = f"page-{start_idx + 1:03d}-{end_idx:03d}"
|
|
93
|
-
|
|
94
|
-
chunks.append((chunk_bytes, page_range))
|
|
95
|
-
|
|
96
|
-
except Exception as e:
|
|
97
|
-
print(f"[split_pdf_by_pages] 切割 PDF 時發生錯誤: {e}")
|
|
98
|
-
# 如果切割失敗,回傳整個 PDF 作為單一切片
|
|
99
|
-
if pdf_content:
|
|
100
|
-
chunks.append((pdf_content, "page-001-all"))
|
|
101
|
-
|
|
102
|
-
return chunks
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def calculate_optimal_chunk_size(
|
|
106
|
-
pdf_content: bytes,
|
|
107
|
-
target_size_mb: float = 4.0,
|
|
108
|
-
min_pages: int = 5,
|
|
109
|
-
max_pages: int = 30,
|
|
110
|
-
) -> int:
|
|
111
|
-
"""
|
|
112
|
-
計算最佳切割頁數,確保每個切片小於目標大小
|
|
113
|
-
|
|
114
|
-
策略:
|
|
115
|
-
1. 先估算每頁平均大小
|
|
116
|
-
2. 計算達到目標大小需要的頁數
|
|
117
|
-
3. 限制在 min_pages 和 max_pages 之間
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
pdf_content: PDF 檔案的二進位內容
|
|
121
|
-
target_size_mb: 目標切片大小(MB),預設 4MB
|
|
122
|
-
min_pages: 最小頁數,預設 5 頁
|
|
123
|
-
max_pages: 最大頁數,預設 30 頁
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
int: 建議的每個切片頁數
|
|
127
|
-
"""
|
|
128
|
-
total_size_mb = get_pdf_size_mb(pdf_content)
|
|
129
|
-
total_pages = get_pdf_page_count(pdf_content)
|
|
130
|
-
|
|
131
|
-
if total_pages == 0:
|
|
132
|
-
return min_pages
|
|
133
|
-
|
|
134
|
-
# 估算每頁平均大小
|
|
135
|
-
avg_page_size_mb = total_size_mb / total_pages
|
|
136
|
-
|
|
137
|
-
# 計算達到目標大小需要的頁數
|
|
138
|
-
if avg_page_size_mb > 0:
|
|
139
|
-
optimal_pages = int(target_size_mb / avg_page_size_mb)
|
|
140
|
-
else:
|
|
141
|
-
optimal_pages = max_pages
|
|
142
|
-
|
|
143
|
-
# 限制在範圍內
|
|
144
|
-
optimal_pages = max(min_pages, min(optimal_pages, max_pages))
|
|
145
|
-
|
|
146
|
-
return optimal_pages
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def split_pdf_smart(
|
|
150
|
-
pdf_content: bytes, target_size_mb: float = 4.0
|
|
151
|
-
) -> List[Tuple[bytes, str]]:
|
|
152
|
-
"""
|
|
153
|
-
智慧切割 PDF
|
|
154
|
-
|
|
155
|
-
先計算最佳切割頁數,然後進行切割。
|
|
156
|
-
如果切割後某個切片仍超過目標大小,會進一步分割。
|
|
157
|
-
|
|
158
|
-
Args:
|
|
159
|
-
pdf_content: PDF 檔案的二進位內容
|
|
160
|
-
target_size_mb: 目標切片大小(MB),預設 4MB
|
|
161
|
-
|
|
162
|
-
Returns:
|
|
163
|
-
List[Tuple[bytes, str]]: 切片清單,每個元素為 (切片內容, 頁碼範圍字串)
|
|
164
|
-
"""
|
|
165
|
-
# 計算最佳切割頁數
|
|
166
|
-
pages_per_chunk = calculate_optimal_chunk_size(pdf_content, target_size_mb)
|
|
167
|
-
print(f"[split_pdf_smart] 計算最佳切割頁數: {pages_per_chunk} 頁/切片")
|
|
168
|
-
|
|
169
|
-
# 進行初步切割
|
|
170
|
-
chunks = split_pdf_by_pages(pdf_content, pages_per_chunk)
|
|
171
|
-
|
|
172
|
-
# 檢查是否有切片超過目標大小,如果有則進一步分割
|
|
173
|
-
final_chunks = []
|
|
174
|
-
for chunk_bytes, page_range in chunks:
|
|
175
|
-
chunk_size_mb = get_pdf_size_mb(chunk_bytes)
|
|
176
|
-
|
|
177
|
-
if chunk_size_mb > target_size_mb and pages_per_chunk > 5:
|
|
178
|
-
# 這個切片太大,需要進一步分割
|
|
179
|
-
print(
|
|
180
|
-
f"[split_pdf_smart] 切片 {page_range} 大小 {chunk_size_mb:.2f}MB "
|
|
181
|
-
f"超過目標 {target_size_mb}MB,進一步分割"
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
# 取得這個切片的頁碼範圍
|
|
185
|
-
parts = page_range.replace("page-", "").split("-")
|
|
186
|
-
start_page = int(parts[0])
|
|
187
|
-
|
|
188
|
-
# 用更小的頁數重新切割
|
|
189
|
-
smaller_chunks = split_pdf_by_pages(chunk_bytes, pages_per_chunk // 2)
|
|
190
|
-
|
|
191
|
-
# 更新頁碼範圍
|
|
192
|
-
chunk_page_count = get_pdf_page_count(chunk_bytes)
|
|
193
|
-
for i, (sub_chunk, _) in enumerate(smaller_chunks):
|
|
194
|
-
sub_start = start_page + i * (pages_per_chunk // 2)
|
|
195
|
-
sub_end = min(
|
|
196
|
-
sub_start + (pages_per_chunk // 2) - 1,
|
|
197
|
-
start_page + chunk_page_count - 1,
|
|
198
|
-
)
|
|
199
|
-
sub_range = f"page-{sub_start:03d}-{sub_end:03d}"
|
|
200
|
-
final_chunks.append((sub_chunk, sub_range))
|
|
201
|
-
else:
|
|
202
|
-
final_chunks.append((chunk_bytes, page_range))
|
|
203
|
-
|
|
204
|
-
return final_chunks
|
|
1
|
+
"""
|
|
2
|
+
PDF 處理工具模組
|
|
3
|
+
|
|
4
|
+
提供 PDF 切割等功能,用於處理大型 PDF 檔案。
|
|
5
|
+
使用 pypdf(純 Python)實作,避免 C++ 庫的 segfault 問題。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
from typing import List, Tuple
|
|
10
|
+
|
|
11
|
+
from pypdf import PdfReader, PdfWriter
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_pdf_size(pdf_content: bytes) -> int:
|
|
15
|
+
"""
|
|
16
|
+
取得 PDF 檔案大小(bytes)
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
pdf_content: PDF 檔案的二進位內容
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
int: 檔案大小(bytes)
|
|
23
|
+
"""
|
|
24
|
+
return len(pdf_content)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_pdf_size_mb(pdf_content: bytes) -> float:
|
|
28
|
+
"""
|
|
29
|
+
取得 PDF 檔案大小(MB)
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
pdf_content: PDF 檔案的二進位內容
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
float: 檔案大小(MB)
|
|
36
|
+
"""
|
|
37
|
+
return len(pdf_content) / (1024 * 1024)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_pdf_page_count(pdf_content: bytes) -> int:
|
|
41
|
+
"""
|
|
42
|
+
取得 PDF 總頁數
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
pdf_content: PDF 檔案的二進位內容
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
int: 總頁數
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
reader = PdfReader(io.BytesIO(pdf_content))
|
|
52
|
+
return len(reader.pages)
|
|
53
|
+
except Exception as e:
|
|
54
|
+
print(f"[get_pdf_page_count] 無法讀取 PDF 頁數: {e}")
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def split_pdf_by_pages(
|
|
59
|
+
pdf_content: bytes, pages_per_chunk: int = 15
|
|
60
|
+
) -> List[Tuple[bytes, str]]:
|
|
61
|
+
"""
|
|
62
|
+
按頁數切割 PDF
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
pdf_content: PDF 檔案的二進位內容
|
|
66
|
+
pages_per_chunk: 每個切片的頁數(預設 15 頁)
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List[Tuple[bytes, str]]: 切片清單,每個元素為 (切片內容, 頁碼範圍字串)
|
|
70
|
+
例如: [(chunk_bytes, "page-001-015"), (chunk_bytes, "page-016-030"), ...]
|
|
71
|
+
"""
|
|
72
|
+
chunks = []
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
reader = PdfReader(io.BytesIO(pdf_content))
|
|
76
|
+
total_pages = len(reader.pages)
|
|
77
|
+
|
|
78
|
+
for start_idx in range(0, total_pages, pages_per_chunk):
|
|
79
|
+
end_idx = min(start_idx + pages_per_chunk, total_pages)
|
|
80
|
+
|
|
81
|
+
# 建立新的 PDF 並複製頁面
|
|
82
|
+
writer = PdfWriter()
|
|
83
|
+
for page_idx in range(start_idx, end_idx):
|
|
84
|
+
writer.add_page(reader.pages[page_idx])
|
|
85
|
+
|
|
86
|
+
# 輸出切片
|
|
87
|
+
output = io.BytesIO()
|
|
88
|
+
writer.write(output)
|
|
89
|
+
chunk_bytes = output.getvalue()
|
|
90
|
+
|
|
91
|
+
# 產生頁碼範圍字串(1-indexed)
|
|
92
|
+
page_range = f"page-{start_idx + 1:03d}-{end_idx:03d}"
|
|
93
|
+
|
|
94
|
+
chunks.append((chunk_bytes, page_range))
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"[split_pdf_by_pages] 切割 PDF 時發生錯誤: {e}")
|
|
98
|
+
# 如果切割失敗,回傳整個 PDF 作為單一切片
|
|
99
|
+
if pdf_content:
|
|
100
|
+
chunks.append((pdf_content, "page-001-all"))
|
|
101
|
+
|
|
102
|
+
return chunks
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def calculate_optimal_chunk_size(
|
|
106
|
+
pdf_content: bytes,
|
|
107
|
+
target_size_mb: float = 4.0,
|
|
108
|
+
min_pages: int = 5,
|
|
109
|
+
max_pages: int = 30,
|
|
110
|
+
) -> int:
|
|
111
|
+
"""
|
|
112
|
+
計算最佳切割頁數,確保每個切片小於目標大小
|
|
113
|
+
|
|
114
|
+
策略:
|
|
115
|
+
1. 先估算每頁平均大小
|
|
116
|
+
2. 計算達到目標大小需要的頁數
|
|
117
|
+
3. 限制在 min_pages 和 max_pages 之間
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
pdf_content: PDF 檔案的二進位內容
|
|
121
|
+
target_size_mb: 目標切片大小(MB),預設 4MB
|
|
122
|
+
min_pages: 最小頁數,預設 5 頁
|
|
123
|
+
max_pages: 最大頁數,預設 30 頁
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
int: 建議的每個切片頁數
|
|
127
|
+
"""
|
|
128
|
+
total_size_mb = get_pdf_size_mb(pdf_content)
|
|
129
|
+
total_pages = get_pdf_page_count(pdf_content)
|
|
130
|
+
|
|
131
|
+
if total_pages == 0:
|
|
132
|
+
return min_pages
|
|
133
|
+
|
|
134
|
+
# 估算每頁平均大小
|
|
135
|
+
avg_page_size_mb = total_size_mb / total_pages
|
|
136
|
+
|
|
137
|
+
# 計算達到目標大小需要的頁數
|
|
138
|
+
if avg_page_size_mb > 0:
|
|
139
|
+
optimal_pages = int(target_size_mb / avg_page_size_mb)
|
|
140
|
+
else:
|
|
141
|
+
optimal_pages = max_pages
|
|
142
|
+
|
|
143
|
+
# 限制在範圍內
|
|
144
|
+
optimal_pages = max(min_pages, min(optimal_pages, max_pages))
|
|
145
|
+
|
|
146
|
+
return optimal_pages
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def split_pdf_smart(
|
|
150
|
+
pdf_content: bytes, target_size_mb: float = 4.0
|
|
151
|
+
) -> List[Tuple[bytes, str]]:
|
|
152
|
+
"""
|
|
153
|
+
智慧切割 PDF
|
|
154
|
+
|
|
155
|
+
先計算最佳切割頁數,然後進行切割。
|
|
156
|
+
如果切割後某個切片仍超過目標大小,會進一步分割。
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
pdf_content: PDF 檔案的二進位內容
|
|
160
|
+
target_size_mb: 目標切片大小(MB),預設 4MB
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List[Tuple[bytes, str]]: 切片清單,每個元素為 (切片內容, 頁碼範圍字串)
|
|
164
|
+
"""
|
|
165
|
+
# 計算最佳切割頁數
|
|
166
|
+
pages_per_chunk = calculate_optimal_chunk_size(pdf_content, target_size_mb)
|
|
167
|
+
print(f"[split_pdf_smart] 計算最佳切割頁數: {pages_per_chunk} 頁/切片")
|
|
168
|
+
|
|
169
|
+
# 進行初步切割
|
|
170
|
+
chunks = split_pdf_by_pages(pdf_content, pages_per_chunk)
|
|
171
|
+
|
|
172
|
+
# 檢查是否有切片超過目標大小,如果有則進一步分割
|
|
173
|
+
final_chunks = []
|
|
174
|
+
for chunk_bytes, page_range in chunks:
|
|
175
|
+
chunk_size_mb = get_pdf_size_mb(chunk_bytes)
|
|
176
|
+
|
|
177
|
+
if chunk_size_mb > target_size_mb and pages_per_chunk > 5:
|
|
178
|
+
# 這個切片太大,需要進一步分割
|
|
179
|
+
print(
|
|
180
|
+
f"[split_pdf_smart] 切片 {page_range} 大小 {chunk_size_mb:.2f}MB "
|
|
181
|
+
f"超過目標 {target_size_mb}MB,進一步分割"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# 取得這個切片的頁碼範圍
|
|
185
|
+
parts = page_range.replace("page-", "").split("-")
|
|
186
|
+
start_page = int(parts[0])
|
|
187
|
+
|
|
188
|
+
# 用更小的頁數重新切割
|
|
189
|
+
smaller_chunks = split_pdf_by_pages(chunk_bytes, pages_per_chunk // 2)
|
|
190
|
+
|
|
191
|
+
# 更新頁碼範圍
|
|
192
|
+
chunk_page_count = get_pdf_page_count(chunk_bytes)
|
|
193
|
+
for i, (sub_chunk, _) in enumerate(smaller_chunks):
|
|
194
|
+
sub_start = start_page + i * (pages_per_chunk // 2)
|
|
195
|
+
sub_end = min(
|
|
196
|
+
sub_start + (pages_per_chunk // 2) - 1,
|
|
197
|
+
start_page + chunk_page_count - 1,
|
|
198
|
+
)
|
|
199
|
+
sub_range = f"page-{sub_start:03d}-{sub_end:03d}"
|
|
200
|
+
final_chunks.append((sub_chunk, sub_range))
|
|
201
|
+
else:
|
|
202
|
+
final_chunks.append((chunk_bytes, page_range))
|
|
203
|
+
|
|
204
|
+
return final_chunks
|