botrun-flow-lang 5.12.263__py3-none-any.whl → 6.2.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. botrun_flow_lang/api/auth_api.py +39 -39
  2. botrun_flow_lang/api/auth_utils.py +183 -183
  3. botrun_flow_lang/api/botrun_back_api.py +65 -65
  4. botrun_flow_lang/api/flow_api.py +3 -3
  5. botrun_flow_lang/api/hatch_api.py +508 -508
  6. botrun_flow_lang/api/langgraph_api.py +816 -811
  7. botrun_flow_lang/api/langgraph_constants.py +11 -0
  8. botrun_flow_lang/api/line_bot_api.py +1484 -1484
  9. botrun_flow_lang/api/model_api.py +300 -300
  10. botrun_flow_lang/api/rate_limit_api.py +32 -32
  11. botrun_flow_lang/api/routes.py +79 -79
  12. botrun_flow_lang/api/search_api.py +53 -53
  13. botrun_flow_lang/api/storage_api.py +395 -395
  14. botrun_flow_lang/api/subsidy_api.py +290 -290
  15. botrun_flow_lang/api/subsidy_api_system_prompt.txt +109 -109
  16. botrun_flow_lang/api/user_setting_api.py +70 -70
  17. botrun_flow_lang/api/version_api.py +31 -31
  18. botrun_flow_lang/api/youtube_api.py +26 -26
  19. botrun_flow_lang/constants.py +13 -13
  20. botrun_flow_lang/langgraph_agents/agents/agent_runner.py +178 -178
  21. botrun_flow_lang/langgraph_agents/agents/agent_tools/step_planner.py +77 -77
  22. botrun_flow_lang/langgraph_agents/agents/checkpointer/firestore_checkpointer.py +666 -666
  23. botrun_flow_lang/langgraph_agents/agents/gov_researcher/GOV_RESEARCHER_PRD.md +192 -192
  24. botrun_flow_lang/langgraph_agents/agents/gov_researcher/gemini_subsidy_graph.py +460 -460
  25. botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_2_graph.py +1002 -1002
  26. botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_graph.py +822 -822
  27. botrun_flow_lang/langgraph_agents/agents/langgraph_react_agent.py +730 -723
  28. botrun_flow_lang/langgraph_agents/agents/search_agent_graph.py +864 -864
  29. botrun_flow_lang/langgraph_agents/agents/tools/__init__.py +4 -4
  30. botrun_flow_lang/langgraph_agents/agents/tools/gemini_code_execution.py +376 -376
  31. botrun_flow_lang/langgraph_agents/agents/util/gemini_grounding.py +66 -66
  32. botrun_flow_lang/langgraph_agents/agents/util/html_util.py +316 -316
  33. botrun_flow_lang/langgraph_agents/agents/util/img_util.py +336 -294
  34. botrun_flow_lang/langgraph_agents/agents/util/local_files.py +419 -419
  35. botrun_flow_lang/langgraph_agents/agents/util/mermaid_util.py +86 -86
  36. botrun_flow_lang/langgraph_agents/agents/util/model_utils.py +143 -143
  37. botrun_flow_lang/langgraph_agents/agents/util/pdf_analyzer.py +562 -486
  38. botrun_flow_lang/langgraph_agents/agents/util/pdf_cache.py +250 -250
  39. botrun_flow_lang/langgraph_agents/agents/util/pdf_processor.py +204 -204
  40. botrun_flow_lang/langgraph_agents/agents/util/perplexity_search.py +464 -464
  41. botrun_flow_lang/langgraph_agents/agents/util/plotly_util.py +59 -59
  42. botrun_flow_lang/langgraph_agents/agents/util/tavily_search.py +199 -199
  43. botrun_flow_lang/langgraph_agents/agents/util/usage_metadata.py +34 -0
  44. botrun_flow_lang/langgraph_agents/agents/util/youtube_util.py +90 -90
  45. botrun_flow_lang/langgraph_agents/cache/langgraph_botrun_cache.py +197 -197
  46. botrun_flow_lang/llm_agent/llm_agent.py +19 -19
  47. botrun_flow_lang/llm_agent/llm_agent_util.py +83 -83
  48. botrun_flow_lang/log/.gitignore +2 -2
  49. botrun_flow_lang/main.py +61 -61
  50. botrun_flow_lang/main_fast.py +51 -51
  51. botrun_flow_lang/mcp_server/__init__.py +10 -10
  52. botrun_flow_lang/mcp_server/default_mcp.py +854 -744
  53. botrun_flow_lang/models/nodes/utils.py +205 -205
  54. botrun_flow_lang/models/token_usage.py +34 -34
  55. botrun_flow_lang/requirements.txt +21 -21
  56. botrun_flow_lang/services/base/firestore_base.py +30 -30
  57. botrun_flow_lang/services/hatch/hatch_factory.py +11 -11
  58. botrun_flow_lang/services/hatch/hatch_fs_store.py +419 -419
  59. botrun_flow_lang/services/storage/storage_cs_store.py +206 -206
  60. botrun_flow_lang/services/storage/storage_factory.py +12 -12
  61. botrun_flow_lang/services/storage/storage_store.py +65 -65
  62. botrun_flow_lang/services/user_setting/user_setting_factory.py +9 -9
  63. botrun_flow_lang/services/user_setting/user_setting_fs_store.py +66 -66
  64. botrun_flow_lang/static/docs/tools/index.html +926 -926
  65. botrun_flow_lang/tests/api_functional_tests.py +1525 -1525
  66. botrun_flow_lang/tests/api_stress_test.py +357 -357
  67. botrun_flow_lang/tests/shared_hatch_tests.py +333 -333
  68. botrun_flow_lang/tests/test_botrun_app.py +46 -46
  69. botrun_flow_lang/tests/test_html_util.py +31 -31
  70. botrun_flow_lang/tests/test_img_analyzer.py +190 -190
  71. botrun_flow_lang/tests/test_img_util.py +39 -39
  72. botrun_flow_lang/tests/test_local_files.py +114 -114
  73. botrun_flow_lang/tests/test_mermaid_util.py +103 -103
  74. botrun_flow_lang/tests/test_pdf_analyzer.py +104 -104
  75. botrun_flow_lang/tests/test_plotly_util.py +151 -151
  76. botrun_flow_lang/tests/test_run_workflow_engine.py +65 -65
  77. botrun_flow_lang/tools/generate_docs.py +133 -133
  78. botrun_flow_lang/tools/templates/tools.html +153 -153
  79. botrun_flow_lang/utils/__init__.py +7 -7
  80. botrun_flow_lang/utils/botrun_logger.py +344 -344
  81. botrun_flow_lang/utils/clients/rate_limit_client.py +209 -209
  82. botrun_flow_lang/utils/clients/token_verify_client.py +153 -153
  83. botrun_flow_lang/utils/google_drive_utils.py +654 -654
  84. botrun_flow_lang/utils/langchain_utils.py +324 -324
  85. botrun_flow_lang/utils/yaml_utils.py +9 -9
  86. {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-6.2.21.dist-info}/METADATA +6 -6
  87. botrun_flow_lang-6.2.21.dist-info/RECORD +104 -0
  88. botrun_flow_lang-5.12.263.dist-info/RECORD +0 -102
  89. {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-6.2.21.dist-info}/WHEEL +0 -0
@@ -1,486 +1,562 @@
1
- """
2
- PDF 分析模組
3
-
4
- 提供 PDF 檔案分析功能,支援:
5
- - 小檔 (< 5MB):直接多模態問答
6
- - 大檔 (>= 5MB):壓縮 → 切割 → 平行多模態問答 → LLM 統整結果
7
- """
8
-
9
- import anthropic
10
- import asyncio
11
- import base64
12
- import httpx
13
- import os
14
- from typing import List, Dict, Any
15
-
16
- from dotenv import load_dotenv
17
- from google.oauth2 import service_account
18
-
19
- load_dotenv()
20
-
21
- # 檔案大小閾值(MB)
22
- PDF_SIZE_THRESHOLD_MB = 30.0
23
-
24
- # 切片目標大小(MB)
25
- PDF_CHUNK_TARGET_SIZE_MB = 30.0
26
-
27
- # 最大平行問答數量
28
- MAX_CONCURRENT_CHUNKS = 5
29
-
30
-
31
- def analyze_pdf_with_claude(
32
- pdf_data: str, user_input: str, model_name: str = "claude-sonnet-4-5-20250929"
33
- ):
34
- """
35
- Analyze a PDF file using Claude API
36
-
37
- Args:
38
- pdf_data: Base64-encoded PDF data
39
- user_input: User's query about the PDF content
40
-
41
- Returns:
42
- str: Claude's analysis of the PDF content based on the query
43
- """
44
- # Initialize Anthropic client
45
- client = anthropic.Anthropic()
46
-
47
- # Send to Claude
48
- message = client.messages.create(
49
- model=model_name,
50
- max_tokens=4096, # Increased token limit for detailed analysis
51
- messages=[
52
- {
53
- "role": "user",
54
- "content": [
55
- {
56
- "type": "document",
57
- "source": {
58
- "type": "base64",
59
- "media_type": "application/pdf",
60
- "data": pdf_data,
61
- },
62
- },
63
- {"type": "text", "text": user_input},
64
- ],
65
- }
66
- ],
67
- )
68
-
69
- print(
70
- f"analyze_pdf_with_claude============> input_token: {message.usage.input_tokens} output_token: {message.usage.output_tokens}",
71
- )
72
- return message.content[0].text
73
-
74
-
75
- def analyze_pdf_with_gemini(
76
- pdf_data: str, user_input: str, model_name: str = "gemini-2.5-flash", pdf_url: str = ""
77
- ):
78
- """
79
- Analyze a PDF file using Gemini API
80
-
81
- Args:
82
- pdf_data: Base64-encoded PDF data
83
- user_input: User's query about the PDF content
84
- model_name: Gemini model name to use
85
-
86
- Returns:
87
- str: Gemini's analysis of the PDF content based on the query
88
- """
89
- # 放到要用的時候才 import,不然loading 會花時間
90
- from google import genai
91
- from google.genai import types
92
-
93
- credentials = service_account.Credentials.from_service_account_file(
94
- os.getenv("GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI"),
95
- scopes=["https://www.googleapis.com/auth/cloud-platform"],
96
- )
97
-
98
- client = genai.Client(
99
- credentials=credentials,
100
- project="scoop-386004",
101
- location="us-central1",
102
- )
103
- response = client.models.generate_content(
104
- model=model_name,
105
- contents=[
106
- user_input,
107
- types.Part(
108
- inline_data={
109
- "mime_type": "application/pdf",
110
- "data": pdf_data,
111
- }
112
- ),
113
- ],
114
- )
115
- # Log token usage if available
116
- if hasattr(response, "usage_metadata"):
117
- print(
118
- f"analyze_pdf_with_gemini============> input_token: {response.usage_metadata.prompt_token_count} output_token: {response.usage_metadata.candidates_token_count}",
119
- )
120
-
121
- print(f"{pdf_url} success")
122
- return response.text
123
-
124
-
125
- def _analyze_single_chunk(
126
- chunk_data: str, page_range: str, user_input: str, model_name: str
127
- ) -> Dict[str, Any]:
128
- """
129
- 分析單一 PDF 切片
130
-
131
- Args:
132
- chunk_data: Base64-encoded PDF chunk data
133
- page_range: 頁碼範圍字串 (e.g., "page-001-015")
134
- user_input: 使用者問題
135
- model_name: 使用的模型名稱
136
-
137
- Returns:
138
- Dict: {"page_range": str, "answer": str, "relevant": bool, "error": str|None}
139
- """
140
- # 構建切片專用的 prompt
141
- chunk_prompt = f"""你正在閱讀一份大型 PDF 文件的其中一部分({page_range})。
142
-
143
- 使用者問題:{user_input}
144
-
145
- 請根據這個部分的內容回答問題:
146
- - 如果這個部分包含與問題相關的資訊,請詳細回答
147
- - 如果這個部分與問題完全無關,請只回答「NOT_RELEVANT」(不要回答其他內容)
148
- - 回答時請標註資訊來源的頁碼"""
149
-
150
- try:
151
- if model_name.startswith("gemini-"):
152
- answer = analyze_pdf_with_gemini(chunk_data, chunk_prompt, model_name)
153
- elif model_name.startswith("claude-"):
154
- answer = analyze_pdf_with_claude(chunk_data, chunk_prompt, model_name)
155
- else:
156
- return {
157
- "page_range": page_range,
158
- "answer": "",
159
- "relevant": False,
160
- "error": f"Unknown model type: {model_name}",
161
- }
162
-
163
- # 判斷是否相關
164
- is_relevant = "NOT_RELEVANT" not in answer.upper()
165
-
166
- return {
167
- "page_range": page_range,
168
- "answer": answer if is_relevant else "",
169
- "relevant": is_relevant,
170
- "error": None,
171
- }
172
-
173
- except Exception as e:
174
- import traceback
175
-
176
- traceback.print_exc()
177
- return {
178
- "page_range": page_range,
179
- "answer": "",
180
- "relevant": False,
181
- "error": str(e),
182
- }
183
-
184
-
185
- async def analyze_pdf_chunks_parallel(
186
- chunks: List[tuple], user_input: str, model_name: str, max_concurrent: int = 5
187
- ) -> List[Dict[str, Any]]:
188
- """
189
- 平行問答多個 PDF 切片
190
-
191
- Args:
192
- chunks: 切片清單 [(chunk_bytes, page_range), ...]
193
- user_input: 使用者問題
194
- model_name: 使用的模型名稱
195
- max_concurrent: 最大平行數量
196
-
197
- Returns:
198
- List[Dict]: 每個切片的回答結果
199
- """
200
- semaphore = asyncio.Semaphore(max_concurrent)
201
-
202
- async def analyze_with_semaphore(chunk_bytes: bytes, page_range: str):
203
- async with semaphore:
204
- # 將 bytes 轉為 base64
205
- chunk_data = base64.standard_b64encode(chunk_bytes).decode("utf-8")
206
-
207
- # 使用 run_in_executor 執行同步函數
208
- loop = asyncio.get_event_loop()
209
- return await loop.run_in_executor(
210
- None,
211
- _analyze_single_chunk,
212
- chunk_data,
213
- page_range,
214
- user_input,
215
- model_name,
216
- )
217
-
218
- # 建立所有任務
219
- tasks = [
220
- analyze_with_semaphore(chunk_bytes, page_range)
221
- for chunk_bytes, page_range in chunks
222
- ]
223
-
224
- # 平行執行
225
- results = await asyncio.gather(*tasks, return_exceptions=True)
226
-
227
- # 處理例外
228
- processed_results = []
229
- for i, result in enumerate(results):
230
- if isinstance(result, Exception):
231
- processed_results.append(
232
- {
233
- "page_range": chunks[i][1],
234
- "answer": "",
235
- "relevant": False,
236
- "error": str(result),
237
- }
238
- )
239
- else:
240
- processed_results.append(result)
241
-
242
- return processed_results
243
-
244
-
245
- def merge_chunk_results(
246
- chunk_results: List[Dict[str, Any]],
247
- user_input: str,
248
- model_name: str = "gemini-2.5-flash",
249
- ) -> str:
250
- """
251
- 使用 LLM 統整多個切片的回答
252
-
253
- Args:
254
- chunk_results: 切片回答結果清單
255
- user_input: 原始使用者問題
256
- model_name: 統整使用的模型名稱
257
-
258
- Returns:
259
- str: 統整後的回答
260
- """
261
- # 過濾出相關的回答
262
- relevant_results = [r for r in chunk_results if r.get("relevant", False)]
263
-
264
- if not relevant_results:
265
- # 沒有找到相關內容
266
- error_results = [r for r in chunk_results if r.get("error")]
267
- if error_results:
268
- error_msgs = [f"{r['page_range']}: {r['error']}" for r in error_results]
269
- return f"分析 PDF 時發生錯誤:\n" + "\n".join(error_msgs)
270
- return "在 PDF 文件中未找到與您問題相關的內容。"
271
-
272
- # 只有一個相關結果,直接回傳
273
- if len(relevant_results) == 1:
274
- return relevant_results[0]["answer"]
275
-
276
- # 多個相關結果,需要統整
277
- combined_content = "\n\n".join(
278
- [
279
- f"【{r['page_range']}】\n{r['answer']}"
280
- for r in relevant_results
281
- ]
282
- )
283
-
284
- merge_prompt = f"""以下是從一份大型 PDF 文件的不同部分擷取的回答,請統整這些資訊來回答使用者的問題。
285
-
286
- 使用者問題:{user_input}
287
-
288
- 各部分的回答:
289
- {combined_content}
290
-
291
- 請統整以上資訊,提供一個完整、連貫的回答。如果不同部分有互補的資訊,請整合在一起。請保留頁碼引用。"""
292
-
293
- try:
294
- # 使用 LLM 統整(這裡不需要傳 PDF,只是純文字統整)
295
- from google import genai
296
-
297
- credentials = service_account.Credentials.from_service_account_file(
298
- os.getenv("GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI"),
299
- scopes=["https://www.googleapis.com/auth/cloud-platform"],
300
- )
301
-
302
- client = genai.Client(
303
- credentials=credentials,
304
- project="scoop-386004",
305
- location="us-central1",
306
- )
307
-
308
- response = client.models.generate_content(
309
- model=model_name,
310
- contents=[merge_prompt],
311
- )
312
-
313
- if hasattr(response, "usage_metadata"):
314
- print(
315
- f"merge_chunk_results============> input_token: {response.usage_metadata.prompt_token_count} output_token: {response.usage_metadata.candidates_token_count}",
316
- )
317
-
318
- return response.text
319
-
320
- except Exception as e:
321
- import traceback
322
-
323
- traceback.print_exc()
324
- # 統整失敗,直接回傳合併的內容
325
- return f"統整時發生錯誤,以下是各部分的回答:\n\n{combined_content}"
326
-
327
-
328
- async def analyze_pdf_async(pdf_url: str, user_input: str) -> str:
329
- """
330
- 非同步分析 PDF 檔案(智慧處理策略)
331
-
332
- 根據檔案大小自動選擇處理策略:
333
- - < 5MB: 直接多模態問答
334
- - >= 5MB: 壓縮 → 切割 → 平行多模態問答 → LLM 統整結果
335
-
336
- Args:
337
- pdf_url: PDF 檔案的 URL
338
- user_input: 使用者問題
339
-
340
- Returns:
341
- str: 分析結果
342
- """
343
- try:
344
- # 1. 下載 PDF
345
- print(f"[analyze_pdf_async] 下載 PDF: {pdf_url}")
346
- pdf_content = httpx.get(pdf_url, timeout=60.0).content
347
- pdf_size_mb = len(pdf_content) / (1024 * 1024)
348
- print(f"[analyze_pdf_async] PDF 大小: {pdf_size_mb:.2f} MB")
349
-
350
- # 取得模型設定
351
- models_str = os.getenv("PDF_ANALYZER_MODEL", "gemini-2.5-flash")
352
- print(f"[analyze_pdf_async] 使用模型: {models_str}")
353
- models = [model.strip() for model in models_str.split(",")]
354
- primary_model = models[0]
355
-
356
- # 2. 判斷處理策略
357
- if pdf_size_mb < PDF_SIZE_THRESHOLD_MB:
358
- # 小檔:直接多模態問答
359
- print(f"[analyze_pdf_async] 小檔模式 (< {PDF_SIZE_THRESHOLD_MB}MB)")
360
- pdf_data = base64.standard_b64encode(pdf_content).decode("utf-8")
361
-
362
- # 嘗試所有模型
363
- last_error = None
364
- for model in models:
365
- try:
366
- if model.startswith("gemini-"):
367
- return analyze_pdf_with_gemini(pdf_data, user_input, model, pdf_url)
368
- elif model.startswith("claude-"):
369
- return analyze_pdf_with_claude(pdf_data, user_input, model)
370
- except Exception as e:
371
- import traceback
372
-
373
- traceback.print_exc()
374
- last_error = str(e)
375
- continue
376
-
377
- return f"分析 PDF 時所有模型都失敗。最後錯誤: {last_error}"
378
-
379
- # 3. 大檔:壓縮 → 切割 → 平行問答統整
380
- print(f"[analyze_pdf_async] 大檔模式 (>= {PDF_SIZE_THRESHOLD_MB}MB)")
381
-
382
- # 延遲 import 以加快載入
383
- from botrun_flow_lang.langgraph_agents.agents.util.pdf_processor import (
384
- split_pdf_smart,
385
- get_pdf_page_count,
386
- )
387
- from botrun_flow_lang.langgraph_agents.agents.util.pdf_cache import (
388
- get_cache_key,
389
- check_cache,
390
- save_to_cache,
391
- )
392
-
393
- # 3.1 檢查快取
394
- cache_key = get_cache_key(pdf_url)
395
- print(f"[analyze_pdf_async] 檢查快取: {cache_key}")
396
- cached_chunks = await check_cache(cache_key)
397
-
398
- if cached_chunks:
399
- # 有快取,直接使用
400
- print(f"[analyze_pdf_async] 使用快取: {len(cached_chunks)} 個切片")
401
- chunks = cached_chunks
402
- total_pages = sum(
403
- int(pr.split("-")[-1]) - int(pr.split("-")[-2]) + 1
404
- for _, pr in chunks
405
- if pr.startswith("page-")
406
- ) if chunks else 0
407
- else:
408
- # 無快取,切割後存入快取
409
-
410
- # 3.2 切割
411
- print("[analyze_pdf_async] 切割 PDF...")
412
- chunks = split_pdf_smart(pdf_content, target_size_mb=PDF_CHUNK_TARGET_SIZE_MB)
413
- total_pages = get_pdf_page_count(pdf_content)
414
- print(
415
- f"[analyze_pdf_async] 切割完成: {len(chunks)} 個切片, 共 {total_pages} 頁"
416
- )
417
-
418
- # 3.3 存入快取
419
- print("[analyze_pdf_async] 存入快取...")
420
- await save_to_cache(
421
- cache_key=cache_key,
422
- chunks=chunks,
423
- original_url=pdf_url,
424
- original_size_mb=pdf_size_mb,
425
- total_pages=total_pages,
426
- )
427
-
428
- # 3.3 平行問答
429
- print(f"[analyze_pdf_async] 開始平行問答 (最大並行: {MAX_CONCURRENT_CHUNKS})...")
430
- chunk_results = await analyze_pdf_chunks_parallel(
431
- chunks, user_input, primary_model, max_concurrent=MAX_CONCURRENT_CHUNKS
432
- )
433
-
434
- # 統計結果
435
- relevant_count = sum(1 for r in chunk_results if r.get("relevant", False))
436
- error_count = sum(1 for r in chunk_results if r.get("error"))
437
- print(
438
- f"[analyze_pdf_async] 問答完成: {relevant_count}/{len(chunks)} 個切片有相關內容, "
439
- f"{error_count} 個錯誤"
440
- )
441
-
442
- # 3.4 統整結果
443
- print("[analyze_pdf_async] 統整結果...")
444
- result = merge_chunk_results(chunk_results, user_input, primary_model)
445
- print("[analyze_pdf_async] 完成")
446
-
447
- return result
448
-
449
- except Exception as e:
450
- import traceback
451
-
452
- traceback.print_exc()
453
- return f"分析 PDF {pdf_url} 時發生錯誤: {str(e)}"
454
-
455
-
456
- def analyze_pdf(pdf_url: str, user_input: str) -> str:
457
- """
458
- 分析 PDF 檔案(同步包裝函數)
459
-
460
- 這是一個同步函數,內部會建立事件迴圈來執行非同步的 analyze_pdf_async。
461
- 為了向後相容,保留這個同步介面。
462
-
463
- Args:
464
- pdf_url: PDF 檔案的 URL
465
- user_input: 使用者問題
466
-
467
- Returns:
468
- str: 分析結果
469
- """
470
- try:
471
- # 嘗試取得現有的事件迴圈
472
- loop = asyncio.get_event_loop()
473
- if loop.is_running():
474
- # 如果已經在事件迴圈中,建立新的任務
475
- import concurrent.futures
476
-
477
- with concurrent.futures.ThreadPoolExecutor() as executor:
478
- future = executor.submit(
479
- asyncio.run, analyze_pdf_async(pdf_url, user_input)
480
- )
481
- return future.result()
482
- else:
483
- return loop.run_until_complete(analyze_pdf_async(pdf_url, user_input))
484
- except RuntimeError:
485
- # 沒有事件迴圈,建立新的
486
- return asyncio.run(analyze_pdf_async(pdf_url, user_input))
1
+ """
2
+ PDF 分析模組
3
+
4
+ 提供 PDF 檔案分析功能,支援:
5
+ - 小檔 (< 5MB):直接多模態問答
6
+ - 大檔 (>= 5MB):壓縮 → 切割 → 平行多模態問答 → LLM 統整結果
7
+ """
8
+
9
+ import anthropic
10
+ import asyncio
11
+ import base64
12
+ import httpx
13
+ import os
14
+ from typing import List, Dict, Any, Tuple
15
+
16
+ from dotenv import load_dotenv
17
+ from google.oauth2 import service_account
18
+
19
+ from botrun_flow_lang.langgraph_agents.agents.util.usage_metadata import UsageMetadata
20
+
21
+ load_dotenv()
22
+
23
+ # 檔案大小閾值(MB)
24
+ PDF_SIZE_THRESHOLD_MB = 30.0
25
+
26
+ # 切片目標大小(MB)
27
+ PDF_CHUNK_TARGET_SIZE_MB = 30.0
28
+
29
+ # 最大平行問答數量
30
+ MAX_CONCURRENT_CHUNKS = 5
31
+
32
+
33
+ def analyze_pdf_with_claude(
34
+ pdf_data: str, user_input: str, model_name: str = "claude-sonnet-4-5-20250929"
35
+ ) -> Tuple[str, UsageMetadata]:
36
+ """
37
+ Analyze a PDF file using Claude API
38
+
39
+ Args:
40
+ pdf_data: Base64-encoded PDF data
41
+ user_input: User's query about the PDF content
42
+ model_name: Claude model name to use
43
+
44
+ Returns:
45
+ Tuple[str, UsageMetadata]: Claude's analysis and usage metadata
46
+ """
47
+ # Initialize Anthropic client
48
+ client = anthropic.Anthropic()
49
+
50
+ # Send to Claude
51
+ message = client.messages.create(
52
+ model=model_name,
53
+ max_tokens=4096, # Increased token limit for detailed analysis
54
+ messages=[
55
+ {
56
+ "role": "user",
57
+ "content": [
58
+ {
59
+ "type": "document",
60
+ "source": {
61
+ "type": "base64",
62
+ "media_type": "application/pdf",
63
+ "data": pdf_data,
64
+ },
65
+ },
66
+ {"type": "text", "text": user_input},
67
+ ],
68
+ }
69
+ ],
70
+ )
71
+
72
+ # Extract usage metadata
73
+ usage = UsageMetadata(
74
+ prompt_tokens=message.usage.input_tokens,
75
+ completion_tokens=message.usage.output_tokens,
76
+ total_tokens=message.usage.input_tokens + message.usage.output_tokens,
77
+ cache_creation_input_tokens=getattr(message.usage, 'cache_creation_input_tokens', 0) or 0,
78
+ cache_read_input_tokens=getattr(message.usage, 'cache_read_input_tokens', 0) or 0,
79
+ model=model_name,
80
+ )
81
+
82
+ print(
83
+ f"analyze_pdf_with_claude============> input_token: {message.usage.input_tokens} output_token: {message.usage.output_tokens}",
84
+ )
85
+ return message.content[0].text, usage
86
+
87
+
88
+ def analyze_pdf_with_gemini(
89
+ pdf_data: str, user_input: str, model_name: str = "gemini-2.5-flash", pdf_url: str = ""
90
+ ) -> Tuple[str, UsageMetadata]:
91
+ """
92
+ Analyze a PDF file using Gemini API
93
+
94
+ Args:
95
+ pdf_data: Base64-encoded PDF data
96
+ user_input: User's query about the PDF content
97
+ model_name: Gemini model name to use
98
+ pdf_url: Original PDF URL for logging
99
+
100
+ Returns:
101
+ Tuple[str, UsageMetadata]: Gemini's analysis and usage metadata
102
+ """
103
+ # 放到要用的時候才 import,不然loading 會花時間
104
+ from google import genai
105
+ from google.genai import types
106
+
107
+ credentials = service_account.Credentials.from_service_account_file(
108
+ os.getenv("GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI"),
109
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
110
+ )
111
+
112
+ client = genai.Client(
113
+ credentials=credentials,
114
+ project="scoop-386004",
115
+ location="us-central1",
116
+ )
117
+ response = client.models.generate_content(
118
+ model=model_name,
119
+ contents=[
120
+ user_input,
121
+ types.Part(
122
+ inline_data={
123
+ "mime_type": "application/pdf",
124
+ "data": pdf_data,
125
+ }
126
+ ),
127
+ ],
128
+ )
129
+
130
+ # Extract usage metadata
131
+ usage = UsageMetadata(model=model_name)
132
+ if hasattr(response, "usage_metadata"):
133
+ usage_meta = response.usage_metadata
134
+ usage = UsageMetadata(
135
+ prompt_tokens=getattr(usage_meta, 'prompt_token_count', 0) or 0,
136
+ completion_tokens=getattr(usage_meta, 'candidates_token_count', 0) or 0,
137
+ total_tokens=getattr(usage_meta, 'total_token_count', 0) or 0,
138
+ cache_creation_input_tokens=0,
139
+ cache_read_input_tokens=getattr(usage_meta, 'cached_content_token_count', 0) or 0,
140
+ model=model_name,
141
+ )
142
+ print(
143
+ f"analyze_pdf_with_gemini============> input_token: {usage_meta.prompt_token_count} output_token: {usage_meta.candidates_token_count}",
144
+ )
145
+
146
+ print(f"{pdf_url} success")
147
+ return response.text, usage
148
+
149
+
150
+ def _analyze_single_chunk(
151
+ chunk_data: str, page_range: str, user_input: str, model_name: str
152
+ ) -> Dict[str, Any]:
153
+ """
154
+ 分析單一 PDF 切片
155
+
156
+ Args:
157
+ chunk_data: Base64-encoded PDF chunk data
158
+ page_range: 頁碼範圍字串 (e.g., "page-001-015")
159
+ user_input: 使用者問題
160
+ model_name: 使用的模型名稱
161
+
162
+ Returns:
163
+ Dict: {"page_range": str, "answer": str, "relevant": bool, "error": str|None, "usage": UsageMetadata}
164
+ """
165
+ # 構建切片專用的 prompt
166
+ chunk_prompt = f"""你正在閱讀一份大型 PDF 文件的其中一部分({page_range})。
167
+
168
+ 使用者問題:{user_input}
169
+
170
+ 請根據這個部分的內容回答問題:
171
+ - 如果這個部分包含與問題相關的資訊,請詳細回答
172
+ - 如果這個部分與問題完全無關,請只回答「NOT_RELEVANT」(不要回答其他內容)
173
+ - 回答時請標註資訊來源的頁碼"""
174
+
175
+ try:
176
+ if model_name.startswith("gemini-"):
177
+ answer, usage = analyze_pdf_with_gemini(chunk_data, chunk_prompt, model_name)
178
+ elif model_name.startswith("claude-"):
179
+ answer, usage = analyze_pdf_with_claude(chunk_data, chunk_prompt, model_name)
180
+ else:
181
+ return {
182
+ "page_range": page_range,
183
+ "answer": "",
184
+ "relevant": False,
185
+ "error": f"Unknown model type: {model_name}",
186
+ "usage": UsageMetadata(),
187
+ }
188
+
189
+ # 判斷是否相關
190
+ is_relevant = "NOT_RELEVANT" not in answer.upper()
191
+
192
+ return {
193
+ "page_range": page_range,
194
+ "answer": answer if is_relevant else "",
195
+ "relevant": is_relevant,
196
+ "error": None,
197
+ "usage": usage,
198
+ }
199
+
200
+ except Exception as e:
201
+ import traceback
202
+
203
+ traceback.print_exc()
204
+ return {
205
+ "page_range": page_range,
206
+ "answer": "",
207
+ "relevant": False,
208
+ "error": str(e),
209
+ "usage": UsageMetadata(model=model_name),
210
+ }
211
+
212
+
213
+ async def analyze_pdf_chunks_parallel(
214
+ chunks: List[tuple], user_input: str, model_name: str, max_concurrent: int = 5
215
+ ) -> Tuple[List[Dict[str, Any]], List[UsageMetadata]]:
216
+ """
217
+ 平行問答多個 PDF 切片
218
+
219
+ Args:
220
+ chunks: 切片清單 [(chunk_bytes, page_range), ...]
221
+ user_input: 使用者問題
222
+ model_name: 使用的模型名稱
223
+ max_concurrent: 最大平行數量
224
+
225
+ Returns:
226
+ Tuple[List[Dict], List[UsageMetadata]]: 每個切片的回答結果和每次呼叫的 usage list
227
+ """
228
+ semaphore = asyncio.Semaphore(max_concurrent)
229
+
230
+ async def analyze_with_semaphore(chunk_bytes: bytes, page_range: str):
231
+ async with semaphore:
232
+ # 將 bytes 轉為 base64
233
+ chunk_data = base64.standard_b64encode(chunk_bytes).decode("utf-8")
234
+
235
+ # 使用 run_in_executor 執行同步函數
236
+ loop = asyncio.get_event_loop()
237
+ return await loop.run_in_executor(
238
+ None,
239
+ _analyze_single_chunk,
240
+ chunk_data,
241
+ page_range,
242
+ user_input,
243
+ model_name,
244
+ )
245
+
246
+ # 建立所有任務
247
+ tasks = [
248
+ analyze_with_semaphore(chunk_bytes, page_range)
249
+ for chunk_bytes, page_range in chunks
250
+ ]
251
+
252
+ # 平行執行
253
+ results = await asyncio.gather(*tasks, return_exceptions=True)
254
+
255
+ # 處理例外並收集 usage list
256
+ processed_results = []
257
+ usage_list = []
258
+ for i, result in enumerate(results):
259
+ if isinstance(result, Exception):
260
+ processed_results.append(
261
+ {
262
+ "page_range": chunks[i][1],
263
+ "answer": "",
264
+ "relevant": False,
265
+ "error": str(result),
266
+ "usage": UsageMetadata(model=model_name),
267
+ }
268
+ )
269
+ usage_list.append(UsageMetadata(model=model_name))
270
+ else:
271
+ processed_results.append(result)
272
+ # 收集 usage
273
+ if "usage" in result and isinstance(result["usage"], UsageMetadata):
274
+ usage_list.append(result["usage"])
275
+
276
+ return processed_results, usage_list
277
+
278
+
279
+ def merge_chunk_results(
280
+ chunk_results: List[Dict[str, Any]],
281
+ user_input: str,
282
+ model_name: str = "gemini-2.5-flash",
283
+ ) -> Tuple[str, UsageMetadata]:
284
+ """
285
+ 使用 LLM 統整多個切片的回答
286
+
287
+ Args:
288
+ chunk_results: 切片回答結果清單
289
+ user_input: 原始使用者問題
290
+ model_name: 統整使用的模型名稱
291
+
292
+ Returns:
293
+ Tuple[str, UsageMetadata]: 統整後的回答和 usage metadata
294
+ """
295
+ # 過濾出相關的回答
296
+ relevant_results = [r for r in chunk_results if r.get("relevant", False)]
297
+
298
+ if not relevant_results:
299
+ # 沒有找到相關內容
300
+ error_results = [r for r in chunk_results if r.get("error")]
301
+ if error_results:
302
+ error_msgs = [f"{r['page_range']}: {r['error']}" for r in error_results]
303
+ return f"分析 PDF 時發生錯誤:\n" + "\n".join(error_msgs), UsageMetadata(model=model_name)
304
+ return "在 PDF 文件中未找到與您問題相關的內容。", UsageMetadata(model=model_name)
305
+
306
+ # 只有一個相關結果,直接回傳(不需要額外的 LLM 呼叫)
307
+ if len(relevant_results) == 1:
308
+ return relevant_results[0]["answer"], UsageMetadata(model=model_name)
309
+
310
+ # 多個相關結果,需要統整
311
+ combined_content = "\n\n".join(
312
+ [
313
+ f"【{r['page_range']}】\n{r['answer']}"
314
+ for r in relevant_results
315
+ ]
316
+ )
317
+
318
+ merge_prompt = f"""以下是從一份大型 PDF 文件的不同部分擷取的回答,請統整這些資訊來回答使用者的問題。
319
+
320
+ 使用者問題:{user_input}
321
+
322
+ 各部分的回答:
323
+ {combined_content}
324
+
325
+ 請統整以上資訊,提供一個完整、連貫的回答。如果不同部分有互補的資訊,請整合在一起。請保留頁碼引用。"""
326
+
327
+ try:
328
+ # 使用 LLM 統整(這裡不需要傳 PDF,只是純文字統整)
329
+ from google import genai
330
+
331
+ credentials = service_account.Credentials.from_service_account_file(
332
+ os.getenv("GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI"),
333
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
334
+ )
335
+
336
+ client = genai.Client(
337
+ credentials=credentials,
338
+ project="scoop-386004",
339
+ location="us-central1",
340
+ )
341
+
342
+ response = client.models.generate_content(
343
+ model=model_name,
344
+ contents=[merge_prompt],
345
+ )
346
+
347
+ # Extract usage metadata
348
+ usage = UsageMetadata(model=model_name)
349
+ if hasattr(response, "usage_metadata"):
350
+ usage_meta = response.usage_metadata
351
+ usage = UsageMetadata(
352
+ prompt_tokens=getattr(usage_meta, 'prompt_token_count', 0) or 0,
353
+ completion_tokens=getattr(usage_meta, 'candidates_token_count', 0) or 0,
354
+ total_tokens=getattr(usage_meta, 'total_token_count', 0) or 0,
355
+ cache_creation_input_tokens=0,
356
+ cache_read_input_tokens=getattr(usage_meta, 'cached_content_token_count', 0) or 0,
357
+ model=model_name,
358
+ )
359
+ print(
360
+ f"merge_chunk_results============> input_token: {usage_meta.prompt_token_count} output_token: {usage_meta.candidates_token_count}",
361
+ )
362
+
363
+ return response.text, usage
364
+
365
+ except Exception as e:
366
+ import traceback
367
+
368
+ traceback.print_exc()
369
+ # 統整失敗,直接回傳合併的內容
370
+ return f"統整時發生錯誤,以下是各部分的回答:\n\n{combined_content}", UsageMetadata(model=model_name)
371
+
372
+
373
+ async def analyze_pdf_async(pdf_url: str, user_input: str) -> Dict[str, Any]:
374
+ """
375
+ 非同步分析 PDF 檔案(智慧處理策略)
376
+
377
+ 根據檔案大小自動選擇處理策略:
378
+ - < 5MB: 直接多模態問答
379
+ - >= 5MB: 壓縮 → 切割 → 平行多模態問答LLM 統整結果
380
+
381
+ Args:
382
+ pdf_url: PDF 檔案的 URL
383
+ user_input: 使用者問題
384
+
385
+ Returns:
386
+ Dict[str, Any]: {
387
+ "result": str, # 分析結果
388
+ "usage_metadata": List[Dict] # 每次 LLM 呼叫的 usage 資訊
389
+ }
390
+ """
391
+ usage_list: List[UsageMetadata] = []
392
+
393
+ try:
394
+ # 1. 下載 PDF
395
+ print(f"[analyze_pdf_async] 下載 PDF: {pdf_url}")
396
+ pdf_content = httpx.get(pdf_url, timeout=60.0).content
397
+ pdf_size_mb = len(pdf_content) / (1024 * 1024)
398
+ print(f"[analyze_pdf_async] PDF 大小: {pdf_size_mb:.2f} MB")
399
+
400
+ # 取得模型設定
401
+ models_str = os.getenv("PDF_ANALYZER_MODEL", "gemini-2.5-flash")
402
+ print(f"[analyze_pdf_async] 使用模型: {models_str}")
403
+ models = [model.strip() for model in models_str.split(",")]
404
+ primary_model = models[0]
405
+
406
+ # 2. 判斷處理策略
407
+ if pdf_size_mb < PDF_SIZE_THRESHOLD_MB:
408
+ # 小檔:直接多模態問答
409
+ print(f"[analyze_pdf_async] 小檔模式 (< {PDF_SIZE_THRESHOLD_MB}MB)")
410
+ pdf_data = base64.standard_b64encode(pdf_content).decode("utf-8")
411
+
412
+ # 嘗試所有模型
413
+ last_error = None
414
+ for model in models:
415
+ try:
416
+ if model.startswith("gemini-"):
417
+ result, usage = analyze_pdf_with_gemini(pdf_data, user_input, model, pdf_url)
418
+ usage_list.append(usage)
419
+ return {
420
+ "result": result,
421
+ "usage_metadata": [u.to_dict() for u in usage_list],
422
+ }
423
+ elif model.startswith("claude-"):
424
+ result, usage = analyze_pdf_with_claude(pdf_data, user_input, model)
425
+ usage_list.append(usage)
426
+ return {
427
+ "result": result,
428
+ "usage_metadata": [u.to_dict() for u in usage_list],
429
+ }
430
+ except Exception as e:
431
+ import traceback
432
+
433
+ traceback.print_exc()
434
+ last_error = str(e)
435
+ continue
436
+
437
+ return {
438
+ "result": f"分析 PDF 時所有模型都失敗。最後錯誤: {last_error}",
439
+ "usage_metadata": [u.to_dict() for u in usage_list],
440
+ }
441
+
442
+ # 3. 大檔:壓縮 → 切割 → 平行問答 → 統整
443
+ print(f"[analyze_pdf_async] 大檔模式 (>= {PDF_SIZE_THRESHOLD_MB}MB)")
444
+
445
+ # 延遲 import 以加快載入
446
+ from botrun_flow_lang.langgraph_agents.agents.util.pdf_processor import (
447
+ split_pdf_smart,
448
+ get_pdf_page_count,
449
+ )
450
+ from botrun_flow_lang.langgraph_agents.agents.util.pdf_cache import (
451
+ get_cache_key,
452
+ check_cache,
453
+ save_to_cache,
454
+ )
455
+
456
+ # 3.1 檢查快取
457
+ cache_key = get_cache_key(pdf_url)
458
+ print(f"[analyze_pdf_async] 檢查快取: {cache_key}")
459
+ cached_chunks = await check_cache(cache_key)
460
+
461
+ if cached_chunks:
462
+ # 有快取,直接使用
463
+ print(f"[analyze_pdf_async] 使用快取: {len(cached_chunks)} 個切片")
464
+ chunks = cached_chunks
465
+ total_pages = sum(
466
+ int(pr.split("-")[-1]) - int(pr.split("-")[-2]) + 1
467
+ for _, pr in chunks
468
+ if pr.startswith("page-")
469
+ ) if chunks else 0
470
+ else:
471
+ # 無快取,切割後存入快取
472
+
473
+ # 3.2 切割
474
+ print("[analyze_pdf_async] 切割 PDF...")
475
+ chunks = split_pdf_smart(pdf_content, target_size_mb=PDF_CHUNK_TARGET_SIZE_MB)
476
+ total_pages = get_pdf_page_count(pdf_content)
477
+ print(
478
+ f"[analyze_pdf_async] 切割完成: {len(chunks)} 個切片, 共 {total_pages} 頁"
479
+ )
480
+
481
+ # 3.3 存入快取
482
+ print("[analyze_pdf_async] 存入快取...")
483
+ await save_to_cache(
484
+ cache_key=cache_key,
485
+ chunks=chunks,
486
+ original_url=pdf_url,
487
+ original_size_mb=pdf_size_mb,
488
+ total_pages=total_pages,
489
+ )
490
+
491
+ # 3.3 平行問答
492
+ print(f"[analyze_pdf_async] 開始平行問答 (最大並行: {MAX_CONCURRENT_CHUNKS})...")
493
+ chunk_results, chunk_usage_list = await analyze_pdf_chunks_parallel(
494
+ chunks, user_input, primary_model, max_concurrent=MAX_CONCURRENT_CHUNKS
495
+ )
496
+ usage_list.extend(chunk_usage_list)
497
+
498
+ # 統計結果
499
+ relevant_count = sum(1 for r in chunk_results if r.get("relevant", False))
500
+ error_count = sum(1 for r in chunk_results if r.get("error"))
501
+ print(
502
+ f"[analyze_pdf_async] 問答完成: {relevant_count}/{len(chunks)} 個切片有相關內容, "
503
+ f"{error_count} 個錯誤"
504
+ )
505
+
506
+ # 3.4 統整結果
507
+ print("[analyze_pdf_async] 統整結果...")
508
+ result, merge_usage = merge_chunk_results(chunk_results, user_input, primary_model)
509
+ # 只有當 merge_usage 有實際 token 使用時才加入(避免加入空的 usage)
510
+ if merge_usage.prompt_tokens > 0 or merge_usage.completion_tokens > 0:
511
+ usage_list.append(merge_usage)
512
+ print("[analyze_pdf_async] 完成")
513
+
514
+ return {
515
+ "result": result,
516
+ "usage_metadata": [u.to_dict() for u in usage_list],
517
+ }
518
+
519
+ except Exception as e:
520
+ import traceback
521
+
522
+ traceback.print_exc()
523
+ return {
524
+ "result": f"分析 PDF {pdf_url} 時發生錯誤: {str(e)}",
525
+ "usage_metadata": [u.to_dict() for u in usage_list],
526
+ }
527
+
528
+
529
+ def analyze_pdf(pdf_url: str, user_input: str) -> Dict[str, Any]:
530
+ """
531
+ 分析 PDF 檔案(同步包裝函數)
532
+
533
+ 這是一個同步函數,內部會建立事件迴圈來執行非同步的 analyze_pdf_async。
534
+ 為了向後相容,保留這個同步介面。
535
+
536
+ Args:
537
+ pdf_url: PDF 檔案的 URL
538
+ user_input: 使用者問題
539
+
540
+ Returns:
541
+ Dict[str, Any]: {
542
+ "result": str, # 分析結果
543
+ "usage_metadata": List[Dict] # 每次 LLM 呼叫的 usage 資訊
544
+ }
545
+ """
546
+ try:
547
+ # 嘗試取得現有的事件迴圈
548
+ loop = asyncio.get_event_loop()
549
+ if loop.is_running():
550
+ # 如果已經在事件迴圈中,建立新的任務
551
+ import concurrent.futures
552
+
553
+ with concurrent.futures.ThreadPoolExecutor() as executor:
554
+ future = executor.submit(
555
+ asyncio.run, analyze_pdf_async(pdf_url, user_input)
556
+ )
557
+ return future.result()
558
+ else:
559
+ return loop.run_until_complete(analyze_pdf_async(pdf_url, user_input))
560
+ except RuntimeError:
561
+ # 沒有事件迴圈,建立新的
562
+ return asyncio.run(analyze_pdf_async(pdf_url, user_input))