botrun-flow-lang 5.11.11__py3-none-any.whl → 5.12.261__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,32 @@
1
+ """
2
+ PDF 分析模組
3
+
4
+ 提供 PDF 檔案分析功能,支援:
5
+ - 小檔 (< 5MB):直接多模態問答
6
+ - 大檔 (>= 5MB):壓縮 → 切割 → 平行多模態問答 → LLM 統整結果
7
+ """
8
+
1
9
  import anthropic
10
+ import asyncio
2
11
  import base64
3
12
  import httpx
4
-
5
-
6
13
  import os
14
+ from typing import List, Dict, Any
15
+
7
16
  from dotenv import load_dotenv
8
17
  from google.oauth2 import service_account
9
18
 
10
19
  load_dotenv()
11
20
 
21
+ # 檔案大小閾值(MB)
22
+ PDF_SIZE_THRESHOLD_MB = 30.0
23
+
24
+ # 切片目標大小(MB)
25
+ PDF_CHUNK_TARGET_SIZE_MB = 30.0
26
+
27
+ # 最大平行問答數量
28
+ MAX_CONCURRENT_CHUNKS = 5
29
+
12
30
 
13
31
  def analyze_pdf_with_claude(
14
32
  pdf_data: str, user_input: str, model_name: str = "claude-sonnet-4-5-20250929"
@@ -55,7 +73,7 @@ def analyze_pdf_with_claude(
55
73
 
56
74
 
57
75
  def analyze_pdf_with_gemini(
58
- pdf_data: str, user_input: str, model_name: str = "gemini-2.5-flash"
76
+ pdf_data: str, user_input: str, model_name: str = "gemini-2.5-flash", pdf_url: str = ""
59
77
  ):
60
78
  """
61
79
  Analyze a PDF file using Gemini API
@@ -100,61 +118,369 @@ def analyze_pdf_with_gemini(
100
118
  f"analyze_pdf_with_gemini============> input_token: {response.usage_metadata.prompt_token_count} output_token: {response.usage_metadata.candidates_token_count}",
101
119
  )
102
120
 
121
+ print(f"{pdf_url} success")
103
122
  return response.text
104
123
 
105
124
 
106
- def analyze_pdf(pdf_url: str, user_input: str):
125
+ def _analyze_single_chunk(
126
+ chunk_data: str, page_range: str, user_input: str, model_name: str
127
+ ) -> Dict[str, Any]:
128
+ """
129
+ 分析單一 PDF 切片
130
+
131
+ Args:
132
+ chunk_data: Base64-encoded PDF chunk data
133
+ page_range: 頁碼範圍字串 (e.g., "page-001-015")
134
+ user_input: 使用者問題
135
+ model_name: 使用的模型名稱
136
+
137
+ Returns:
138
+ Dict: {"page_range": str, "answer": str, "relevant": bool, "error": str|None}
139
+ """
140
+ # 構建切片專用的 prompt
141
+ chunk_prompt = f"""你正在閱讀一份大型 PDF 文件的其中一部分({page_range})。
142
+
143
+ 使用者問題:{user_input}
144
+
145
+ 請根據這個部分的內容回答問題:
146
+ - 如果這個部分包含與問題相關的資訊,請詳細回答
147
+ - 如果這個部分與問題完全無關,請只回答「NOT_RELEVANT」(不要回答其他內容)
148
+ - 回答時請標註資訊來源的頁碼"""
149
+
150
+ try:
151
+ if model_name.startswith("gemini-"):
152
+ answer = analyze_pdf_with_gemini(chunk_data, chunk_prompt, model_name)
153
+ elif model_name.startswith("claude-"):
154
+ answer = analyze_pdf_with_claude(chunk_data, chunk_prompt, model_name)
155
+ else:
156
+ return {
157
+ "page_range": page_range,
158
+ "answer": "",
159
+ "relevant": False,
160
+ "error": f"Unknown model type: {model_name}",
161
+ }
162
+
163
+ # 判斷是否相關
164
+ is_relevant = "NOT_RELEVANT" not in answer.upper()
165
+
166
+ return {
167
+ "page_range": page_range,
168
+ "answer": answer if is_relevant else "",
169
+ "relevant": is_relevant,
170
+ "error": None,
171
+ }
172
+
173
+ except Exception as e:
174
+ import traceback
175
+
176
+ traceback.print_exc()
177
+ return {
178
+ "page_range": page_range,
179
+ "answer": "",
180
+ "relevant": False,
181
+ "error": str(e),
182
+ }
183
+
184
+
185
+ async def analyze_pdf_chunks_parallel(
186
+ chunks: List[tuple], user_input: str, model_name: str, max_concurrent: int = 5
187
+ ) -> List[Dict[str, Any]]:
107
188
  """
108
- Analyze a PDF file using multiple models in order of preference based on PDF_ANALYZER_MODEL env var
189
+ 平行問答多個 PDF 切片
190
+
191
+ Args:
192
+ chunks: 切片清單 [(chunk_bytes, page_range), ...]
193
+ user_input: 使用者問題
194
+ model_name: 使用的模型名稱
195
+ max_concurrent: 最大平行數量
196
+
197
+ Returns:
198
+ List[Dict]: 每個切片的回答結果
199
+ """
200
+ semaphore = asyncio.Semaphore(max_concurrent)
201
+
202
+ async def analyze_with_semaphore(chunk_bytes: bytes, page_range: str):
203
+ async with semaphore:
204
+ # 將 bytes 轉為 base64
205
+ chunk_data = base64.standard_b64encode(chunk_bytes).decode("utf-8")
206
+
207
+ # 使用 run_in_executor 執行同步函數
208
+ loop = asyncio.get_event_loop()
209
+ return await loop.run_in_executor(
210
+ None,
211
+ _analyze_single_chunk,
212
+ chunk_data,
213
+ page_range,
214
+ user_input,
215
+ model_name,
216
+ )
217
+
218
+ # 建立所有任務
219
+ tasks = [
220
+ analyze_with_semaphore(chunk_bytes, page_range)
221
+ for chunk_bytes, page_range in chunks
222
+ ]
223
+
224
+ # 平行執行
225
+ results = await asyncio.gather(*tasks, return_exceptions=True)
226
+
227
+ # 處理例外
228
+ processed_results = []
229
+ for i, result in enumerate(results):
230
+ if isinstance(result, Exception):
231
+ processed_results.append(
232
+ {
233
+ "page_range": chunks[i][1],
234
+ "answer": "",
235
+ "relevant": False,
236
+ "error": str(result),
237
+ }
238
+ )
239
+ else:
240
+ processed_results.append(result)
241
+
242
+ return processed_results
109
243
 
110
- If PDF_ANALYZER_MODEL contains comma-separated models, it will try them in order,
111
- falling back to the next one if the previous fails.
244
+
245
+ def merge_chunk_results(
246
+ chunk_results: List[Dict[str, Any]],
247
+ user_input: str,
248
+ model_name: str = "gemini-2.5-flash",
249
+ ) -> str:
250
+ """
251
+ 使用 LLM 統整多個切片的回答
112
252
 
113
253
  Args:
114
- pdf_url: URL to the PDF file
115
- user_input: User's query about the PDF content
254
+ chunk_results: 切片回答結果清單
255
+ user_input: 原始使用者問題
256
+ model_name: 統整使用的模型名稱
257
+
258
+ Returns:
259
+ str: 統整後的回答
260
+ """
261
+ # 過濾出相關的回答
262
+ relevant_results = [r for r in chunk_results if r.get("relevant", False)]
263
+
264
+ if not relevant_results:
265
+ # 沒有找到相關內容
266
+ error_results = [r for r in chunk_results if r.get("error")]
267
+ if error_results:
268
+ error_msgs = [f"{r['page_range']}: {r['error']}" for r in error_results]
269
+ return f"分析 PDF 時發生錯誤:\n" + "\n".join(error_msgs)
270
+ return "在 PDF 文件中未找到與您問題相關的內容。"
271
+
272
+ # 只有一個相關結果,直接回傳
273
+ if len(relevant_results) == 1:
274
+ return relevant_results[0]["answer"]
275
+
276
+ # 多個相關結果,需要統整
277
+ combined_content = "\n\n".join(
278
+ [
279
+ f"【{r['page_range']}】\n{r['answer']}"
280
+ for r in relevant_results
281
+ ]
282
+ )
283
+
284
+ merge_prompt = f"""以下是從一份大型 PDF 文件的不同部分擷取的回答,請統整這些資訊來回答使用者的問題。
285
+
286
+ 使用者問題:{user_input}
287
+
288
+ 各部分的回答:
289
+ {combined_content}
290
+
291
+ 請統整以上資訊,提供一個完整、連貫的回答。如果不同部分有互補的資訊,請整合在一起。請保留頁碼引用。"""
292
+
293
+ try:
294
+ # 使用 LLM 統整(這裡不需要傳 PDF,只是純文字統整)
295
+ from google import genai
296
+
297
+ credentials = service_account.Credentials.from_service_account_file(
298
+ os.getenv("GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI"),
299
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
300
+ )
301
+
302
+ client = genai.Client(
303
+ credentials=credentials,
304
+ project="scoop-386004",
305
+ location="us-central1",
306
+ )
307
+
308
+ response = client.models.generate_content(
309
+ model=model_name,
310
+ contents=[merge_prompt],
311
+ )
312
+
313
+ if hasattr(response, "usage_metadata"):
314
+ print(
315
+ f"merge_chunk_results============> input_token: {response.usage_metadata.prompt_token_count} output_token: {response.usage_metadata.candidates_token_count}",
316
+ )
317
+
318
+ return response.text
319
+
320
+ except Exception as e:
321
+ import traceback
322
+
323
+ traceback.print_exc()
324
+ # 統整失敗,直接回傳合併的內容
325
+ return f"統整時發生錯誤,以下是各部分的回答:\n\n{combined_content}"
326
+
327
+
328
+ async def analyze_pdf_async(pdf_url: str, user_input: str) -> str:
329
+ """
330
+ 非同步分析 PDF 檔案(智慧處理策略)
331
+
332
+ 根據檔案大小自動選擇處理策略:
333
+ - < 5MB: 直接多模態問答
334
+ - >= 5MB: 壓縮 → 切割 → 平行多模態問答 → LLM 統整結果
335
+
336
+ Args:
337
+ pdf_url: PDF 檔案的 URL
338
+ user_input: 使用者問題
116
339
 
117
340
  Returns:
118
- str: Analysis of the PDF content based on the query
341
+ str: 分析結果
119
342
  """
120
343
  try:
121
- # Download and encode the PDF file from URL
122
- pdf_data = base64.standard_b64encode(httpx.get(pdf_url).content).decode("utf-8")
344
+ # 1. 下載 PDF
345
+ print(f"[analyze_pdf_async] 下載 PDF: {pdf_url}")
346
+ pdf_content = httpx.get(pdf_url, timeout=60.0).content
347
+ pdf_size_mb = len(pdf_content) / (1024 * 1024)
348
+ print(f"[analyze_pdf_async] PDF 大小: {pdf_size_mb:.2f} MB")
123
349
 
124
- # Get models list from environment variable
350
+ # 取得模型設定
125
351
  models_str = os.getenv("PDF_ANALYZER_MODEL", "gemini-2.5-flash")
126
- print(f"[analyze_pdf] 分析PDF使用模型: {models_str}")
352
+ print(f"[analyze_pdf_async] 使用模型: {models_str}")
127
353
  models = [model.strip() for model in models_str.split(",")]
354
+ primary_model = models[0]
355
+
356
+ # 2. 判斷處理策略
357
+ if pdf_size_mb < PDF_SIZE_THRESHOLD_MB:
358
+ # 小檔:直接多模態問答
359
+ print(f"[analyze_pdf_async] 小檔模式 (< {PDF_SIZE_THRESHOLD_MB}MB)")
360
+ pdf_data = base64.standard_b64encode(pdf_content).decode("utf-8")
128
361
 
129
- last_error = None
130
-
131
- # Try each model in order
132
- for model in models:
133
- try:
134
- if model.startswith("gemini-"):
135
- print(f"Trying to analyze PDF with Gemini model: {model}")
136
- return analyze_pdf_with_gemini(pdf_data, user_input, model)
137
- elif model.startswith("claude-"):
138
- print(f"Trying to analyze PDF with Claude model: {model}")
139
- return analyze_pdf_with_claude(pdf_data, user_input, model)
140
- else:
141
- print(f"Unknown model type: {model}, skipping")
362
+ # 嘗試所有模型
363
+ last_error = None
364
+ for model in models:
365
+ try:
366
+ if model.startswith("gemini-"):
367
+ return analyze_pdf_with_gemini(pdf_data, user_input, model, pdf_url)
368
+ elif model.startswith("claude-"):
369
+ return analyze_pdf_with_claude(pdf_data, user_input, model)
370
+ except Exception as e:
371
+ import traceback
372
+
373
+ traceback.print_exc()
374
+ last_error = str(e)
142
375
  continue
143
- except Exception as e:
144
- import traceback
145
-
146
- traceback.print_exc()
147
- error_msg = f"Error analyzing PDF with {model}: {str(e)}"
148
- print(error_msg)
149
- last_error = error_msg
150
- # Continue to the next model in the list
151
- continue
152
-
153
- # If we've reached here, all models failed
154
- return (
155
- f"Error analyzing PDF with all specified models. Last error: {last_error}"
376
+
377
+ return f"分析 PDF 時所有模型都失敗。最後錯誤: {last_error}"
378
+
379
+ # 3. 大檔:壓縮 → 切割 → 平行問答 → 統整
380
+ print(f"[analyze_pdf_async] 大檔模式 (>= {PDF_SIZE_THRESHOLD_MB}MB)")
381
+
382
+ # 延遲 import 以加快載入
383
+ from botrun_flow_lang.langgraph_agents.agents.util.pdf_processor import (
384
+ split_pdf_smart,
385
+ get_pdf_page_count,
386
+ )
387
+ from botrun_flow_lang.langgraph_agents.agents.util.pdf_cache import (
388
+ get_cache_key,
389
+ check_cache,
390
+ save_to_cache,
156
391
  )
157
392
 
393
+ # 3.1 檢查快取
394
+ cache_key = get_cache_key(pdf_url)
395
+ print(f"[analyze_pdf_async] 檢查快取: {cache_key}")
396
+ cached_chunks = await check_cache(cache_key)
397
+
398
+ if cached_chunks:
399
+ # 有快取,直接使用
400
+ print(f"[analyze_pdf_async] 使用快取: {len(cached_chunks)} 個切片")
401
+ chunks = cached_chunks
402
+ total_pages = sum(
403
+ int(pr.split("-")[-1]) - int(pr.split("-")[-2]) + 1
404
+ for _, pr in chunks
405
+ if pr.startswith("page-")
406
+ ) if chunks else 0
407
+ else:
408
+ # 無快取,切割後存入快取
409
+
410
+ # 3.2 切割
411
+ print("[analyze_pdf_async] 切割 PDF...")
412
+ chunks = split_pdf_smart(pdf_content, target_size_mb=PDF_CHUNK_TARGET_SIZE_MB)
413
+ total_pages = get_pdf_page_count(pdf_content)
414
+ print(
415
+ f"[analyze_pdf_async] 切割完成: {len(chunks)} 個切片, 共 {total_pages} 頁"
416
+ )
417
+
418
+ # 3.3 存入快取
419
+ print("[analyze_pdf_async] 存入快取...")
420
+ await save_to_cache(
421
+ cache_key=cache_key,
422
+ chunks=chunks,
423
+ original_url=pdf_url,
424
+ original_size_mb=pdf_size_mb,
425
+ total_pages=total_pages,
426
+ )
427
+
428
+ # 3.3 平行問答
429
+ print(f"[analyze_pdf_async] 開始平行問答 (最大並行: {MAX_CONCURRENT_CHUNKS})...")
430
+ chunk_results = await analyze_pdf_chunks_parallel(
431
+ chunks, user_input, primary_model, max_concurrent=MAX_CONCURRENT_CHUNKS
432
+ )
433
+
434
+ # 統計結果
435
+ relevant_count = sum(1 for r in chunk_results if r.get("relevant", False))
436
+ error_count = sum(1 for r in chunk_results if r.get("error"))
437
+ print(
438
+ f"[analyze_pdf_async] 問答完成: {relevant_count}/{len(chunks)} 個切片有相關內容, "
439
+ f"{error_count} 個錯誤"
440
+ )
441
+
442
+ # 3.4 統整結果
443
+ print("[analyze_pdf_async] 統整結果...")
444
+ result = merge_chunk_results(chunk_results, user_input, primary_model)
445
+ print("[analyze_pdf_async] 完成")
446
+
447
+ return result
448
+
158
449
  except Exception as e:
159
- print(f"Error downloading PDF: {str(e)}")
160
- return f"Error downloading PDF: {str(e)}"
450
+ import traceback
451
+
452
+ traceback.print_exc()
453
+ return f"分析 PDF {pdf_url} 時發生錯誤: {str(e)}"
454
+
455
+
456
+ def analyze_pdf(pdf_url: str, user_input: str) -> str:
457
+ """
458
+ 分析 PDF 檔案(同步包裝函數)
459
+
460
+ 這是一個同步函數,內部會建立事件迴圈來執行非同步的 analyze_pdf_async。
461
+ 為了向後相容,保留這個同步介面。
462
+
463
+ Args:
464
+ pdf_url: PDF 檔案的 URL
465
+ user_input: 使用者問題
466
+
467
+ Returns:
468
+ str: 分析結果
469
+ """
470
+ try:
471
+ # 嘗試取得現有的事件迴圈
472
+ loop = asyncio.get_event_loop()
473
+ if loop.is_running():
474
+ # 如果已經在事件迴圈中,建立新的任務
475
+ import concurrent.futures
476
+
477
+ with concurrent.futures.ThreadPoolExecutor() as executor:
478
+ future = executor.submit(
479
+ asyncio.run, analyze_pdf_async(pdf_url, user_input)
480
+ )
481
+ return future.result()
482
+ else:
483
+ return loop.run_until_complete(analyze_pdf_async(pdf_url, user_input))
484
+ except RuntimeError:
485
+ # 沒有事件迴圈,建立新的
486
+ return asyncio.run(analyze_pdf_async(pdf_url, user_input))
@@ -0,0 +1,250 @@
1
+ """
2
+ PDF 快取模組
3
+
4
+ 提供 PDF 切片的 GCS 快取功能,避免重複切割相同的 PDF 檔案。
5
+ 快取會自動在 7 天後過期(透過 GCS Lifecycle Rule)。
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ from io import BytesIO
11
+ from typing import List, Tuple, Optional
12
+ from datetime import datetime
13
+
14
+ from botrun_flow_lang.services.storage.storage_factory import storage_store_factory
15
+
16
+ # 快取目錄前綴
17
+ PDF_CACHE_PREFIX = "pdf-cache"
18
+
19
+ # 快取過期天數(用於 lifecycle rule)
20
+ PDF_CACHE_EXPIRY_DAYS = 7
21
+
22
+
23
+ def get_cache_key(pdf_url: str) -> str:
24
+ """
25
+ 根據 PDF URL 產生快取 key(hash)
26
+
27
+ Args:
28
+ pdf_url: PDF 檔案的 URL
29
+
30
+ Returns:
31
+ str: 32 字元的 MD5 hash
32
+ """
33
+ return hashlib.md5(pdf_url.encode()).hexdigest()
34
+
35
+
36
+ def _get_cache_path(cache_key: str) -> str:
37
+ """
38
+ 取得快取目錄路徑
39
+
40
+ Args:
41
+ cache_key: 快取 key
42
+
43
+ Returns:
44
+ str: GCS 路徑,格式為 "pdf-cache/{cache_key}"
45
+ """
46
+ return f"{PDF_CACHE_PREFIX}/{cache_key}"
47
+
48
+
49
+ def _get_metadata_path(cache_key: str) -> str:
50
+ """取得 metadata 檔案路徑"""
51
+ return f"{_get_cache_path(cache_key)}/metadata.json"
52
+
53
+
54
+ def _get_chunk_path(cache_key: str, chunk_index: int) -> str:
55
+ """取得切片檔案路徑"""
56
+ return f"{_get_cache_path(cache_key)}/chunk-{chunk_index:03d}.pdf"
57
+
58
+
59
+ async def check_cache(cache_key: str) -> Optional[List[Tuple[bytes, str]]]:
60
+ """
61
+ 檢查 GCS 是否有快取
62
+
63
+ Args:
64
+ cache_key: 快取 key(來自 get_cache_key)
65
+
66
+ Returns:
67
+ Optional[List[Tuple[bytes, str]]]: 如果有快取,回傳切片清單;否則回傳 None
68
+ """
69
+ try:
70
+ storage = storage_store_factory()
71
+ metadata_path = _get_metadata_path(cache_key)
72
+
73
+ # 檢查 metadata 檔案是否存在
74
+ if not await storage.file_exists(metadata_path):
75
+ print(f"[pdf_cache] 快取不存在: {cache_key}")
76
+ return None
77
+
78
+ # 讀取 metadata
79
+ metadata_file = await storage.retrieve_file(metadata_path)
80
+ if not metadata_file:
81
+ print(f"[pdf_cache] 無法讀取 metadata: {cache_key}")
82
+ return None
83
+
84
+ metadata = json.loads(metadata_file.getvalue().decode("utf-8"))
85
+ chunk_count = metadata.get("chunk_count", 0)
86
+ page_ranges = metadata.get("page_ranges", [])
87
+
88
+ if chunk_count == 0:
89
+ print(f"[pdf_cache] 快取無切片: {cache_key}")
90
+ return None
91
+
92
+ print(f"[pdf_cache] 找到快取: {cache_key}, {chunk_count} 個切片")
93
+
94
+ # 讀取所有切片
95
+ chunks = []
96
+ for i in range(chunk_count):
97
+ chunk_path = _get_chunk_path(cache_key, i)
98
+ chunk_file = await storage.retrieve_file(chunk_path)
99
+
100
+ if not chunk_file:
101
+ print(f"[pdf_cache] 無法讀取切片 {i}: {cache_key}")
102
+ return None # 快取不完整,放棄使用
103
+
104
+ chunk_bytes = chunk_file.getvalue()
105
+ page_range = page_ranges[i] if i < len(page_ranges) else f"chunk-{i:03d}"
106
+ chunks.append((chunk_bytes, page_range))
107
+
108
+ print(f"[pdf_cache] 成功載入快取: {cache_key}")
109
+ return chunks
110
+
111
+ except Exception as e:
112
+ print(f"[pdf_cache] 檢查快取時發生錯誤: {e}")
113
+ return None
114
+
115
+
116
+ async def save_to_cache(
117
+ cache_key: str,
118
+ chunks: List[Tuple[bytes, str]],
119
+ original_url: str,
120
+ original_size_mb: float,
121
+ total_pages: int,
122
+ ) -> bool:
123
+ """
124
+ 將切片存入 GCS 快取
125
+
126
+ Args:
127
+ cache_key: 快取 key
128
+ chunks: 切片清單 [(chunk_bytes, page_range), ...]
129
+ original_url: 原始 PDF URL
130
+ original_size_mb: 原始檔案大小(MB)
131
+ total_pages: 總頁數
132
+
133
+ Returns:
134
+ bool: 是否成功存入快取
135
+ """
136
+ try:
137
+ storage = storage_store_factory()
138
+
139
+ # 1. 存入所有切片
140
+ page_ranges = []
141
+ for i, (chunk_bytes, page_range) in enumerate(chunks):
142
+ chunk_path = _get_chunk_path(cache_key, i)
143
+ chunk_file = BytesIO(chunk_bytes)
144
+
145
+ success, _ = await storage.store_file(
146
+ chunk_path, chunk_file, public=False, content_type="application/pdf"
147
+ )
148
+
149
+ if not success:
150
+ print(f"[pdf_cache] 無法存入切片 {i}: {cache_key}")
151
+ return False
152
+
153
+ page_ranges.append(page_range)
154
+
155
+ # 2. 存入 metadata
156
+ metadata = {
157
+ "original_url": original_url,
158
+ "cache_key": cache_key,
159
+ "chunk_count": len(chunks),
160
+ "page_ranges": page_ranges,
161
+ "original_size_mb": original_size_mb,
162
+ "total_pages": total_pages,
163
+ "created_at": datetime.utcnow().isoformat(),
164
+ }
165
+
166
+ metadata_path = _get_metadata_path(cache_key)
167
+ metadata_file = BytesIO(json.dumps(metadata, ensure_ascii=False).encode("utf-8"))
168
+
169
+ success, _ = await storage.store_file(
170
+ metadata_path, metadata_file, public=False, content_type="application/json"
171
+ )
172
+
173
+ if not success:
174
+ print(f"[pdf_cache] 無法存入 metadata: {cache_key}")
175
+ return False
176
+
177
+ print(
178
+ f"[pdf_cache] 成功存入快取: {cache_key}, "
179
+ f"{len(chunks)} 個切片, {total_pages} 頁"
180
+ )
181
+ return True
182
+
183
+ except Exception as e:
184
+ print(f"[pdf_cache] 存入快取時發生錯誤: {e}")
185
+ return False
186
+
187
+
188
+ async def get_cache_metadata(cache_key: str) -> Optional[dict]:
189
+ """
190
+ 取得快取的 metadata(不載入切片內容)
191
+
192
+ Args:
193
+ cache_key: 快取 key
194
+
195
+ Returns:
196
+ Optional[dict]: metadata 字典,或 None
197
+ """
198
+ try:
199
+ storage = storage_store_factory()
200
+ metadata_path = _get_metadata_path(cache_key)
201
+
202
+ if not await storage.file_exists(metadata_path):
203
+ return None
204
+
205
+ metadata_file = await storage.retrieve_file(metadata_path)
206
+ if not metadata_file:
207
+ return None
208
+
209
+ return json.loads(metadata_file.getvalue().decode("utf-8"))
210
+
211
+ except Exception as e:
212
+ print(f"[pdf_cache] 讀取 metadata 時發生錯誤: {e}")
213
+ return None
214
+
215
+
216
+ async def delete_cache(cache_key: str) -> bool:
217
+ """
218
+ 刪除快取
219
+
220
+ Args:
221
+ cache_key: 快取 key
222
+
223
+ Returns:
224
+ bool: 是否成功刪除
225
+ """
226
+ try:
227
+ storage = storage_store_factory()
228
+
229
+ # 先讀取 metadata 取得切片數量
230
+ metadata = await get_cache_metadata(cache_key)
231
+ if not metadata:
232
+ return True # 快取不存在,視為成功
233
+
234
+ chunk_count = metadata.get("chunk_count", 0)
235
+
236
+ # 刪除所有切片
237
+ for i in range(chunk_count):
238
+ chunk_path = _get_chunk_path(cache_key, i)
239
+ await storage.delete_file(chunk_path)
240
+
241
+ # 刪除 metadata
242
+ metadata_path = _get_metadata_path(cache_key)
243
+ await storage.delete_file(metadata_path)
244
+
245
+ print(f"[pdf_cache] 已刪除快取: {cache_key}")
246
+ return True
247
+
248
+ except Exception as e:
249
+ print(f"[pdf_cache] 刪除快取時發生錯誤: {e}")
250
+ return False