PyPI - botrun-flow-lang - Versions diffs - 5.11.11__py3-none-any.whl → 5.12.261__py3-none-any.whl - Mend

botrun-flow-lang 5.11.11py3-none-any.whl → 5.12.261py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

botrun_flow_lang/langgraph_agents/agents/util/pdf_analyzer.py CHANGED Viewed

@@ -1,14 +1,32 @@
+"""
+PDF 分析模組
+提供 PDF 檔案分析功能，支援：
+- 小檔 (< 5MB)：直接多模態問答
+- 大檔 (>= 5MB)：壓縮 → 切割 → 平行多模態問答 → LLM 統整結果
+"""
 import anthropic
+import asyncio
 import base64
 import httpx
 import os
+from typing import List, Dict, Any
 from dotenv import load_dotenv
 from google.oauth2 import service_account
 load_dotenv()
+# 檔案大小閾值（MB）
+PDF_SIZE_THRESHOLD_MB = 30.0
+# 切片目標大小（MB）
+PDF_CHUNK_TARGET_SIZE_MB = 30.0
+# 最大平行問答數量
+MAX_CONCURRENT_CHUNKS = 5
 def analyze_pdf_with_claude(
     pdf_data: str, user_input: str, model_name: str = "claude-sonnet-4-5-20250929"
@@ -55,7 +73,7 @@ def analyze_pdf_with_claude(
 def analyze_pdf_with_gemini(
-    pdf_data: str, user_input: str, model_name: str = "gemini-2.5-flash"
+    pdf_data: str, user_input: str, model_name: str = "gemini-2.5-flash", pdf_url: str = ""
 ):
     """
     Analyze a PDF file using Gemini API
@@ -100,61 +118,369 @@ def analyze_pdf_with_gemini(
             f"analyze_pdf_with_gemini============> input_token: {response.usage_metadata.prompt_token_count} output_token: {response.usage_metadata.candidates_token_count}",
         )
+    print(f"{pdf_url} success")
     return response.text
-def analyze_pdf(pdf_url: str, user_input: str):
+def _analyze_single_chunk(
+    chunk_data: str, page_range: str, user_input: str, model_name: str
+) -> Dict[str, Any]:
+    """
+    分析單一 PDF 切片
+    Args:
+        chunk_data: Base64-encoded PDF chunk data
+        page_range: 頁碼範圍字串 (e.g., "page-001-015")
+        user_input: 使用者問題
+        model_name: 使用的模型名稱
+    Returns:
+        Dict: {"page_range": str, "answer": str, "relevant": bool, "error": str|None}
+    """
+    # 構建切片專用的 prompt
+    chunk_prompt = f"""你正在閱讀一份大型 PDF 文件的其中一部分（{page_range}）。
+使用者問題：{user_input}
+請根據這個部分的內容回答問題：
+- 如果這個部分包含與問題相關的資訊，請詳細回答
+- 如果這個部分與問題完全無關，請只回答「NOT_RELEVANT」（不要回答其他內容）
+- 回答時請標註資訊來源的頁碼"""
+    try:
+        if model_name.startswith("gemini-"):
+            answer = analyze_pdf_with_gemini(chunk_data, chunk_prompt, model_name)
+        elif model_name.startswith("claude-"):
+            answer = analyze_pdf_with_claude(chunk_data, chunk_prompt, model_name)
+        else:
+            return {
+                "page_range": page_range,
+                "answer": "",
+                "relevant": False,
+                "error": f"Unknown model type: {model_name}",
+            }
+        # 判斷是否相關
+        is_relevant = "NOT_RELEVANT" not in answer.upper()
+        return {
+            "page_range": page_range,
+            "answer": answer if is_relevant else "",
+            "relevant": is_relevant,
+            "error": None,
+        }
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {
+            "page_range": page_range,
+            "answer": "",
+            "relevant": False,
+            "error": str(e),
+        }
+async def analyze_pdf_chunks_parallel(
+    chunks: List[tuple], user_input: str, model_name: str, max_concurrent: int = 5
+) -> List[Dict[str, Any]]:
     """
-    Analyze a PDF file using multiple models in order of preference based on PDF_ANALYZER_MODEL env var
+    平行問答多個 PDF 切片
+    Args:
+        chunks: 切片清單 [(chunk_bytes, page_range), ...]
+        user_input: 使用者問題
+        model_name: 使用的模型名稱
+        max_concurrent: 最大平行數量
+    Returns:
+        List[Dict]: 每個切片的回答結果
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def analyze_with_semaphore(chunk_bytes: bytes, page_range: str):
+        async with semaphore:
+            # 將 bytes 轉為 base64
+            chunk_data = base64.standard_b64encode(chunk_bytes).decode("utf-8")
+            # 使用 run_in_executor 執行同步函數
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(
+                None,
+                _analyze_single_chunk,
+                chunk_data,
+                page_range,
+                user_input,
+                model_name,
+            )
+    # 建立所有任務
+    tasks = [
+        analyze_with_semaphore(chunk_bytes, page_range)
+        for chunk_bytes, page_range in chunks
+    ]
+    # 平行執行
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    # 處理例外
+    processed_results = []
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            processed_results.append(
+                {
+                    "page_range": chunks[i][1],
+                    "answer": "",
+                    "relevant": False,
+                    "error": str(result),
+                }
+            )
+        else:
+            processed_results.append(result)
+    return processed_results
-    If PDF_ANALYZER_MODEL contains comma-separated models, it will try them in order,
-    falling back to the next one if the previous fails.
+def merge_chunk_results(
+    chunk_results: List[Dict[str, Any]],
+    user_input: str,
+    model_name: str = "gemini-2.5-flash",
+) -> str:
+    """
+    使用 LLM 統整多個切片的回答
     Args:
-        pdf_url: URL to the PDF file
-        user_input: User's query about the PDF content
+        chunk_results: 切片回答結果清單
+        user_input: 原始使用者問題
+        model_name: 統整使用的模型名稱
+    Returns:
+        str: 統整後的回答
+    """
+    # 過濾出相關的回答
+    relevant_results = [r for r in chunk_results if r.get("relevant", False)]
+    if not relevant_results:
+        # 沒有找到相關內容
+        error_results = [r for r in chunk_results if r.get("error")]
+        if error_results:
+            error_msgs = [f"{r['page_range']}: {r['error']}" for r in error_results]
+            return f"分析 PDF 時發生錯誤：\n" + "\n".join(error_msgs)
+        return "在 PDF 文件中未找到與您問題相關的內容。"
+    # 只有一個相關結果，直接回傳
+    if len(relevant_results) == 1:
+        return relevant_results[0]["answer"]
+    # 多個相關結果，需要統整
+    combined_content = "\n\n".join(
+        [
+            f"【{r['page_range']}】\n{r['answer']}"
+            for r in relevant_results
+        ]
+    )
+    merge_prompt = f"""以下是從一份大型 PDF 文件的不同部分擷取的回答，請統整這些資訊來回答使用者的問題。
+使用者問題：{user_input}
+各部分的回答：
+{combined_content}
+請統整以上資訊，提供一個完整、連貫的回答。如果不同部分有互補的資訊，請整合在一起。請保留頁碼引用。"""
+    try:
+        # 使用 LLM 統整（這裡不需要傳 PDF，只是純文字統整）
+        from google import genai
+        credentials = service_account.Credentials.from_service_account_file(
+            os.getenv("GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI"),
+            scopes=["https://www.googleapis.com/auth/cloud-platform"],
+        )
+        client = genai.Client(
+            credentials=credentials,
+            project="scoop-386004",
+            location="us-central1",
+        )
+        response = client.models.generate_content(
+            model=model_name,
+            contents=[merge_prompt],
+        )
+        if hasattr(response, "usage_metadata"):
+            print(
+                f"merge_chunk_results============> input_token: {response.usage_metadata.prompt_token_count} output_token: {response.usage_metadata.candidates_token_count}",
+            )
+        return response.text
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        # 統整失敗，直接回傳合併的內容
+        return f"統整時發生錯誤，以下是各部分的回答：\n\n{combined_content}"
+async def analyze_pdf_async(pdf_url: str, user_input: str) -> str:
+    """
+    非同步分析 PDF 檔案（智慧處理策略）
+    根據檔案大小自動選擇處理策略：
+    - < 5MB: 直接多模態問答
+    - >= 5MB: 壓縮 → 切割 → 平行多模態問答 → LLM 統整結果
+    Args:
+        pdf_url: PDF 檔案的 URL
+        user_input: 使用者問題
     Returns:
-        str: Analysis of the PDF content based on the query
+        str: 分析結果
     """
     try:
-        # Download and encode the PDF file from URL
-        pdf_data = base64.standard_b64encode(httpx.get(pdf_url).content).decode("utf-8")
+        # 1. 下載 PDF
+        print(f"[analyze_pdf_async] 下載 PDF: {pdf_url}")
+        pdf_content = httpx.get(pdf_url, timeout=60.0).content
+        pdf_size_mb = len(pdf_content) / (1024 * 1024)
+        print(f"[analyze_pdf_async] PDF 大小: {pdf_size_mb:.2f} MB")
-        # Get models list from environment variable
+        # 取得模型設定
         models_str = os.getenv("PDF_ANALYZER_MODEL", "gemini-2.5-flash")
-        print(f"[analyze_pdf] 分析PDF使用模型: {models_str}")
+        print(f"[analyze_pdf_async] 使用模型: {models_str}")
         models = [model.strip() for model in models_str.split(",")]
+        primary_model = models[0]
+        # 2. 判斷處理策略
+        if pdf_size_mb < PDF_SIZE_THRESHOLD_MB:
+            # 小檔：直接多模態問答
+            print(f"[analyze_pdf_async] 小檔模式 (< {PDF_SIZE_THRESHOLD_MB}MB)")
+            pdf_data = base64.standard_b64encode(pdf_content).decode("utf-8")
-        last_error = None
-        # Try each model in order
-        for model in models:
-            try:
-                if model.startswith("gemini-"):
-                    print(f"Trying to analyze PDF with Gemini model: {model}")
-                    return analyze_pdf_with_gemini(pdf_data, user_input, model)
-                elif model.startswith("claude-"):
-                    print(f"Trying to analyze PDF with Claude model: {model}")
-                    return analyze_pdf_with_claude(pdf_data, user_input, model)
-                else:
-                    print(f"Unknown model type: {model}, skipping")
+            # 嘗試所有模型
+            last_error = None
+            for model in models:
+                try:
+                    if model.startswith("gemini-"):
+                        return analyze_pdf_with_gemini(pdf_data, user_input, model, pdf_url)
+                    elif model.startswith("claude-"):
+                        return analyze_pdf_with_claude(pdf_data, user_input, model)
+                except Exception as e:
+                    import traceback
+                    traceback.print_exc()
+                    last_error = str(e)
                     continue
-            except Exception as e:
-                import traceback
-                traceback.print_exc()
-                error_msg = f"Error analyzing PDF with {model}: {str(e)}"
-                print(error_msg)
-                last_error = error_msg
-                # Continue to the next model in the list
-                continue
-        # If we've reached here, all models failed
-        return (
-            f"Error analyzing PDF with all specified models. Last error: {last_error}"
+            return f"分析 PDF 時所有模型都失敗。最後錯誤: {last_error}"
+        # 3. 大檔：壓縮 → 切割 → 平行問答 → 統整
+        print(f"[analyze_pdf_async] 大檔模式 (>= {PDF_SIZE_THRESHOLD_MB}MB)")
+        # 延遲 import 以加快載入
+        from botrun_flow_lang.langgraph_agents.agents.util.pdf_processor import (
+            split_pdf_smart,
+            get_pdf_page_count,
+        )
+        from botrun_flow_lang.langgraph_agents.agents.util.pdf_cache import (
+            get_cache_key,
+            check_cache,
+            save_to_cache,
         )
+        # 3.1 檢查快取
+        cache_key = get_cache_key(pdf_url)
+        print(f"[analyze_pdf_async] 檢查快取: {cache_key}")
+        cached_chunks = await check_cache(cache_key)
+        if cached_chunks:
+            # 有快取，直接使用
+            print(f"[analyze_pdf_async] 使用快取: {len(cached_chunks)} 個切片")
+            chunks = cached_chunks
+            total_pages = sum(
+                int(pr.split("-")[-1]) - int(pr.split("-")[-2]) + 1
+                for _, pr in chunks
+                if pr.startswith("page-")
+            ) if chunks else 0
+        else:
+            # 無快取，切割後存入快取
+            # 3.2 切割
+            print("[analyze_pdf_async] 切割 PDF...")
+            chunks = split_pdf_smart(pdf_content, target_size_mb=PDF_CHUNK_TARGET_SIZE_MB)
+            total_pages = get_pdf_page_count(pdf_content)
+            print(
+                f"[analyze_pdf_async] 切割完成: {len(chunks)} 個切片, 共 {total_pages} 頁"
+            )
+            # 3.3 存入快取
+            print("[analyze_pdf_async] 存入快取...")
+            await save_to_cache(
+                cache_key=cache_key,
+                chunks=chunks,
+                original_url=pdf_url,
+                original_size_mb=pdf_size_mb,
+                total_pages=total_pages,
+            )
+        # 3.3 平行問答
+        print(f"[analyze_pdf_async] 開始平行問答 (最大並行: {MAX_CONCURRENT_CHUNKS})...")
+        chunk_results = await analyze_pdf_chunks_parallel(
+            chunks, user_input, primary_model, max_concurrent=MAX_CONCURRENT_CHUNKS
+        )
+        # 統計結果
+        relevant_count = sum(1 for r in chunk_results if r.get("relevant", False))
+        error_count = sum(1 for r in chunk_results if r.get("error"))
+        print(
+            f"[analyze_pdf_async] 問答完成: {relevant_count}/{len(chunks)} 個切片有相關內容, "
+            f"{error_count} 個錯誤"
+        )
+        # 3.4 統整結果
+        print("[analyze_pdf_async] 統整結果...")
+        result = merge_chunk_results(chunk_results, user_input, primary_model)
+        print("[analyze_pdf_async] 完成")
+        return result
     except Exception as e:
-        print(f"Error downloading PDF: {str(e)}")
-        return f"Error downloading PDF: {str(e)}"
+        import traceback
+        traceback.print_exc()
+        return f"分析 PDF {pdf_url} 時發生錯誤: {str(e)}"
+def analyze_pdf(pdf_url: str, user_input: str) -> str:
+    """
+    分析 PDF 檔案（同步包裝函數）
+    這是一個同步函數，內部會建立事件迴圈來執行非同步的 analyze_pdf_async。
+    為了向後相容，保留這個同步介面。
+    Args:
+        pdf_url: PDF 檔案的 URL
+        user_input: 使用者問題
+    Returns:
+        str: 分析結果
+    """
+    try:
+        # 嘗試取得現有的事件迴圈
+        loop = asyncio.get_event_loop()
+        if loop.is_running():
+            # 如果已經在事件迴圈中，建立新的任務
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(
+                    asyncio.run, analyze_pdf_async(pdf_url, user_input)
+                )
+                return future.result()
+        else:
+            return loop.run_until_complete(analyze_pdf_async(pdf_url, user_input))
+    except RuntimeError:
+        # 沒有事件迴圈，建立新的
+        return asyncio.run(analyze_pdf_async(pdf_url, user_input))

botrun_flow_lang/langgraph_agents/agents/util/pdf_cache.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""
+PDF 快取模組
+提供 PDF 切片的 GCS 快取功能，避免重複切割相同的 PDF 檔案。
+快取會自動在 7 天後過期（透過 GCS Lifecycle Rule）。
+"""
+import hashlib
+import json
+from io import BytesIO
+from typing import List, Tuple, Optional
+from datetime import datetime
+from botrun_flow_lang.services.storage.storage_factory import storage_store_factory
+# 快取目錄前綴
+PDF_CACHE_PREFIX = "pdf-cache"
+# 快取過期天數（用於 lifecycle rule）
+PDF_CACHE_EXPIRY_DAYS = 7
+def get_cache_key(pdf_url: str) -> str:
+    """
+    根據 PDF URL 產生快取 key（hash）
+    Args:
+        pdf_url: PDF 檔案的 URL
+    Returns:
+        str: 32 字元的 MD5 hash
+    """
+    return hashlib.md5(pdf_url.encode()).hexdigest()
+def _get_cache_path(cache_key: str) -> str:
+    """
+    取得快取目錄路徑
+    Args:
+        cache_key: 快取 key
+    Returns:
+        str: GCS 路徑，格式為 "pdf-cache/{cache_key}"
+    """
+    return f"{PDF_CACHE_PREFIX}/{cache_key}"
+def _get_metadata_path(cache_key: str) -> str:
+    """取得 metadata 檔案路徑"""
+    return f"{_get_cache_path(cache_key)}/metadata.json"
+def _get_chunk_path(cache_key: str, chunk_index: int) -> str:
+    """取得切片檔案路徑"""
+    return f"{_get_cache_path(cache_key)}/chunk-{chunk_index:03d}.pdf"
+async def check_cache(cache_key: str) -> Optional[List[Tuple[bytes, str]]]:
+    """
+    檢查 GCS 是否有快取
+    Args:
+        cache_key: 快取 key（來自 get_cache_key）
+    Returns:
+        Optional[List[Tuple[bytes, str]]]: 如果有快取，回傳切片清單；否則回傳 None
+    """
+    try:
+        storage = storage_store_factory()
+        metadata_path = _get_metadata_path(cache_key)
+        # 檢查 metadata 檔案是否存在
+        if not await storage.file_exists(metadata_path):
+            print(f"[pdf_cache] 快取不存在: {cache_key}")
+            return None
+        # 讀取 metadata
+        metadata_file = await storage.retrieve_file(metadata_path)
+        if not metadata_file:
+            print(f"[pdf_cache] 無法讀取 metadata: {cache_key}")
+            return None
+        metadata = json.loads(metadata_file.getvalue().decode("utf-8"))
+        chunk_count = metadata.get("chunk_count", 0)
+        page_ranges = metadata.get("page_ranges", [])
+        if chunk_count == 0:
+            print(f"[pdf_cache] 快取無切片: {cache_key}")
+            return None
+        print(f"[pdf_cache] 找到快取: {cache_key}, {chunk_count} 個切片")
+        # 讀取所有切片
+        chunks = []
+        for i in range(chunk_count):
+            chunk_path = _get_chunk_path(cache_key, i)
+            chunk_file = await storage.retrieve_file(chunk_path)
+            if not chunk_file:
+                print(f"[pdf_cache] 無法讀取切片 {i}: {cache_key}")
+                return None  # 快取不完整，放棄使用
+            chunk_bytes = chunk_file.getvalue()
+            page_range = page_ranges[i] if i < len(page_ranges) else f"chunk-{i:03d}"
+            chunks.append((chunk_bytes, page_range))
+        print(f"[pdf_cache] 成功載入快取: {cache_key}")
+        return chunks
+    except Exception as e:
+        print(f"[pdf_cache] 檢查快取時發生錯誤: {e}")
+        return None
+async def save_to_cache(
+    cache_key: str,
+    chunks: List[Tuple[bytes, str]],
+    original_url: str,
+    original_size_mb: float,
+    total_pages: int,
+) -> bool:
+    """
+    將切片存入 GCS 快取
+    Args:
+        cache_key: 快取 key
+        chunks: 切片清單 [(chunk_bytes, page_range), ...]
+        original_url: 原始 PDF URL
+        original_size_mb: 原始檔案大小（MB）
+        total_pages: 總頁數
+    Returns:
+        bool: 是否成功存入快取
+    """
+    try:
+        storage = storage_store_factory()
+        # 1. 存入所有切片
+        page_ranges = []
+        for i, (chunk_bytes, page_range) in enumerate(chunks):
+            chunk_path = _get_chunk_path(cache_key, i)
+            chunk_file = BytesIO(chunk_bytes)
+            success, _ = await storage.store_file(
+                chunk_path, chunk_file, public=False, content_type="application/pdf"
+            )
+            if not success:
+                print(f"[pdf_cache] 無法存入切片 {i}: {cache_key}")
+                return False
+            page_ranges.append(page_range)
+        # 2. 存入 metadata
+        metadata = {
+            "original_url": original_url,
+            "cache_key": cache_key,
+            "chunk_count": len(chunks),
+            "page_ranges": page_ranges,
+            "original_size_mb": original_size_mb,
+            "total_pages": total_pages,
+            "created_at": datetime.utcnow().isoformat(),
+        }
+        metadata_path = _get_metadata_path(cache_key)
+        metadata_file = BytesIO(json.dumps(metadata, ensure_ascii=False).encode("utf-8"))
+        success, _ = await storage.store_file(
+            metadata_path, metadata_file, public=False, content_type="application/json"
+        )
+        if not success:
+            print(f"[pdf_cache] 無法存入 metadata: {cache_key}")
+            return False
+        print(
+            f"[pdf_cache] 成功存入快取: {cache_key}, "
+            f"{len(chunks)} 個切片, {total_pages} 頁"
+        )
+        return True
+    except Exception as e:
+        print(f"[pdf_cache] 存入快取時發生錯誤: {e}")
+        return False
+async def get_cache_metadata(cache_key: str) -> Optional[dict]:
+    """
+    取得快取的 metadata（不載入切片內容）
+    Args:
+        cache_key: 快取 key
+    Returns:
+        Optional[dict]: metadata 字典，或 None
+    """
+    try:
+        storage = storage_store_factory()
+        metadata_path = _get_metadata_path(cache_key)
+        if not await storage.file_exists(metadata_path):
+            return None
+        metadata_file = await storage.retrieve_file(metadata_path)
+        if not metadata_file:
+            return None
+        return json.loads(metadata_file.getvalue().decode("utf-8"))
+    except Exception as e:
+        print(f"[pdf_cache] 讀取 metadata 時發生錯誤: {e}")
+        return None
+async def delete_cache(cache_key: str) -> bool:
+    """
+    刪除快取
+    Args:
+        cache_key: 快取 key
+    Returns:
+        bool: 是否成功刪除
+    """
+    try:
+        storage = storage_store_factory()
+        # 先讀取 metadata 取得切片數量
+        metadata = await get_cache_metadata(cache_key)
+        if not metadata:
+            return True  # 快取不存在，視為成功
+        chunk_count = metadata.get("chunk_count", 0)
+        # 刪除所有切片
+        for i in range(chunk_count):
+            chunk_path = _get_chunk_path(cache_key, i)
+            await storage.delete_file(chunk_path)
+        # 刪除 metadata
+        metadata_path = _get_metadata_path(cache_key)
+        await storage.delete_file(metadata_path)
+        print(f"[pdf_cache] 已刪除快取: {cache_key}")
+        return True
+    except Exception as e:
+        print(f"[pdf_cache] 刪除快取時發生錯誤: {e}")
+        return False

botrun-flow-lang 5.11.11__py3-none-any.whl → 5.12.261__py3-none-any.whl

botrun-flow-lang 5.11.11py3-none-any.whl → 5.12.261py3-none-any.whl