PyPI - ai-pipeline-core - Versions diffs - 0.2.9__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

ai-pipeline-core 0.2.9py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

ai_pipeline_core/__init__.py +32 -5
ai_pipeline_core/debug/__init__.py +26 -0
ai_pipeline_core/debug/config.py +91 -0
ai_pipeline_core/debug/content.py +705 -0
ai_pipeline_core/debug/processor.py +99 -0
ai_pipeline_core/debug/summary.py +236 -0
ai_pipeline_core/debug/writer.py +913 -0
ai_pipeline_core/deployment/__init__.py +46 -0
ai_pipeline_core/deployment/base.py +681 -0
ai_pipeline_core/deployment/contract.py +84 -0
ai_pipeline_core/deployment/helpers.py +98 -0
ai_pipeline_core/documents/flow_document.py +1 -1
ai_pipeline_core/documents/task_document.py +1 -1
ai_pipeline_core/documents/temporary_document.py +1 -1
ai_pipeline_core/flow/config.py +13 -2
ai_pipeline_core/flow/options.py +4 -4
ai_pipeline_core/images/__init__.py +362 -0
ai_pipeline_core/images/_processing.py +157 -0
ai_pipeline_core/llm/ai_messages.py +25 -4
ai_pipeline_core/llm/client.py +15 -19
ai_pipeline_core/llm/model_response.py +5 -5
ai_pipeline_core/llm/model_types.py +10 -13
ai_pipeline_core/logging/logging_mixin.py +2 -2
ai_pipeline_core/pipeline.py +1 -1
ai_pipeline_core/progress.py +127 -0
ai_pipeline_core/prompt_builder/__init__.py +5 -0
ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +23 -0
ai_pipeline_core/prompt_builder/global_cache.py +78 -0
ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +6 -0
ai_pipeline_core/prompt_builder/prompt_builder.py +253 -0
ai_pipeline_core/prompt_builder/system_prompt.jinja2 +41 -0
ai_pipeline_core/tracing.py +54 -2
ai_pipeline_core/utils/deploy.py +214 -6
ai_pipeline_core/utils/remote_deployment.py +37 -187
{ai_pipeline_core-0.2.9.dist-info → ai_pipeline_core-0.3.3.dist-info}/METADATA +96 -27
ai_pipeline_core-0.3.3.dist-info/RECORD +57 -0
{ai_pipeline_core-0.2.9.dist-info → ai_pipeline_core-0.3.3.dist-info}/WHEEL +1 -1
ai_pipeline_core/simple_runner/__init__.py +0 -14
ai_pipeline_core/simple_runner/cli.py +0 -254
ai_pipeline_core/simple_runner/simple_runner.py +0 -247
ai_pipeline_core-0.2.9.dist-info/RECORD +0 -41
{ai_pipeline_core-0.2.9.dist-info → ai_pipeline_core-0.3.3.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/deployment/contract.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Unified pipeline run response contract.
+@public
+Single source of truth for the response shape used by both
+webhook push (ai-pipeline-core) and polling pull (unified-middleware).
+"""
+from datetime import datetime
+from typing import Annotated, Literal
+from uuid import UUID
+from pydantic import BaseModel, ConfigDict, Discriminator
+class _RunBase(BaseModel):
+    """Common fields on every run response variant."""
+    type: str
+    flow_run_id: UUID
+    project_name: str
+    state: str  # PENDING, RUNNING, COMPLETED, FAILED, CRASHED, CANCELLED
+    timestamp: datetime
+    storage_uri: str = ""
+    model_config = ConfigDict(frozen=True)
+class PendingRun(_RunBase):
+    """Pipeline queued or running but no progress reported yet."""
+    type: Literal["pending"] = "pending"  # pyright: ignore[reportIncompatibleVariableOverride]
+class ProgressRun(_RunBase):
+    """Pipeline running with step-level progress data."""
+    type: Literal["progress"] = "progress"  # pyright: ignore[reportIncompatibleVariableOverride]
+    step: int
+    total_steps: int
+    flow_name: str
+    status: str  # "started", "completed", "cached"
+    progress: float  # overall 0.0–1.0
+    step_progress: float  # within step 0.0–1.0
+    message: str
+class DeploymentResultData(BaseModel):
+    """Typed result payload — always has success + optional error."""
+    success: bool
+    error: str | None = None
+    model_config = ConfigDict(frozen=True, extra="allow")
+class CompletedRun(_RunBase):
+    """Pipeline finished (Prefect COMPLETED). Check result.success for business outcome."""
+    type: Literal["completed"] = "completed"  # pyright: ignore[reportIncompatibleVariableOverride]
+    result: DeploymentResultData
+class FailedRun(_RunBase):
+    """Pipeline crashed — execution error, not business logic."""
+    type: Literal["failed"] = "failed"  # pyright: ignore[reportIncompatibleVariableOverride]
+    error: str
+    result: DeploymentResultData | None = None
+RunResponse = Annotated[
+    PendingRun | ProgressRun | CompletedRun | FailedRun,
+    Discriminator("type"),
+]
+__all__ = [
+    "CompletedRun",
+    "DeploymentResultData",
+    "FailedRun",
+    "PendingRun",
+    "ProgressRun",
+    "RunResponse",
+]

ai_pipeline_core/deployment/helpers.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Helper functions for pipeline deployments."""
+import asyncio
+import re
+from typing import Any, Literal, TypedDict
+import httpx
+from ai_pipeline_core.deployment.contract import CompletedRun, FailedRun, ProgressRun
+from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
+from ai_pipeline_core.logging import get_pipeline_logger
+logger = get_pipeline_logger(__name__)
+class StatusPayload(TypedDict):
+    """Webhook payload for Prefect state transitions (sub-flow level)."""
+    type: Literal["status"]
+    flow_run_id: str
+    project_name: str
+    step: int
+    total_steps: int
+    flow_name: str
+    state: str  # RUNNING, COMPLETED, FAILED, CRASHED, CANCELLED
+    state_name: str
+    timestamp: str
+def class_name_to_deployment_name(class_name: str) -> str:
+    """Convert PascalCase to kebab-case: ResearchPipeline → research-pipeline."""
+    name = re.sub(r"(?<!^)(?=[A-Z])", "-", class_name)
+    return name.lower()
+def extract_generic_params(cls: type) -> tuple[type | None, type | None]:
+    """Extract TOptions and TResult from PipelineDeployment generic args."""
+    from ai_pipeline_core.deployment.base import PipelineDeployment  # noqa: PLC0415
+    for base in getattr(cls, "__orig_bases__", []):
+        origin = getattr(base, "__origin__", None)
+        if origin is PipelineDeployment:
+            args = getattr(base, "__args__", ())
+            if len(args) == 2:
+                return args[0], args[1]
+    return None, None
+async def download_documents(
+    urls: list[str],
+    document_type: type[FlowDocument],
+) -> DocumentList:
+    """Download documents from URLs and return as DocumentList."""
+    documents: list[Document] = []
+    async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
+        for url in urls:
+            response = await client.get(url)
+            response.raise_for_status()
+            filename = url.split("/")[-1].split("?")[0] or "document"
+            documents.append(document_type(name=filename, content=response.content))
+    return DocumentList(documents)
+async def upload_documents(documents: DocumentList, url_mapping: dict[str, str]) -> None:
+    """Upload documents to their mapped URLs."""
+    async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
+        for doc in documents:
+            if doc.name in url_mapping:
+                response = await client.put(
+                    url_mapping[doc.name],
+                    content=doc.content,
+                    headers={"Content-Type": doc.mime_type},
+                )
+                response.raise_for_status()
+async def send_webhook(
+    url: str,
+    payload: ProgressRun | CompletedRun | FailedRun,
+    max_retries: int = 3,
+    retry_delay: float = 10.0,
+) -> None:
+    """Send webhook with retries."""
+    data: dict[str, Any] = payload.model_dump(mode="json")
+    for attempt in range(max_retries):
+        try:
+            async with httpx.AsyncClient(timeout=30) as client:
+                response = await client.post(url, json=data, follow_redirects=True)
+                response.raise_for_status()
+            return
+        except Exception as e:
+            if attempt < max_retries - 1:
+                logger.warning(f"Webhook retry {attempt + 1}/{max_retries}: {e}")
+                await asyncio.sleep(retry_delay)
+            else:
+                logger.error(f"Webhook failed after {max_retries} attempts: {e}")
+                raise

ai_pipeline_core/documents/flow_document.py CHANGED Viewed

@@ -18,7 +18,7 @@ class FlowDocument(Document):
     FlowDocument is used for data that needs to be saved between pipeline
     steps and across multiple flow executions. These documents are typically
-    written to the file system using the simple_runner utilities.
+    written to the file system using the deployment utilities.
     Key characteristics:
     - Persisted to file system between pipeline steps

ai_pipeline_core/documents/task_document.py CHANGED Viewed

@@ -40,7 +40,7 @@ class TaskDocument(Document):
     Note:
         - Cannot instantiate TaskDocument directly - must subclass
-        - Not saved by simple_runner utilities
+        - Not saved by deployment utilities
         - Reduces I/O overhead for temporary data
         - No additional abstract methods to implement
     """

ai_pipeline_core/documents/temporary_document.py CHANGED Viewed

@@ -23,7 +23,7 @@ class TemporaryDocument(Document):
     - Can be instantiated directly (not abstract)
     - Cannot be subclassed (annotated with Python's @final decorator in code)
     - Useful for transient data like API responses or intermediate calculations
-    - Ignored by simple_runner save operations
+    - Ignored by deployment save operations
     - Useful for tests and debugging
     Creating TemporaryDocuments:

ai_pipeline_core/flow/config.py CHANGED Viewed

@@ -39,11 +39,13 @@ class FlowConfig(ABC):
     Class Variables:
         INPUT_DOCUMENT_TYPES: List of FlowDocument types this flow accepts
         OUTPUT_DOCUMENT_TYPE: Single FlowDocument type this flow produces
+        WEIGHT: Weight for progress calculation (default 1.0, based on avg duration)
     Validation Rules:
         - INPUT_DOCUMENT_TYPES and OUTPUT_DOCUMENT_TYPE must be defined
         - OUTPUT_DOCUMENT_TYPE cannot be in INPUT_DOCUMENT_TYPES (prevents cycles)
         - Field names must be exact (common typos are detected)
+        - WEIGHT must be a positive number
     Why this matters:
         Flows connect in pipelines where one flow's output becomes another's input.
@@ -54,6 +56,7 @@ class FlowConfig(ABC):
         >>> class ProcessingFlowConfig(FlowConfig):
         ...     INPUT_DOCUMENT_TYPES = [RawDataDocument]
         ...     OUTPUT_DOCUMENT_TYPE = ProcessedDocument  # Different type!
+        ...     WEIGHT = 45.0  # Average ~45 minutes
         >>>
         >>> # Use in @pipeline_flow - RECOMMENDED PATTERN
         >>> @pipeline_flow(config=ProcessingFlowConfig, name="processing")
@@ -72,11 +75,12 @@ class FlowConfig(ABC):
     Note:
         - Validation happens at class definition time
         - Helps catch configuration errors early
-        - Used by simple_runner to manage document flow
+        - Used by PipelineDeployment to manage document flow
     """
     INPUT_DOCUMENT_TYPES: ClassVar[list[type[FlowDocument]]]
     OUTPUT_DOCUMENT_TYPE: ClassVar[type[FlowDocument]]
+    WEIGHT: ClassVar[float] = 1.0
     def __init_subclass__(cls, **kwargs: Any):
         """Validate flow configuration at subclass definition time.
@@ -106,7 +110,7 @@ class FlowConfig(ABC):
             return
         # Check for invalid field names (common mistakes)
-        allowed_fields = {"INPUT_DOCUMENT_TYPES", "OUTPUT_DOCUMENT_TYPE"}
+        allowed_fields = {"INPUT_DOCUMENT_TYPES", "OUTPUT_DOCUMENT_TYPE", "WEIGHT"}
         class_attrs = {name for name in dir(cls) if not name.startswith("_") and name.isupper()}
         # Find fields that look like they might be mistakes
@@ -145,6 +149,13 @@ class FlowConfig(ABC):
                 f"({cls.OUTPUT_DOCUMENT_TYPE.__name__}) cannot be in INPUT_DOCUMENT_TYPES"
             )
+        # Validate WEIGHT
+        weight = getattr(cls, "WEIGHT", 1.0)
+        if not isinstance(weight, (int, float)) or weight <= 0:
+            raise TypeError(
+                f"FlowConfig {cls.__name__}: WEIGHT must be a positive number, got {weight}"
+            )
     @classmethod
     def get_input_document_types(cls) -> list[type[FlowDocument]]:
         """Get the list of input document types this flow accepts.

ai_pipeline_core/flow/options.py CHANGED Viewed

@@ -41,7 +41,7 @@ class FlowOptions(BaseSettings):
         >>> # Or create programmatically:
         >>> options = MyFlowOptions(
-        ...     core_model="gemini-2.5-pro",
+        ...     core_model="gemini-3-pro",
         ...     temperature=0.9
         ... )
@@ -53,7 +53,7 @@ class FlowOptions(BaseSettings):
         - Frozen (immutable) after creation
         - Extra fields ignored (not strict)
         - Can be populated from environment variables
-        - Used by simple_runner.cli for command-line parsing
+        - Used by PipelineDeployment.run_cli for command-line parsing
     Note:
         The base class provides model selection. Subclasses should
@@ -61,11 +61,11 @@ class FlowOptions(BaseSettings):
     """
     core_model: ModelName = Field(
-        default="gemini-2.5-pro",
+        default="gemini-3-pro",
         description="Primary model for complex analysis and generation tasks.",
     )
     small_model: ModelName = Field(
-        default="grok-4-fast",
+        default="grok-4.1-fast",
         description="Fast, cost-effective model for simple tasks and orchestration.",
     )

ai_pipeline_core/images/__init__.py ADDED Viewed

@@ -0,0 +1,362 @@
+"""Image processing utilities for LLM vision models.
+@public
+Splits large images, compresses to JPEG, and respects model-specific constraints.
+Designed for website screenshots, document pages, and other visual content
+sent to vision-capable LLMs.
+Quick Start:
+    >>> from ai_pipeline_core.images import process_image, ImagePreset
+    >>>
+    >>> result = process_image(screenshot_bytes)
+    >>> for part in result:
+    ...     send_to_llm(part.data, context=part.label)
+    >>>
+    >>> result = process_image(screenshot_bytes, preset=ImagePreset.GEMINI)
+"""
+from enum import StrEnum
+from pydantic import BaseModel, Field
+from ai_pipeline_core.documents import Document, TemporaryDocument
+from ._processing import execute_split, load_and_normalize, plan_split
+__all__ = [
+    "ImagePreset",
+    "ImageProcessingConfig",
+    "ImagePart",
+    "ProcessedImage",
+    "ImageProcessingError",
+    "process_image",
+    "process_image_to_documents",
+]
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+class ImagePreset(StrEnum):
+    """Presets for LLM vision model constraints.
+    @public
+    """
+    GEMINI = "gemini"
+    CLAUDE = "claude"
+    GPT4V = "gpt4v"
+class ImageProcessingConfig(BaseModel):
+    """Configuration for image processing.
+    @public
+    Use ``for_preset`` for standard configurations or construct directly for
+    custom constraints.
+    Example:
+        >>> config = ImageProcessingConfig.for_preset(ImagePreset.GEMINI)
+        >>> config = ImageProcessingConfig(max_dimension=2000, jpeg_quality=80)
+    """
+    model_config = {"frozen": True}
+    max_dimension: int = Field(
+        default=3000,
+        ge=100,
+        le=8192,
+        description="Maximum width AND height in pixels",
+    )
+    max_pixels: int = Field(
+        default=9_000_000,
+        ge=10_000,
+        description="Maximum total pixels per output image part",
+    )
+    overlap_fraction: float = Field(
+        default=0.20,
+        ge=0.0,
+        le=0.5,
+        description="Overlap between adjacent vertical parts (0.0-0.5)",
+    )
+    max_parts: int = Field(
+        default=20,
+        ge=1,
+        le=100,
+        description="Maximum number of output image parts",
+    )
+    jpeg_quality: int = Field(
+        default=60,
+        ge=10,
+        le=95,
+        description="JPEG compression quality (10-95)",
+    )
+    @classmethod
+    def for_preset(cls, preset: ImagePreset) -> "ImageProcessingConfig":
+        """Create configuration from a model preset.
+        @public
+        """
+        return _PRESETS[preset]
+_PRESETS: dict[ImagePreset, ImageProcessingConfig] = {
+    ImagePreset.GEMINI: ImageProcessingConfig(
+        max_dimension=3000,
+        max_pixels=9_000_000,
+        jpeg_quality=75,
+    ),
+    ImagePreset.CLAUDE: ImageProcessingConfig(
+        max_dimension=1568,
+        max_pixels=1_150_000,
+        jpeg_quality=60,
+    ),
+    ImagePreset.GPT4V: ImageProcessingConfig(
+        max_dimension=2048,
+        max_pixels=4_000_000,
+        jpeg_quality=70,
+    ),
+}
+# ---------------------------------------------------------------------------
+# Result models
+# ---------------------------------------------------------------------------
+class ImagePart(BaseModel):
+    """A single processed image part.
+    @public
+    """
+    model_config = {"frozen": True}
+    data: bytes = Field(repr=False)
+    width: int
+    height: int
+    index: int = Field(ge=0, description="0-indexed position")
+    total: int = Field(ge=1, description="Total number of parts")
+    source_y: int = Field(ge=0, description="Y offset in original image")
+    source_height: int = Field(ge=1, description="Height of region in original")
+    @property
+    def label(self) -> str:
+        """Human-readable label for LLM context, 1-indexed.
+        @public
+        """
+        if self.total == 1:
+            return "Full image"
+        return f"Part {self.index + 1}/{self.total}"
+class ProcessedImage(BaseModel):
+    """Result of image processing.
+    @public
+    Iterable: ``for part in result`` iterates over parts.
+    """
+    model_config = {"frozen": True}
+    parts: list[ImagePart]
+    original_width: int
+    original_height: int
+    original_bytes: int
+    output_bytes: int
+    was_trimmed: bool = Field(description="True if width was trimmed to fit")
+    warnings: list[str] = Field(default_factory=list)
+    @property
+    def compression_ratio(self) -> float:
+        """Output size / input size (lower means more compression).
+        @public
+        """
+        if self.original_bytes <= 0:
+            return 1.0
+        return self.output_bytes / self.original_bytes
+    def __len__(self) -> int:
+        return len(self.parts)
+    def __iter__(self):  # type: ignore[override]
+        return iter(self.parts)
+    def __getitem__(self, idx: int) -> ImagePart:
+        return self.parts[idx]
+# ---------------------------------------------------------------------------
+# Exceptions
+# ---------------------------------------------------------------------------
+class ImageProcessingError(Exception):
+    """Image processing failed.
+    @public
+    """
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def process_image(
+    image: bytes | Document,
+    preset: ImagePreset = ImagePreset.GEMINI,
+    config: ImageProcessingConfig | None = None,
+) -> ProcessedImage:
+    """Process an image for LLM vision models.
+    @public
+    Splits tall images vertically with overlap, trims width if needed, and
+    compresses to JPEG.  The default preset is **GEMINI** (3 000 px, 9 M pixels).
+    Args:
+        image: Raw image bytes or a Document whose content is an image.
+        preset: Model preset (ignored when *config* is provided).
+        config: Custom configuration that overrides the preset.
+    Returns:
+        A ``ProcessedImage`` containing one or more ``ImagePart`` objects.
+    Raises:
+        ImageProcessingError: If the image cannot be decoded or processed.
+    Example:
+        >>> result = process_image(screenshot_bytes)
+        >>> for part in result:
+        ...     print(part.label, len(part.data))
+    """
+    effective = config if config is not None else ImageProcessingConfig.for_preset(preset)
+    # Resolve input bytes
+    raw: bytes
+    if isinstance(image, Document):
+        raw = image.content
+    elif isinstance(image, bytes):  # type: ignore[reportUnnecessaryIsInstance]
+        raw = image
+    else:
+        raise ImageProcessingError(f"Unsupported image input type: {type(image)}")
+    if not raw:
+        raise ImageProcessingError("Empty image data")
+    original_bytes = len(raw)
+    # Load & normalise
+    try:
+        img = load_and_normalize(raw)
+    except Exception as exc:
+        raise ImageProcessingError(f"Failed to decode image: {exc}") from exc
+    original_width, original_height = img.size
+    # Plan
+    plan = plan_split(
+        width=original_width,
+        height=original_height,
+        max_dimension=effective.max_dimension,
+        max_pixels=effective.max_pixels,
+        overlap_fraction=effective.overlap_fraction,
+        max_parts=effective.max_parts,
+    )
+    # Execute
+    raw_parts = execute_split(img, plan, effective.jpeg_quality)
+    # Build result
+    parts: list[ImagePart] = []
+    total = len(raw_parts)
+    total_output = 0
+    for idx, (data, w, h, sy, sh) in enumerate(raw_parts):
+        total_output += len(data)
+        parts.append(
+            ImagePart(
+                data=data,
+                width=w,
+                height=h,
+                index=idx,
+                total=total,
+                source_y=sy,
+                source_height=sh,
+            )
+        )
+    return ProcessedImage(
+        parts=parts,
+        original_width=original_width,
+        original_height=original_height,
+        original_bytes=original_bytes,
+        output_bytes=total_output,
+        was_trimmed=plan.trim_width is not None,
+        warnings=plan.warnings,
+    )
+def process_image_to_documents(
+    image: bytes | Document,
+    preset: ImagePreset = ImagePreset.GEMINI,
+    config: ImageProcessingConfig | None = None,
+    name_prefix: str = "image",
+    sources: list[str] | None = None,
+) -> list[TemporaryDocument]:
+    """Process an image and return parts as ``TemporaryDocument`` list.
+    @public
+    Convenience wrapper around ``process_image`` for direct integration
+    with ``AIMessages``.
+    Args:
+        image: Raw image bytes or a Document.
+        preset: Model preset (ignored when *config* is provided).
+        config: Custom configuration.
+        name_prefix: Prefix for generated document names.
+        sources: Optional provenance references attached to each document.
+    Returns:
+        List of ``TemporaryDocument`` instances with JPEG image data.
+    Example:
+        >>> docs = process_image_to_documents(screenshot_bytes)
+        >>> messages = AIMessages(docs)
+    """
+    result = process_image(image, preset=preset, config=config)
+    # Resolve sources
+    doc_sources: list[str] = list(sources or [])
+    if isinstance(image, Document):
+        doc_sources.append(image.sha256)
+    documents: list[TemporaryDocument] = []
+    for part in result.parts:
+        if len(result.parts) == 1:
+            name = f"{name_prefix}.jpg"
+            desc = None
+        else:
+            name = f"{name_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
+            desc = part.label
+        documents.append(
+            TemporaryDocument.create(
+                name=name,
+                content=part.data,
+                description=desc,
+                sources=doc_sources or None,
+            )
+        )
+    return documents

ai-pipeline-core 0.2.9__py3-none-any.whl → 0.3.3__py3-none-any.whl

ai-pipeline-core 0.2.9py3-none-any.whl → 0.3.3py3-none-any.whl