PyPI - ai-pipeline-core - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

ai-pipeline-core 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

ai_pipeline_core/__init__.py +64 -158
ai_pipeline_core/deployment/__init__.py +6 -18
ai_pipeline_core/deployment/base.py +392 -212
ai_pipeline_core/deployment/contract.py +6 -10
ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
ai_pipeline_core/deployment/helpers.py +16 -17
ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
ai_pipeline_core/docs_generator/__init__.py +54 -0
ai_pipeline_core/docs_generator/__main__.py +5 -0
ai_pipeline_core/docs_generator/cli.py +196 -0
ai_pipeline_core/docs_generator/extractor.py +324 -0
ai_pipeline_core/docs_generator/guide_builder.py +644 -0
ai_pipeline_core/docs_generator/trimmer.py +35 -0
ai_pipeline_core/docs_generator/validator.py +114 -0
ai_pipeline_core/document_store/__init__.py +13 -0
ai_pipeline_core/document_store/_summary.py +9 -0
ai_pipeline_core/document_store/_summary_worker.py +170 -0
ai_pipeline_core/document_store/clickhouse.py +492 -0
ai_pipeline_core/document_store/factory.py +38 -0
ai_pipeline_core/document_store/local.py +312 -0
ai_pipeline_core/document_store/memory.py +85 -0
ai_pipeline_core/document_store/protocol.py +68 -0
ai_pipeline_core/documents/__init__.py +12 -14
ai_pipeline_core/documents/_context_vars.py +85 -0
ai_pipeline_core/documents/_hashing.py +52 -0
ai_pipeline_core/documents/attachment.py +85 -0
ai_pipeline_core/documents/context.py +128 -0
ai_pipeline_core/documents/document.py +318 -1434
ai_pipeline_core/documents/mime_type.py +11 -84
ai_pipeline_core/documents/utils.py +4 -12
ai_pipeline_core/exceptions.py +10 -62
ai_pipeline_core/images/__init__.py +32 -85
ai_pipeline_core/images/_processing.py +5 -11
ai_pipeline_core/llm/__init__.py +6 -4
ai_pipeline_core/llm/ai_messages.py +102 -90
ai_pipeline_core/llm/client.py +229 -183
ai_pipeline_core/llm/model_options.py +12 -84
ai_pipeline_core/llm/model_response.py +53 -99
ai_pipeline_core/llm/model_types.py +8 -23
ai_pipeline_core/logging/__init__.py +2 -7
ai_pipeline_core/logging/logging.yml +1 -1
ai_pipeline_core/logging/logging_config.py +27 -37
ai_pipeline_core/logging/logging_mixin.py +15 -41
ai_pipeline_core/observability/__init__.py +32 -0
ai_pipeline_core/observability/_debug/__init__.py +30 -0
ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
ai_pipeline_core/observability/_debug/_types.py +75 -0
ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
ai_pipeline_core/observability/_document_tracking.py +146 -0
ai_pipeline_core/observability/_initialization.py +194 -0
ai_pipeline_core/observability/_logging_bridge.py +57 -0
ai_pipeline_core/observability/_summary.py +81 -0
ai_pipeline_core/observability/_tracking/__init__.py +6 -0
ai_pipeline_core/observability/_tracking/_client.py +178 -0
ai_pipeline_core/observability/_tracking/_internal.py +28 -0
ai_pipeline_core/observability/_tracking/_models.py +138 -0
ai_pipeline_core/observability/_tracking/_processor.py +158 -0
ai_pipeline_core/observability/_tracking/_service.py +311 -0
ai_pipeline_core/observability/_tracking/_writer.py +229 -0
ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
ai_pipeline_core/pipeline/__init__.py +10 -0
ai_pipeline_core/pipeline/decorators.py +915 -0
ai_pipeline_core/pipeline/options.py +16 -0
ai_pipeline_core/prompt_manager.py +16 -102
ai_pipeline_core/settings.py +26 -31
ai_pipeline_core/testing.py +9 -0
ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
ai_pipeline_core/debug/__init__.py +0 -26
ai_pipeline_core/documents/document_list.py +0 -420
ai_pipeline_core/documents/flow_document.py +0 -112
ai_pipeline_core/documents/task_document.py +0 -117
ai_pipeline_core/documents/temporary_document.py +0 -74
ai_pipeline_core/flow/__init__.py +0 -9
ai_pipeline_core/flow/config.py +0 -494
ai_pipeline_core/flow/options.py +0 -75
ai_pipeline_core/pipeline.py +0 -718
ai_pipeline_core/prefect.py +0 -63
ai_pipeline_core/prompt_builder/__init__.py +0 -5
ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
ai_pipeline_core/prompt_builder/global_cache.py +0 -78
ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
ai_pipeline_core/storage/__init__.py +0 -8
ai_pipeline_core/storage/storage.py +0 -628
ai_pipeline_core/utils/__init__.py +0 -8
ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
{ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/observability/_document_tracking.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Document tracking helpers for pipeline instrumentation.
+Emits document lifecycle events and sets OTel span attributes for
+document lineage. All functions are no-ops when tracking is not initialized.
+"""
+from typing import cast
+from opentelemetry import trace as otel_trace
+from ai_pipeline_core.documents import Document
+from ai_pipeline_core.logging import get_pipeline_logger
+from ai_pipeline_core.observability._initialization import TrackingServiceProtocol, get_tracking_service
+from ai_pipeline_core.observability._tracking._models import ATTR_INPUT_DOCUMENT_SHA256S, ATTR_OUTPUT_DOCUMENT_SHA256S, DocumentEventType
+logger = get_pipeline_logger(__name__)
+def get_current_span_id() -> str:
+    """Return the current OTel span ID as hex, or empty string."""
+    span = otel_trace.get_current_span()
+    ctx = span.get_span_context()
+    if ctx and ctx.span_id:
+        return format(ctx.span_id, "016x")
+    return ""
+def _get_tracking_service() -> TrackingServiceProtocol | None:
+    """Return the global tracking service, or None if not initialized."""
+    return get_tracking_service()
+def track_task_io(task_name: str, args: tuple[object, ...], kwargs: dict[str, object], result: object) -> None:  # noqa: ARG001
+    """Track input/output documents for a pipeline task."""
+    service = _get_tracking_service()
+    if service is None:
+        return
+    span_id = get_current_span_id()
+    input_sha256s: list[str] = []
+    output_sha256s: list[str] = []
+    # Track input documents
+    for arg in (*args, *kwargs.values()):
+        if isinstance(arg, Document):
+            input_sha256s.append(arg.sha256)
+            service.track_document_event(
+                document_sha256=arg.sha256,
+                span_id=span_id,
+                event_type=DocumentEventType.TASK_INPUT,
+            )
+        elif isinstance(arg, list) and arg and isinstance(arg[0], Document):
+            for doc in cast(list[Document], arg):
+                input_sha256s.append(doc.sha256)
+                service.track_document_event(
+                    document_sha256=doc.sha256,
+                    span_id=span_id,
+                    event_type=DocumentEventType.TASK_INPUT,
+                )
+    # Track output documents
+    if isinstance(result, Document):
+        output_sha256s.append(result.sha256)
+        service.track_document_event(
+            document_sha256=result.sha256,
+            span_id=span_id,
+            event_type=DocumentEventType.TASK_OUTPUT,
+        )
+    elif isinstance(result, list) and result and isinstance(result[0], Document):
+        for doc in cast(list[Document], result):
+            output_sha256s.append(doc.sha256)
+            service.track_document_event(
+                document_sha256=doc.sha256,
+                span_id=span_id,
+                event_type=DocumentEventType.TASK_OUTPUT,
+            )
+    # Set span attributes for TrackingSpanProcessor to populate tracked_spans columns
+    if input_sha256s or output_sha256s:
+        span = otel_trace.get_current_span()
+        if input_sha256s:
+            span.set_attribute(ATTR_INPUT_DOCUMENT_SHA256S, input_sha256s)
+        if output_sha256s:
+            span.set_attribute(ATTR_OUTPUT_DOCUMENT_SHA256S, output_sha256s)
+def track_flow_io(flow_name: str, input_documents: list[Document], output_documents: list[Document]) -> None:  # noqa: ARG001
+    """Track input/output documents for a pipeline flow."""
+    service = _get_tracking_service()
+    if service is None:
+        return
+    span_id = get_current_span_id()
+    input_sha256s: list[str] = []
+    output_sha256s: list[str] = []
+    for doc in input_documents:
+        input_sha256s.append(doc.sha256)
+        service.track_document_event(
+            document_sha256=doc.sha256,
+            span_id=span_id,
+            event_type=DocumentEventType.FLOW_INPUT,
+        )
+    for doc in output_documents:
+        output_sha256s.append(doc.sha256)
+        service.track_document_event(
+            document_sha256=doc.sha256,
+            span_id=span_id,
+            event_type=DocumentEventType.FLOW_OUTPUT,
+        )
+    if input_sha256s or output_sha256s:
+        span = otel_trace.get_current_span()
+        if input_sha256s:
+            span.set_attribute(ATTR_INPUT_DOCUMENT_SHA256S, input_sha256s)
+        if output_sha256s:
+            span.set_attribute(ATTR_OUTPUT_DOCUMENT_SHA256S, output_sha256s)
+def track_llm_documents(context: object | None, messages: object | None) -> None:
+    """Track documents used in LLM calls (context and messages)."""
+    service = _get_tracking_service()
+    if service is None:
+        return
+    span_id = get_current_span_id()
+    if context is not None:
+        _track_docs_from_messages(service, context, span_id, DocumentEventType.LLM_CONTEXT)
+    if messages is not None:
+        _track_docs_from_messages(service, messages, span_id, DocumentEventType.LLM_MESSAGE)
+def _track_docs_from_messages(service: TrackingServiceProtocol, messages: object, span_id: str, event_type: DocumentEventType) -> None:
+    """Extract and track documents from AIMessages or similar containers."""
+    if not isinstance(messages, list):
+        return
+    for item in cast(list[object], messages):
+        if isinstance(item, Document):
+            service.track_document_event(
+                document_sha256=item.sha256,
+                span_id=span_id,
+                event_type=event_type,
+            )

ai_pipeline_core/observability/_initialization.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""Observability system initialization.
+Provides ``initialize_observability()`` as the single entry point for
+setting up Laminar and ClickHouse tracking.
+"""
+import importlib
+from typing import Any, Protocol
+from uuid import UUID
+from lmnr import Laminar
+from opentelemetry import trace as otel_trace
+from pydantic import BaseModel, ConfigDict
+from ai_pipeline_core.logging import get_pipeline_logger
+from ai_pipeline_core.observability._tracking._client import ClickHouseClient
+from ai_pipeline_core.observability._tracking._models import DocumentEventType, RunStatus
+from ai_pipeline_core.observability._tracking._processor import TrackingSpanProcessor
+from ai_pipeline_core.observability._tracking._service import TrackingService
+from ai_pipeline_core.settings import settings
+logger = get_pipeline_logger(__name__)
+class TrackingServiceProtocol(Protocol):
+    """Protocol for the tracking service methods used by deployment, decorators, and document tracking."""
+    # Run lifecycle
+    def set_run_context(self, *, run_id: UUID, project_name: str, flow_name: str, run_scope: str = "") -> None:
+        """Store run metadata in context vars for downstream span attribution."""
+        ...
+    def track_run_start(self, *, run_id: UUID, project_name: str, flow_name: str, run_scope: str = "") -> None:
+        """Record a pipeline run start event to ClickHouse."""
+        ...
+    def track_run_end(
+        self,
+        *,
+        run_id: UUID,
+        status: RunStatus,
+        total_cost: float = ...,
+        total_tokens: int = ...,
+        metadata: dict[str, object] | None = ...,
+    ) -> None:
+        """Record a pipeline run completion event with final metrics."""
+        ...
+    def clear_run_context(self) -> None:
+        """Reset run-scoped context vars after a run finishes."""
+        ...
+    # Document tracking
+    def track_document_event(
+        self,
+        *,
+        document_sha256: str,
+        span_id: str,
+        event_type: DocumentEventType,
+        metadata: dict[str, str] | None = ...,
+    ) -> None:
+        """Record a document lifecycle event (created, read, transformed)."""
+        ...
+    # Summaries
+    def schedule_summary(self, span_id: str, label: str, output_hint: str) -> None:
+        """Queue an LLM-generated summary for a span's output."""
+        ...
+    # Lifecycle
+    def flush(self, timeout: float = 30.0) -> None:
+        """Flush all pending tracking events to ClickHouse."""
+        ...
+    def shutdown(self, timeout: float = 30.0) -> None:
+        """Flush pending events and release tracking resources."""
+        ...
+_tracking_service: TrackingServiceProtocol | None = None
+def get_tracking_service() -> TrackingServiceProtocol | None:
+    """Return the global TrackingService instance, or None if not initialized."""
+    return _tracking_service
+class ObservabilityConfig(BaseModel):
+    """Configuration for the observability system."""
+    model_config = ConfigDict(frozen=True)
+    # Laminar
+    lmnr_project_api_key: str = ""
+    lmnr_debug: str = ""
+    # ClickHouse tracking
+    clickhouse_host: str = ""
+    clickhouse_port: int = 8443
+    clickhouse_database: str = "default"
+    clickhouse_user: str = "default"
+    clickhouse_password: str = ""
+    clickhouse_secure: bool = True
+    # Tracking behavior
+    tracking_enabled: bool = True
+    tracking_summary_model: str = "gemini-3-flash"
+    @property
+    def has_clickhouse(self) -> bool:
+        """Whether ClickHouse is configured."""
+        return bool(self.clickhouse_host)
+    @property
+    def has_lmnr(self) -> bool:
+        """Whether Laminar is configured."""
+        return bool(self.lmnr_project_api_key)
+def _build_config_from_settings() -> ObservabilityConfig:
+    """Build ObservabilityConfig from framework Settings."""
+    return ObservabilityConfig(
+        lmnr_project_api_key=getattr(settings, "lmnr_project_api_key", ""),
+        lmnr_debug=getattr(settings, "lmnr_debug", ""),
+        clickhouse_host=getattr(settings, "clickhouse_host", ""),
+        clickhouse_port=getattr(settings, "clickhouse_port", 8443),
+        clickhouse_database=getattr(settings, "clickhouse_database", "default"),
+        clickhouse_user=getattr(settings, "clickhouse_user", "default"),
+        clickhouse_password=getattr(settings, "clickhouse_password", ""),
+        clickhouse_secure=getattr(settings, "clickhouse_secure", True),
+        tracking_enabled=getattr(settings, "tracking_enabled", True),
+        tracking_summary_model=getattr(settings, "tracking_summary_model", "gemini-3-flash"),
+    )
+def _setup_tracking(config: ObservabilityConfig) -> TrackingServiceProtocol | None:
+    """Set up ClickHouse tracking if configured. Returns TrackingService or None."""
+    if not config.has_clickhouse or not config.tracking_enabled:
+        return None
+    client = ClickHouseClient(
+        host=config.clickhouse_host,
+        port=config.clickhouse_port,
+        database=config.clickhouse_database,
+        username=config.clickhouse_user,
+        password=config.clickhouse_password,
+        secure=config.clickhouse_secure,
+    )
+    summary_mod = importlib.import_module("ai_pipeline_core.observability._summary")
+    service = TrackingService(
+        client,
+        summary_model=config.tracking_summary_model,
+        span_summary_fn=summary_mod.generate_span_summary,
+    )
+    # Register span processor with OTel
+    try:
+        provider: Any = otel_trace.get_tracer_provider()
+        if hasattr(provider, "add_span_processor"):
+            processor = TrackingSpanProcessor(service)
+            provider.add_span_processor(processor)
+            logger.info("ClickHouse tracking initialized")
+    except Exception as e:
+        logger.warning(f"Failed to register TrackingSpanProcessor: {e}")
+    return service
+def initialize_observability(config: ObservabilityConfig | None = None) -> None:
+    """Initialize the full observability stack.
+    Call once at pipeline startup. Safe to call multiple times (idempotent
+    for Laminar). Reads from Settings if no config provided.
+    """
+    global _tracking_service  # noqa: PLW0603
+    if _tracking_service is not None:
+        return  # Already initialized
+    if config is None:
+        config = _build_config_from_settings()
+    # 1. Laminar
+    if config.has_lmnr:
+        try:
+            Laminar.initialize(project_api_key=config.lmnr_project_api_key, export_timeout_seconds=15)
+            logger.info("Laminar initialized")
+        except Exception as e:
+            logger.warning(f"Laminar initialization failed: {e}")
+    # 2. ClickHouse tracking
+    _tracking_service = _setup_tracking(config)
+    # 3. Logging bridge — attached per-logger in get_pipeline_logger(), nothing to do here.

ai_pipeline_core/observability/_logging_bridge.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Logging bridge — captures Python log records as OTel span events.
+Attaches a singleton ``SpanEventLoggingHandler`` to every logger created
+via ``get_pipeline_logger()``.  The handler is safe to attach eagerly
+because ``emit()`` is a no-op when no OTel span is recording.
+This is the only module that legitimately needs ``import logging`` directly
+to subclass ``logging.Handler``. The ruff ban on ``import logging``
+(pyproject.toml) is suppressed with ``# noqa: TID251``.
+"""
+import contextlib
+import logging  # noqa: TID251
+from opentelemetry import trace as otel_trace
+_MIN_LEVEL = logging.INFO
+class SpanEventLoggingHandler(logging.Handler):
+    """Logging handler that writes log records as OTel span events.
+    Attached to each logger returned by ``get_pipeline_logger()``.
+    Only captures records at INFO level and above. Each record becomes
+    a span event with ``log.level`` and ``log.message`` attributes.
+    """
+    def __init__(self) -> None:
+        super().__init__(level=_MIN_LEVEL)
+    def emit(self, record: logging.LogRecord) -> None:
+        """Write a log record as an OTel span event."""
+        with contextlib.suppress(Exception):
+            # Prevent duplicate events when handler is on both parent and child logger
+            if getattr(record, "_span_event_logged", False):
+                return
+            span = otel_trace.get_current_span()
+            if not span.is_recording():
+                return
+            span.add_event(
+                name="log",
+                attributes={
+                    "log.level": record.levelname,
+                    "log.message": self.format(record),
+                    "log.logger": record.name,
+                },
+            )
+            record._span_event_logged = True
+# Module-level singleton — safe because emit() checks is_recording().
+_bridge_handler = SpanEventLoggingHandler()
+def get_bridge_handler() -> SpanEventLoggingHandler:
+    """Return the singleton bridge handler for attaching to pipeline loggers."""
+    return _bridge_handler

ai_pipeline_core/observability/_summary.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""LLM-powered summary generation for tracked spans and documents."""
+from pydantic import BaseModel, Field
+from ai_pipeline_core.llm import generate_structured
+from ai_pipeline_core.llm.model_options import ModelOptions
+from ai_pipeline_core.logging import get_pipeline_logger
+from ai_pipeline_core.observability._tracking._internal import internal_tracking_context
+logger = get_pipeline_logger(__name__)
+_SPAN_SUMMARY_SYSTEM_PROMPT = (
+    "You summarize AI pipeline task results for non-technical users "
+    "monitoring a research pipeline.\n"
+    "Rules:\n"
+    "- Describe the action and outcome, not the content\n"
+    "- No internal names, function names, or technical details\n"
+    "- No sensitive data (URLs, personal names, company details) from the output\n"
+    "- Use present perfect tense"
+)
+_DOC_SUMMARY_SYSTEM_PROMPT = (
+    "You generate metadata for documents in a research pipeline dashboard.\n"
+    "Rules:\n"
+    "- No sensitive data (URLs, personal names, company details)\n"
+    "- Describe purpose and content type, not the content itself\n"
+    "- For website documents: short_title must be 'domain.com: Page Title' (shorten title if needed to fit 50 chars)"
+)
+class SpanSummary(BaseModel):
+    """Structured output for span/task summaries."""
+    summary: str = Field(description="1-2 sentences (max 50 words) describing what the task accomplished in present perfect tense")
+class DocumentSummary(BaseModel):
+    """Structured output for document summaries."""
+    short_title: str = Field(description="Document title proposition based on content, max 50 characters")
+    summary: str = Field(description="1-2 sentences (max 50 words) describing the document's purpose and content type")
+async def generate_span_summary(label: str, output_hint: str, model: str = "gemini-3-flash") -> str:
+    """Generate a human-readable summary for a span/task output.
+    Returns plain summary string (stored in tracked_spans.user_summary).
+    """
+    try:
+        with internal_tracking_context():
+            result = await generate_structured(
+                model=model,
+                response_format=SpanSummary,
+                messages=f"Task: {label}\nResult: {output_hint}",
+                options=ModelOptions(system_prompt=_SPAN_SUMMARY_SYSTEM_PROMPT, cache_ttl=None, retries=3, timeout=30),
+                purpose=f"span_summary: {label}",
+            )
+        return result.parsed.summary
+    except Exception as e:
+        logger.warning(f"Span summary failed for '{label}': {e}")
+        return ""
+async def generate_document_summary(name: str, excerpt: str, model: str = "gemini-3-flash") -> str:
+    """Generate structured metadata for a document.
+    Returns JSON-serialized DocumentSummary (stored in document_index.summary).
+    """
+    try:
+        with internal_tracking_context():
+            result = await generate_structured(
+                model=model,
+                response_format=DocumentSummary,
+                messages=f"Document: {name}\nContent excerpt:\n{excerpt}",
+                options=ModelOptions(system_prompt=_DOC_SUMMARY_SYSTEM_PROMPT, cache_ttl=None, retries=3, timeout=30),
+                purpose=f"document_summary: {name}",
+            )
+        return result.parsed.model_dump_json()
+    except Exception as e:
+        logger.warning(f"Document summary failed for '{name}': {e}")
+        return ""

ai_pipeline_core/observability/_tracking/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""ClickHouse-based tracking system for pipeline observability.
+Import from submodules directly:
+- ``from ai_pipeline_core.observability._tracking._models import ...``
+- ``from ai_pipeline_core.observability._tracking._service import ...``
+"""

ai_pipeline_core/observability/_tracking/_client.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""ClickHouse client with lazy connection and table management."""
+import clickhouse_connect
+from pydantic import BaseModel
+from ai_pipeline_core.logging import get_pipeline_logger
+from ._models import (
+    TABLE_DOCUMENT_EVENTS,
+    TABLE_PIPELINE_RUNS,
+    TABLE_SPAN_EVENTS,
+    TABLE_TRACKED_SPANS,
+    TrackedSpanRow,
+)
+logger = get_pipeline_logger(__name__)
+# SQL statements for table creation
+_CREATE_TABLES_SQL = [
+    f"""
+    CREATE TABLE IF NOT EXISTS {TABLE_PIPELINE_RUNS}
+    (
+        run_id           UUID,
+        project_name     LowCardinality(String),
+        flow_name        LowCardinality(String),
+        run_scope        String         DEFAULT '',
+        status           LowCardinality(String),
+        start_time       DateTime64(3, 'UTC'),
+        end_time         Nullable(DateTime64(3, 'UTC')),
+        total_cost       Float64        DEFAULT 0,
+        total_tokens     UInt64         DEFAULT 0,
+        metadata         String         DEFAULT '{{}}' CODEC(ZSTD(3)),
+        version          UInt64         DEFAULT 1
+    )
+    ENGINE = ReplacingMergeTree(version)
+    PARTITION BY toYYYYMM(start_time)
+    ORDER BY (run_id)
+    SETTINGS index_granularity = 8192
+    """,
+    f"""
+    CREATE TABLE IF NOT EXISTS {TABLE_TRACKED_SPANS}
+    (
+        span_id                  String,
+        trace_id                 String,
+        run_id                   UUID,
+        parent_span_id           Nullable(String),
+        name                     String,
+        span_type                LowCardinality(String),
+        status                   LowCardinality(String),
+        start_time               DateTime64(3, 'UTC'),
+        end_time                 Nullable(DateTime64(3, 'UTC')),
+        duration_ms              UInt64         DEFAULT 0,
+        cost                     Float64        DEFAULT 0,
+        tokens_input             UInt64         DEFAULT 0,
+        tokens_output            UInt64         DEFAULT 0,
+        llm_model                LowCardinality(Nullable(String)),
+        user_summary             Nullable(String) CODEC(ZSTD(3)),
+        user_visible             Bool           DEFAULT false,
+        user_label               Nullable(String),
+        input_document_sha256s   Array(String),
+        output_document_sha256s  Array(String),
+        version                  UInt64         DEFAULT 1,
+        INDEX idx_trace trace_id TYPE bloom_filter GRANULARITY 1
+    )
+    ENGINE = ReplacingMergeTree(version)
+    PARTITION BY toYYYYMM(start_time)
+    ORDER BY (run_id, span_id)
+    SETTINGS index_granularity = 8192
+    """,
+    f"""
+    CREATE TABLE IF NOT EXISTS {TABLE_DOCUMENT_EVENTS}
+    (
+        event_id           UUID,
+        run_id             UUID,
+        document_sha256    String,
+        span_id            String,
+        event_type         LowCardinality(String),
+        timestamp          DateTime64(3, 'UTC'),
+        metadata           String         DEFAULT '{{}}' CODEC(ZSTD(3))
+    )
+    ENGINE = MergeTree
+    PARTITION BY toYYYYMM(timestamp)
+    ORDER BY (run_id, document_sha256, timestamp)
+    SETTINGS index_granularity = 8192
+    """,
+    f"""
+    CREATE TABLE IF NOT EXISTS {TABLE_SPAN_EVENTS}
+    (
+        event_id       UUID,
+        run_id         UUID,
+        span_id        String,
+        name           String,
+        timestamp      DateTime64(3, 'UTC'),
+        attributes     String         DEFAULT '{{}}' CODEC(ZSTD(3)),
+        level          LowCardinality(Nullable(String))
+    )
+    ENGINE = MergeTree
+    PARTITION BY toYYYYMM(timestamp)
+    ORDER BY (run_id, span_id, timestamp)
+    SETTINGS index_granularity = 8192
+    """,
+]
+class ClickHouseClient:
+    """Synchronous ClickHouse client with lazy connection.
+    All methods are synchronous and must be called from the writer background
+    thread — never from the async event loop. Connection is deferred to
+    ``connect()`` which is called from the writer thread's ``_run()`` startup.
+    """
+    def __init__(
+        self,
+        *,
+        host: str,
+        port: int = 8443,
+        database: str = "default",
+        username: str = "default",
+        password: str = "",
+        secure: bool = True,
+    ) -> None:
+        """Store connection params. Does NOT connect yet."""
+        self._params = {
+            "host": host,
+            "port": port,
+            "database": database,
+            "username": username,
+            "password": password,
+            "secure": secure,
+        }
+        self._client: object | None = None
+        self._tables_initialized = False
+    def connect(self) -> None:
+        """Connect to ClickHouse. Call from writer thread, not async context."""
+        self._client = clickhouse_connect.get_client(**self._params)  # pyright: ignore[reportArgumentType, reportUnknownMemberType]
+        logger.info(f"Connected to ClickHouse at {self._params['host']}:{self._params['port']}")
+    def ensure_tables(self) -> None:
+        """Create tables if they don't exist. Call after connect()."""
+        if self._client is None:
+            raise RuntimeError("Not connected — call connect() first")
+        if self._tables_initialized:
+            return
+        for sql in _CREATE_TABLES_SQL:
+            self._client.command(sql)  # type: ignore[union-attr]
+        self._tables_initialized = True
+        logger.info("ClickHouse tables verified/created")
+    def _insert_rows(self, table: str, rows: list[BaseModel]) -> None:
+        """Insert rows into a table using columnar format."""
+        if not rows or self._client is None:
+            return
+        column_names = list(type(rows[0]).model_fields.keys())
+        data = [[getattr(row, col) for row in rows] for col in column_names]
+        self._client.insert(table, data, column_names=column_names, column_oriented=True)  # type: ignore[union-attr]
+    def insert_runs(self, rows: list[BaseModel]) -> None:
+        """Insert pipeline run rows."""
+        self._insert_rows(TABLE_PIPELINE_RUNS, rows)
+    def insert_spans(self, rows: list[BaseModel]) -> None:
+        """Insert tracked span rows."""
+        self._insert_rows(TABLE_TRACKED_SPANS, rows)
+    def insert_document_events(self, rows: list[BaseModel]) -> None:
+        """Insert document event rows."""
+        self._insert_rows(TABLE_DOCUMENT_EVENTS, rows)
+    def insert_span_events(self, rows: list[BaseModel]) -> None:
+        """Insert span event rows."""
+        self._insert_rows(TABLE_SPAN_EVENTS, rows)
+    def update_span(self, row: TrackedSpanRow) -> None:
+        """Insert a single replacement span row (ReplacingMergeTree update)."""
+        self.insert_spans([row])

ai_pipeline_core/observability/_tracking/_internal.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Thread-local flag to prevent tracking recursion.
+When summary generation calls ``llm.generate()``, the resulting span must NOT
+be tracked again (infinite loop).  The flag is checked by
+``TrackingSpanProcessor.on_end()``.
+"""
+import threading
+from collections.abc import Generator
+from contextlib import contextmanager
+_internal = threading.local()
+def is_internal_tracking() -> bool:
+    """Return True if the current thread is inside a tracking-internal LLM call."""
+    return getattr(_internal, "active", False)
+@contextmanager
+def internal_tracking_context() -> Generator[None, None, None]:
+    """Mark the current thread as performing internal tracking work."""
+    prev = getattr(_internal, "active", False)
+    _internal.active = True
+    try:
+        yield
+    finally:
+        _internal.active = prev

ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

ai-pipeline-core 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl