PyPI - pixie-qa - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pixie-qa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

pixie/__init__.py +11 -0
pixie/cli/__init__.py +6 -0
pixie/cli/dataset_command.py +193 -0
pixie/cli/main.py +192 -0
pixie/cli/test_command.py +68 -0
pixie/config.py +41 -0
pixie/dataset/__init__.py +11 -0
pixie/dataset/models.py +21 -0
pixie/dataset/store.py +212 -0
pixie/evals/__init__.py +111 -0
pixie/evals/criteria.py +77 -0
pixie/evals/eval_utils.py +244 -0
pixie/evals/evaluation.py +112 -0
pixie/evals/runner.py +187 -0
pixie/evals/scorers.py +755 -0
pixie/evals/trace_capture.py +70 -0
pixie/evals/trace_helpers.py +57 -0
pixie/instrumentation/__init__.py +49 -0
pixie/instrumentation/context.py +86 -0
pixie/instrumentation/handler.py +72 -0
pixie/instrumentation/handlers.py +83 -0
pixie/instrumentation/instrumentors.py +31 -0
pixie/instrumentation/observation.py +211 -0
pixie/instrumentation/processor.py +366 -0
pixie/instrumentation/queue.py +88 -0
pixie/instrumentation/spans.py +165 -0
pixie/storage/__init__.py +27 -0
pixie/storage/evaluable.py +129 -0
pixie/storage/piccolo_conf.py +10 -0
pixie/storage/piccolo_migrations/__init__.py +1 -0
pixie/storage/serialization.py +227 -0
pixie/storage/store.py +231 -0
pixie/storage/tables.py +21 -0
pixie/storage/tree.py +199 -0
pixie_qa-0.1.0.dist-info/METADATA +162 -0
pixie_qa-0.1.0.dist-info/RECORD +39 -0
pixie_qa-0.1.0.dist-info/WHEEL +4 -0
pixie_qa-0.1.0.dist-info/entry_points.txt +3 -0
pixie_qa-0.1.0.dist-info/licenses/LICENSE +21 -0

pixie/instrumentation/processor.py ADDED Viewed

@@ -0,0 +1,366 @@
+"""LLMSpanProcessor — converts OpenInference span attributes to LLMSpan."""
+from __future__ import annotations
+import json
+from datetime import datetime, timezone
+from typing import Any
+from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
+from opentelemetry.trace import StatusCode
+from .queue import _DeliveryQueue
+from .spans import (
+    AssistantMessage,
+    ImageContent,
+    LLMSpan,
+    Message,
+    SystemMessage,
+    TextContent,
+    ToolCall,
+    ToolDefinition,
+    ToolResultMessage,
+    UserMessage,
+)
+class LLMSpanProcessor(SpanProcessor):
+    """OTel SpanProcessor that converts OpenInference LLM spans to typed LLMSpan objects."""
+    def __init__(self, delivery_queue: _DeliveryQueue) -> None:
+        self._delivery_queue = delivery_queue
+    def on_start(self, span: Any, parent_context: Any = None) -> None:
+        """No-op — we only process completed spans."""
+    def on_end(self, span: ReadableSpan) -> None:
+        """Convert completed OpenInference LLM spans to LLMSpan and submit."""
+        try:
+            attrs = dict(span.attributes) if span.attributes else {}
+            # Only process LLM spans
+            span_kind = attrs.get("openinference.span.kind")
+            if span_kind not in ("LLM", "EMBEDDING"):
+                return
+            llm_span = self._build_llm_span(span, attrs, str(span_kind))
+            self._delivery_queue.submit(llm_span)
+        except Exception:
+            pass  # Never raise from on_end
+    def on_shutdown(self) -> None:
+        """No-op."""
+    def force_flush(self, timeout_millis: int = 30000) -> bool:
+        """Flush the delivery queue."""
+        return self._delivery_queue.flush(timeout_seconds=timeout_millis / 1000)
+    def _build_llm_span(
+        self,
+        span: ReadableSpan,
+        attrs: dict[str, Any],
+        span_kind: str,
+    ) -> LLMSpan:
+        """Build a typed LLMSpan from raw OTel span and attributes."""
+        # ── Identity / timing
+        ctx = span.context
+        if ctx is None:
+            raise ValueError("No span context")
+        span_id = format(ctx.span_id, "016x")
+        trace_id = format(ctx.trace_id, "032x")
+        parent_span_id = format(span.parent.span_id, "016x") if span.parent else None
+        start_ns = span.start_time or 0
+        end_ns = span.end_time or 0
+        started_at = datetime.fromtimestamp(start_ns / 1e9, tz=timezone.utc)
+        ended_at = datetime.fromtimestamp(end_ns / 1e9, tz=timezone.utc)
+        duration_ms = (end_ns - start_ns) / 1e6
+        # ── Provider / model
+        request_model = str(attrs.get("llm.model_name") or attrs.get("gen_ai.request.model", ""))
+        response_model_raw = attrs.get("gen_ai.response.model")
+        response_model = str(response_model_raw) if response_model_raw is not None else None
+        provider = str(attrs.get("gen_ai.system", "")) or _infer_provider(request_model)
+        operation = "embedding" if span_kind == "EMBEDDING" else "chat"
+        # ── Token usage
+        input_tokens = int(attrs.get("llm.token_count.prompt", 0))
+        output_tokens = int(attrs.get("llm.token_count.completion", 0))
+        cache_read_tokens = int(attrs.get("llm.token_count.cache_read", 0))
+        cache_creation_tokens = int(attrs.get("llm.token_count.cache_creation", 0))
+        # ── Request parameters
+        params = _parse_json(str(attrs.get("llm.invocation_parameters", "{}")))
+        request_temperature = _to_float_or_none(params.get("temperature"))
+        request_max_tokens = _to_int_or_none(
+            params.get("max_tokens") or params.get("max_completion_tokens")
+        )
+        request_top_p = _to_float_or_none(params.get("top_p"))
+        # ── Response / error
+        response_id_raw = attrs.get("llm.response_id") or attrs.get("gen_ai.response.id")
+        response_id = str(response_id_raw) if response_id_raw is not None else None
+        output_type_raw = attrs.get("gen_ai.output.type")
+        output_type = str(output_type_raw) if output_type_raw is not None else None
+        error_type_raw = attrs.get("error.type")
+        if error_type_raw is not None:
+            error_type: str | None = str(error_type_raw)
+        elif span.status and span.status.status_code == StatusCode.ERROR:
+            error_type = "error"
+        else:
+            error_type = None
+        # ── Messages
+        input_messages = _parse_input_messages(attrs)
+        output_messages = _parse_output_messages(attrs)
+        finish_reasons = tuple(msg.finish_reason for msg in output_messages if msg.finish_reason)
+        # ── Tool definitions
+        tool_definitions = _parse_tool_definitions(attrs)
+        return LLMSpan(
+            span_id=span_id,
+            trace_id=trace_id,
+            parent_span_id=parent_span_id,
+            started_at=started_at,
+            ended_at=ended_at,
+            duration_ms=duration_ms,
+            operation=operation,
+            provider=provider,
+            request_model=request_model,
+            response_model=response_model,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            cache_read_tokens=cache_read_tokens,
+            cache_creation_tokens=cache_creation_tokens,
+            request_temperature=request_temperature,
+            request_max_tokens=request_max_tokens,
+            request_top_p=request_top_p,
+            finish_reasons=finish_reasons,
+            response_id=response_id,
+            output_type=output_type,
+            error_type=error_type,
+            input_messages=tuple(input_messages),
+            output_messages=tuple(output_messages),
+            tool_definitions=tuple(tool_definitions),
+        )
+# ── Helper functions ──────────────────────────────────────────────────────────
+def _infer_provider(model_name: str) -> str:
+    """Infer the LLM provider from the model name."""
+    lower = model_name.lower()
+    if "gpt" in lower or "o1" in lower or "o3" in lower:
+        return "openai"
+    if "claude" in lower:
+        return "anthropic"
+    if "gemini" in lower:
+        return "google"
+    if "command" in lower or "coral" in lower:
+        return "cohere"
+    if "llama" in lower or "mixtral" in lower or "mistral" in lower:
+        return "meta"
+    return "unknown"
+def _parse_json(raw: str) -> dict[str, Any]:
+    """Parse JSON safely, returning empty dict on failure."""
+    try:
+        result = json.loads(raw)
+        if isinstance(result, dict):
+            return result
+        return {}
+    except (json.JSONDecodeError, TypeError, ValueError):
+        return {}
+def _to_float_or_none(value: Any) -> float | None:
+    """Convert value to float or return None."""
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+def _to_int_or_none(value: Any) -> int | None:
+    """Convert value to int or return None."""
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+def _parse_content_parts(
+    attrs: dict[str, Any], prefix: str
+) -> tuple[TextContent | ImageContent, ...]:
+    """Parse multimodal content parts from OpenInference indexed attributes.
+    Falls back to plain .content string as a single TextContent.
+    """
+    parts: list[TextContent | ImageContent] = []
+    j = 0
+    while True:
+        type_key = f"{prefix}.contents.{j}.message_content.type"
+        content_type = attrs.get(type_key)
+        if content_type is None:
+            break
+        if content_type == "text":
+            text_key = f"{prefix}.contents.{j}.message_content.text"
+            text = str(attrs.get(text_key, ""))
+            parts.append(TextContent(text=text))
+        elif content_type == "image":
+            url_key = f"{prefix}.contents.{j}.message_content.image.url.url"
+            detail_key = f"{prefix}.contents.{j}.message_content.image.url.detail"
+            url = str(attrs.get(url_key, ""))
+            detail_raw = attrs.get(detail_key)
+            detail = str(detail_raw) if detail_raw is not None else None
+            parts.append(ImageContent(url=url, detail=detail))
+        j += 1
+    if not parts:
+        # Fall back to plain .content string
+        content_key = f"{prefix}.content"
+        content_raw = attrs.get(content_key)
+        if content_raw is not None:
+            parts.append(TextContent(text=str(content_raw)))
+    return tuple(parts)
+def _parse_tool_calls(attrs: dict[str, Any], prefix: str) -> tuple[ToolCall, ...]:
+    """Parse tool calls from OpenInference indexed attributes."""
+    tool_calls: list[ToolCall] = []
+    j = 0
+    while True:
+        name_key = f"{prefix}.tool_calls.{j}.tool_call.function.name"
+        name = attrs.get(name_key)
+        if name is None:
+            break
+        args_key = f"{prefix}.tool_calls.{j}.tool_call.function.arguments"
+        args_raw = attrs.get(args_key)
+        if isinstance(args_raw, str):
+            try:
+                arguments = json.loads(args_raw)
+            except json.JSONDecodeError:
+                arguments = {"_raw": args_raw}
+        elif isinstance(args_raw, dict):
+            arguments = args_raw
+        else:
+            arguments = {}
+        id_key = f"{prefix}.tool_calls.{j}.tool_call.id"
+        call_id_raw = attrs.get(id_key)
+        call_id = str(call_id_raw) if call_id_raw is not None else None
+        tool_calls.append(ToolCall(name=str(name), arguments=arguments, id=call_id))
+        j += 1
+    return tuple(tool_calls)
+def _parse_input_messages(attrs: dict[str, Any]) -> list[Message]:
+    """Parse input messages from OpenInference indexed span attributes."""
+    messages: list[Message] = []
+    i = 0
+    while True:
+        prefix = f"llm.input_messages.{i}.message"
+        role_key = f"{prefix}.role"
+        role = attrs.get(role_key)
+        if role is None:
+            break
+        role = str(role).lower()
+        if role == "system":
+            content_key = f"{prefix}.content"
+            content = str(attrs.get(content_key, ""))
+            messages.append(SystemMessage(content=content))
+        elif role == "user":
+            parts = _parse_content_parts(attrs, prefix)
+            messages.append(UserMessage(content=parts))
+        elif role == "assistant":
+            parts = _parse_content_parts(attrs, prefix)
+            tool_calls = _parse_tool_calls(attrs, prefix)
+            messages.append(AssistantMessage(content=parts, tool_calls=tool_calls))
+        elif role == "tool":
+            content_key = f"{prefix}.content"
+            content = str(attrs.get(content_key, ""))
+            tool_call_id_raw = attrs.get(f"{prefix}.tool_call_id")
+            tool_call_id = str(tool_call_id_raw) if tool_call_id_raw is not None else None
+            tool_name_raw = attrs.get(f"{prefix}.name")
+            tool_name = str(tool_name_raw) if tool_name_raw is not None else None
+            messages.append(
+                ToolResultMessage(content=content, tool_call_id=tool_call_id, tool_name=tool_name)
+            )
+        i += 1
+    return messages
+def _parse_output_messages(attrs: dict[str, Any]) -> list[AssistantMessage]:
+    """Parse output messages from OpenInference indexed span attributes."""
+    messages: list[AssistantMessage] = []
+    i = 0
+    while True:
+        prefix = f"llm.output_messages.{i}.message"
+        role_key = f"{prefix}.role"
+        role = attrs.get(role_key)
+        if role is None:
+            break
+        parts = _parse_content_parts(attrs, prefix)
+        tool_calls = _parse_tool_calls(attrs, prefix)
+        # finish_reason is per-message in OpenInference
+        finish_reason_raw = attrs.get(f"{prefix}.finish_reason")
+        finish_reason = str(finish_reason_raw) if finish_reason_raw is not None else None
+        messages.append(
+            AssistantMessage(
+                content=parts,
+                tool_calls=tool_calls,
+                finish_reason=finish_reason,
+            )
+        )
+        i += 1
+    return messages
+def _parse_tool_definitions(attrs: dict[str, Any]) -> list[ToolDefinition]:
+    """Parse tool definitions from OpenInference indexed span attributes."""
+    tools: list[ToolDefinition] = []
+    i = 0
+    while True:
+        name_key = f"llm.tools.{i}.tool.name"
+        name = attrs.get(name_key)
+        if name is None:
+            break
+        desc_raw = attrs.get(f"llm.tools.{i}.tool.description")
+        description = str(desc_raw) if desc_raw is not None else None
+        schema_raw = attrs.get(f"llm.tools.{i}.tool.json_schema")
+        if isinstance(schema_raw, str):
+            parameters = _parse_json(schema_raw) or None
+        elif isinstance(schema_raw, dict):
+            parameters = schema_raw
+        else:
+            parameters = None
+        tools.append(
+            ToolDefinition(name=str(name), description=description, parameters=parameters)
+        )
+        i += 1
+    return tools

pixie/instrumentation/queue.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""_DeliveryQueue — background worker thread for delivering spans to handler."""
+from __future__ import annotations
+import asyncio
+import queue
+import threading
+from concurrent.futures import Future
+from .handler import InstrumentationHandler
+from .spans import LLMSpan, ObserveSpan
+class _DeliveryQueue:
+    """Single queue for both LLMSpan and ObserveSpan.
+    A dedicated asyncio event loop runs on a background daemon thread.  The
+    queue-worker thread picks up each span and schedules an async dispatch
+    coroutine on that loop (fire and forget from the worker's perspective).
+    ``queue.task_done()`` is called via a ``Future`` done-callback once the
+    coroutine finishes, so ``flush()`` (which calls ``queue.join()``) correctly
+    waits for all in-flight async processing to complete.
+    """
+    def __init__(self, handler: InstrumentationHandler, maxsize: int = 1000) -> None:
+        self._handler = handler
+        self._queue: queue.Queue[LLMSpan | ObserveSpan] = queue.Queue(maxsize=maxsize)
+        self._dropped_count = 0
+        # Dedicated event loop running on its own daemon thread.
+        self._loop = asyncio.new_event_loop()
+        self._loop_thread = threading.Thread(
+            target=self._loop.run_forever,
+            daemon=True,
+            name="pixie-asyncio-loop",
+        )
+        self._loop_thread.start()
+        # Queue-consumer thread: picks items and schedules async tasks.
+        self._thread = threading.Thread(
+            target=self._worker, daemon=True, name="pixie-delivery-worker"
+        )
+        self._thread.start()
+    def submit(self, item: LLMSpan | ObserveSpan) -> None:
+        """Submit a span for delivery. Drops silently on full queue."""
+        try:
+            self._queue.put_nowait(item)
+        except queue.Full:
+            self._dropped_count += 1
+    def flush(self, timeout_seconds: float = 5.0) -> bool:
+        """Block until all queued items and their async handlers are done."""
+        try:
+            self._queue.join()
+            return True
+        except Exception:
+            return False
+    def _worker(self) -> None:
+        """Queue-consumer: fire-and-forget async dispatch for each span."""
+        while True:
+            item = self._queue.get()
+            try:
+                future: Future[None] = asyncio.run_coroutine_threadsafe(
+                    self._dispatch(item), self._loop
+                )
+                # task_done() is deferred until the coroutine finishes so
+                # that flush() / queue.join() waits for async handlers too.
+                future.add_done_callback(lambda _f: self._queue.task_done())
+            except Exception:
+                # Scheduling failed — mark done immediately to avoid deadlock.
+                self._queue.task_done()
+    async def _dispatch(self, item: LLMSpan | ObserveSpan) -> None:
+        """Async dispatch: route span to the appropriate handler method."""
+        try:
+            if isinstance(item, LLMSpan):
+                await self._handler.on_llm(item)
+            elif isinstance(item, ObserveSpan):
+                await self._handler.on_observe(item)
+        except Exception:
+            pass  # Handler exceptions are silently swallowed
+    @property
+    def dropped_count(self) -> int:
+        """Number of spans dropped due to full queue."""
+        return self._dropped_count

pixie/instrumentation/spans.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""Data model types for pixie instrumentation spans."""
+from __future__ import annotations
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Literal  # noqa: UP035
+# ── Message content types ─────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class TextContent:
+    """Plain text content part."""
+    text: str
+    type: Literal["text"] = "text"
+@dataclass(frozen=True)
+class ImageContent:
+    """Image content part (URL or data URI)."""
+    url: str  # https:// or data: URI
+    detail: str | None = None  # "low" | "high" | "auto" | None
+    type: Literal["image"] = "image"
+MessageContent = TextContent | ImageContent
+# ── Tool types ────────────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class ToolCall:
+    """Tool invocation requested by the model."""
+    name: str
+    arguments: dict[str, Any]  # always deserialized, never a raw JSON string
+    id: str | None = None
+@dataclass(frozen=True)
+class ToolDefinition:
+    """Tool made available to the model in the request."""
+    name: str
+    description: str | None = None
+    parameters: dict[str, Any] | None = None  # JSON Schema object
+# ── Message types ─────────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class SystemMessage:
+    """System prompt message."""
+    content: str
+    role: Literal["system"] = "system"
+@dataclass(frozen=True)
+class UserMessage:
+    """User message with multimodal content parts."""
+    content: tuple[MessageContent, ...]
+    role: Literal["user"] = "user"
+    @classmethod
+    def from_text(cls, text: str) -> UserMessage:
+        """Create a UserMessage with a single TextContent part."""
+        return cls(content=(TextContent(text=text),))
+@dataclass(frozen=True)
+class AssistantMessage:
+    """Assistant response message with optional tool calls."""
+    content: tuple[MessageContent, ...]
+    tool_calls: tuple[ToolCall, ...]
+    finish_reason: str | None = None
+    role: Literal["assistant"] = "assistant"
+@dataclass(frozen=True)
+class ToolResultMessage:
+    """Tool execution result message."""
+    content: str
+    tool_call_id: str | None = None
+    tool_name: str | None = None
+    role: Literal["tool"] = "tool"
+Message = SystemMessage | UserMessage | AssistantMessage | ToolResultMessage
+# ── Span types ────────────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class LLMSpan:
+    """One LLM provider call, produced by LLMSpanProcessor from OpenInference attrs."""
+    # ── Identity
+    span_id: str  # hex, 16 chars
+    trace_id: str  # hex, 32 chars
+    parent_span_id: str | None  # links to ObserveSpan.span_id when nested
+    # ── Timing
+    started_at: datetime
+    ended_at: datetime
+    duration_ms: float
+    # ── Provider / model
+    operation: str  # "chat" | "embedding"
+    provider: str  # "openai" | "anthropic" | "google" | ...
+    request_model: str
+    response_model: str | None
+    # ── Token usage
+    input_tokens: int  # default 0
+    output_tokens: int  # default 0
+    cache_read_tokens: int  # default 0
+    cache_creation_tokens: int  # default 0
+    # ── Request parameters
+    request_temperature: float | None
+    request_max_tokens: int | None
+    request_top_p: float | None
+    # ── Response metadata
+    finish_reasons: tuple[str, ...]  # default ()
+    response_id: str | None
+    output_type: str | None  # "json" | "text" | None
+    error_type: str | None
+    # ── Content (populated when capture_content=True)
+    input_messages: tuple[Message, ...]  # default ()
+    output_messages: tuple[AssistantMessage, ...]  # default ()
+    tool_definitions: tuple[ToolDefinition, ...]  # always populated when available
+@dataclass(frozen=True)
+class ObserveSpan:
+    """A user-defined instrumented block, produced when a log() block exits."""
+    # ── Identity
+    span_id: str  # hex, 16 chars
+    trace_id: str  # hex, 32 chars
+    parent_span_id: str | None
+    # ── Timing
+    started_at: datetime
+    ended_at: datetime
+    duration_ms: float
+    # ── User-defined fields
+    name: str | None  # optional label for the block
+    input: Any  # value passed to log(input=...)
+    output: Any  # value set via span.set_output(...)
+    metadata: dict[str, Any]  # accumulated via span.set_metadata(k, v)
+    error: str | None  # exception type if block raised, else None

pixie/storage/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Observation storage module for persisting and querying LLM application traces.
+Provides:
+- ``Evaluable`` Pydantic BaseModel for uniform evaluator access
+- ``UNSET`` sentinel for distinguishing unset from ``None``
+- ``ObservationNode`` tree wrapper with traversal and LLM-friendly serialization
+- ``ObservationStore`` for persistence and query via Piccolo ORM / SQLite
+"""
+from __future__ import annotations
+from pixie.storage.evaluable import (
+    UNSET,
+    Evaluable,
+    as_evaluable,
+)
+from pixie.storage.store import ObservationStore
+from pixie.storage.tree import ObservationNode, build_tree
+__all__ = [
+    "Evaluable",
+    "ObservationNode",
+    "ObservationStore",
+    "UNSET",
+    "as_evaluable",
+    "build_tree",
+]