PyPI - pixie-qa - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pixie-qa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

pixie/__init__.py +11 -0
pixie/cli/__init__.py +6 -0
pixie/cli/dataset_command.py +193 -0
pixie/cli/main.py +192 -0
pixie/cli/test_command.py +68 -0
pixie/config.py +41 -0
pixie/dataset/__init__.py +11 -0
pixie/dataset/models.py +21 -0
pixie/dataset/store.py +212 -0
pixie/evals/__init__.py +111 -0
pixie/evals/criteria.py +77 -0
pixie/evals/eval_utils.py +244 -0
pixie/evals/evaluation.py +112 -0
pixie/evals/runner.py +187 -0
pixie/evals/scorers.py +755 -0
pixie/evals/trace_capture.py +70 -0
pixie/evals/trace_helpers.py +57 -0
pixie/instrumentation/__init__.py +49 -0
pixie/instrumentation/context.py +86 -0
pixie/instrumentation/handler.py +72 -0
pixie/instrumentation/handlers.py +83 -0
pixie/instrumentation/instrumentors.py +31 -0
pixie/instrumentation/observation.py +211 -0
pixie/instrumentation/processor.py +366 -0
pixie/instrumentation/queue.py +88 -0
pixie/instrumentation/spans.py +165 -0
pixie/storage/__init__.py +27 -0
pixie/storage/evaluable.py +129 -0
pixie/storage/piccolo_conf.py +10 -0
pixie/storage/piccolo_migrations/__init__.py +1 -0
pixie/storage/serialization.py +227 -0
pixie/storage/store.py +231 -0
pixie/storage/tables.py +21 -0
pixie/storage/tree.py +199 -0
pixie_qa-0.1.0.dist-info/METADATA +162 -0
pixie_qa-0.1.0.dist-info/RECORD +39 -0
pixie_qa-0.1.0.dist-info/WHEEL +4 -0
pixie_qa-0.1.0.dist-info/entry_points.txt +3 -0
pixie_qa-0.1.0.dist-info/licenses/LICENSE +21 -0

pixie/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""pixie — automated quality assurance for AI applications.
+Re-exports commonly used public API for convenient top-level access.
+"""
+from pixie.instrumentation.handlers import StorageHandler, enable_storage
+__all__ = [
+    "StorageHandler",
+    "enable_storage",
+]

pixie/cli/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""pixie.cli — command-line interface for pixie.
+Provides:
+- ``pixie`` — main entry point with dataset management subcommands.
+- ``pixie-test`` — eval test discovery and execution.
+"""

pixie/cli/dataset_command.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""``pixie dataset`` CLI commands.
+Provides operations for managing datasets and saving trace spans as evaluable
+items:
+- :func:`dataset_create` — create a new empty dataset.
+- :func:`dataset_list` — list datasets with basic information.
+- :func:`dataset_save` — select a span from the latest trace and save it
+  to a dataset.
+"""
+from __future__ import annotations
+from typing import Any
+from pydantic import JsonValue
+from pixie.dataset.models import Dataset
+from pixie.dataset.store import DatasetStore
+from pixie.storage.evaluable import UNSET, Evaluable, _Unset, as_evaluable
+from pixie.storage.store import ObservationStore
+def dataset_create(
+    *,
+    name: str,
+    dataset_store: DatasetStore,
+) -> Dataset:
+    """Create a new empty dataset.
+    Args:
+        name: Unique name for the new dataset.
+        dataset_store: Store to write the dataset to.
+    Returns:
+        The created ``Dataset``.
+    Raises:
+        FileExistsError: If a dataset with *name* already exists.
+    """
+    return dataset_store.create(name)
+def dataset_list(
+    *,
+    dataset_store: DatasetStore,
+) -> list[dict[str, Any]]:
+    """Return metadata for every dataset.
+    Each returned dict contains:
+    - ``name``: dataset name
+    - ``row_count``: number of evaluable items
+    - ``created_at``: file creation timestamp (ISO 8601)
+    - ``updated_at``: file last-modified timestamp (ISO 8601)
+    """
+    return dataset_store.list_details()
+def format_dataset_table(rows: list[dict[str, Any]]) -> str:
+    """Format dataset metadata rows as an aligned CLI table.
+    Args:
+        rows: List of dicts from :func:`dataset_list`.
+    Returns:
+        A multi-line string suitable for printing.
+    """
+    if not rows:
+        return "No datasets found."
+    headers = ["Name", "Rows", "Created", "Updated"]
+    data = [
+        [r["name"], str(r["row_count"]), r["created_at"], r["updated_at"]]
+        for r in rows
+    ]
+    col_widths = [len(h) for h in headers]
+    for row in data:
+        for i, cell in enumerate(row):
+            col_widths[i] = max(col_widths[i], len(cell))
+    def _fmt_row(cells: list[str]) -> str:
+        return "  ".join(c.ljust(col_widths[i]) for i, c in enumerate(cells))
+    lines = [_fmt_row(headers), _fmt_row(["-" * w for w in col_widths])]
+    for row in data:
+        lines.append(_fmt_row(row))
+    return "\n".join(lines)
+async def dataset_save(
+    *,
+    name: str,
+    observation_store: ObservationStore,
+    dataset_store: DatasetStore,
+    select: str = "root",
+    span_name: str | None = None,
+    expected_output: JsonValue | _Unset = UNSET,
+    notes: str | None = None,
+) -> Dataset:
+    """Select a span from the latest trace and save it to a dataset.
+    Fetches the most recent trace from the observation store, selects
+    a span according to *select*, converts it to an ``Evaluable``, then
+    appends it to the named dataset.
+    Args:
+        name: Name of the dataset to save to (must exist).
+        observation_store: Store to read spans from.
+        dataset_store: Store to write the updated dataset to.
+        select: Selection mode — ``"root"``, ``"last_llm_call"``, or
+            ``"by_name"``. Defaults to ``"root"``.
+        span_name: Span name to match when *select* is ``"by_name"``.
+            Required when *select* is ``"by_name"``.
+        expected_output: If provided, set on the evaluable. When
+            ``UNSET`` (default), the evaluable's ``expected_output``
+            is left as ``UNSET``.
+        notes: Optional notes string to attach to the evaluable's
+            ``eval_metadata`` under the ``"notes"`` key.
+    Returns:
+        The updated ``Dataset``.
+    Raises:
+        ValueError: If no traces exist, or no matching span found.
+        FileNotFoundError: If no dataset with *name* exists.
+    """
+    traces = await observation_store.list_traces(limit=1)
+    if not traces:
+        raise ValueError("No traces found in the observation store.")
+    trace_id: str = traces[0]["trace_id"]
+    span = await _select_span(
+        observation_store=observation_store,
+        trace_id=trace_id,
+        select=select,
+        span_name=span_name,
+    )
+    evaluable = as_evaluable(span)
+    # Apply expected_output if provided
+    if not isinstance(expected_output, _Unset):
+        evaluable = Evaluable(
+            eval_input=evaluable.eval_input,
+            eval_output=evaluable.eval_output,
+            eval_metadata=evaluable.eval_metadata,
+            expected_output=expected_output,
+        )
+    # Apply notes if provided
+    if notes is not None:
+        existing_meta = dict(evaluable.eval_metadata) if evaluable.eval_metadata else {}
+        existing_meta["notes"] = notes
+        evaluable = Evaluable(
+            eval_input=evaluable.eval_input,
+            eval_output=evaluable.eval_output,
+            eval_metadata=existing_meta,
+            expected_output=evaluable.expected_output,
+        )
+    return dataset_store.append(name, evaluable)
+async def _select_span(
+    *,
+    observation_store: ObservationStore,
+    trace_id: str,
+    select: str,
+    span_name: str | None,
+) -> Any:
+    """Select a span from a trace according to the selection mode."""
+    if select == "root":
+        return await observation_store.get_root(trace_id)
+    if select == "last_llm_call":
+        span = await observation_store.get_last_llm(trace_id)
+        if span is None:
+            raise ValueError(f"No LLM span found in trace {trace_id}.")
+        return span
+    if select == "by_name":
+        if not span_name:
+            raise ValueError("--span-name is required when selection mode is 'by_name'.")
+        matches = await observation_store.get_by_name(span_name, trace_id=trace_id)
+        if not matches:
+            raise ValueError(
+                f"No span named {span_name!r} found in trace {trace_id}."
+            )
+        # Select the latest (last by started_at — get_by_name returns ASC order)
+        return matches[-1]
+    raise ValueError(f"Unknown selection mode: {select!r}")

pixie/cli/main.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""``pixie`` CLI entry point — top-level command with subcommand routing.
+Usage::
+    pixie dataset create <name>
+    pixie dataset list
+    pixie dataset save <name> [--select MODE] [--span-name NAME]
+                               [--expected-output] [--notes TEXT]
+Reads spans from the observation store (SQLite, configured via ``PIXIE_DB_PATH``)
+and writes evaluable items to the dataset store (JSON files, configured via
+``PIXIE_DATASET_DIR``).
+"""
+from __future__ import annotations
+import argparse
+import asyncio
+import json
+import sys
+from typing import TextIO
+from piccolo.engine.sqlite import SQLiteEngine
+from pydantic import JsonValue
+from pixie.cli.dataset_command import (
+    dataset_create,
+    dataset_list,
+    dataset_save,
+    format_dataset_table,
+)
+from pixie.config import get_config
+from pixie.dataset.store import DatasetStore
+from pixie.storage.evaluable import UNSET, _Unset
+from pixie.storage.store import ObservationStore
+def _build_parser() -> argparse.ArgumentParser:
+    """Build the top-level argument parser with subcommands."""
+    parser = argparse.ArgumentParser(
+        prog="pixie",
+        description="Pixie — automated quality assurance for AI applications",
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # -- pixie dataset -------------------------------------------------------
+    dataset_parser = subparsers.add_parser("dataset", help="Dataset management commands")
+    dataset_sub = dataset_parser.add_subparsers(
+        dest="dataset_action", help="Dataset actions"
+    )
+    # pixie dataset create <name>
+    create_parser = dataset_sub.add_parser(
+        "create", help="Create a new empty dataset"
+    )
+    create_parser.add_argument("name", help="Name for the new dataset")
+    # pixie dataset list
+    dataset_sub.add_parser("list", help="List all datasets")
+    # pixie dataset save <name> [options]
+    save_parser = dataset_sub.add_parser(
+        "save",
+        help="Save a span from the latest trace to a dataset",
+    )
+    save_parser.add_argument("name", help="Name of the dataset to save to")
+    save_parser.add_argument(
+        "--select",
+        choices=["root", "last_llm_call", "by_name"],
+        default="root",
+        help="How to select the span from the trace (default: root)",
+    )
+    save_parser.add_argument(
+        "--span-name",
+        default=None,
+        help="Span name to match (required when --select=by_name)",
+    )
+    save_parser.add_argument(
+        "--expected-output",
+        action="store_true",
+        default=False,
+        help="Read expected output JSON from stdin",
+    )
+    save_parser.add_argument(
+        "--notes",
+        default=None,
+        help="Optional notes to attach to the evaluable metadata",
+    )
+    return parser
+def _run_dataset_create(name: str) -> None:
+    """Run dataset_create."""
+    ds_store = DatasetStore()
+    dataset = dataset_create(name=name, dataset_store=ds_store)
+    print(f"Created dataset {dataset.name!r}.")  # noqa: T201
+def _run_dataset_list() -> None:
+    """Run dataset_list and print the table."""
+    ds_store = DatasetStore()
+    rows = dataset_list(dataset_store=ds_store)
+    print(format_dataset_table(rows))  # noqa: T201
+def _run_dataset_save(
+    name: str,
+    select: str,
+    span_name: str | None,
+    expected_output_flag: bool,
+    notes: str | None,
+    stdin: TextIO | None = None,
+) -> None:
+    """Set up stores and run dataset_save."""
+    config = get_config()
+    engine = SQLiteEngine(path=config.db_path)
+    obs_store = ObservationStore(engine=engine)
+    ds_store = DatasetStore()
+    expected: JsonValue | _Unset = UNSET
+    if expected_output_flag:
+        source = stdin if stdin is not None else sys.stdin
+        raw = source.read().strip()
+        if not raw:
+            raise ValueError("--expected-output flag set but no JSON provided on stdin.")
+        expected = json.loads(raw)
+    dataset = asyncio.run(
+        dataset_save(
+            name=name,
+            observation_store=obs_store,
+            dataset_store=ds_store,
+            select=select,
+            span_name=span_name,
+            expected_output=expected,
+            notes=notes,
+        )
+    )
+    print(  # noqa: T201
+        f"Saved to dataset {dataset.name!r} — now {len(dataset.items)} item(s)."
+    )
+def main(argv: list[str] | None = None) -> int:
+    """Entry point for the ``pixie`` command.
+    Args:
+        argv: Command-line arguments. Defaults to ``sys.argv[1:]``.
+    Returns:
+        Exit code: 0 on success, 1 on error.
+    """
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+    if args.command is None:
+        parser.print_help()
+        return 1
+    if args.command == "dataset":
+        if args.dataset_action is None:
+            parser.parse_args(["dataset", "--help"])
+            return 1
+        try:
+            if args.dataset_action == "create":
+                _run_dataset_create(args.name)
+            elif args.dataset_action == "list":
+                _run_dataset_list()
+            elif args.dataset_action == "save":
+                _run_dataset_save(
+                    name=args.name,
+                    select=args.select,
+                    span_name=args.span_name,
+                    expected_output_flag=args.expected_output,
+                    notes=args.notes,
+                )
+        except (
+            ValueError,
+            FileExistsError,
+            FileNotFoundError,
+            json.JSONDecodeError,
+        ) as exc:
+            print(f"Error: {exc}", file=sys.stderr)  # noqa: T201
+            return 1
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

pixie/cli/test_command.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""``pixie test`` CLI entry point.
+Usage::
+    pixie test [path] [--filter PATTERN] [--verbose]
+Discovers and runs eval test functions, reporting pass/fail results.
+"""
+from __future__ import annotations
+import argparse
+import sys
+import pixie.instrumentation as px
+from pixie.evals.runner import discover_tests, format_results, run_tests
+def main(argv: list[str] | None = None) -> int:
+    """Entry point for ``pixie test`` command.
+    Args:
+        argv: Command-line arguments. Defaults to ``sys.argv[1:]``.
+    Returns:
+        Exit code: 0 if all tests pass, 1 otherwise.
+    """
+    parser = argparse.ArgumentParser(
+        prog="pixie test",
+        description="Run pixie eval tests",
+    )
+    parser.add_argument(
+        "path",
+        nargs="?",
+        default=".",
+        help="File or directory to search for tests (default: current directory)",
+    )
+    parser.add_argument(
+        "-k",
+        "--filter",
+        dest="filter_pattern",
+        default=None,
+        help="Only run tests whose names contain this substring",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="Show detailed evaluation results",
+    )
+    args = parser.parse_args(argv)
+    # Ensure instrumentation is initialised before running test functions
+    px.init()
+    cases = discover_tests(args.path, filter_pattern=args.filter_pattern)
+    results = run_tests(cases)
+    output = format_results(results, verbose=args.verbose)
+    print(output)  # noqa: T201
+    all_passed = all(r.status == "passed" for r in results)
+    return 0 if all_passed else 1
+if __name__ == "__main__":
+    sys.exit(main())

pixie/config.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Centralized configuration with env var overrides and sensible defaults.
+All environment variables are prefixed with ``PIXIE_``. Values are read at
+call time (inside :func:`get_config`), not at import time, so tests can
+manipulate ``os.environ`` before calling :func:`get_config`.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class PixieConfig:
+    """Immutable configuration snapshot.
+    Attributes:
+        db_path: Path to the SQLite database file.
+        db_engine: Database engine type (currently only ``"sqlite"``).
+        dataset_dir: Directory for dataset JSON files.
+    """
+    db_path: str = "pixie_observations.db"
+    db_engine: str = "sqlite"
+    dataset_dir: str = "pixie_datasets"
+def get_config() -> PixieConfig:
+    """Read configuration from environment variables with defaults.
+    Supported variables:
+        - ``PIXIE_DB_PATH`` — overrides :attr:`PixieConfig.db_path`
+        - ``PIXIE_DB_ENGINE`` — overrides :attr:`PixieConfig.db_engine`
+        - ``PIXIE_DATASET_DIR`` — overrides :attr:`PixieConfig.dataset_dir`
+    """
+    return PixieConfig(
+        db_path=os.environ.get("PIXIE_DB_PATH", PixieConfig.db_path),
+        db_engine=os.environ.get("PIXIE_DB_ENGINE", PixieConfig.db_engine),
+        dataset_dir=os.environ.get("PIXIE_DATASET_DIR", PixieConfig.dataset_dir),
+    )

pixie/dataset/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""pixie.dataset — named collections of evaluable items.
+Public API:
+    - ``Dataset`` — Pydantic model: name + items
+    - ``DatasetStore`` — JSON-file-backed CRUD
+"""
+from pixie.dataset.models import Dataset
+from pixie.dataset.store import DatasetStore
+__all__ = ["Dataset", "DatasetStore"]

pixie/dataset/models.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Dataset model — a named collection of evaluable items."""
+from __future__ import annotations
+from pydantic import BaseModel, ConfigDict, Field
+from pixie.storage.evaluable import Evaluable
+class Dataset(BaseModel):
+    """A named collection of evaluable items.
+    Attributes:
+        name: Unique human-readable name for the dataset.
+        items: Ordered list of evaluable entries.
+    """
+    model_config = ConfigDict(frozen=True)
+    name: str = Field(..., min_length=1)
+    items: tuple[Evaluable, ...] = ()