PyPI - pixie-qa - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pixie-qa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

pixie/__init__.py +11 -0
pixie/cli/__init__.py +6 -0
pixie/cli/dataset_command.py +193 -0
pixie/cli/main.py +192 -0
pixie/cli/test_command.py +68 -0
pixie/config.py +41 -0
pixie/dataset/__init__.py +11 -0
pixie/dataset/models.py +21 -0
pixie/dataset/store.py +212 -0
pixie/evals/__init__.py +111 -0
pixie/evals/criteria.py +77 -0
pixie/evals/eval_utils.py +244 -0
pixie/evals/evaluation.py +112 -0
pixie/evals/runner.py +187 -0
pixie/evals/scorers.py +755 -0
pixie/evals/trace_capture.py +70 -0
pixie/evals/trace_helpers.py +57 -0
pixie/instrumentation/__init__.py +49 -0
pixie/instrumentation/context.py +86 -0
pixie/instrumentation/handler.py +72 -0
pixie/instrumentation/handlers.py +83 -0
pixie/instrumentation/instrumentors.py +31 -0
pixie/instrumentation/observation.py +211 -0
pixie/instrumentation/processor.py +366 -0
pixie/instrumentation/queue.py +88 -0
pixie/instrumentation/spans.py +165 -0
pixie/storage/__init__.py +27 -0
pixie/storage/evaluable.py +129 -0
pixie/storage/piccolo_conf.py +10 -0
pixie/storage/piccolo_migrations/__init__.py +1 -0
pixie/storage/serialization.py +227 -0
pixie/storage/store.py +231 -0
pixie/storage/tables.py +21 -0
pixie/storage/tree.py +199 -0
pixie_qa-0.1.0.dist-info/METADATA +162 -0
pixie_qa-0.1.0.dist-info/RECORD +39 -0
pixie_qa-0.1.0.dist-info/WHEEL +4 -0
pixie_qa-0.1.0.dist-info/entry_points.txt +3 -0
pixie_qa-0.1.0.dist-info/licenses/LICENSE +21 -0

pixie/storage/tree.py ADDED Viewed

@@ -0,0 +1,199 @@
+"""ObservationNode tree wrapper with traversal and LLM-friendly serialization."""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from typing import Any
+from pixie.instrumentation.spans import (
+    AssistantMessage,
+    LLMSpan,
+    ObserveSpan,
+    SystemMessage,
+    TextContent,
+    ToolResultMessage,
+    UserMessage,
+)
+@dataclass
+class ObservationNode:
+    """Tree node wrapping a span with children for hierarchical traversal."""
+    span: ObserveSpan | LLMSpan
+    children: list[ObservationNode] = field(default_factory=list)
+    # ── Delegated properties ──────────────────────────────────────────────
+    @property
+    def span_id(self) -> str:
+        """Span identifier."""
+        return self.span.span_id
+    @property
+    def trace_id(self) -> str:
+        """Trace identifier."""
+        return self.span.trace_id
+    @property
+    def parent_span_id(self) -> str | None:
+        """Parent span identifier."""
+        return self.span.parent_span_id
+    @property
+    def name(self) -> str:
+        """Human-readable name: ``span.name`` for observe, ``request_model`` for LLM."""
+        if isinstance(self.span, LLMSpan):
+            return self.span.request_model
+        return self.span.name or "(unnamed)"
+    @property
+    def duration_ms(self) -> float:
+        """Duration in milliseconds."""
+        return self.span.duration_ms
+    # ── Search ────────────────────────────────────────────────────────────
+    def find(self, name: str) -> list[ObservationNode]:
+        """Return all nodes in the subtree where ``node.name == name`` (DFS)."""
+        result: list[ObservationNode] = []
+        if self.name == name:
+            result.append(self)
+        for child in self.children:
+            result.extend(child.find(name))
+        return result
+    def find_by_type(self, span_type: type[ObserveSpan] | type[LLMSpan]) -> list[ObservationNode]:
+        """Return all nodes in the subtree where ``isinstance(node.span, span_type)``."""
+        result: list[ObservationNode] = []
+        if isinstance(self.span, span_type):
+            result.append(self)
+        for child in self.children:
+            result.extend(child.find_by_type(span_type))
+        return result
+    # ── Serialization ─────────────────────────────────────────────────────
+    def to_text(self, indent: int = 0) -> str:
+        """Serialize the tree to an LLM-friendly indented outline."""
+        prefix = "  " * indent
+        if isinstance(self.span, LLMSpan):
+            return self._llm_to_text(prefix, indent)
+        return self._observe_to_text(prefix, indent)
+    def _observe_to_text(self, prefix: str, indent: int) -> str:
+        span: ObserveSpan = self.span  # type: ignore[assignment]
+        lines: list[str] = []
+        name = span.name or "(unnamed)"
+        lines.append(f"{prefix}{name} [{span.duration_ms:.0f}ms]")
+        if span.input is not None:
+            lines.append(f"{prefix}  input: {_format_value(span.input)}")
+        if span.output is not None:
+            lines.append(f"{prefix}  output: {_format_value(span.output)}")
+        if span.error is not None:
+            lines.append(f"{prefix}  <e>{span.error}</e>")
+        if span.metadata:
+            lines.append(f"{prefix}  metadata: {json.dumps(span.metadata, default=str)}")
+        for child in self.children:
+            lines.append(child.to_text(indent + 1))
+        return "\n".join(lines)
+    def _llm_to_text(self, prefix: str, indent: int) -> str:
+        span: LLMSpan = self.span  # type: ignore[assignment]
+        lines: list[str] = []
+        lines.append(f"{prefix}{span.request_model} [{span.provider}, {span.duration_ms:.0f}ms]")
+        # Input messages
+        if span.input_messages:
+            lines.append(f"{prefix}  input_messages:")
+            for msg in span.input_messages:
+                lines.append(f"{prefix}    {_format_message(msg)}")
+        # Output messages
+        if span.output_messages:
+            lines.append(f"{prefix}  output:")
+            for msg in span.output_messages:
+                lines.append(f"{prefix}    {_format_message(msg)}")
+        # Tokens
+        token_parts: list[str] = []
+        if span.input_tokens > 0 or span.output_tokens > 0:
+            token_parts.append(f"{span.input_tokens} in / {span.output_tokens} out")
+            if span.cache_read_tokens > 0:
+                token_parts.append(f"({span.cache_read_tokens} cache read)")
+            if span.cache_creation_tokens > 0:
+                token_parts.append(f"({span.cache_creation_tokens} cache creation)")
+            lines.append(f"{prefix}  tokens: {' '.join(token_parts)}")
+        # Error
+        if span.error_type is not None:
+            lines.append(f"{prefix}  <e>{span.error_type}</e>")
+        # Tool definitions
+        if span.tool_definitions:
+            tool_names = ", ".join(td.name for td in span.tool_definitions)
+            lines.append(f"{prefix}  tools: [{tool_names}]")
+        for child in self.children:
+            lines.append(child.to_text(indent + 1))
+        return "\n".join(lines)
+def _format_value(value: Any) -> str:
+    """Format a value for text output."""
+    if isinstance(value, (dict, list)):
+        return json.dumps(value, default=str)
+    return str(value)
+def _format_message(
+    msg: SystemMessage | UserMessage | AssistantMessage | ToolResultMessage,
+) -> str:
+    """Format a single message for text output."""
+    if isinstance(msg, SystemMessage):
+        return f"system: {msg.content}"
+    if isinstance(msg, UserMessage):
+        parts = [p.text for p in msg.content if isinstance(p, TextContent)]
+        return f"user: {''.join(parts)}"
+    if isinstance(msg, AssistantMessage):
+        parts = [p.text for p in msg.content if isinstance(p, TextContent)]
+        text = f"assistant: {''.join(parts)}"
+        if msg.tool_calls:
+            names = ", ".join(tc.name for tc in msg.tool_calls)
+            text += f" [tool_calls: {names}]"
+        return text
+    if isinstance(msg, ToolResultMessage):
+        return f"tool({msg.tool_name}): {msg.content}"
+    return str(msg)  # pragma: no cover
+def build_tree(spans: list[ObserveSpan | LLMSpan]) -> list[ObservationNode]:
+    """Build a tree from a flat list of spans sharing the same trace.
+    Algorithm:
+    1. Create an ``ObservationNode`` for each span.
+    2. Index by ``span.span_id``.
+    3. Link children to parents via ``parent_span_id``.
+    4. Orphaned nodes (missing parent) become roots.
+    5. Sort each node's children by ``started_at`` ascending.
+    6. Return list of root nodes.
+    """
+    nodes: dict[str, ObservationNode] = {}
+    for span in spans:
+        nodes[span.span_id] = ObservationNode(span=span)
+    roots: list[ObservationNode] = []
+    for node in nodes.values():
+        pid = node.span.parent_span_id
+        if pid is not None and pid in nodes:
+            nodes[pid].children.append(node)
+        else:
+            roots.append(node)
+    # Sort children by started_at
+    for node in nodes.values():
+        node.children.sort(key=lambda n: n.span.started_at)
+    roots.sort(key=lambda n: n.span.started_at)
+    return roots

pixie_qa-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,162 @@
+Metadata-Version: 2.4
+Name: pixie-qa
+Version: 0.1.0
+Summary: Automated quality assurance for AI applications
+Project-URL: Homepage, https://github.com/yiouli/pixie-qa
+Project-URL: Repository, https://github.com/yiouli/pixie-qa
+Project-URL: Documentation, https://yiouli.github.io/pixie-qa/
+Project-URL: Bug Tracker, https://github.com/yiouli/pixie-qa/issues
+License: MIT License
+        Copyright (c) 2026 Yiou Li
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+License-File: LICENSE
+Keywords: ai,evals,llm,observability,opentelemetry,testing
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Testing
+Requires-Python: >=3.11
+Requires-Dist: autoevals>=0.1.0
+Requires-Dist: jsonpickle>=4.0.0
+Requires-Dist: openinference-instrumentation>=0.1.44
+Requires-Dist: opentelemetry-api>=1.27.0
+Requires-Dist: opentelemetry-sdk>=1.27.0
+Requires-Dist: piccolo[sqlite]>=1.33.0
+Requires-Dist: pydantic>=2.0
+Provides-Extra: all
+Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
+Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
+Requires-Dist: openinference-instrumentation-google-genai; extra == 'all'
+Requires-Dist: openinference-instrumentation-langchain; extra == 'all'
+Requires-Dist: openinference-instrumentation-openai; extra == 'all'
+Provides-Extra: anthropic
+Requires-Dist: openinference-instrumentation-anthropic; extra == 'anthropic'
+Provides-Extra: dspy
+Requires-Dist: openinference-instrumentation-dspy; extra == 'dspy'
+Provides-Extra: google
+Requires-Dist: openinference-instrumentation-google-genai; extra == 'google'
+Provides-Extra: langchain
+Requires-Dist: openinference-instrumentation-langchain; extra == 'langchain'
+Provides-Extra: openai
+Requires-Dist: openinference-instrumentation-openai; extra == 'openai'
+Description-Content-Type: text/markdown
+# pixie-qa
+A Claude skill and Python package for **eval-driven development** of LLM-powered applications.
+Use this skill to instrument your app, build golden datasets from real runs, write eval-based tests, and catch regressions before they ship — all from a single conversation with Claude.
+## What the Skill Does
+The `eval-driven-dev` skill guides Claude through the full QA loop for LLM applications:
+1. **Understand the app** — read the codebase, trace the data flow, learn what the app is supposed to do
+2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
+3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
+4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
+5. **Run the tests** — `pixie-test` to run all evals and report per-case scores
+6. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
+## Getting Started
+### 1. Add the skill to Claude
+The skill is bundled in this repository. Claude will automatically use it when you ask to evaluate, test, QA, or benchmark an LLM-powered Python project.
+If you are using an openskills-compatible agent host:
+```bash
+npx openskills install anthropics/skills
+```
+### 2. Install the `pixie-qa` package in your project
+```bash
+pip install pixie-qa          # or: uv add pixie-qa
+```
+Provider instrumentation extras:
+```bash
+pip install "pixie-qa[openai]"       # OpenAI
+pip install "pixie-qa[anthropic]"    # Anthropic
+pip install "pixie-qa[langchain]"    # LangChain
+pip install "pixie-qa[all]"          # all providers
+```
+### 3. Ask Claude to set up evals
+Open a conversation and describe your project:
+> "I have a RAG chatbot in `app/chatbot.py`. Help me set up evals to make sure it's giving accurate answers."
+Claude will read your code, instrument it, build a dataset from a few real runs, write tests, and run them for you.
+## Skill Workflow Example
+Here is a quick summary of what Claude does end-to-end:
+```
+# Claude instruments your app entry point
+from pixie import enable_storage
+enable_storage()              # one line: creates DB, registers handler
+# Claude adds @observe on the function to test
+import pixie.instrumentation as px
+@px.observe(name="answer_question")
+def answer_question(question: str) -> str:
+    ...
+# After running the app with a few real inputs:
+pixie dataset create qa-golden-set
+pixie dataset save qa-golden-set
+# Claude writes tests/test_qa.py with:
+async def test_factuality():
+    await assert_dataset_pass(
+        runnable=runnable,
+        dataset_name="qa-golden-set",
+        evaluators=[FactualityEval()],
+        pass_criteria=ScoreThreshold(threshold=0.7, pct=0.8),
+    )
+# Then runs:
+pixie-test -v
+```
+## Repository Structure
+```
+pixie/          Python package (instrumentation, storage, evals, dataset, cli)
+specs/          Design specs and architecture docs
+changelogs/     Per-feature change history
+.claude/skills/ Claude skill definitions and benchmarks
+```
+## Python Package
+The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).

pixie_qa-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,39 @@
+pixie/__init__.py,sha256=swukidYwIXnLqDw1LfYz9eIG7-dj2sOfR9AkRgGGsUA,270
+pixie/config.py,sha256=jWqWmBKXazQ0VPzSbHmZH3taN4Sv4fvwYeJmQ5z0LqA,1388
+pixie/cli/__init__.py,sha256=MokbPTq_UfSbFVxYoVkf1kLq89uif310BqLmgXi9yao,192
+pixie/cli/dataset_command.py,sha256=FFYEMyFOJ0iY4UIUa1F0sK1b_tqUuvFXEhBCJUxNxI8,6053
+pixie/cli/main.py,sha256=Hh6sHe1SM5dYuGh5uPDyHCZFBrnuoNlSujmiWcUbngo,5700
+pixie/cli/test_command.py,sha256=PAaY5p8UgI7jFuBybRWPlHTmzAKhXDIJlf9p2Exja4E,1699
+pixie/dataset/__init__.py,sha256=1Vtrli_4WjM8yCFCB1zUfjd6ad0iIskehDuIa1OK6DE,302
+pixie/dataset/models.py,sha256=PsDDTM4TVIDxkGdAQEWgJ0SBXU-RI2Br1ILWtul0uZw,528
+pixie/dataset/store.py,sha256=LEed1a_M6-gjeM_WHFdRPBC6IiD1Qoe0G8sCfwU8c5U,6832
+pixie/evals/__init__.py,sha256=IE3-vHNiG7q0z9dZs4BmCnVOhwJtlgYf7ewFD1tpanU,3654
+pixie/evals/criteria.py,sha256=24yV_lAJg-t34pqhzDt4f5KZ9Dlsyaqg9P6foDzbO4E,2858
+pixie/evals/eval_utils.py,sha256=82E4SU7K9aHCK7pV9fRPdGXTmB5nGKY7AypEw6sIJkI,9206
+pixie/evals/evaluation.py,sha256=22Ml0xYCA11N65tNNgBg1nJra92JQYFSjtDyse594_g,3507
+pixie/evals/runner.py,sha256=Rh3VlKg0JtNwe41A3E-UmKiGqcrSn119AMOudHB-_GM,5557
+pixie/evals/scorers.py,sha256=b1UBYyoRjpY1-BvWNPmM5LKM2EOr-g0XGh5limr8ZnU,22578
+pixie/evals/trace_capture.py,sha256=WZcx_JMLyH2zU6wgsEInU76g2-wttnpUjF66wDTw2lI,2502
+pixie/evals/trace_helpers.py,sha256=mPGTA7ZmX2zahIZ0Lm3ORgut8fryk80jJKth2Eqvtog,1829
+pixie/instrumentation/__init__.py,sha256=U3Q0gbobVoesQu3AUV738_JVg8JFi3IkNMZNqA4EYe4,921
+pixie/instrumentation/context.py,sha256=ehM4Ek5RLnowMnMlphOW_8BjFyOIDazqshkywWCXqdA,3024
+pixie/instrumentation/handler.py,sha256=COvLjWqsm7-KUIMyYV6lY38LHPjC-XYxIi7QTDvXiVU,2583
+pixie/instrumentation/handlers.py,sha256=vYnVT2F5SzxVLbMN5cxrer5IVwsrei0hgMwTYfRosas,2705
+pixie/instrumentation/instrumentors.py,sha256=a1sfHt9qitRw6iThrqIQ3lPNiXKLxbSkeQQBOatTlMg,1211
+pixie/instrumentation/observation.py,sha256=h_oG4K_SDzUiX5_5QTXDYV2Fi06a1go5tC2azcBgQCI,6672
+pixie/instrumentation/processor.py,sha256=wvqefzFnNIu7RFLimsVi7X-l9BGy735l5SyL2gvvP3o,13106
+pixie/instrumentation/queue.py,sha256=7PIsrfW8C7fWbF0H5boRBCs2JKzTqfnzK5_vfnxj5NU,3383
+pixie/instrumentation/spans.py,sha256=nL3siQSO5D2Ia-o2xXqVCTTqbWi37tnDYGl1VMD7LJs,4965
+pixie/storage/__init__.py,sha256=2XutlYhgv2ytKzXmoIoEoFfD_QQsQ6yV6kmZYPY5Bd8,735
+pixie/storage/evaluable.py,sha256=-NS9_ws6ZvOvCPwf-5eg9fQGbZzea6pDxaQcHFKCLkE,4394
+pixie/storage/piccolo_conf.py,sha256=FUZjTSvZ3GbUTKzRQ7q0gzUmflIrSth8rGybFoBLi0k,247
+pixie/storage/serialization.py,sha256=6ybbfneJur8VOnPRT7pDvGZxGh8MwdQEOTtv-9QMZd0,7802
+pixie/storage/store.py,sha256=NOVS8nqHuQw0IvYMiQI1arBvgUeOAwuTz1aAaxgDb2g,9148
+pixie/storage/tables.py,sha256=Y9oTFZAifVvIULY1Uu2wnfe0VfisiD1M2o70ZUXzXP8,734
+pixie/storage/tree.py,sha256=9jZWgTHr9wSDDoDFVH3yOaHhpZiOwfKUgk_mipEV47w,7509
+pixie/storage/piccolo_migrations/__init__.py,sha256=eGjDIi1cgVSySAIZetJxdOeaODoXkISWNYPNcOoJcII,52
+pixie_qa-0.1.0.dist-info/METADATA,sha256=OldsWIEuQAGaftbeAGZr4FBaAfqdDzm4H-Yg_5KRIE8,6577
+pixie_qa-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+pixie_qa-0.1.0.dist-info/entry_points.txt,sha256=56TDpYuU0wGJd4W0Z56dh5wruEekfWT5XmPCqb5--AU,87
+pixie_qa-0.1.0.dist-info/licenses/LICENSE,sha256=nZoehBpdSXe6iTF2ZWzM-fgXdXECUZ0J8LrW_1tBwyk,1064
+pixie_qa-0.1.0.dist-info/RECORD,,

pixie_qa-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

pixie_qa-0.1.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+pixie = pixie.cli.main:main
+pixie-test = pixie.cli.test_command:main

pixie_qa-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Yiou Li
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.