pixie-qa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pixie/storage/tree.py ADDED
@@ -0,0 +1,199 @@
1
+ """ObservationNode tree wrapper with traversal and LLM-friendly serialization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+ from pixie.instrumentation.spans import (
10
+ AssistantMessage,
11
+ LLMSpan,
12
+ ObserveSpan,
13
+ SystemMessage,
14
+ TextContent,
15
+ ToolResultMessage,
16
+ UserMessage,
17
+ )
18
+
19
+
20
+ @dataclass
21
+ class ObservationNode:
22
+ """Tree node wrapping a span with children for hierarchical traversal."""
23
+
24
+ span: ObserveSpan | LLMSpan
25
+ children: list[ObservationNode] = field(default_factory=list)
26
+
27
+ # ── Delegated properties ──────────────────────────────────────────────
28
+
29
+ @property
30
+ def span_id(self) -> str:
31
+ """Span identifier."""
32
+ return self.span.span_id
33
+
34
+ @property
35
+ def trace_id(self) -> str:
36
+ """Trace identifier."""
37
+ return self.span.trace_id
38
+
39
+ @property
40
+ def parent_span_id(self) -> str | None:
41
+ """Parent span identifier."""
42
+ return self.span.parent_span_id
43
+
44
+ @property
45
+ def name(self) -> str:
46
+ """Human-readable name: ``span.name`` for observe, ``request_model`` for LLM."""
47
+ if isinstance(self.span, LLMSpan):
48
+ return self.span.request_model
49
+ return self.span.name or "(unnamed)"
50
+
51
+ @property
52
+ def duration_ms(self) -> float:
53
+ """Duration in milliseconds."""
54
+ return self.span.duration_ms
55
+
56
+ # ── Search ────────────────────────────────────────────────────────────
57
+
58
+ def find(self, name: str) -> list[ObservationNode]:
59
+ """Return all nodes in the subtree where ``node.name == name`` (DFS)."""
60
+ result: list[ObservationNode] = []
61
+ if self.name == name:
62
+ result.append(self)
63
+ for child in self.children:
64
+ result.extend(child.find(name))
65
+ return result
66
+
67
+ def find_by_type(self, span_type: type[ObserveSpan] | type[LLMSpan]) -> list[ObservationNode]:
68
+ """Return all nodes in the subtree where ``isinstance(node.span, span_type)``."""
69
+ result: list[ObservationNode] = []
70
+ if isinstance(self.span, span_type):
71
+ result.append(self)
72
+ for child in self.children:
73
+ result.extend(child.find_by_type(span_type))
74
+ return result
75
+
76
+ # ── Serialization ─────────────────────────────────────────────────────
77
+
78
+ def to_text(self, indent: int = 0) -> str:
79
+ """Serialize the tree to an LLM-friendly indented outline."""
80
+ prefix = " " * indent
81
+ if isinstance(self.span, LLMSpan):
82
+ return self._llm_to_text(prefix, indent)
83
+ return self._observe_to_text(prefix, indent)
84
+
85
+ def _observe_to_text(self, prefix: str, indent: int) -> str:
86
+ span: ObserveSpan = self.span # type: ignore[assignment]
87
+ lines: list[str] = []
88
+ name = span.name or "(unnamed)"
89
+ lines.append(f"{prefix}{name} [{span.duration_ms:.0f}ms]")
90
+ if span.input is not None:
91
+ lines.append(f"{prefix} input: {_format_value(span.input)}")
92
+ if span.output is not None:
93
+ lines.append(f"{prefix} output: {_format_value(span.output)}")
94
+ if span.error is not None:
95
+ lines.append(f"{prefix} <e>{span.error}</e>")
96
+ if span.metadata:
97
+ lines.append(f"{prefix} metadata: {json.dumps(span.metadata, default=str)}")
98
+ for child in self.children:
99
+ lines.append(child.to_text(indent + 1))
100
+ return "\n".join(lines)
101
+
102
+ def _llm_to_text(self, prefix: str, indent: int) -> str:
103
+ span: LLMSpan = self.span # type: ignore[assignment]
104
+ lines: list[str] = []
105
+ lines.append(f"{prefix}{span.request_model} [{span.provider}, {span.duration_ms:.0f}ms]")
106
+
107
+ # Input messages
108
+ if span.input_messages:
109
+ lines.append(f"{prefix} input_messages:")
110
+ for msg in span.input_messages:
111
+ lines.append(f"{prefix} {_format_message(msg)}")
112
+
113
+ # Output messages
114
+ if span.output_messages:
115
+ lines.append(f"{prefix} output:")
116
+ for msg in span.output_messages:
117
+ lines.append(f"{prefix} {_format_message(msg)}")
118
+
119
+ # Tokens
120
+ token_parts: list[str] = []
121
+ if span.input_tokens > 0 or span.output_tokens > 0:
122
+ token_parts.append(f"{span.input_tokens} in / {span.output_tokens} out")
123
+ if span.cache_read_tokens > 0:
124
+ token_parts.append(f"({span.cache_read_tokens} cache read)")
125
+ if span.cache_creation_tokens > 0:
126
+ token_parts.append(f"({span.cache_creation_tokens} cache creation)")
127
+ lines.append(f"{prefix} tokens: {' '.join(token_parts)}")
128
+
129
+ # Error
130
+ if span.error_type is not None:
131
+ lines.append(f"{prefix} <e>{span.error_type}</e>")
132
+
133
+ # Tool definitions
134
+ if span.tool_definitions:
135
+ tool_names = ", ".join(td.name for td in span.tool_definitions)
136
+ lines.append(f"{prefix} tools: [{tool_names}]")
137
+
138
+ for child in self.children:
139
+ lines.append(child.to_text(indent + 1))
140
+ return "\n".join(lines)
141
+
142
+
143
+ def _format_value(value: Any) -> str:
144
+ """Format a value for text output."""
145
+ if isinstance(value, (dict, list)):
146
+ return json.dumps(value, default=str)
147
+ return str(value)
148
+
149
+
150
+ def _format_message(
151
+ msg: SystemMessage | UserMessage | AssistantMessage | ToolResultMessage,
152
+ ) -> str:
153
+ """Format a single message for text output."""
154
+ if isinstance(msg, SystemMessage):
155
+ return f"system: {msg.content}"
156
+ if isinstance(msg, UserMessage):
157
+ parts = [p.text for p in msg.content if isinstance(p, TextContent)]
158
+ return f"user: {''.join(parts)}"
159
+ if isinstance(msg, AssistantMessage):
160
+ parts = [p.text for p in msg.content if isinstance(p, TextContent)]
161
+ text = f"assistant: {''.join(parts)}"
162
+ if msg.tool_calls:
163
+ names = ", ".join(tc.name for tc in msg.tool_calls)
164
+ text += f" [tool_calls: {names}]"
165
+ return text
166
+ if isinstance(msg, ToolResultMessage):
167
+ return f"tool({msg.tool_name}): {msg.content}"
168
+ return str(msg) # pragma: no cover
169
+
170
+
171
+ def build_tree(spans: list[ObserveSpan | LLMSpan]) -> list[ObservationNode]:
172
+ """Build a tree from a flat list of spans sharing the same trace.
173
+
174
+ Algorithm:
175
+ 1. Create an ``ObservationNode`` for each span.
176
+ 2. Index by ``span.span_id``.
177
+ 3. Link children to parents via ``parent_span_id``.
178
+ 4. Orphaned nodes (missing parent) become roots.
179
+ 5. Sort each node's children by ``started_at`` ascending.
180
+ 6. Return list of root nodes.
181
+ """
182
+ nodes: dict[str, ObservationNode] = {}
183
+ for span in spans:
184
+ nodes[span.span_id] = ObservationNode(span=span)
185
+
186
+ roots: list[ObservationNode] = []
187
+ for node in nodes.values():
188
+ pid = node.span.parent_span_id
189
+ if pid is not None and pid in nodes:
190
+ nodes[pid].children.append(node)
191
+ else:
192
+ roots.append(node)
193
+
194
+ # Sort children by started_at
195
+ for node in nodes.values():
196
+ node.children.sort(key=lambda n: n.span.started_at)
197
+
198
+ roots.sort(key=lambda n: n.span.started_at)
199
+ return roots
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: pixie-qa
3
+ Version: 0.1.0
4
+ Summary: Automated quality assurance for AI applications
5
+ Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
+ Project-URL: Repository, https://github.com/yiouli/pixie-qa
7
+ Project-URL: Documentation, https://yiouli.github.io/pixie-qa/
8
+ Project-URL: Bug Tracker, https://github.com/yiouli/pixie-qa/issues
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 Yiou Li
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: ai,evals,llm,observability,opentelemetry,testing
32
+ Classifier: Development Status :: 4 - Beta
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3.11
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
39
+ Classifier: Topic :: Software Development :: Testing
40
+ Requires-Python: >=3.11
41
+ Requires-Dist: autoevals>=0.1.0
42
+ Requires-Dist: jsonpickle>=4.0.0
43
+ Requires-Dist: openinference-instrumentation>=0.1.44
44
+ Requires-Dist: opentelemetry-api>=1.27.0
45
+ Requires-Dist: opentelemetry-sdk>=1.27.0
46
+ Requires-Dist: piccolo[sqlite]>=1.33.0
47
+ Requires-Dist: pydantic>=2.0
48
+ Provides-Extra: all
49
+ Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
50
+ Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
51
+ Requires-Dist: openinference-instrumentation-google-genai; extra == 'all'
52
+ Requires-Dist: openinference-instrumentation-langchain; extra == 'all'
53
+ Requires-Dist: openinference-instrumentation-openai; extra == 'all'
54
+ Provides-Extra: anthropic
55
+ Requires-Dist: openinference-instrumentation-anthropic; extra == 'anthropic'
56
+ Provides-Extra: dspy
57
+ Requires-Dist: openinference-instrumentation-dspy; extra == 'dspy'
58
+ Provides-Extra: google
59
+ Requires-Dist: openinference-instrumentation-google-genai; extra == 'google'
60
+ Provides-Extra: langchain
61
+ Requires-Dist: openinference-instrumentation-langchain; extra == 'langchain'
62
+ Provides-Extra: openai
63
+ Requires-Dist: openinference-instrumentation-openai; extra == 'openai'
64
+ Description-Content-Type: text/markdown
65
+
66
+ # pixie-qa
67
+
68
+ A Claude skill and Python package for **eval-driven development** of LLM-powered applications.
69
+
70
+ Use this skill to instrument your app, build golden datasets from real runs, write eval-based tests, and catch regressions before they ship — all from a single conversation with Claude.
71
+
72
+ ## What the Skill Does
73
+
74
+ The `eval-driven-dev` skill guides Claude through the full QA loop for LLM applications:
75
+
76
+ 1. **Understand the app** — read the codebase, trace the data flow, learn what the app is supposed to do
77
+ 2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
78
+ 3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
79
+ 4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
80
+ 5. **Run the tests** — `pixie-test` to run all evals and report per-case scores
81
+ 6. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
82
+
83
+ ## Getting Started
84
+
85
+ ### 1. Add the skill to Claude
86
+
87
+ The skill is bundled in this repository. Claude will automatically use it when you ask to evaluate, test, QA, or benchmark an LLM-powered Python project.
88
+
89
+ If you are using an openskills-compatible agent host:
90
+
91
+ ```bash
92
+ npx openskills install anthropics/skills
93
+ ```
94
+
95
+ ### 2. Install the `pixie-qa` package in your project
96
+
97
+ ```bash
98
+ pip install pixie-qa # or: uv add pixie-qa
99
+ ```
100
+
101
+ Provider instrumentation extras:
102
+
103
+ ```bash
104
+ pip install "pixie-qa[openai]" # OpenAI
105
+ pip install "pixie-qa[anthropic]" # Anthropic
106
+ pip install "pixie-qa[langchain]" # LangChain
107
+ pip install "pixie-qa[all]" # all providers
108
+ ```
109
+
110
+ ### 3. Ask Claude to set up evals
111
+
112
+ Open a conversation and describe your project:
113
+
114
+ > "I have a RAG chatbot in `app/chatbot.py`. Help me set up evals to make sure it's giving accurate answers."
115
+
116
+ Claude will read your code, instrument it, build a dataset from a few real runs, write tests, and run them for you.
117
+
118
+ ## Skill Workflow Example
119
+
120
+ Here is a quick summary of what Claude does end-to-end:
121
+
122
+ ```
123
+ # Claude instruments your app entry point
124
+ from pixie import enable_storage
125
+ enable_storage() # one line: creates DB, registers handler
126
+
127
+ # Claude adds @observe on the function to test
128
+ import pixie.instrumentation as px
129
+
130
+ @px.observe(name="answer_question")
131
+ def answer_question(question: str) -> str:
132
+ ...
133
+
134
+ # After running the app with a few real inputs:
135
+ pixie dataset create qa-golden-set
136
+ pixie dataset save qa-golden-set
137
+
138
+ # Claude writes tests/test_qa.py with:
139
+ async def test_factuality():
140
+ await assert_dataset_pass(
141
+ runnable=runnable,
142
+ dataset_name="qa-golden-set",
143
+ evaluators=[FactualityEval()],
144
+ pass_criteria=ScoreThreshold(threshold=0.7, pct=0.8),
145
+ )
146
+
147
+ # Then runs:
148
+ pixie-test -v
149
+ ```
150
+
151
+ ## Repository Structure
152
+
153
+ ```
154
+ pixie/ Python package (instrumentation, storage, evals, dataset, cli)
155
+ specs/ Design specs and architecture docs
156
+ changelogs/ Per-feature change history
157
+ .claude/skills/ Claude skill definitions and benchmarks
158
+ ```
159
+
160
+ ## Python Package
161
+
162
+ The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
@@ -0,0 +1,39 @@
1
+ pixie/__init__.py,sha256=swukidYwIXnLqDw1LfYz9eIG7-dj2sOfR9AkRgGGsUA,270
2
+ pixie/config.py,sha256=jWqWmBKXazQ0VPzSbHmZH3taN4Sv4fvwYeJmQ5z0LqA,1388
3
+ pixie/cli/__init__.py,sha256=MokbPTq_UfSbFVxYoVkf1kLq89uif310BqLmgXi9yao,192
4
+ pixie/cli/dataset_command.py,sha256=FFYEMyFOJ0iY4UIUa1F0sK1b_tqUuvFXEhBCJUxNxI8,6053
5
+ pixie/cli/main.py,sha256=Hh6sHe1SM5dYuGh5uPDyHCZFBrnuoNlSujmiWcUbngo,5700
6
+ pixie/cli/test_command.py,sha256=PAaY5p8UgI7jFuBybRWPlHTmzAKhXDIJlf9p2Exja4E,1699
7
+ pixie/dataset/__init__.py,sha256=1Vtrli_4WjM8yCFCB1zUfjd6ad0iIskehDuIa1OK6DE,302
8
+ pixie/dataset/models.py,sha256=PsDDTM4TVIDxkGdAQEWgJ0SBXU-RI2Br1ILWtul0uZw,528
9
+ pixie/dataset/store.py,sha256=LEed1a_M6-gjeM_WHFdRPBC6IiD1Qoe0G8sCfwU8c5U,6832
10
+ pixie/evals/__init__.py,sha256=IE3-vHNiG7q0z9dZs4BmCnVOhwJtlgYf7ewFD1tpanU,3654
11
+ pixie/evals/criteria.py,sha256=24yV_lAJg-t34pqhzDt4f5KZ9Dlsyaqg9P6foDzbO4E,2858
12
+ pixie/evals/eval_utils.py,sha256=82E4SU7K9aHCK7pV9fRPdGXTmB5nGKY7AypEw6sIJkI,9206
13
+ pixie/evals/evaluation.py,sha256=22Ml0xYCA11N65tNNgBg1nJra92JQYFSjtDyse594_g,3507
14
+ pixie/evals/runner.py,sha256=Rh3VlKg0JtNwe41A3E-UmKiGqcrSn119AMOudHB-_GM,5557
15
+ pixie/evals/scorers.py,sha256=b1UBYyoRjpY1-BvWNPmM5LKM2EOr-g0XGh5limr8ZnU,22578
16
+ pixie/evals/trace_capture.py,sha256=WZcx_JMLyH2zU6wgsEInU76g2-wttnpUjF66wDTw2lI,2502
17
+ pixie/evals/trace_helpers.py,sha256=mPGTA7ZmX2zahIZ0Lm3ORgut8fryk80jJKth2Eqvtog,1829
18
+ pixie/instrumentation/__init__.py,sha256=U3Q0gbobVoesQu3AUV738_JVg8JFi3IkNMZNqA4EYe4,921
19
+ pixie/instrumentation/context.py,sha256=ehM4Ek5RLnowMnMlphOW_8BjFyOIDazqshkywWCXqdA,3024
20
+ pixie/instrumentation/handler.py,sha256=COvLjWqsm7-KUIMyYV6lY38LHPjC-XYxIi7QTDvXiVU,2583
21
+ pixie/instrumentation/handlers.py,sha256=vYnVT2F5SzxVLbMN5cxrer5IVwsrei0hgMwTYfRosas,2705
22
+ pixie/instrumentation/instrumentors.py,sha256=a1sfHt9qitRw6iThrqIQ3lPNiXKLxbSkeQQBOatTlMg,1211
23
+ pixie/instrumentation/observation.py,sha256=h_oG4K_SDzUiX5_5QTXDYV2Fi06a1go5tC2azcBgQCI,6672
24
+ pixie/instrumentation/processor.py,sha256=wvqefzFnNIu7RFLimsVi7X-l9BGy735l5SyL2gvvP3o,13106
25
+ pixie/instrumentation/queue.py,sha256=7PIsrfW8C7fWbF0H5boRBCs2JKzTqfnzK5_vfnxj5NU,3383
26
+ pixie/instrumentation/spans.py,sha256=nL3siQSO5D2Ia-o2xXqVCTTqbWi37tnDYGl1VMD7LJs,4965
27
+ pixie/storage/__init__.py,sha256=2XutlYhgv2ytKzXmoIoEoFfD_QQsQ6yV6kmZYPY5Bd8,735
28
+ pixie/storage/evaluable.py,sha256=-NS9_ws6ZvOvCPwf-5eg9fQGbZzea6pDxaQcHFKCLkE,4394
29
+ pixie/storage/piccolo_conf.py,sha256=FUZjTSvZ3GbUTKzRQ7q0gzUmflIrSth8rGybFoBLi0k,247
30
+ pixie/storage/serialization.py,sha256=6ybbfneJur8VOnPRT7pDvGZxGh8MwdQEOTtv-9QMZd0,7802
31
+ pixie/storage/store.py,sha256=NOVS8nqHuQw0IvYMiQI1arBvgUeOAwuTz1aAaxgDb2g,9148
32
+ pixie/storage/tables.py,sha256=Y9oTFZAifVvIULY1Uu2wnfe0VfisiD1M2o70ZUXzXP8,734
33
+ pixie/storage/tree.py,sha256=9jZWgTHr9wSDDoDFVH3yOaHhpZiOwfKUgk_mipEV47w,7509
34
+ pixie/storage/piccolo_migrations/__init__.py,sha256=eGjDIi1cgVSySAIZetJxdOeaODoXkISWNYPNcOoJcII,52
35
+ pixie_qa-0.1.0.dist-info/METADATA,sha256=OldsWIEuQAGaftbeAGZr4FBaAfqdDzm4H-Yg_5KRIE8,6577
36
+ pixie_qa-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
37
+ pixie_qa-0.1.0.dist-info/entry_points.txt,sha256=56TDpYuU0wGJd4W0Z56dh5wruEekfWT5XmPCqb5--AU,87
38
+ pixie_qa-0.1.0.dist-info/licenses/LICENSE,sha256=nZoehBpdSXe6iTF2ZWzM-fgXdXECUZ0J8LrW_1tBwyk,1064
39
+ pixie_qa-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ pixie = pixie.cli.main:main
3
+ pixie-test = pixie.cli.test_command:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Yiou Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.