pixie-qa 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/PKG-INFO +15 -11
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/README.md +14 -9
- pixie_qa-0.5.0/pixie/__init__.py +90 -0
- pixie_qa-0.5.0/pixie/assets/webui.html +64 -0
- pixie_qa-0.5.0/pixie/cli/__init__.py +11 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/cli/analyze_command.py +3 -3
- pixie_qa-0.5.0/pixie/cli/format_command.py +223 -0
- pixie_qa-0.5.0/pixie/cli/main.py +202 -0
- pixie_qa-0.5.0/pixie/cli/test_command.py +178 -0
- pixie_qa-0.5.0/pixie/cli/trace_command.py +128 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/config.py +8 -8
- pixie_qa-0.5.0/pixie/eval/__init__.py +143 -0
- pixie_qa-0.5.0/pixie/eval/evaluable.py +100 -0
- {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/evaluation.py +8 -14
- {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/llm_evaluator.py +35 -19
- {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/rate_limiter.py +1 -1
- {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/scorers.py +53 -50
- pixie_qa-0.5.0/pixie/harness/__init__.py +8 -0
- pixie_qa-0.5.0/pixie/harness/runnable.py +133 -0
- pixie_qa-0.5.0/pixie/harness/runner.py +813 -0
- pixie_qa-0.5.0/pixie/instrumentation/__init__.py +99 -0
- pixie_qa-0.5.0/pixie/instrumentation/llm_tracing.py +818 -0
- pixie_qa-0.5.0/pixie/instrumentation/wrap.py +323 -0
- pixie_qa-0.5.0/pixie/web/__init__.py +7 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/web/app.py +15 -4
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/web/watcher.py +6 -3
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pyproject.toml +3 -4
- pixie_qa-0.4.0/pixie/__init__.py +0 -113
- pixie_qa-0.4.0/pixie/assets/webui.html +0 -64
- pixie_qa-0.4.0/pixie/cli/__init__.py +0 -6
- pixie_qa-0.4.0/pixie/cli/dag_command.py +0 -75
- pixie_qa-0.4.0/pixie/cli/dataset_command.py +0 -193
- pixie_qa-0.4.0/pixie/cli/main.py +0 -456
- pixie_qa-0.4.0/pixie/cli/test_command.py +0 -257
- pixie_qa-0.4.0/pixie/cli/trace_command.py +0 -294
- pixie_qa-0.4.0/pixie/dag/__init__.py +0 -400
- pixie_qa-0.4.0/pixie/dag/trace_check.py +0 -183
- pixie_qa-0.4.0/pixie/dataset/__init__.py +0 -11
- pixie_qa-0.4.0/pixie/dataset/models.py +0 -21
- pixie_qa-0.4.0/pixie/dataset/store.py +0 -212
- pixie_qa-0.4.0/pixie/evals/__init__.py +0 -184
- pixie_qa-0.4.0/pixie/evals/criteria.py +0 -61
- pixie_qa-0.4.0/pixie/evals/dataset_runner.py +0 -495
- pixie_qa-0.4.0/pixie/evals/eval_utils.py +0 -334
- pixie_qa-0.4.0/pixie/evals/scorecard.py +0 -252
- pixie_qa-0.4.0/pixie/evals/trace_capture.py +0 -70
- pixie_qa-0.4.0/pixie/evals/trace_helpers.py +0 -57
- pixie_qa-0.4.0/pixie/instrumentation/__init__.py +0 -80
- pixie_qa-0.4.0/pixie/instrumentation/context.py +0 -86
- pixie_qa-0.4.0/pixie/instrumentation/handler.py +0 -72
- pixie_qa-0.4.0/pixie/instrumentation/handlers.py +0 -105
- pixie_qa-0.4.0/pixie/instrumentation/instrumentors.py +0 -47
- pixie_qa-0.4.0/pixie/instrumentation/observation.py +0 -217
- pixie_qa-0.4.0/pixie/instrumentation/processor.py +0 -366
- pixie_qa-0.4.0/pixie/instrumentation/queue.py +0 -88
- pixie_qa-0.4.0/pixie/instrumentation/spans.py +0 -165
- pixie_qa-0.4.0/pixie/storage/__init__.py +0 -27
- pixie_qa-0.4.0/pixie/storage/evaluable.py +0 -140
- pixie_qa-0.4.0/pixie/storage/piccolo_conf.py +0 -10
- pixie_qa-0.4.0/pixie/storage/piccolo_migrations/__init__.py +0 -1
- pixie_qa-0.4.0/pixie/storage/serialization.py +0 -227
- pixie_qa-0.4.0/pixie/storage/store.py +0 -231
- pixie_qa-0.4.0/pixie/storage/tables.py +0 -21
- pixie_qa-0.4.0/pixie/storage/tree.py +0 -199
- pixie_qa-0.4.0/pixie/web/__init__.py +0 -1
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/.gitignore +0 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/LICENSE +0 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/assets/mock-data.json +0 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/cli/init_command.py +0 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/cli/start_command.py +0 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/favicon.png +0 -0
- /pixie_qa-0.4.0/pixie/evals/test_result.py → /pixie_qa-0.5.0/pixie/harness/run_result.py +0 -0
- {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/web/server.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pixie-qa
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Automated quality assurance for AI applications
|
|
5
5
|
Project-URL: Homepage, https://github.com/yiouli/pixie-qa
|
|
6
6
|
Project-URL: Repository, https://github.com/yiouli/pixie-qa
|
|
@@ -44,7 +44,6 @@ Requires-Dist: openai>=2.29.0
|
|
|
44
44
|
Requires-Dist: openinference-instrumentation>=0.1.44
|
|
45
45
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
46
46
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
47
|
-
Requires-Dist: piccolo[sqlite]>=1.33.0
|
|
48
47
|
Requires-Dist: pydantic>=2.0
|
|
49
48
|
Requires-Dist: python-dotenv>=1.2.2
|
|
50
49
|
Requires-Dist: starlette>=1.0.0
|
|
@@ -70,20 +69,19 @@ Description-Content-Type: text/markdown
|
|
|
70
69
|
|
|
71
70
|
# pixie-qa
|
|
72
71
|
|
|
73
|
-
An agent skill that
|
|
72
|
+
An agent skill that makes coding agents the QA engineer for LLM applications.
|
|
74
73
|
|
|
75
74
|
## What the Skill Does
|
|
76
75
|
|
|
77
76
|
The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
|
|
78
77
|
|
|
79
78
|
1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
|
|
80
|
-
2. **Instrument it** —
|
|
81
|
-
3. **Build a dataset** —
|
|
82
|
-
4. **Write eval tests** — generate `test_*.py` files with
|
|
83
|
-
5. **
|
|
84
|
-
6. **
|
|
85
|
-
7. **
|
|
86
|
-
8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
|
|
79
|
+
2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
|
|
80
|
+
3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
|
|
81
|
+
4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
|
|
82
|
+
5. **Run the tests** — `pixie test` to run all evals and report per-case scores
|
|
83
|
+
6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
|
|
84
|
+
7. **Investigate failures** — diagnose failures, fix, repeat
|
|
87
85
|
|
|
88
86
|
## Getting Started
|
|
89
87
|
|
|
@@ -105,7 +103,13 @@ Your coding agent will read your code, instrument it, build a dataset from a few
|
|
|
105
103
|
|
|
106
104
|
## Python Package
|
|
107
105
|
|
|
108
|
-
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project.
|
|
106
|
+
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
|
|
107
|
+
|
|
108
|
+
Install hooks once per clone:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
uv run pre-commit install
|
|
112
|
+
```
|
|
109
113
|
|
|
110
114
|
## Web UI
|
|
111
115
|
|
|
@@ -1,19 +1,18 @@
|
|
|
1
1
|
# pixie-qa
|
|
2
2
|
|
|
3
|
-
An agent skill that
|
|
3
|
+
An agent skill that makes coding agents the QA engineer for LLM applications.
|
|
4
4
|
|
|
5
5
|
## What the Skill Does
|
|
6
6
|
|
|
7
7
|
The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
|
|
8
8
|
|
|
9
9
|
1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
|
|
10
|
-
2. **Instrument it** —
|
|
11
|
-
3. **Build a dataset** —
|
|
12
|
-
4. **Write eval tests** — generate `test_*.py` files with
|
|
13
|
-
5. **
|
|
14
|
-
6. **
|
|
15
|
-
7. **
|
|
16
|
-
8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
|
|
10
|
+
2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
|
|
11
|
+
3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
|
|
12
|
+
4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
|
|
13
|
+
5. **Run the tests** — `pixie test` to run all evals and report per-case scores
|
|
14
|
+
6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
|
|
15
|
+
7. **Investigate failures** — diagnose failures, fix, repeat
|
|
17
16
|
|
|
18
17
|
## Getting Started
|
|
19
18
|
|
|
@@ -35,7 +34,13 @@ Your coding agent will read your code, instrument it, build a dataset from a few
|
|
|
35
34
|
|
|
36
35
|
## Python Package
|
|
37
36
|
|
|
38
|
-
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project.
|
|
37
|
+
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
|
|
38
|
+
|
|
39
|
+
Install hooks once per clone:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv run pre-commit install
|
|
43
|
+
```
|
|
39
44
|
|
|
40
45
|
## Web UI
|
|
41
46
|
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""pixie — automated quality assurance for AI applications.
|
|
2
|
+
|
|
3
|
+
Re-exports the full public API so users can ``from pixie import ...``
|
|
4
|
+
for every commonly used symbol without needing submodule paths.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pixie.eval.evaluable import Evaluable, TestCase
|
|
8
|
+
from pixie.eval.evaluation import Evaluation, Evaluator, evaluate
|
|
9
|
+
from pixie.eval.llm_evaluator import create_llm_evaluator
|
|
10
|
+
from pixie.eval.scorers import (
|
|
11
|
+
AnswerCorrectness,
|
|
12
|
+
AnswerRelevancy,
|
|
13
|
+
AutoevalsAdapter,
|
|
14
|
+
Battle,
|
|
15
|
+
ClosedQA,
|
|
16
|
+
ContextRelevancy,
|
|
17
|
+
EmbeddingSimilarity,
|
|
18
|
+
ExactMatch,
|
|
19
|
+
Factuality,
|
|
20
|
+
Faithfulness,
|
|
21
|
+
Humor,
|
|
22
|
+
JSONDiff,
|
|
23
|
+
LevenshteinMatch,
|
|
24
|
+
ListContains,
|
|
25
|
+
Moderation,
|
|
26
|
+
NumericDiff,
|
|
27
|
+
Possible,
|
|
28
|
+
Security,
|
|
29
|
+
Sql,
|
|
30
|
+
Summary,
|
|
31
|
+
Translation,
|
|
32
|
+
ValidJSON,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# -- Harness ------------------------------------------------------------------
|
|
36
|
+
from pixie.harness.runnable import Runnable
|
|
37
|
+
|
|
38
|
+
# -- Instrumentation ----------------------------------------------------------
|
|
39
|
+
from pixie.instrumentation.llm_tracing import (
|
|
40
|
+
add_handler,
|
|
41
|
+
enable_llm_tracing,
|
|
42
|
+
flush,
|
|
43
|
+
remove_handler,
|
|
44
|
+
)
|
|
45
|
+
from pixie.instrumentation.wrap import (
|
|
46
|
+
WrappedData,
|
|
47
|
+
wrap,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
# Instrumentation
|
|
52
|
+
"WrappedData",
|
|
53
|
+
"flush",
|
|
54
|
+
"enable_llm_tracing",
|
|
55
|
+
"add_handler",
|
|
56
|
+
"remove_handler",
|
|
57
|
+
"wrap",
|
|
58
|
+
# Harness
|
|
59
|
+
"Runnable",
|
|
60
|
+
# Eval data models
|
|
61
|
+
"Evaluable",
|
|
62
|
+
"TestCase",
|
|
63
|
+
"Evaluation",
|
|
64
|
+
"Evaluator",
|
|
65
|
+
"evaluate",
|
|
66
|
+
"create_llm_evaluator",
|
|
67
|
+
# Pre-made evaluators (autoevals adapters)
|
|
68
|
+
"AnswerCorrectness",
|
|
69
|
+
"AnswerRelevancy",
|
|
70
|
+
"AutoevalsAdapter",
|
|
71
|
+
"Battle",
|
|
72
|
+
"ClosedQA",
|
|
73
|
+
"ContextRelevancy",
|
|
74
|
+
"EmbeddingSimilarity",
|
|
75
|
+
"ExactMatch",
|
|
76
|
+
"Factuality",
|
|
77
|
+
"Faithfulness",
|
|
78
|
+
"Humor",
|
|
79
|
+
"JSONDiff",
|
|
80
|
+
"LevenshteinMatch",
|
|
81
|
+
"ListContains",
|
|
82
|
+
"Moderation",
|
|
83
|
+
"NumericDiff",
|
|
84
|
+
"Possible",
|
|
85
|
+
"Security",
|
|
86
|
+
"Sql",
|
|
87
|
+
"Summary",
|
|
88
|
+
"Translation",
|
|
89
|
+
"ValidJSON",
|
|
90
|
+
]
|