pixie-qa 0.2.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixie_qa-0.5.0/.gitignore +8 -0
- {pixie_qa-0.2.2 → pixie_qa-0.5.0}/PKG-INFO +41 -8
- pixie_qa-0.5.0/README.md +67 -0
- pixie_qa-0.5.0/pixie/__init__.py +90 -0
- pixie_qa-0.5.0/pixie/assets/mock-data.json +407 -0
- pixie_qa-0.5.0/pixie/assets/webui.html +64 -0
- pixie_qa-0.5.0/pixie/cli/__init__.py +11 -0
- pixie_qa-0.5.0/pixie/cli/analyze_command.py +156 -0
- pixie_qa-0.5.0/pixie/cli/format_command.py +223 -0
- pixie_qa-0.5.0/pixie/cli/init_command.py +55 -0
- pixie_qa-0.5.0/pixie/cli/main.py +202 -0
- pixie_qa-0.5.0/pixie/cli/start_command.py +43 -0
- pixie_qa-0.5.0/pixie/cli/test_command.py +178 -0
- pixie_qa-0.5.0/pixie/cli/trace_command.py +128 -0
- pixie_qa-0.5.0/pixie/config.py +130 -0
- pixie_qa-0.5.0/pixie/eval/__init__.py +143 -0
- pixie_qa-0.5.0/pixie/eval/evaluable.py +100 -0
- {pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/evaluation.py +15 -11
- {pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/llm_evaluator.py +35 -19
- pixie_qa-0.5.0/pixie/eval/rate_limiter.py +140 -0
- {pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/scorers.py +268 -123
- pixie_qa-0.5.0/pixie/harness/__init__.py +8 -0
- pixie_qa-0.5.0/pixie/harness/run_result.py +239 -0
- pixie_qa-0.5.0/pixie/harness/runnable.py +133 -0
- pixie_qa-0.5.0/pixie/harness/runner.py +813 -0
- pixie_qa-0.5.0/pixie/instrumentation/__init__.py +99 -0
- pixie_qa-0.5.0/pixie/instrumentation/llm_tracing.py +818 -0
- pixie_qa-0.5.0/pixie/instrumentation/wrap.py +323 -0
- pixie_qa-0.5.0/pixie/web/__init__.py +7 -0
- pixie_qa-0.5.0/pixie/web/app.py +266 -0
- pixie_qa-0.5.0/pixie/web/server.py +369 -0
- pixie_qa-0.5.0/pixie/web/watcher.py +102 -0
- {pixie_qa-0.2.2 → pixie_qa-0.5.0}/pyproject.toml +24 -4
- pixie_qa-0.2.2/.github/copilot-instructions.md +0 -632
- pixie_qa-0.2.2/.github/workflows/publish.yml +0 -80
- pixie_qa-0.2.2/.gitignore +0 -4
- pixie_qa-0.2.2/README.md +0 -36
- pixie_qa-0.2.2/changelogs/async-handler-processing.md +0 -96
- pixie_qa-0.2.2/changelogs/autoevals-adapters.md +0 -39
- pixie_qa-0.2.2/changelogs/cli-dataset-commands.md +0 -37
- pixie_qa-0.2.2/changelogs/dataset-management.md +0 -91
- pixie_qa-0.2.2/changelogs/deep-research-demo.md +0 -43
- pixie_qa-0.2.2/changelogs/eval-harness.md +0 -128
- pixie_qa-0.2.2/changelogs/expected-output-in-evals.md +0 -42
- pixie_qa-0.2.2/changelogs/instrumentation-module-implementation.md +0 -55
- pixie_qa-0.2.2/changelogs/loud-failure-mode.md +0 -58
- pixie_qa-0.2.2/changelogs/manual-instrumentation-usability.md +0 -56
- pixie_qa-0.2.2/changelogs/observation-store-implementation.md +0 -53
- pixie_qa-0.2.2/changelogs/observe-sensitive-field-stripping.md +0 -22
- pixie_qa-0.2.2/changelogs/pixie-directory-and-skill-improvements.md +0 -63
- pixie_qa-0.2.2/changelogs/pixie-test-e2e-suite.md +0 -69
- pixie_qa-0.2.2/changelogs/root-package-exports-and-trace-id.md +0 -58
- pixie_qa-0.2.2/changelogs/scorecard-branding-and-skill-version-check.md +0 -41
- pixie_qa-0.2.2/changelogs/scorecard-eval-detail-dialog.md +0 -28
- pixie_qa-0.2.2/changelogs/skill-v2-and-rootdir-discovery.md +0 -76
- pixie_qa-0.2.2/changelogs/test-scorecard.md +0 -54
- pixie_qa-0.2.2/changelogs/usability-utils.md +0 -60
- pixie_qa-0.2.2/docs/package.md +0 -233
- pixie_qa-0.2.2/pixie/__init__.py +0 -110
- pixie_qa-0.2.2/pixie/cli/__init__.py +0 -6
- pixie_qa-0.2.2/pixie/cli/dataset_command.py +0 -193
- pixie_qa-0.2.2/pixie/cli/main.py +0 -307
- pixie_qa-0.2.2/pixie/cli/test_command.py +0 -120
- pixie_qa-0.2.2/pixie/cli/trace_command.py +0 -186
- pixie_qa-0.2.2/pixie/config.py +0 -54
- pixie_qa-0.2.2/pixie/dataset/__init__.py +0 -11
- pixie_qa-0.2.2/pixie/dataset/models.py +0 -21
- pixie_qa-0.2.2/pixie/dataset/store.py +0 -212
- pixie_qa-0.2.2/pixie/evals/__init__.py +0 -121
- pixie_qa-0.2.2/pixie/evals/criteria.py +0 -77
- pixie_qa-0.2.2/pixie/evals/eval_utils.py +0 -358
- pixie_qa-0.2.2/pixie/evals/runner.py +0 -278
- pixie_qa-0.2.2/pixie/evals/scorecard.py +0 -916
- pixie_qa-0.2.2/pixie/evals/trace_capture.py +0 -70
- pixie_qa-0.2.2/pixie/evals/trace_helpers.py +0 -57
- pixie_qa-0.2.2/pixie/instrumentation/__init__.py +0 -49
- pixie_qa-0.2.2/pixie/instrumentation/context.py +0 -86
- pixie_qa-0.2.2/pixie/instrumentation/handler.py +0 -72
- pixie_qa-0.2.2/pixie/instrumentation/handlers.py +0 -105
- pixie_qa-0.2.2/pixie/instrumentation/instrumentors.py +0 -31
- pixie_qa-0.2.2/pixie/instrumentation/observation.py +0 -217
- pixie_qa-0.2.2/pixie/instrumentation/processor.py +0 -366
- pixie_qa-0.2.2/pixie/instrumentation/queue.py +0 -88
- pixie_qa-0.2.2/pixie/instrumentation/spans.py +0 -165
- pixie_qa-0.2.2/pixie/storage/__init__.py +0 -27
- pixie_qa-0.2.2/pixie/storage/evaluable.py +0 -138
- pixie_qa-0.2.2/pixie/storage/piccolo_conf.py +0 -10
- pixie_qa-0.2.2/pixie/storage/piccolo_migrations/__init__.py +0 -1
- pixie_qa-0.2.2/pixie/storage/serialization.py +0 -227
- pixie_qa-0.2.2/pixie/storage/store.py +0 -231
- pixie_qa-0.2.2/pixie/storage/tables.py +0 -21
- pixie_qa-0.2.2/pixie/storage/tree.py +0 -199
- pixie_qa-0.2.2/skills/eval-driven-dev/SKILL.md +0 -378
- pixie_qa-0.2.2/skills/eval-driven-dev/references/dataset-generation.md +0 -235
- pixie_qa-0.2.2/skills/eval-driven-dev/references/eval-tests.md +0 -241
- pixie_qa-0.2.2/skills/eval-driven-dev/references/instrumentation.md +0 -174
- pixie_qa-0.2.2/skills/eval-driven-dev/references/investigation.md +0 -146
- pixie_qa-0.2.2/skills/eval-driven-dev/references/pixie-api.md +0 -257
- pixie_qa-0.2.2/skills/eval-driven-dev/references/run-harness-patterns.md +0 -281
- pixie_qa-0.2.2/skills/eval-driven-dev/references/understanding-app.md +0 -201
- pixie_qa-0.2.2/specs/agent-skill-1.md +0 -25
- pixie_qa-0.2.2/specs/agent-skill.md +0 -71
- pixie_qa-0.2.2/specs/autoevals-adapters.md +0 -301
- pixie_qa-0.2.2/specs/dataset-management.md +0 -727
- pixie_qa-0.2.2/specs/evals-harness.md +0 -649
- pixie_qa-0.2.2/specs/expected-output-in-evals.md +0 -141
- pixie_qa-0.2.2/specs/instrumentation.md +0 -726
- pixie_qa-0.2.2/specs/manual-instrumentation-usability.md +0 -767
- pixie_qa-0.2.2/specs/storage.md +0 -473
- pixie_qa-0.2.2/specs/usability-utils.md +0 -327
- pixie_qa-0.2.2/tests/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/cli/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/cli/conftest.py +0 -15
- pixie_qa-0.2.2/tests/pixie/cli/e2e_cases.json +0 -183
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/conftest.py +0 -9
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -45
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -156
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -106
- pixie_qa-0.2.2/tests/pixie/cli/test_dataset_command.py +0 -412
- pixie_qa-0.2.2/tests/pixie/cli/test_e2e_pixie_test.py +0 -343
- pixie_qa-0.2.2/tests/pixie/cli/test_main.py +0 -261
- pixie_qa-0.2.2/tests/pixie/cli/test_trace_command.py +0 -324
- pixie_qa-0.2.2/tests/pixie/dataset/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/dataset/test_models.py +0 -64
- pixie_qa-0.2.2/tests/pixie/dataset/test_store.py +0 -222
- pixie_qa-0.2.2/tests/pixie/evals/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/evals/test_criteria.py +0 -116
- pixie_qa-0.2.2/tests/pixie/evals/test_eval_utils.py +0 -666
- pixie_qa-0.2.2/tests/pixie/evals/test_evaluation.py +0 -186
- pixie_qa-0.2.2/tests/pixie/evals/test_llm_evaluator.py +0 -235
- pixie_qa-0.2.2/tests/pixie/evals/test_runner.py +0 -452
- pixie_qa-0.2.2/tests/pixie/evals/test_scorecard.py +0 -487
- pixie_qa-0.2.2/tests/pixie/evals/test_scorers.py +0 -558
- pixie_qa-0.2.2/tests/pixie/evals/test_trace_capture.py +0 -205
- pixie_qa-0.2.2/tests/pixie/evals/test_trace_helpers.py +0 -154
- pixie_qa-0.2.2/tests/pixie/instrumentation/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/instrumentation/conftest.py +0 -35
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_context.py +0 -157
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_handler.py +0 -192
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_integration.py +0 -208
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_observation.py +0 -196
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_processor.py +0 -560
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_queue.py +0 -223
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_spans.py +0 -254
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_storage_handler.py +0 -108
- pixie_qa-0.2.2/tests/pixie/observation_store/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/observation_store/conftest.py +0 -231
- pixie_qa-0.2.2/tests/pixie/observation_store/test_evaluable.py +0 -191
- pixie_qa-0.2.2/tests/pixie/observation_store/test_serialization.py +0 -156
- pixie_qa-0.2.2/tests/pixie/observation_store/test_store.py +0 -289
- pixie_qa-0.2.2/tests/pixie/observation_store/test_tree.py +0 -248
- pixie_qa-0.2.2/tests/pixie/test_config.py +0 -73
- pixie_qa-0.2.2/tests/pixie/test_init.py +0 -157
- {pixie_qa-0.2.2 → pixie_qa-0.5.0}/LICENSE +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.5.0}/pixie/favicon.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pixie-qa
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Automated quality assurance for AI applications
|
|
5
5
|
Project-URL: Homepage, https://github.com/yiouli/pixie-qa
|
|
6
6
|
Project-URL: Repository, https://github.com/yiouli/pixie-qa
|
|
@@ -44,9 +44,11 @@ Requires-Dist: openai>=2.29.0
|
|
|
44
44
|
Requires-Dist: openinference-instrumentation>=0.1.44
|
|
45
45
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
46
46
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
47
|
-
Requires-Dist: piccolo[sqlite]>=1.33.0
|
|
48
47
|
Requires-Dist: pydantic>=2.0
|
|
49
48
|
Requires-Dist: python-dotenv>=1.2.2
|
|
49
|
+
Requires-Dist: starlette>=1.0.0
|
|
50
|
+
Requires-Dist: uvicorn>=0.42.0
|
|
51
|
+
Requires-Dist: watchfiles>=1.1.1
|
|
50
52
|
Provides-Extra: all
|
|
51
53
|
Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
|
|
52
54
|
Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
|
|
@@ -67,18 +69,19 @@ Description-Content-Type: text/markdown
|
|
|
67
69
|
|
|
68
70
|
# pixie-qa
|
|
69
71
|
|
|
70
|
-
An agent skill that
|
|
72
|
+
An agent skill that makes coding agents the QA engineer for LLM applications.
|
|
71
73
|
|
|
72
74
|
## What the Skill Does
|
|
73
75
|
|
|
74
76
|
The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
|
|
75
77
|
|
|
76
78
|
1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
|
|
77
|
-
2. **Instrument it** —
|
|
78
|
-
3. **Build a dataset** —
|
|
79
|
-
4. **Write eval tests** — generate `test_*.py` files with
|
|
79
|
+
2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
|
|
80
|
+
3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
|
|
81
|
+
4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
|
|
80
82
|
5. **Run the tests** — `pixie test` to run all evals and report per-case scores
|
|
81
|
-
6. **
|
|
83
|
+
6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
|
|
84
|
+
7. **Investigate failures** — diagnose failures, fix, repeat
|
|
82
85
|
|
|
83
86
|
## Getting Started
|
|
84
87
|
|
|
@@ -100,4 +103,34 @@ Your coding agent will read your code, instrument it, build a dataset from a few
|
|
|
100
103
|
|
|
101
104
|
## Python Package
|
|
102
105
|
|
|
103
|
-
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project.
|
|
106
|
+
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
|
|
107
|
+
|
|
108
|
+
Install hooks once per clone:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
uv run pre-commit install
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Web UI
|
|
115
|
+
|
|
116
|
+
View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
|
|
120
|
+
pixie start my_dir # use a custom artifact root
|
|
121
|
+
pixie init # scaffolds pixie_qa/ without starting the server
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
|
|
125
|
+
|
|
126
|
+
The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
|
|
127
|
+
|
|
128
|
+
## Configuration
|
|
129
|
+
|
|
130
|
+
Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
|
|
131
|
+
|
|
132
|
+
Useful settings include:
|
|
133
|
+
|
|
134
|
+
- `PIXIE_ROOT` to move all generated artefacts under a different root directory
|
|
135
|
+
- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
|
|
136
|
+
- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
|
pixie_qa-0.5.0/README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# pixie-qa
|
|
2
|
+
|
|
3
|
+
An agent skill that makes coding agents the QA engineer for LLM applications.
|
|
4
|
+
|
|
5
|
+
## What the Skill Does
|
|
6
|
+
|
|
7
|
+
The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
|
|
8
|
+
|
|
9
|
+
1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
|
|
10
|
+
2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
|
|
11
|
+
3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
|
|
12
|
+
4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
|
|
13
|
+
5. **Run the tests** — `pixie test` to run all evals and report per-case scores
|
|
14
|
+
6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
|
|
15
|
+
7. **Investigate failures** — diagnose failures, fix, repeat
|
|
16
|
+
|
|
17
|
+
## Getting Started
|
|
18
|
+
|
|
19
|
+
### 1. Add the skill to your coding agent
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
npx skills add yiouli/pixie-qa
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
The accompanying python package would be installed by the skill automatically when it's used.
|
|
26
|
+
|
|
27
|
+
### 2. Ask coding agent to set up evals
|
|
28
|
+
|
|
29
|
+
Open a conversation and say something like when developing a python based AI project:
|
|
30
|
+
|
|
31
|
+
> "setup QA for my agent"
|
|
32
|
+
|
|
33
|
+
Your coding agent will read your code, instrument it, build a dataset from a few real runs, write and run eval-based tests, investigate failures and fix.
|
|
34
|
+
|
|
35
|
+
## Python Package
|
|
36
|
+
|
|
37
|
+
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
|
|
38
|
+
|
|
39
|
+
Install hooks once per clone:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv run pre-commit install
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Web UI
|
|
46
|
+
|
|
47
|
+
View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
|
|
51
|
+
pixie start my_dir # use a custom artifact root
|
|
52
|
+
pixie init # scaffolds pixie_qa/ without starting the server
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
|
|
56
|
+
|
|
57
|
+
The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
|
|
58
|
+
|
|
59
|
+
## Configuration
|
|
60
|
+
|
|
61
|
+
Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
|
|
62
|
+
|
|
63
|
+
Useful settings include:
|
|
64
|
+
|
|
65
|
+
- `PIXIE_ROOT` to move all generated artefacts under a different root directory
|
|
66
|
+
- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
|
|
67
|
+
- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""pixie — automated quality assurance for AI applications.
|
|
2
|
+
|
|
3
|
+
Re-exports the full public API so users can ``from pixie import ...``
|
|
4
|
+
for every commonly used symbol without needing submodule paths.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pixie.eval.evaluable import Evaluable, TestCase
|
|
8
|
+
from pixie.eval.evaluation import Evaluation, Evaluator, evaluate
|
|
9
|
+
from pixie.eval.llm_evaluator import create_llm_evaluator
|
|
10
|
+
from pixie.eval.scorers import (
|
|
11
|
+
AnswerCorrectness,
|
|
12
|
+
AnswerRelevancy,
|
|
13
|
+
AutoevalsAdapter,
|
|
14
|
+
Battle,
|
|
15
|
+
ClosedQA,
|
|
16
|
+
ContextRelevancy,
|
|
17
|
+
EmbeddingSimilarity,
|
|
18
|
+
ExactMatch,
|
|
19
|
+
Factuality,
|
|
20
|
+
Faithfulness,
|
|
21
|
+
Humor,
|
|
22
|
+
JSONDiff,
|
|
23
|
+
LevenshteinMatch,
|
|
24
|
+
ListContains,
|
|
25
|
+
Moderation,
|
|
26
|
+
NumericDiff,
|
|
27
|
+
Possible,
|
|
28
|
+
Security,
|
|
29
|
+
Sql,
|
|
30
|
+
Summary,
|
|
31
|
+
Translation,
|
|
32
|
+
ValidJSON,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# -- Harness ------------------------------------------------------------------
|
|
36
|
+
from pixie.harness.runnable import Runnable
|
|
37
|
+
|
|
38
|
+
# -- Instrumentation ----------------------------------------------------------
|
|
39
|
+
from pixie.instrumentation.llm_tracing import (
|
|
40
|
+
add_handler,
|
|
41
|
+
enable_llm_tracing,
|
|
42
|
+
flush,
|
|
43
|
+
remove_handler,
|
|
44
|
+
)
|
|
45
|
+
from pixie.instrumentation.wrap import (
|
|
46
|
+
WrappedData,
|
|
47
|
+
wrap,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
# Instrumentation
|
|
52
|
+
"WrappedData",
|
|
53
|
+
"flush",
|
|
54
|
+
"enable_llm_tracing",
|
|
55
|
+
"add_handler",
|
|
56
|
+
"remove_handler",
|
|
57
|
+
"wrap",
|
|
58
|
+
# Harness
|
|
59
|
+
"Runnable",
|
|
60
|
+
# Eval data models
|
|
61
|
+
"Evaluable",
|
|
62
|
+
"TestCase",
|
|
63
|
+
"Evaluation",
|
|
64
|
+
"Evaluator",
|
|
65
|
+
"evaluate",
|
|
66
|
+
"create_llm_evaluator",
|
|
67
|
+
# Pre-made evaluators (autoevals adapters)
|
|
68
|
+
"AnswerCorrectness",
|
|
69
|
+
"AnswerRelevancy",
|
|
70
|
+
"AutoevalsAdapter",
|
|
71
|
+
"Battle",
|
|
72
|
+
"ClosedQA",
|
|
73
|
+
"ContextRelevancy",
|
|
74
|
+
"EmbeddingSimilarity",
|
|
75
|
+
"ExactMatch",
|
|
76
|
+
"Factuality",
|
|
77
|
+
"Faithfulness",
|
|
78
|
+
"Humor",
|
|
79
|
+
"JSONDiff",
|
|
80
|
+
"LevenshteinMatch",
|
|
81
|
+
"ListContains",
|
|
82
|
+
"Moderation",
|
|
83
|
+
"NumericDiff",
|
|
84
|
+
"Possible",
|
|
85
|
+
"Security",
|
|
86
|
+
"Sql",
|
|
87
|
+
"Summary",
|
|
88
|
+
"Translation",
|
|
89
|
+
"ValidJSON",
|
|
90
|
+
]
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
{
|
|
2
|
+
"command_args": "pixie test tests/",
|
|
3
|
+
"timestamp": "2025-06-15 12:00:00 UTC",
|
|
4
|
+
"pixie_repo_url": "https://github.com/yiouli/pixie-qa",
|
|
5
|
+
"feedback_url": "https://feedback.gopixie.ai/feedback",
|
|
6
|
+
"brand_icon_url": "https://github.com/user-attachments/assets/76c18199-f00a-4fb3-a12f-ce6c173727af",
|
|
7
|
+
"test_records": [
|
|
8
|
+
{
|
|
9
|
+
"name": "test_customer_faq.py::test_faq_factuality",
|
|
10
|
+
"status": "passed",
|
|
11
|
+
"message": null,
|
|
12
|
+
"asserts": [
|
|
13
|
+
{
|
|
14
|
+
"evaluator_names": [
|
|
15
|
+
"MockFactuality"
|
|
16
|
+
],
|
|
17
|
+
"input_labels": [
|
|
18
|
+
"What is your return policy?",
|
|
19
|
+
"How do I track my order?",
|
|
20
|
+
"Do you offer international shipping?",
|
|
21
|
+
"What payment methods do you accept?",
|
|
22
|
+
"How can I contact support?"
|
|
23
|
+
],
|
|
24
|
+
"results": [
|
|
25
|
+
[
|
|
26
|
+
[
|
|
27
|
+
{
|
|
28
|
+
"score": 0.85,
|
|
29
|
+
"reasoning": "High string similarity between expected and actual output.",
|
|
30
|
+
"details": {}
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
[
|
|
34
|
+
{
|
|
35
|
+
"score": 0.72,
|
|
36
|
+
"reasoning": "Moderate string similarity.",
|
|
37
|
+
"details": {}
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
[
|
|
41
|
+
{
|
|
42
|
+
"score": 0.91,
|
|
43
|
+
"reasoning": "Very high similarity match.",
|
|
44
|
+
"details": {}
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
[
|
|
48
|
+
{
|
|
49
|
+
"score": 0.68,
|
|
50
|
+
"reasoning": "Reasonable similarity.",
|
|
51
|
+
"details": {}
|
|
52
|
+
}
|
|
53
|
+
],
|
|
54
|
+
[
|
|
55
|
+
{
|
|
56
|
+
"score": 0.77,
|
|
57
|
+
"reasoning": "Good similarity match.",
|
|
58
|
+
"details": {}
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
]
|
|
62
|
+
],
|
|
63
|
+
"passed": true,
|
|
64
|
+
"criteria_message": "Pass: 4/5 inputs passed (threshold 0.6, required 80%)",
|
|
65
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.6. At least 80% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
66
|
+
"evaluable_dicts": [
|
|
67
|
+
{
|
|
68
|
+
"input": "What is your return policy?",
|
|
69
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
70
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
71
|
+
"metadata": {}
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"input": "How do I track my order?",
|
|
75
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
76
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
77
|
+
"metadata": {}
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"input": "Do you offer international shipping?",
|
|
81
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
82
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
83
|
+
"metadata": {}
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"input": "What payment methods do you accept?",
|
|
87
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
88
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
89
|
+
"metadata": {}
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"input": "How can I contact support?",
|
|
93
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
94
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
95
|
+
"metadata": {}
|
|
96
|
+
}
|
|
97
|
+
]
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"name": "test_customer_faq.py::test_faq_multi_evaluator",
|
|
103
|
+
"status": "failed",
|
|
104
|
+
"message": "AssertionError: 3/5 inputs failed on at least one evaluator",
|
|
105
|
+
"asserts": [
|
|
106
|
+
{
|
|
107
|
+
"evaluator_names": [
|
|
108
|
+
"MockFactuality",
|
|
109
|
+
"MockClosedQA"
|
|
110
|
+
],
|
|
111
|
+
"input_labels": [
|
|
112
|
+
"What is your return policy?",
|
|
113
|
+
"How do I track my order?",
|
|
114
|
+
"Do you offer international shipping?",
|
|
115
|
+
"What payment methods do you accept?",
|
|
116
|
+
"How can I contact support?"
|
|
117
|
+
],
|
|
118
|
+
"results": [
|
|
119
|
+
[
|
|
120
|
+
[
|
|
121
|
+
{
|
|
122
|
+
"score": 0.85,
|
|
123
|
+
"reasoning": "High similarity.",
|
|
124
|
+
"details": {}
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"score": 0.3,
|
|
128
|
+
"reasoning": "Low keyword overlap.",
|
|
129
|
+
"details": {}
|
|
130
|
+
}
|
|
131
|
+
],
|
|
132
|
+
[
|
|
133
|
+
{
|
|
134
|
+
"score": 0.72,
|
|
135
|
+
"reasoning": "Moderate similarity.",
|
|
136
|
+
"details": {}
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
"score": 0.45,
|
|
140
|
+
"reasoning": "Below threshold keyword overlap.",
|
|
141
|
+
"details": {}
|
|
142
|
+
}
|
|
143
|
+
],
|
|
144
|
+
[
|
|
145
|
+
{
|
|
146
|
+
"score": 0.91,
|
|
147
|
+
"reasoning": "Very high similarity.",
|
|
148
|
+
"details": {}
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
"score": 0.6,
|
|
152
|
+
"reasoning": "Acceptable keyword overlap.",
|
|
153
|
+
"details": {}
|
|
154
|
+
}
|
|
155
|
+
],
|
|
156
|
+
[
|
|
157
|
+
{
|
|
158
|
+
"score": 0.68,
|
|
159
|
+
"reasoning": "Reasonable similarity.",
|
|
160
|
+
"details": {}
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
"score": 0.25,
|
|
164
|
+
"reasoning": "Poor keyword match.",
|
|
165
|
+
"details": {}
|
|
166
|
+
}
|
|
167
|
+
],
|
|
168
|
+
[
|
|
169
|
+
{
|
|
170
|
+
"score": 0.77,
|
|
171
|
+
"reasoning": "Good similarity.",
|
|
172
|
+
"details": {}
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
"score": 0.55,
|
|
176
|
+
"reasoning": "Marginal keyword overlap.",
|
|
177
|
+
"details": {}
|
|
178
|
+
}
|
|
179
|
+
]
|
|
180
|
+
]
|
|
181
|
+
],
|
|
182
|
+
"passed": false,
|
|
183
|
+
"criteria_message": "Fail: only 2/5 inputs passed on all evaluators (required 100%)",
|
|
184
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
185
|
+
"evaluable_dicts": [
|
|
186
|
+
{
|
|
187
|
+
"input": "What is your return policy?",
|
|
188
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
189
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
190
|
+
"metadata": {}
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"input": "How do I track my order?",
|
|
194
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
195
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
196
|
+
"metadata": {}
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
"input": "Do you offer international shipping?",
|
|
200
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
201
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
202
|
+
"metadata": {}
|
|
203
|
+
},
|
|
204
|
+
{
|
|
205
|
+
"input": "What payment methods do you accept?",
|
|
206
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
207
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
208
|
+
"metadata": {}
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
"input": "How can I contact support?",
|
|
212
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
213
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
214
|
+
"metadata": {}
|
|
215
|
+
}
|
|
216
|
+
]
|
|
217
|
+
}
|
|
218
|
+
]
|
|
219
|
+
},
|
|
220
|
+
{
|
|
221
|
+
"name": "test_customer_faq.py::test_faq_no_hallucinations",
|
|
222
|
+
"status": "passed",
|
|
223
|
+
"message": null,
|
|
224
|
+
"asserts": [
|
|
225
|
+
{
|
|
226
|
+
"evaluator_names": [
|
|
227
|
+
"MockHallucination"
|
|
228
|
+
],
|
|
229
|
+
"input_labels": [
|
|
230
|
+
"What is your return policy?",
|
|
231
|
+
"How do I track my order?",
|
|
232
|
+
"Do you offer international shipping?",
|
|
233
|
+
"What payment methods do you accept?",
|
|
234
|
+
"How can I contact support?"
|
|
235
|
+
],
|
|
236
|
+
"results": [
|
|
237
|
+
[
|
|
238
|
+
[
|
|
239
|
+
{
|
|
240
|
+
"score": 0.95,
|
|
241
|
+
"reasoning": "No hallucination detected.",
|
|
242
|
+
"details": {}
|
|
243
|
+
}
|
|
244
|
+
],
|
|
245
|
+
[
|
|
246
|
+
{
|
|
247
|
+
"score": 0.95,
|
|
248
|
+
"reasoning": "No hallucination detected.",
|
|
249
|
+
"details": {}
|
|
250
|
+
}
|
|
251
|
+
],
|
|
252
|
+
[
|
|
253
|
+
{
|
|
254
|
+
"score": 0.95,
|
|
255
|
+
"reasoning": "No hallucination detected.",
|
|
256
|
+
"details": {}
|
|
257
|
+
}
|
|
258
|
+
],
|
|
259
|
+
[
|
|
260
|
+
{
|
|
261
|
+
"score": 0.95,
|
|
262
|
+
"reasoning": "No hallucination detected.",
|
|
263
|
+
"details": {}
|
|
264
|
+
}
|
|
265
|
+
],
|
|
266
|
+
[
|
|
267
|
+
{
|
|
268
|
+
"score": 0.95,
|
|
269
|
+
"reasoning": "No hallucination detected.",
|
|
270
|
+
"details": {}
|
|
271
|
+
}
|
|
272
|
+
]
|
|
273
|
+
]
|
|
274
|
+
],
|
|
275
|
+
"passed": true,
|
|
276
|
+
"criteria_message": "Pass: 5/5 inputs passed (threshold 0.5, required 100%)",
|
|
277
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
278
|
+
"evaluable_dicts": [
|
|
279
|
+
{
|
|
280
|
+
"input": "What is your return policy?",
|
|
281
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
282
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
283
|
+
"metadata": {}
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"input": "How do I track my order?",
|
|
287
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
288
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
289
|
+
"metadata": {}
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"input": "Do you offer international shipping?",
|
|
293
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
294
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
295
|
+
"metadata": {}
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
"input": "What payment methods do you accept?",
|
|
299
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
300
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
301
|
+
"metadata": {}
|
|
302
|
+
},
|
|
303
|
+
{
|
|
304
|
+
"input": "How can I contact support?",
|
|
305
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
306
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
307
|
+
"metadata": {}
|
|
308
|
+
}
|
|
309
|
+
]
|
|
310
|
+
}
|
|
311
|
+
]
|
|
312
|
+
},
|
|
313
|
+
{
|
|
314
|
+
"name": "test_customer_faq.py::test_faq_tone_check",
|
|
315
|
+
"status": "failed",
|
|
316
|
+
"message": "AssertionError: all 5 inputs failed",
|
|
317
|
+
"asserts": [
|
|
318
|
+
{
|
|
319
|
+
"evaluator_names": [
|
|
320
|
+
"MockStrictTone"
|
|
321
|
+
],
|
|
322
|
+
"input_labels": [
|
|
323
|
+
"What is your return policy?",
|
|
324
|
+
"How do I track my order?",
|
|
325
|
+
"Do you offer international shipping?",
|
|
326
|
+
"What payment methods do you accept?",
|
|
327
|
+
"How can I contact support?"
|
|
328
|
+
],
|
|
329
|
+
"results": [
|
|
330
|
+
[
|
|
331
|
+
[
|
|
332
|
+
{
|
|
333
|
+
"score": 0.2,
|
|
334
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
335
|
+
"details": {}
|
|
336
|
+
}
|
|
337
|
+
],
|
|
338
|
+
[
|
|
339
|
+
{
|
|
340
|
+
"score": 0.2,
|
|
341
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
342
|
+
"details": {}
|
|
343
|
+
}
|
|
344
|
+
],
|
|
345
|
+
[
|
|
346
|
+
{
|
|
347
|
+
"score": 0.2,
|
|
348
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
349
|
+
"details": {}
|
|
350
|
+
}
|
|
351
|
+
],
|
|
352
|
+
[
|
|
353
|
+
{
|
|
354
|
+
"score": 0.2,
|
|
355
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
356
|
+
"details": {}
|
|
357
|
+
}
|
|
358
|
+
],
|
|
359
|
+
[
|
|
360
|
+
{
|
|
361
|
+
"score": 0.2,
|
|
362
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
363
|
+
"details": {}
|
|
364
|
+
}
|
|
365
|
+
]
|
|
366
|
+
]
|
|
367
|
+
],
|
|
368
|
+
"passed": false,
|
|
369
|
+
"criteria_message": "Fail: 0/5 inputs passed (threshold 0.5, required 100%)",
|
|
370
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
371
|
+
"evaluable_dicts": [
|
|
372
|
+
{
|
|
373
|
+
"input": "What is your return policy?",
|
|
374
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
375
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
376
|
+
"metadata": {}
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
"input": "How do I track my order?",
|
|
380
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
381
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
382
|
+
"metadata": {}
|
|
383
|
+
},
|
|
384
|
+
{
|
|
385
|
+
"input": "Do you offer international shipping?",
|
|
386
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
387
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
388
|
+
"metadata": {}
|
|
389
|
+
},
|
|
390
|
+
{
|
|
391
|
+
"input": "What payment methods do you accept?",
|
|
392
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
393
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
394
|
+
"metadata": {}
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
"input": "How can I contact support?",
|
|
398
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
399
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
400
|
+
"metadata": {}
|
|
401
|
+
}
|
|
402
|
+
]
|
|
403
|
+
}
|
|
404
|
+
]
|
|
405
|
+
}
|
|
406
|
+
]
|
|
407
|
+
}
|