pixie-qa 0.2.2__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixie_qa-0.4.0/.gitignore +8 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/PKG-INFO +32 -3
- pixie_qa-0.4.0/README.md +62 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/__init__.py +43 -40
- pixie_qa-0.4.0/pixie/assets/mock-data.json +407 -0
- pixie_qa-0.4.0/pixie/assets/webui.html +64 -0
- pixie_qa-0.4.0/pixie/cli/analyze_command.py +156 -0
- pixie_qa-0.4.0/pixie/cli/dag_command.py +75 -0
- pixie_qa-0.4.0/pixie/cli/init_command.py +55 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/main.py +161 -12
- pixie_qa-0.4.0/pixie/cli/start_command.py +43 -0
- pixie_qa-0.4.0/pixie/cli/test_command.py +257 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/trace_command.py +108 -0
- pixie_qa-0.4.0/pixie/config.py +130 -0
- pixie_qa-0.4.0/pixie/dag/__init__.py +400 -0
- pixie_qa-0.4.0/pixie/dag/trace_check.py +183 -0
- pixie_qa-0.4.0/pixie/evals/__init__.py +184 -0
- pixie_qa-0.4.0/pixie/evals/criteria.py +61 -0
- pixie_qa-0.4.0/pixie/evals/dataset_runner.py +495 -0
- pixie_qa-0.4.0/pixie/evals/eval_utils.py +334 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/evaluation.py +10 -0
- pixie_qa-0.4.0/pixie/evals/rate_limiter.py +140 -0
- pixie_qa-0.4.0/pixie/evals/scorecard.py +252 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/scorers.py +252 -110
- pixie_qa-0.4.0/pixie/evals/test_result.py +239 -0
- pixie_qa-0.4.0/pixie/instrumentation/__init__.py +80 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/instrumentors.py +18 -2
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/evaluable.py +2 -0
- pixie_qa-0.4.0/pixie/web/__init__.py +1 -0
- pixie_qa-0.4.0/pixie/web/app.py +255 -0
- pixie_qa-0.4.0/pixie/web/server.py +369 -0
- pixie_qa-0.4.0/pixie/web/watcher.py +99 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pyproject.toml +23 -2
- pixie_qa-0.2.2/.github/copilot-instructions.md +0 -632
- pixie_qa-0.2.2/.github/workflows/publish.yml +0 -80
- pixie_qa-0.2.2/.gitignore +0 -4
- pixie_qa-0.2.2/README.md +0 -36
- pixie_qa-0.2.2/changelogs/async-handler-processing.md +0 -96
- pixie_qa-0.2.2/changelogs/autoevals-adapters.md +0 -39
- pixie_qa-0.2.2/changelogs/cli-dataset-commands.md +0 -37
- pixie_qa-0.2.2/changelogs/dataset-management.md +0 -91
- pixie_qa-0.2.2/changelogs/deep-research-demo.md +0 -43
- pixie_qa-0.2.2/changelogs/eval-harness.md +0 -128
- pixie_qa-0.2.2/changelogs/expected-output-in-evals.md +0 -42
- pixie_qa-0.2.2/changelogs/instrumentation-module-implementation.md +0 -55
- pixie_qa-0.2.2/changelogs/loud-failure-mode.md +0 -58
- pixie_qa-0.2.2/changelogs/manual-instrumentation-usability.md +0 -56
- pixie_qa-0.2.2/changelogs/observation-store-implementation.md +0 -53
- pixie_qa-0.2.2/changelogs/observe-sensitive-field-stripping.md +0 -22
- pixie_qa-0.2.2/changelogs/pixie-directory-and-skill-improvements.md +0 -63
- pixie_qa-0.2.2/changelogs/pixie-test-e2e-suite.md +0 -69
- pixie_qa-0.2.2/changelogs/root-package-exports-and-trace-id.md +0 -58
- pixie_qa-0.2.2/changelogs/scorecard-branding-and-skill-version-check.md +0 -41
- pixie_qa-0.2.2/changelogs/scorecard-eval-detail-dialog.md +0 -28
- pixie_qa-0.2.2/changelogs/skill-v2-and-rootdir-discovery.md +0 -76
- pixie_qa-0.2.2/changelogs/test-scorecard.md +0 -54
- pixie_qa-0.2.2/changelogs/usability-utils.md +0 -60
- pixie_qa-0.2.2/docs/package.md +0 -233
- pixie_qa-0.2.2/pixie/cli/test_command.py +0 -120
- pixie_qa-0.2.2/pixie/config.py +0 -54
- pixie_qa-0.2.2/pixie/evals/__init__.py +0 -121
- pixie_qa-0.2.2/pixie/evals/criteria.py +0 -77
- pixie_qa-0.2.2/pixie/evals/eval_utils.py +0 -358
- pixie_qa-0.2.2/pixie/evals/runner.py +0 -278
- pixie_qa-0.2.2/pixie/evals/scorecard.py +0 -916
- pixie_qa-0.2.2/pixie/instrumentation/__init__.py +0 -49
- pixie_qa-0.2.2/skills/eval-driven-dev/SKILL.md +0 -378
- pixie_qa-0.2.2/skills/eval-driven-dev/references/dataset-generation.md +0 -235
- pixie_qa-0.2.2/skills/eval-driven-dev/references/eval-tests.md +0 -241
- pixie_qa-0.2.2/skills/eval-driven-dev/references/instrumentation.md +0 -174
- pixie_qa-0.2.2/skills/eval-driven-dev/references/investigation.md +0 -146
- pixie_qa-0.2.2/skills/eval-driven-dev/references/pixie-api.md +0 -257
- pixie_qa-0.2.2/skills/eval-driven-dev/references/run-harness-patterns.md +0 -281
- pixie_qa-0.2.2/skills/eval-driven-dev/references/understanding-app.md +0 -201
- pixie_qa-0.2.2/specs/agent-skill-1.md +0 -25
- pixie_qa-0.2.2/specs/agent-skill.md +0 -71
- pixie_qa-0.2.2/specs/autoevals-adapters.md +0 -301
- pixie_qa-0.2.2/specs/dataset-management.md +0 -727
- pixie_qa-0.2.2/specs/evals-harness.md +0 -649
- pixie_qa-0.2.2/specs/expected-output-in-evals.md +0 -141
- pixie_qa-0.2.2/specs/instrumentation.md +0 -726
- pixie_qa-0.2.2/specs/manual-instrumentation-usability.md +0 -767
- pixie_qa-0.2.2/specs/storage.md +0 -473
- pixie_qa-0.2.2/specs/usability-utils.md +0 -327
- pixie_qa-0.2.2/tests/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/cli/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/cli/conftest.py +0 -15
- pixie_qa-0.2.2/tests/pixie/cli/e2e_cases.json +0 -183
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/conftest.py +0 -9
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -45
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -156
- pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -106
- pixie_qa-0.2.2/tests/pixie/cli/test_dataset_command.py +0 -412
- pixie_qa-0.2.2/tests/pixie/cli/test_e2e_pixie_test.py +0 -343
- pixie_qa-0.2.2/tests/pixie/cli/test_main.py +0 -261
- pixie_qa-0.2.2/tests/pixie/cli/test_trace_command.py +0 -324
- pixie_qa-0.2.2/tests/pixie/dataset/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/dataset/test_models.py +0 -64
- pixie_qa-0.2.2/tests/pixie/dataset/test_store.py +0 -222
- pixie_qa-0.2.2/tests/pixie/evals/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/evals/test_criteria.py +0 -116
- pixie_qa-0.2.2/tests/pixie/evals/test_eval_utils.py +0 -666
- pixie_qa-0.2.2/tests/pixie/evals/test_evaluation.py +0 -186
- pixie_qa-0.2.2/tests/pixie/evals/test_llm_evaluator.py +0 -235
- pixie_qa-0.2.2/tests/pixie/evals/test_runner.py +0 -452
- pixie_qa-0.2.2/tests/pixie/evals/test_scorecard.py +0 -487
- pixie_qa-0.2.2/tests/pixie/evals/test_scorers.py +0 -558
- pixie_qa-0.2.2/tests/pixie/evals/test_trace_capture.py +0 -205
- pixie_qa-0.2.2/tests/pixie/evals/test_trace_helpers.py +0 -154
- pixie_qa-0.2.2/tests/pixie/instrumentation/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/instrumentation/conftest.py +0 -35
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_context.py +0 -157
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_handler.py +0 -192
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_integration.py +0 -208
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_observation.py +0 -196
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_processor.py +0 -560
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_queue.py +0 -223
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_spans.py +0 -254
- pixie_qa-0.2.2/tests/pixie/instrumentation/test_storage_handler.py +0 -108
- pixie_qa-0.2.2/tests/pixie/observation_store/__init__.py +0 -0
- pixie_qa-0.2.2/tests/pixie/observation_store/conftest.py +0 -231
- pixie_qa-0.2.2/tests/pixie/observation_store/test_evaluable.py +0 -191
- pixie_qa-0.2.2/tests/pixie/observation_store/test_serialization.py +0 -156
- pixie_qa-0.2.2/tests/pixie/observation_store/test_store.py +0 -289
- pixie_qa-0.2.2/tests/pixie/observation_store/test_tree.py +0 -248
- pixie_qa-0.2.2/tests/pixie/test_config.py +0 -73
- pixie_qa-0.2.2/tests/pixie/test_init.py +0 -157
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/LICENSE +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/__init__.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/dataset_command.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/__init__.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/models.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/store.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/llm_evaluator.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/trace_capture.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/trace_helpers.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/favicon.png +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/context.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/handler.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/handlers.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/observation.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/processor.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/queue.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/spans.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/__init__.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/piccolo_conf.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/piccolo_migrations/__init__.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/serialization.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/store.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/tables.py +0 -0
- {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/tree.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pixie-qa
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Automated quality assurance for AI applications
|
|
5
5
|
Project-URL: Homepage, https://github.com/yiouli/pixie-qa
|
|
6
6
|
Project-URL: Repository, https://github.com/yiouli/pixie-qa
|
|
@@ -47,6 +47,9 @@ Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
|
47
47
|
Requires-Dist: piccolo[sqlite]>=1.33.0
|
|
48
48
|
Requires-Dist: pydantic>=2.0
|
|
49
49
|
Requires-Dist: python-dotenv>=1.2.2
|
|
50
|
+
Requires-Dist: starlette>=1.0.0
|
|
51
|
+
Requires-Dist: uvicorn>=0.42.0
|
|
52
|
+
Requires-Dist: watchfiles>=1.1.1
|
|
50
53
|
Provides-Extra: all
|
|
51
54
|
Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
|
|
52
55
|
Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
|
|
@@ -77,8 +80,10 @@ The `qa-eval` skill guides your coding agent through the full eval-based QA loop
|
|
|
77
80
|
2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
|
|
78
81
|
3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
|
|
79
82
|
4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
|
|
80
|
-
5. **
|
|
81
|
-
6. **
|
|
83
|
+
5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
|
|
84
|
+
6. **Run the tests** — `pixie test` to run all evals and report per-case scores
|
|
85
|
+
7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
|
|
86
|
+
8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
|
|
82
87
|
|
|
83
88
|
## Getting Started
|
|
84
89
|
|
|
@@ -101,3 +106,27 @@ Your coding agent will read your code, instrument it, build a dataset from a few
|
|
|
101
106
|
## Python Package
|
|
102
107
|
|
|
103
108
|
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
|
|
109
|
+
|
|
110
|
+
## Web UI
|
|
111
|
+
|
|
112
|
+
View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
|
|
116
|
+
pixie start my_dir # use a custom artifact root
|
|
117
|
+
pixie init # scaffolds pixie_qa/ without starting the server
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
|
|
121
|
+
|
|
122
|
+
The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
|
|
123
|
+
|
|
124
|
+
## Configuration
|
|
125
|
+
|
|
126
|
+
Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
|
|
127
|
+
|
|
128
|
+
Useful settings include:
|
|
129
|
+
|
|
130
|
+
- `PIXIE_ROOT` to move all generated artefacts under a different root directory
|
|
131
|
+
- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
|
|
132
|
+
- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
|
pixie_qa-0.4.0/README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# pixie-qa
|
|
2
|
+
|
|
3
|
+
An agent skill that make coding agent the QA engineer for LLM applications.
|
|
4
|
+
|
|
5
|
+
## What the Skill Does
|
|
6
|
+
|
|
7
|
+
The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
|
|
8
|
+
|
|
9
|
+
1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
|
|
10
|
+
2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
|
|
11
|
+
3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
|
|
12
|
+
4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
|
|
13
|
+
5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
|
|
14
|
+
6. **Run the tests** — `pixie test` to run all evals and report per-case scores
|
|
15
|
+
7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
|
|
16
|
+
8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
|
|
17
|
+
|
|
18
|
+
## Getting Started
|
|
19
|
+
|
|
20
|
+
### 1. Add the skill to your coding agent
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npx skills add yiouli/pixie-qa
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
The accompanying python package would be installed by the skill automatically when it's used.
|
|
27
|
+
|
|
28
|
+
### 2. Ask coding agent to set up evals
|
|
29
|
+
|
|
30
|
+
Open a conversation and say something like when developing a python based AI project:
|
|
31
|
+
|
|
32
|
+
> "setup QA for my agent"
|
|
33
|
+
|
|
34
|
+
Your coding agent will read your code, instrument it, build a dataset from a few real runs, write and run eval-based tests, investigate failures and fix.
|
|
35
|
+
|
|
36
|
+
## Python Package
|
|
37
|
+
|
|
38
|
+
The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
|
|
39
|
+
|
|
40
|
+
## Web UI
|
|
41
|
+
|
|
42
|
+
View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
|
|
46
|
+
pixie start my_dir # use a custom artifact root
|
|
47
|
+
pixie init # scaffolds pixie_qa/ without starting the server
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
|
|
51
|
+
|
|
52
|
+
The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
|
|
53
|
+
|
|
54
|
+
## Configuration
|
|
55
|
+
|
|
56
|
+
Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
|
|
57
|
+
|
|
58
|
+
Useful settings include:
|
|
59
|
+
|
|
60
|
+
- `PIXIE_ROOT` to move all generated artefacts under a different root directory
|
|
61
|
+
- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
|
|
62
|
+
- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
|
|
@@ -18,29 +18,30 @@ from pixie.evals.eval_utils import (
|
|
|
18
18
|
)
|
|
19
19
|
from pixie.evals.evaluation import Evaluation, Evaluator, evaluate
|
|
20
20
|
from pixie.evals.llm_evaluator import create_llm_evaluator
|
|
21
|
+
from pixie.evals.rate_limiter import RateLimitConfig, configure_rate_limits
|
|
21
22
|
from pixie.evals.scorers import (
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
AnswerCorrectness,
|
|
24
|
+
AnswerRelevancy,
|
|
24
25
|
AutoevalsAdapter,
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
26
|
+
Battle,
|
|
27
|
+
ClosedQA,
|
|
28
|
+
ContextRelevancy,
|
|
29
|
+
EmbeddingSimilarity,
|
|
30
|
+
ExactMatch,
|
|
31
|
+
Factuality,
|
|
32
|
+
Faithfulness,
|
|
33
|
+
Humor,
|
|
34
|
+
JSONDiff,
|
|
34
35
|
LevenshteinMatch,
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
36
|
+
ListContains,
|
|
37
|
+
Moderation,
|
|
38
|
+
NumericDiff,
|
|
39
|
+
Possible,
|
|
40
|
+
Security,
|
|
41
|
+
Sql,
|
|
42
|
+
Summary,
|
|
43
|
+
Translation,
|
|
44
|
+
ValidJSON,
|
|
44
45
|
)
|
|
45
46
|
from pixie.evals.trace_capture import MemoryTraceHandler, capture_traces
|
|
46
47
|
from pixie.evals.trace_helpers import last_llm_call, root
|
|
@@ -67,36 +68,38 @@ __all__ = [
|
|
|
67
68
|
"remove_handler",
|
|
68
69
|
"start_observation",
|
|
69
70
|
# Evals
|
|
70
|
-
"
|
|
71
|
-
"
|
|
71
|
+
"AnswerCorrectness",
|
|
72
|
+
"AnswerRelevancy",
|
|
72
73
|
"AutoevalsAdapter",
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
74
|
+
"Battle",
|
|
75
|
+
"ClosedQA",
|
|
76
|
+
"ContextRelevancy",
|
|
77
|
+
"EmbeddingSimilarity",
|
|
77
78
|
"EvalAssertionError",
|
|
78
79
|
"Evaluation",
|
|
79
80
|
"Evaluator",
|
|
80
|
-
"
|
|
81
|
-
"
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
"
|
|
81
|
+
"ExactMatch",
|
|
82
|
+
"Factuality",
|
|
83
|
+
"Faithfulness",
|
|
84
|
+
"Humor",
|
|
85
|
+
"JSONDiff",
|
|
85
86
|
"LevenshteinMatch",
|
|
86
|
-
"
|
|
87
|
+
"ListContains",
|
|
87
88
|
"MemoryTraceHandler",
|
|
88
|
-
"
|
|
89
|
-
"
|
|
90
|
-
"
|
|
89
|
+
"Moderation",
|
|
90
|
+
"NumericDiff",
|
|
91
|
+
"Possible",
|
|
92
|
+
"RateLimitConfig",
|
|
91
93
|
"ScoreThreshold",
|
|
92
|
-
"
|
|
93
|
-
"
|
|
94
|
-
"
|
|
95
|
-
"
|
|
96
|
-
"
|
|
94
|
+
"Security",
|
|
95
|
+
"Sql",
|
|
96
|
+
"Summary",
|
|
97
|
+
"Translation",
|
|
98
|
+
"ValidJSON",
|
|
97
99
|
"assert_dataset_pass",
|
|
98
100
|
"assert_pass",
|
|
99
101
|
"capture_traces",
|
|
102
|
+
"configure_rate_limits",
|
|
100
103
|
"create_llm_evaluator",
|
|
101
104
|
"evaluate",
|
|
102
105
|
"last_llm_call",
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
{
|
|
2
|
+
"command_args": "pixie test tests/",
|
|
3
|
+
"timestamp": "2025-06-15 12:00:00 UTC",
|
|
4
|
+
"pixie_repo_url": "https://github.com/yiouli/pixie-qa",
|
|
5
|
+
"feedback_url": "https://feedback.gopixie.ai/feedback",
|
|
6
|
+
"brand_icon_url": "https://github.com/user-attachments/assets/76c18199-f00a-4fb3-a12f-ce6c173727af",
|
|
7
|
+
"test_records": [
|
|
8
|
+
{
|
|
9
|
+
"name": "test_customer_faq.py::test_faq_factuality",
|
|
10
|
+
"status": "passed",
|
|
11
|
+
"message": null,
|
|
12
|
+
"asserts": [
|
|
13
|
+
{
|
|
14
|
+
"evaluator_names": [
|
|
15
|
+
"MockFactuality"
|
|
16
|
+
],
|
|
17
|
+
"input_labels": [
|
|
18
|
+
"What is your return policy?",
|
|
19
|
+
"How do I track my order?",
|
|
20
|
+
"Do you offer international shipping?",
|
|
21
|
+
"What payment methods do you accept?",
|
|
22
|
+
"How can I contact support?"
|
|
23
|
+
],
|
|
24
|
+
"results": [
|
|
25
|
+
[
|
|
26
|
+
[
|
|
27
|
+
{
|
|
28
|
+
"score": 0.85,
|
|
29
|
+
"reasoning": "High string similarity between expected and actual output.",
|
|
30
|
+
"details": {}
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
[
|
|
34
|
+
{
|
|
35
|
+
"score": 0.72,
|
|
36
|
+
"reasoning": "Moderate string similarity.",
|
|
37
|
+
"details": {}
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
[
|
|
41
|
+
{
|
|
42
|
+
"score": 0.91,
|
|
43
|
+
"reasoning": "Very high similarity match.",
|
|
44
|
+
"details": {}
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
[
|
|
48
|
+
{
|
|
49
|
+
"score": 0.68,
|
|
50
|
+
"reasoning": "Reasonable similarity.",
|
|
51
|
+
"details": {}
|
|
52
|
+
}
|
|
53
|
+
],
|
|
54
|
+
[
|
|
55
|
+
{
|
|
56
|
+
"score": 0.77,
|
|
57
|
+
"reasoning": "Good similarity match.",
|
|
58
|
+
"details": {}
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
]
|
|
62
|
+
],
|
|
63
|
+
"passed": true,
|
|
64
|
+
"criteria_message": "Pass: 4/5 inputs passed (threshold 0.6, required 80%)",
|
|
65
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.6. At least 80% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
66
|
+
"evaluable_dicts": [
|
|
67
|
+
{
|
|
68
|
+
"input": "What is your return policy?",
|
|
69
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
70
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
71
|
+
"metadata": {}
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"input": "How do I track my order?",
|
|
75
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
76
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
77
|
+
"metadata": {}
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"input": "Do you offer international shipping?",
|
|
81
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
82
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
83
|
+
"metadata": {}
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"input": "What payment methods do you accept?",
|
|
87
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
88
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
89
|
+
"metadata": {}
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"input": "How can I contact support?",
|
|
93
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
94
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
95
|
+
"metadata": {}
|
|
96
|
+
}
|
|
97
|
+
]
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"name": "test_customer_faq.py::test_faq_multi_evaluator",
|
|
103
|
+
"status": "failed",
|
|
104
|
+
"message": "AssertionError: 3/5 inputs failed on at least one evaluator",
|
|
105
|
+
"asserts": [
|
|
106
|
+
{
|
|
107
|
+
"evaluator_names": [
|
|
108
|
+
"MockFactuality",
|
|
109
|
+
"MockClosedQA"
|
|
110
|
+
],
|
|
111
|
+
"input_labels": [
|
|
112
|
+
"What is your return policy?",
|
|
113
|
+
"How do I track my order?",
|
|
114
|
+
"Do you offer international shipping?",
|
|
115
|
+
"What payment methods do you accept?",
|
|
116
|
+
"How can I contact support?"
|
|
117
|
+
],
|
|
118
|
+
"results": [
|
|
119
|
+
[
|
|
120
|
+
[
|
|
121
|
+
{
|
|
122
|
+
"score": 0.85,
|
|
123
|
+
"reasoning": "High similarity.",
|
|
124
|
+
"details": {}
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"score": 0.3,
|
|
128
|
+
"reasoning": "Low keyword overlap.",
|
|
129
|
+
"details": {}
|
|
130
|
+
}
|
|
131
|
+
],
|
|
132
|
+
[
|
|
133
|
+
{
|
|
134
|
+
"score": 0.72,
|
|
135
|
+
"reasoning": "Moderate similarity.",
|
|
136
|
+
"details": {}
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
"score": 0.45,
|
|
140
|
+
"reasoning": "Below threshold keyword overlap.",
|
|
141
|
+
"details": {}
|
|
142
|
+
}
|
|
143
|
+
],
|
|
144
|
+
[
|
|
145
|
+
{
|
|
146
|
+
"score": 0.91,
|
|
147
|
+
"reasoning": "Very high similarity.",
|
|
148
|
+
"details": {}
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
"score": 0.6,
|
|
152
|
+
"reasoning": "Acceptable keyword overlap.",
|
|
153
|
+
"details": {}
|
|
154
|
+
}
|
|
155
|
+
],
|
|
156
|
+
[
|
|
157
|
+
{
|
|
158
|
+
"score": 0.68,
|
|
159
|
+
"reasoning": "Reasonable similarity.",
|
|
160
|
+
"details": {}
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
"score": 0.25,
|
|
164
|
+
"reasoning": "Poor keyword match.",
|
|
165
|
+
"details": {}
|
|
166
|
+
}
|
|
167
|
+
],
|
|
168
|
+
[
|
|
169
|
+
{
|
|
170
|
+
"score": 0.77,
|
|
171
|
+
"reasoning": "Good similarity.",
|
|
172
|
+
"details": {}
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
"score": 0.55,
|
|
176
|
+
"reasoning": "Marginal keyword overlap.",
|
|
177
|
+
"details": {}
|
|
178
|
+
}
|
|
179
|
+
]
|
|
180
|
+
]
|
|
181
|
+
],
|
|
182
|
+
"passed": false,
|
|
183
|
+
"criteria_message": "Fail: only 2/5 inputs passed on all evaluators (required 100%)",
|
|
184
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
185
|
+
"evaluable_dicts": [
|
|
186
|
+
{
|
|
187
|
+
"input": "What is your return policy?",
|
|
188
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
189
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
190
|
+
"metadata": {}
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"input": "How do I track my order?",
|
|
194
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
195
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
196
|
+
"metadata": {}
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
"input": "Do you offer international shipping?",
|
|
200
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
201
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
202
|
+
"metadata": {}
|
|
203
|
+
},
|
|
204
|
+
{
|
|
205
|
+
"input": "What payment methods do you accept?",
|
|
206
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
207
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
208
|
+
"metadata": {}
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
"input": "How can I contact support?",
|
|
212
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
213
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
214
|
+
"metadata": {}
|
|
215
|
+
}
|
|
216
|
+
]
|
|
217
|
+
}
|
|
218
|
+
]
|
|
219
|
+
},
|
|
220
|
+
{
|
|
221
|
+
"name": "test_customer_faq.py::test_faq_no_hallucinations",
|
|
222
|
+
"status": "passed",
|
|
223
|
+
"message": null,
|
|
224
|
+
"asserts": [
|
|
225
|
+
{
|
|
226
|
+
"evaluator_names": [
|
|
227
|
+
"MockHallucination"
|
|
228
|
+
],
|
|
229
|
+
"input_labels": [
|
|
230
|
+
"What is your return policy?",
|
|
231
|
+
"How do I track my order?",
|
|
232
|
+
"Do you offer international shipping?",
|
|
233
|
+
"What payment methods do you accept?",
|
|
234
|
+
"How can I contact support?"
|
|
235
|
+
],
|
|
236
|
+
"results": [
|
|
237
|
+
[
|
|
238
|
+
[
|
|
239
|
+
{
|
|
240
|
+
"score": 0.95,
|
|
241
|
+
"reasoning": "No hallucination detected.",
|
|
242
|
+
"details": {}
|
|
243
|
+
}
|
|
244
|
+
],
|
|
245
|
+
[
|
|
246
|
+
{
|
|
247
|
+
"score": 0.95,
|
|
248
|
+
"reasoning": "No hallucination detected.",
|
|
249
|
+
"details": {}
|
|
250
|
+
}
|
|
251
|
+
],
|
|
252
|
+
[
|
|
253
|
+
{
|
|
254
|
+
"score": 0.95,
|
|
255
|
+
"reasoning": "No hallucination detected.",
|
|
256
|
+
"details": {}
|
|
257
|
+
}
|
|
258
|
+
],
|
|
259
|
+
[
|
|
260
|
+
{
|
|
261
|
+
"score": 0.95,
|
|
262
|
+
"reasoning": "No hallucination detected.",
|
|
263
|
+
"details": {}
|
|
264
|
+
}
|
|
265
|
+
],
|
|
266
|
+
[
|
|
267
|
+
{
|
|
268
|
+
"score": 0.95,
|
|
269
|
+
"reasoning": "No hallucination detected.",
|
|
270
|
+
"details": {}
|
|
271
|
+
}
|
|
272
|
+
]
|
|
273
|
+
]
|
|
274
|
+
],
|
|
275
|
+
"passed": true,
|
|
276
|
+
"criteria_message": "Pass: 5/5 inputs passed (threshold 0.5, required 100%)",
|
|
277
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
278
|
+
"evaluable_dicts": [
|
|
279
|
+
{
|
|
280
|
+
"input": "What is your return policy?",
|
|
281
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
282
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
283
|
+
"metadata": {}
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"input": "How do I track my order?",
|
|
287
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
288
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
289
|
+
"metadata": {}
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"input": "Do you offer international shipping?",
|
|
293
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
294
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
295
|
+
"metadata": {}
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
"input": "What payment methods do you accept?",
|
|
299
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
300
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
301
|
+
"metadata": {}
|
|
302
|
+
},
|
|
303
|
+
{
|
|
304
|
+
"input": "How can I contact support?",
|
|
305
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
306
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
307
|
+
"metadata": {}
|
|
308
|
+
}
|
|
309
|
+
]
|
|
310
|
+
}
|
|
311
|
+
]
|
|
312
|
+
},
|
|
313
|
+
{
|
|
314
|
+
"name": "test_customer_faq.py::test_faq_tone_check",
|
|
315
|
+
"status": "failed",
|
|
316
|
+
"message": "AssertionError: all 5 inputs failed",
|
|
317
|
+
"asserts": [
|
|
318
|
+
{
|
|
319
|
+
"evaluator_names": [
|
|
320
|
+
"MockStrictTone"
|
|
321
|
+
],
|
|
322
|
+
"input_labels": [
|
|
323
|
+
"What is your return policy?",
|
|
324
|
+
"How do I track my order?",
|
|
325
|
+
"Do you offer international shipping?",
|
|
326
|
+
"What payment methods do you accept?",
|
|
327
|
+
"How can I contact support?"
|
|
328
|
+
],
|
|
329
|
+
"results": [
|
|
330
|
+
[
|
|
331
|
+
[
|
|
332
|
+
{
|
|
333
|
+
"score": 0.2,
|
|
334
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
335
|
+
"details": {}
|
|
336
|
+
}
|
|
337
|
+
],
|
|
338
|
+
[
|
|
339
|
+
{
|
|
340
|
+
"score": 0.2,
|
|
341
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
342
|
+
"details": {}
|
|
343
|
+
}
|
|
344
|
+
],
|
|
345
|
+
[
|
|
346
|
+
{
|
|
347
|
+
"score": 0.2,
|
|
348
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
349
|
+
"details": {}
|
|
350
|
+
}
|
|
351
|
+
],
|
|
352
|
+
[
|
|
353
|
+
{
|
|
354
|
+
"score": 0.2,
|
|
355
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
356
|
+
"details": {}
|
|
357
|
+
}
|
|
358
|
+
],
|
|
359
|
+
[
|
|
360
|
+
{
|
|
361
|
+
"score": 0.2,
|
|
362
|
+
"reasoning": "Tone does not meet strict requirements.",
|
|
363
|
+
"details": {}
|
|
364
|
+
}
|
|
365
|
+
]
|
|
366
|
+
]
|
|
367
|
+
],
|
|
368
|
+
"passed": false,
|
|
369
|
+
"criteria_message": "Fail: 0/5 inputs passed (threshold 0.5, required 100%)",
|
|
370
|
+
"scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
|
|
371
|
+
"evaluable_dicts": [
|
|
372
|
+
{
|
|
373
|
+
"input": "What is your return policy?",
|
|
374
|
+
"expected_output": "You can return items within 30 days of purchase for a full refund.",
|
|
375
|
+
"actual_output": "Items can be returned within 30 days for a full refund.",
|
|
376
|
+
"metadata": {}
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
"input": "How do I track my order?",
|
|
380
|
+
"expected_output": "You can track your order using the tracking link sent to your email.",
|
|
381
|
+
"actual_output": "Check the tracking link in your confirmation email to track your order.",
|
|
382
|
+
"metadata": {}
|
|
383
|
+
},
|
|
384
|
+
{
|
|
385
|
+
"input": "Do you offer international shipping?",
|
|
386
|
+
"expected_output": "Yes, we ship to over 50 countries worldwide.",
|
|
387
|
+
"actual_output": "Yes, we offer international shipping to over 50 countries.",
|
|
388
|
+
"metadata": {}
|
|
389
|
+
},
|
|
390
|
+
{
|
|
391
|
+
"input": "What payment methods do you accept?",
|
|
392
|
+
"expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
|
|
393
|
+
"actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
|
|
394
|
+
"metadata": {}
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
"input": "How can I contact support?",
|
|
398
|
+
"expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
|
|
399
|
+
"actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
|
|
400
|
+
"metadata": {}
|
|
401
|
+
}
|
|
402
|
+
]
|
|
403
|
+
}
|
|
404
|
+
]
|
|
405
|
+
}
|
|
406
|
+
]
|
|
407
|
+
}
|