pixie-qa 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/PKG-INFO +1 -1
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/eval_utils.py +44 -12
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pyproject.toml +1 -1
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/SKILL.md +20 -3
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/eval-tests.md +1 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/run-harness-patterns.md +6 -7
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/understanding-app.md +4 -4
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_eval_utils.py +135 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/resources/check_version.py +0 -126
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/.github/copilot-instructions.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/.github/workflows/publish.yml +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/.gitignore +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/LICENSE +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/README.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/async-handler-processing.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/autoevals-adapters.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/cli-dataset-commands.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/dataset-management.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/deep-research-demo.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/eval-harness.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/expected-output-in-evals.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/instrumentation-module-implementation.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/loud-failure-mode.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/manual-instrumentation-usability.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/observation-store-implementation.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/observe-sensitive-field-stripping.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/pixie-directory-and-skill-improvements.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/pixie-test-e2e-suite.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/root-package-exports-and-trace-id.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/scorecard-branding-and-skill-version-check.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/scorecard-eval-detail-dialog.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/skill-v2-and-rootdir-discovery.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/test-scorecard.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/usability-utils.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/docs/package.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/dataset_command.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/main.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/test_command.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/trace_command.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/config.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/dataset/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/dataset/models.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/dataset/store.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/criteria.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/evaluation.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/llm_evaluator.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/runner.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/scorecard.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/scorers.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/trace_capture.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/trace_helpers.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/favicon.png +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/context.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/handler.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/handlers.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/instrumentors.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/observation.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/processor.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/queue.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/spans.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/evaluable.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/piccolo_conf.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/piccolo_migrations/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/serialization.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/store.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/tables.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/tree.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/dataset-generation.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/instrumentation.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/investigation.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/pixie-api.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/agent-skill-1.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/agent-skill.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/autoevals-adapters.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/dataset-management.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/evals-harness.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/expected-output-in-evals.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/instrumentation.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/manual-instrumentation-usability.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/storage.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/usability-utils.md +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_cases.json +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/conftest.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_dataset_command.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_e2e_pixie_test.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_main.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_trace_command.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/dataset/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/dataset/test_models.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/dataset/test_store.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_criteria.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_evaluation.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_llm_evaluator.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_runner.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_scorecard.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_scorers.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_trace_capture.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_trace_helpers.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/conftest.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_context.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_handler.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_integration.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_observation.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_processor.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_queue.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_spans.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_storage_handler.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/__init__.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/conftest.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_evaluable.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_serialization.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_store.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_tree.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/test_config.py +0 -0
- {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/test_init.py +0 -0
|
@@ -81,16 +81,27 @@ def _publish_to_scorecard(
|
|
|
81
81
|
"expected_output": (
|
|
82
82
|
None
|
|
83
83
|
if isinstance(ev.expected_output, _Unset)
|
|
84
|
-
else (
|
|
84
|
+
else (
|
|
85
|
+
str(ev.expected_output)
|
|
86
|
+
if ev.expected_output is not None
|
|
87
|
+
else None
|
|
88
|
+
)
|
|
89
|
+
),
|
|
90
|
+
"actual_output": (
|
|
91
|
+
str(ev.eval_output) if ev.eval_output is not None else None
|
|
85
92
|
),
|
|
86
|
-
"actual_output": str(ev.eval_output) if ev.eval_output is not None else None,
|
|
87
93
|
"metadata": ev.eval_metadata,
|
|
88
94
|
}
|
|
89
95
|
for ev in evaluables
|
|
90
96
|
)
|
|
91
97
|
else:
|
|
92
98
|
ev_dicts = tuple(
|
|
93
|
-
{
|
|
99
|
+
{
|
|
100
|
+
"input": str(inp),
|
|
101
|
+
"expected_output": None,
|
|
102
|
+
"actual_output": None,
|
|
103
|
+
"metadata": None,
|
|
104
|
+
}
|
|
94
105
|
for inp in eval_inputs
|
|
95
106
|
)
|
|
96
107
|
|
|
@@ -200,17 +211,23 @@ async def assert_pass(
|
|
|
200
211
|
If the pass criteria are not met, raises :class:`EvalAssertionError`
|
|
201
212
|
carrying the tensor.
|
|
202
213
|
|
|
203
|
-
When ``evaluables`` is provided,
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
is
|
|
214
|
+
When ``evaluables`` is provided, behaviour depends on whether each
|
|
215
|
+
item already has ``eval_output`` populated:
|
|
216
|
+
|
|
217
|
+
- **eval_output is None** — the ``runnable`` is called via
|
|
218
|
+
``run_and_evaluate`` to produce an output from traces, and
|
|
219
|
+
``expected_output`` from the evaluable is merged into the result.
|
|
220
|
+
- **eval_output is not None** — the evaluable is used directly
|
|
221
|
+
(the runnable is not called for that item).
|
|
207
222
|
|
|
208
223
|
Args:
|
|
209
224
|
runnable: The application function to test.
|
|
210
225
|
eval_inputs: List of inputs, each passed to *runnable*.
|
|
211
226
|
evaluators: List of evaluator callables.
|
|
212
227
|
evaluables: Optional list of ``Evaluable`` items, one per input.
|
|
213
|
-
|
|
228
|
+
When provided, their ``expected_output`` is forwarded to
|
|
229
|
+
``run_and_evaluate``. Must have the same length as
|
|
230
|
+
*eval_inputs*.
|
|
214
231
|
passes: How many times to run the entire test matrix.
|
|
215
232
|
pass_criteria: Receives the results tensor, returns
|
|
216
233
|
``(passed, message)``. Defaults to "every score >= 0.5".
|
|
@@ -234,11 +251,26 @@ async def assert_pass(
|
|
|
234
251
|
pass_results: list[list[Evaluation]] = []
|
|
235
252
|
for idx, inp in enumerate(eval_inputs):
|
|
236
253
|
if evaluables is not None:
|
|
237
|
-
# Use provided evaluable directly — skip trace capture
|
|
238
254
|
ev_item = evaluables[idx]
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
255
|
+
if ev_item.eval_output is None:
|
|
256
|
+
# eval_output not yet computed — run the runnable to
|
|
257
|
+
# produce it via trace capture, and merge the dataset
|
|
258
|
+
# item's expected_output into the result.
|
|
259
|
+
eval_coros = [
|
|
260
|
+
run_and_evaluate(
|
|
261
|
+
evaluator=ev,
|
|
262
|
+
runnable=runnable,
|
|
263
|
+
eval_input=inp,
|
|
264
|
+
expected_output=ev_item.expected_output,
|
|
265
|
+
from_trace=from_trace,
|
|
266
|
+
)
|
|
267
|
+
for ev in evaluators
|
|
268
|
+
]
|
|
269
|
+
else:
|
|
270
|
+
# eval_output already populated — evaluate directly.
|
|
271
|
+
eval_coros = [
|
|
272
|
+
evaluate(evaluator=ev, evaluable=ev_item) for ev in evaluators
|
|
273
|
+
]
|
|
242
274
|
else:
|
|
243
275
|
eval_coros = [
|
|
244
276
|
run_and_evaluate(
|
|
@@ -27,11 +27,28 @@ This skill is about doing the work, not describing it. Read code, edit files, ru
|
|
|
27
27
|
|
|
28
28
|
## Before you start
|
|
29
29
|
|
|
30
|
+
Run the following to keep the skill and package up to date. If any command fails or is blocked by the environment, continue — do not let failures here block the rest of the workflow.
|
|
31
|
+
|
|
32
|
+
**Update the skill:**
|
|
33
|
+
|
|
30
34
|
```bash
|
|
31
|
-
|
|
35
|
+
npx skills update
|
|
32
36
|
```
|
|
33
37
|
|
|
34
|
-
|
|
38
|
+
**Upgrade the `pixie-qa` package**
|
|
39
|
+
|
|
40
|
+
Make sure the python virtual environment is active and use the project's package manager:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# uv project (uv.lock exists):
|
|
44
|
+
uv add pixie-qa --upgrade
|
|
45
|
+
|
|
46
|
+
# poetry project (poetry.lock exists):
|
|
47
|
+
poetry add pixie-qa@latest
|
|
48
|
+
|
|
49
|
+
# pip / no lock file:
|
|
50
|
+
pip install --upgrade pixie-qa
|
|
51
|
+
```
|
|
35
52
|
|
|
36
53
|
---
|
|
37
54
|
|
|
@@ -213,8 +230,8 @@ Each dataset item contains:
|
|
|
213
230
|
|
|
214
231
|
- `eval_input`: the made-up input data (app input + external dependency data)
|
|
215
232
|
- `expected_output`: case-specific expectation text (optional — only for test cases with expectations beyond the universal criteria). This is a reference for evaluation, not an exact expected answer.
|
|
216
|
-
- `eval_output`: set to `"UNSET"` — produced at test time by the utility function from Step 3
|
|
217
233
|
|
|
234
|
+
At test time, `eval_output` is produced by the utility function from Step 3 and is not stored in the dataset itself.
|
|
218
235
|
Read `references/dataset-generation.md` for the dataset creation API, data shape matching, expected_output strategy, and validation checklist.
|
|
219
236
|
|
|
220
237
|
#### 4c. Validate the dataset
|
|
@@ -81,6 +81,7 @@ concise_voice_style = create_llm_evaluator(
|
|
|
81
81
|
**How template variables work**: `{eval_input}`, `{eval_output}`, `{expected_output}` are the only placeholders. Each is replaced with a string representation of the corresponding `Evaluable` field — if the field is a dict or list, it becomes a JSON string. The LLM judge sees the full serialized value.
|
|
82
82
|
|
|
83
83
|
**Rules**:
|
|
84
|
+
|
|
84
85
|
- **Only `{eval_input}`, `{eval_output}`, `{expected_output}`** — no nested access like `{eval_input[key]}` (this will crash with a `TypeError`)
|
|
85
86
|
- **Keep templates short and direct** — the system prompt already tells the LLM to return `Score: X.X`. Your template just needs to present the data and define the scoring criteria.
|
|
86
87
|
- **Don't instruct the LLM to "parse" or "extract" data** — just present the values and state the criteria. The LLM can read JSON naturally.
|
|
@@ -8,11 +8,11 @@ For `enable_storage()` and `observe` API details, see `references/pixie-api.md`
|
|
|
8
8
|
|
|
9
9
|
Look at how a real user or client invokes the app, and do the same thing in your utility function:
|
|
10
10
|
|
|
11
|
-
| App type
|
|
12
|
-
|
|
|
13
|
-
| **Web server** (FastAPI, Flask)
|
|
14
|
-
| **CLI application**
|
|
15
|
-
| **Standalone function** (no server, no middleware) | Python function
|
|
11
|
+
| App type | Entry point example | How to invoke it |
|
|
12
|
+
| -------------------------------------------------- | ----------------------- | ---------------------------------------------------- |
|
|
13
|
+
| **Web server** (FastAPI, Flask) | HTTP/WebSocket endpoint | `TestClient`, `httpx`, or subprocess + HTTP requests |
|
|
14
|
+
| **CLI application** | Command-line invocation | `subprocess.run()` |
|
|
15
|
+
| **Standalone function** (no server, no middleware) | Python function | Import and call directly |
|
|
16
16
|
|
|
17
17
|
**Do NOT call an inner function** like `agent.respond()` directly just because it's simpler. Between the entry point and that inner function, the app does request handling, state management, prompt assembly, routing — all of which is under test. When you call an inner function, you skip all of that and end up reimplementing it in your test. Now your test is testing test code, not app code.
|
|
18
18
|
|
|
@@ -137,11 +137,10 @@ from pixie_qa.scripts.mock_backends import (
|
|
|
137
137
|
MockSynthesisBackend,
|
|
138
138
|
)
|
|
139
139
|
|
|
140
|
-
enable_storage()
|
|
141
|
-
|
|
142
140
|
@observe
|
|
143
141
|
def run_app(eval_input: dict) -> dict:
|
|
144
142
|
"""Run the voice agent through its real FastAPI app layer."""
|
|
143
|
+
enable_storage()
|
|
145
144
|
# Patch external dependencies before importing the app
|
|
146
145
|
with patch("myapp.app.transcription_backend", MockTranscriptionBackend()), \
|
|
147
146
|
patch("myapp.app.synthesis_backend", MockSynthesisBackend()), \
|
|
@@ -142,10 +142,10 @@ These are the primary testability seams. In Step 3, you'll write mock implementa
|
|
|
142
142
|
|
|
143
143
|
<For each external dependency, how will you replace it in the utility function (Step 3)?>
|
|
144
144
|
|
|
145
|
-
| Dependency
|
|
146
|
-
|
|
|
147
|
-
| <e.g., Redis>
|
|
148
|
-
| <e.g., STT service> | <MockTranscriptionBackend>
|
|
145
|
+
| Dependency | Mock approach | What mock provides (IN) | What mock captures (OUT) |
|
|
146
|
+
| ------------------- | ------------------------------ | -------------------------------------- | ------------------------ |
|
|
147
|
+
| <e.g., Redis> | <mock.patch / mock class / DI> | <conversation history from eval_input> | <saved messages> |
|
|
148
|
+
| <e.g., STT service> | <MockTranscriptionBackend> | <text from eval_input> | <n/a> |
|
|
149
149
|
|
|
150
150
|
### Intermediate states to capture
|
|
151
151
|
|
|
@@ -438,6 +438,108 @@ class TestAssertPassEvaluables:
|
|
|
438
438
|
)
|
|
439
439
|
assert received_outputs == ["echo:hello"]
|
|
440
440
|
|
|
441
|
+
@pytest.mark.asyncio
|
|
442
|
+
async def test_evaluables_with_runnable_calls_runnable(self) -> None:
|
|
443
|
+
"""When evaluables AND runnable are provided, runnable is still called."""
|
|
444
|
+
call_count = 0
|
|
445
|
+
|
|
446
|
+
def counting_app(input: Any) -> None: # noqa: A002
|
|
447
|
+
nonlocal call_count
|
|
448
|
+
call_count += 1
|
|
449
|
+
with px.start_observation(input=input, name="app") as obs:
|
|
450
|
+
obs.set_output(f"ran:{input}")
|
|
451
|
+
|
|
452
|
+
items = [
|
|
453
|
+
Evaluable(eval_input="q1", expected_output="e1"),
|
|
454
|
+
Evaluable(eval_input="q2", expected_output="e2"),
|
|
455
|
+
]
|
|
456
|
+
await assert_pass(
|
|
457
|
+
runnable=counting_app,
|
|
458
|
+
eval_inputs=["q1", "q2"],
|
|
459
|
+
evaluators=[_always_pass],
|
|
460
|
+
evaluables=items,
|
|
461
|
+
)
|
|
462
|
+
assert call_count == 2, "runnable should be called for each input"
|
|
463
|
+
|
|
464
|
+
@pytest.mark.asyncio
|
|
465
|
+
async def test_evaluables_precomputed_output_used_directly(self) -> None:
|
|
466
|
+
"""When eval_output is already set, the evaluable is used directly."""
|
|
467
|
+
received: list[Evaluable] = []
|
|
468
|
+
|
|
469
|
+
async def capture_eval(
|
|
470
|
+
evaluable: Evaluable,
|
|
471
|
+
*,
|
|
472
|
+
trace: list[ObservationNode] | None = None,
|
|
473
|
+
) -> Evaluation:
|
|
474
|
+
received.append(evaluable)
|
|
475
|
+
return Evaluation(score=1.0, reasoning="ok")
|
|
476
|
+
|
|
477
|
+
items = [
|
|
478
|
+
Evaluable(
|
|
479
|
+
eval_input="hello",
|
|
480
|
+
eval_output="precomputed_output",
|
|
481
|
+
expected_output="ref",
|
|
482
|
+
),
|
|
483
|
+
]
|
|
484
|
+
await assert_pass(
|
|
485
|
+
runnable=_sync_app,
|
|
486
|
+
eval_inputs=["hello"],
|
|
487
|
+
evaluators=[capture_eval],
|
|
488
|
+
evaluables=items,
|
|
489
|
+
)
|
|
490
|
+
# eval_output should be the pre-computed value (runnable not called)
|
|
491
|
+
assert received[0].eval_output == "precomputed_output"
|
|
492
|
+
assert received[0].expected_output == "ref"
|
|
493
|
+
|
|
494
|
+
@pytest.mark.asyncio
|
|
495
|
+
async def test_evaluables_none_output_runs_runnable(self) -> None:
|
|
496
|
+
"""When eval_output is None, the runnable is called to produce it."""
|
|
497
|
+
received: list[Evaluable] = []
|
|
498
|
+
|
|
499
|
+
async def capture_eval(
|
|
500
|
+
evaluable: Evaluable,
|
|
501
|
+
*,
|
|
502
|
+
trace: list[ObservationNode] | None = None,
|
|
503
|
+
) -> Evaluation:
|
|
504
|
+
received.append(evaluable)
|
|
505
|
+
return Evaluation(score=1.0, reasoning="ok")
|
|
506
|
+
|
|
507
|
+
items = [
|
|
508
|
+
Evaluable(eval_input="hello", expected_output="ref"),
|
|
509
|
+
]
|
|
510
|
+
await assert_pass(
|
|
511
|
+
runnable=_sync_app,
|
|
512
|
+
eval_inputs=["hello"],
|
|
513
|
+
evaluators=[capture_eval],
|
|
514
|
+
evaluables=items,
|
|
515
|
+
)
|
|
516
|
+
# eval_output should come from the trace (runnable execution)
|
|
517
|
+
assert received[0].eval_output == "echo:hello"
|
|
518
|
+
# expected_output should come from the evaluable
|
|
519
|
+
assert received[0].expected_output == "ref"
|
|
520
|
+
|
|
521
|
+
@pytest.mark.asyncio
|
|
522
|
+
async def test_evaluables_from_trace_respected(self) -> None:
|
|
523
|
+
"""from_trace is honoured even when evaluables are provided."""
|
|
524
|
+
|
|
525
|
+
async def check_child(
|
|
526
|
+
evaluable: Evaluable,
|
|
527
|
+
*,
|
|
528
|
+
trace: list[ObservationNode] | None = None,
|
|
529
|
+
) -> Evaluation:
|
|
530
|
+
assert evaluable.eval_output == "generated"
|
|
531
|
+
assert evaluable.expected_output == "ref"
|
|
532
|
+
return Evaluation(score=1.0, reasoning="ok")
|
|
533
|
+
|
|
534
|
+
items = [Evaluable(eval_input="q1", expected_output="ref")]
|
|
535
|
+
await assert_pass(
|
|
536
|
+
runnable=_nested_app,
|
|
537
|
+
eval_inputs=["q1"],
|
|
538
|
+
evaluators=[check_child],
|
|
539
|
+
evaluables=items,
|
|
540
|
+
from_trace=lambda tree: as_evaluable(tree[0].find("generator")[0].span),
|
|
541
|
+
)
|
|
542
|
+
|
|
441
543
|
|
|
442
544
|
# ── assert_dataset_pass tests ─────────────────────────────────────────────
|
|
443
545
|
|
|
@@ -529,3 +631,36 @@ class TestAssertDatasetPass:
|
|
|
529
631
|
passes=3,
|
|
530
632
|
)
|
|
531
633
|
assert len(exc_info.value.results) == 3
|
|
634
|
+
|
|
635
|
+
@pytest.mark.asyncio
|
|
636
|
+
async def test_runnable_is_called_with_eval_output_from_trace(
|
|
637
|
+
self, tmp_path: Path
|
|
638
|
+
) -> None:
|
|
639
|
+
"""assert_dataset_pass calls the runnable; eval_output comes from trace."""
|
|
640
|
+
store = DatasetStore(dataset_dir=tmp_path)
|
|
641
|
+
store.create(
|
|
642
|
+
"run-ds",
|
|
643
|
+
items=[
|
|
644
|
+
Evaluable(eval_input="q1", expected_output="e1"),
|
|
645
|
+
],
|
|
646
|
+
)
|
|
647
|
+
received: list[Evaluable] = []
|
|
648
|
+
|
|
649
|
+
async def capture_eval(
|
|
650
|
+
evaluable: Evaluable,
|
|
651
|
+
*,
|
|
652
|
+
trace: list[ObservationNode] | None = None,
|
|
653
|
+
) -> Evaluation:
|
|
654
|
+
received.append(evaluable)
|
|
655
|
+
return Evaluation(score=1.0, reasoning="ok")
|
|
656
|
+
|
|
657
|
+
await assert_dataset_pass(
|
|
658
|
+
runnable=_sync_app,
|
|
659
|
+
dataset_name="run-ds",
|
|
660
|
+
evaluators=[capture_eval],
|
|
661
|
+
dataset_dir=str(tmp_path),
|
|
662
|
+
)
|
|
663
|
+
# eval_output should come from the runnable (trace), not the dataset
|
|
664
|
+
assert received[0].eval_output == "echo:q1"
|
|
665
|
+
# expected_output should still come from the dataset
|
|
666
|
+
assert received[0].expected_output == "e1"
|
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Check whether the eval-driven-dev skill and pixie-qa package need updating.
|
|
3
|
-
|
|
4
|
-
Prints one of:
|
|
5
|
-
"SKILL upgrade available"
|
|
6
|
-
"Package upgrade available"
|
|
7
|
-
"SKILL and Package upgrade available"
|
|
8
|
-
"All up to date"
|
|
9
|
-
|
|
10
|
-
Exit codes:
|
|
11
|
-
0 — everything is up to date (or status could not be determined)
|
|
12
|
-
1 — at least one component needs an upgrade
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
from __future__ import annotations
|
|
16
|
-
|
|
17
|
-
import importlib.metadata
|
|
18
|
-
import json
|
|
19
|
-
import re
|
|
20
|
-
from pathlib import Path
|
|
21
|
-
from urllib.error import URLError
|
|
22
|
-
from urllib.request import urlopen
|
|
23
|
-
|
|
24
|
-
# ── Constants ────────────────────────────────────────────────────────────────
|
|
25
|
-
|
|
26
|
-
SKILL_URL = (
|
|
27
|
-
"https://raw.githubusercontent.com/yiouli/pixie-qa/"
|
|
28
|
-
"main/skills/eval-driven-dev/SKILL.md"
|
|
29
|
-
)
|
|
30
|
-
PYPI_URL = "https://pypi.org/pypi/pixie-qa/json"
|
|
31
|
-
|
|
32
|
-
# ── Helpers ──────────────────────────────────────────────────────────────────
|
|
33
|
-
|
|
34
|
-
_RE_FRONTMATTER = re.compile(r"^---\s*\n(.*?)\n---", re.DOTALL)
|
|
35
|
-
_RE_VERSION = re.compile(r"^\s+version:\s*(\S+)$", re.MULTILINE)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _parse_version(text: str) -> str:
|
|
39
|
-
"""Extract metadata.version from SKILL.md YAML frontmatter."""
|
|
40
|
-
match = _RE_FRONTMATTER.search(text)
|
|
41
|
-
frontmatter = match.group(1) if match else text
|
|
42
|
-
m = _RE_VERSION.search(frontmatter)
|
|
43
|
-
return m.group(1).strip() if m else "0.0.0"
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def _normalise_version(version: str) -> tuple[int, ...]:
|
|
47
|
-
parts: list[int] = []
|
|
48
|
-
for part in version.strip().split("."):
|
|
49
|
-
try:
|
|
50
|
-
parts.append(int(part))
|
|
51
|
-
except ValueError:
|
|
52
|
-
break
|
|
53
|
-
return tuple(parts)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# ── Skill check ──────────────────────────────────────────────────────────────
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def _skill_needs_upgrade() -> bool:
|
|
60
|
-
"""Return True if a newer version of the skill is available on GitHub."""
|
|
61
|
-
resource_dir = Path(__file__).resolve().parent
|
|
62
|
-
skill_path = resource_dir.parent / "SKILL.md"
|
|
63
|
-
if not skill_path.exists():
|
|
64
|
-
# SKILL.md is not on disk (e.g. prompt-based agents); skip check.
|
|
65
|
-
return False
|
|
66
|
-
local_text = skill_path.read_text(encoding="utf-8")
|
|
67
|
-
local_version = _parse_version(local_text)
|
|
68
|
-
try:
|
|
69
|
-
with urlopen(SKILL_URL, timeout=10) as resp:
|
|
70
|
-
remote_version = _parse_version(resp.read().decode("utf-8"))
|
|
71
|
-
except (OSError, URLError):
|
|
72
|
-
return False
|
|
73
|
-
return _normalise_version(remote_version) > _normalise_version(local_version)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
# ── Package check ─────────────────────────────────────────────────────────────
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _is_local_install(dist: importlib.metadata.Distribution) -> bool:
|
|
80
|
-
"""Return True if pixie-qa was installed from a local path rather than PyPI."""
|
|
81
|
-
try:
|
|
82
|
-
text = dist.read_text("direct_url.json")
|
|
83
|
-
if text:
|
|
84
|
-
url: str = json.loads(text).get("url", "")
|
|
85
|
-
return url.startswith("file://")
|
|
86
|
-
except Exception:
|
|
87
|
-
pass
|
|
88
|
-
return False
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def _package_needs_upgrade() -> bool:
|
|
92
|
-
"""Return True if pixie-qa is missing or a newer version is on PyPI."""
|
|
93
|
-
try:
|
|
94
|
-
dist = importlib.metadata.distribution("pixie-qa")
|
|
95
|
-
except importlib.metadata.PackageNotFoundError:
|
|
96
|
-
return True
|
|
97
|
-
if _is_local_install(dist):
|
|
98
|
-
return False
|
|
99
|
-
installed: str = dist.metadata["Version"]
|
|
100
|
-
try:
|
|
101
|
-
with urlopen(PYPI_URL, timeout=10) as resp:
|
|
102
|
-
latest: str = json.loads(resp.read().decode("utf-8"))["info"]["version"]
|
|
103
|
-
except (OSError, URLError, KeyError, ValueError):
|
|
104
|
-
return False
|
|
105
|
-
return _normalise_version(latest) > _normalise_version(installed)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
# ── Entry point ───────────────────────────────────────────────────────────────
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def main() -> int:
|
|
112
|
-
skill = _skill_needs_upgrade()
|
|
113
|
-
package = _package_needs_upgrade()
|
|
114
|
-
if skill and package:
|
|
115
|
-
print("SKILL and Package upgrade available")
|
|
116
|
-
elif skill:
|
|
117
|
-
print("SKILL upgrade available")
|
|
118
|
-
elif package:
|
|
119
|
-
print("Package upgrade available")
|
|
120
|
-
else:
|
|
121
|
-
print("All up to date")
|
|
122
|
-
return 1 if (skill or package) else 0
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
if __name__ == "__main__":
|
|
126
|
-
raise SystemExit(main())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|