pixie-qa 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/PKG-INFO +1 -1
  2. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/eval_utils.py +44 -12
  3. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pyproject.toml +1 -1
  4. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/SKILL.md +20 -3
  5. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/eval-tests.md +1 -0
  6. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/run-harness-patterns.md +6 -7
  7. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/understanding-app.md +4 -4
  8. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_eval_utils.py +135 -0
  9. pixie_qa-0.2.0/skills/eval-driven-dev/resources/check_version.py +0 -126
  10. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/.github/copilot-instructions.md +0 -0
  11. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/.github/workflows/publish.yml +0 -0
  12. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/.gitignore +0 -0
  13. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/LICENSE +0 -0
  14. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/README.md +0 -0
  15. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/async-handler-processing.md +0 -0
  16. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/autoevals-adapters.md +0 -0
  17. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/cli-dataset-commands.md +0 -0
  18. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/dataset-management.md +0 -0
  19. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/deep-research-demo.md +0 -0
  20. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/eval-harness.md +0 -0
  21. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/expected-output-in-evals.md +0 -0
  22. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/instrumentation-module-implementation.md +0 -0
  23. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/loud-failure-mode.md +0 -0
  24. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/manual-instrumentation-usability.md +0 -0
  25. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/observation-store-implementation.md +0 -0
  26. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/observe-sensitive-field-stripping.md +0 -0
  27. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/pixie-directory-and-skill-improvements.md +0 -0
  28. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/pixie-test-e2e-suite.md +0 -0
  29. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/root-package-exports-and-trace-id.md +0 -0
  30. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/scorecard-branding-and-skill-version-check.md +0 -0
  31. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/scorecard-eval-detail-dialog.md +0 -0
  32. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/skill-v2-and-rootdir-discovery.md +0 -0
  33. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/test-scorecard.md +0 -0
  34. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/changelogs/usability-utils.md +0 -0
  35. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/docs/package.md +0 -0
  36. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/__init__.py +0 -0
  37. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/__init__.py +0 -0
  38. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/dataset_command.py +0 -0
  39. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/main.py +0 -0
  40. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/test_command.py +0 -0
  41. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/cli/trace_command.py +0 -0
  42. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/config.py +0 -0
  43. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/dataset/__init__.py +0 -0
  44. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/dataset/models.py +0 -0
  45. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/dataset/store.py +0 -0
  46. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/__init__.py +0 -0
  47. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/criteria.py +0 -0
  48. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/evaluation.py +0 -0
  49. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/llm_evaluator.py +0 -0
  50. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/runner.py +0 -0
  51. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/scorecard.py +0 -0
  52. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/scorers.py +0 -0
  53. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/trace_capture.py +0 -0
  54. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/evals/trace_helpers.py +0 -0
  55. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/favicon.png +0 -0
  56. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/__init__.py +0 -0
  57. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/context.py +0 -0
  58. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/handler.py +0 -0
  59. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/handlers.py +0 -0
  60. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/instrumentors.py +0 -0
  61. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/observation.py +0 -0
  62. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/processor.py +0 -0
  63. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/queue.py +0 -0
  64. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/instrumentation/spans.py +0 -0
  65. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/__init__.py +0 -0
  66. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/evaluable.py +0 -0
  67. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/piccolo_conf.py +0 -0
  68. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/piccolo_migrations/__init__.py +0 -0
  69. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/serialization.py +0 -0
  70. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/store.py +0 -0
  71. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/tables.py +0 -0
  72. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/pixie/storage/tree.py +0 -0
  73. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/dataset-generation.md +0 -0
  74. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/instrumentation.md +0 -0
  75. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/investigation.md +0 -0
  76. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/skills/eval-driven-dev/references/pixie-api.md +0 -0
  77. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/agent-skill-1.md +0 -0
  78. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/agent-skill.md +0 -0
  79. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/autoevals-adapters.md +0 -0
  80. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/dataset-management.md +0 -0
  81. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/evals-harness.md +0 -0
  82. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/expected-output-in-evals.md +0 -0
  83. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/instrumentation.md +0 -0
  84. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/manual-instrumentation-usability.md +0 -0
  85. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/storage.md +0 -0
  86. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/specs/usability-utils.md +0 -0
  87. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/__init__.py +0 -0
  88. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/__init__.py +0 -0
  89. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/__init__.py +0 -0
  90. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_cases.json +0 -0
  91. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/conftest.py +0 -0
  92. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -0
  93. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -0
  94. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -0
  95. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_dataset_command.py +0 -0
  96. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_e2e_pixie_test.py +0 -0
  97. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_main.py +0 -0
  98. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/cli/test_trace_command.py +0 -0
  99. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/dataset/__init__.py +0 -0
  100. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/dataset/test_models.py +0 -0
  101. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/dataset/test_store.py +0 -0
  102. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/__init__.py +0 -0
  103. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_criteria.py +0 -0
  104. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_evaluation.py +0 -0
  105. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_llm_evaluator.py +0 -0
  106. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_runner.py +0 -0
  107. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_scorecard.py +0 -0
  108. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_scorers.py +0 -0
  109. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_trace_capture.py +0 -0
  110. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/evals/test_trace_helpers.py +0 -0
  111. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/__init__.py +0 -0
  112. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/conftest.py +0 -0
  113. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_context.py +0 -0
  114. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_handler.py +0 -0
  115. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_integration.py +0 -0
  116. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_observation.py +0 -0
  117. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_processor.py +0 -0
  118. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_queue.py +0 -0
  119. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_spans.py +0 -0
  120. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/instrumentation/test_storage_handler.py +0 -0
  121. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/__init__.py +0 -0
  122. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/conftest.py +0 -0
  123. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_evaluable.py +0 -0
  124. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_serialization.py +0 -0
  125. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_store.py +0 -0
  126. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/observation_store/test_tree.py +0 -0
  127. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/test_config.py +0 -0
  128. {pixie_qa-0.2.0 → pixie_qa-0.2.1}/tests/pixie/test_init.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixie-qa
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Automated quality assurance for AI applications
5
5
  Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
6
  Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -81,16 +81,27 @@ def _publish_to_scorecard(
81
81
  "expected_output": (
82
82
  None
83
83
  if isinstance(ev.expected_output, _Unset)
84
- else (str(ev.expected_output) if ev.expected_output is not None else None)
84
+ else (
85
+ str(ev.expected_output)
86
+ if ev.expected_output is not None
87
+ else None
88
+ )
89
+ ),
90
+ "actual_output": (
91
+ str(ev.eval_output) if ev.eval_output is not None else None
85
92
  ),
86
- "actual_output": str(ev.eval_output) if ev.eval_output is not None else None,
87
93
  "metadata": ev.eval_metadata,
88
94
  }
89
95
  for ev in evaluables
90
96
  )
91
97
  else:
92
98
  ev_dicts = tuple(
93
- {"input": str(inp), "expected_output": None, "actual_output": None, "metadata": None}
99
+ {
100
+ "input": str(inp),
101
+ "expected_output": None,
102
+ "actual_output": None,
103
+ "metadata": None,
104
+ }
94
105
  for inp in eval_inputs
95
106
  )
96
107
 
@@ -200,17 +211,23 @@ async def assert_pass(
200
211
  If the pass criteria are not met, raises :class:`EvalAssertionError`
201
212
  carrying the tensor.
202
213
 
203
- When ``evaluables`` is provided, each item is used directly as the
204
- evaluable for the corresponding input (it already carries its own
205
- ``expected_output``). When ``evaluables`` is ``None``, the evaluable
206
- is constructed from the captured trace as before.
214
+ When ``evaluables`` is provided, behaviour depends on whether each
215
+ item already has ``eval_output`` populated:
216
+
217
+ - **eval_output is None** the ``runnable`` is called via
218
+ ``run_and_evaluate`` to produce an output from traces, and
219
+ ``expected_output`` from the evaluable is merged into the result.
220
+ - **eval_output is not None** — the evaluable is used directly
221
+ (the runnable is not called for that item).
207
222
 
208
223
  Args:
209
224
  runnable: The application function to test.
210
225
  eval_inputs: List of inputs, each passed to *runnable*.
211
226
  evaluators: List of evaluator callables.
212
227
  evaluables: Optional list of ``Evaluable`` items, one per input.
213
- Must have the same length as *eval_inputs* when provided.
228
+ When provided, their ``expected_output`` is forwarded to
229
+ ``run_and_evaluate``. Must have the same length as
230
+ *eval_inputs*.
214
231
  passes: How many times to run the entire test matrix.
215
232
  pass_criteria: Receives the results tensor, returns
216
233
  ``(passed, message)``. Defaults to "every score >= 0.5".
@@ -234,11 +251,26 @@ async def assert_pass(
234
251
  pass_results: list[list[Evaluation]] = []
235
252
  for idx, inp in enumerate(eval_inputs):
236
253
  if evaluables is not None:
237
- # Use provided evaluable directly — skip trace capture
238
254
  ev_item = evaluables[idx]
239
- eval_coros = [
240
- evaluate(evaluator=ev, evaluable=ev_item) for ev in evaluators
241
- ]
255
+ if ev_item.eval_output is None:
256
+ # eval_output not yet computed — run the runnable to
257
+ # produce it via trace capture, and merge the dataset
258
+ # item's expected_output into the result.
259
+ eval_coros = [
260
+ run_and_evaluate(
261
+ evaluator=ev,
262
+ runnable=runnable,
263
+ eval_input=inp,
264
+ expected_output=ev_item.expected_output,
265
+ from_trace=from_trace,
266
+ )
267
+ for ev in evaluators
268
+ ]
269
+ else:
270
+ # eval_output already populated — evaluate directly.
271
+ eval_coros = [
272
+ evaluate(evaluator=ev, evaluable=ev_item) for ev in evaluators
273
+ ]
242
274
  else:
243
275
  eval_coros = [
244
276
  run_and_evaluate(
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pixie-qa"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  description = "Automated quality assurance for AI applications"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -27,11 +27,28 @@ This skill is about doing the work, not describing it. Read code, edit files, ru
27
27
 
28
28
  ## Before you start
29
29
 
30
+ Run the following to keep the skill and package up to date. If any command fails or is blocked by the environment, continue — do not let failures here block the rest of the workflow.
31
+
32
+ **Update the skill:**
33
+
30
34
  ```bash
31
- python resources/check_version.py
35
+ npx skills update
32
36
  ```
33
37
 
34
- If the script reports updates needed, run the upgrade commands it suggests. If it says "All up to date" or fails, continue.
38
+ **Upgrade the `pixie-qa` package**
39
+
40
+ Make sure the python virtual environment is active and use the project's package manager:
41
+
42
+ ```bash
43
+ # uv project (uv.lock exists):
44
+ uv add pixie-qa --upgrade
45
+
46
+ # poetry project (poetry.lock exists):
47
+ poetry add pixie-qa@latest
48
+
49
+ # pip / no lock file:
50
+ pip install --upgrade pixie-qa
51
+ ```
35
52
 
36
53
  ---
37
54
 
@@ -213,8 +230,8 @@ Each dataset item contains:
213
230
 
214
231
  - `eval_input`: the made-up input data (app input + external dependency data)
215
232
  - `expected_output`: case-specific expectation text (optional — only for test cases with expectations beyond the universal criteria). This is a reference for evaluation, not an exact expected answer.
216
- - `eval_output`: set to `"UNSET"` — produced at test time by the utility function from Step 3
217
233
 
234
+ At test time, `eval_output` is produced by the utility function from Step 3 and is not stored in the dataset itself.
218
235
  Read `references/dataset-generation.md` for the dataset creation API, data shape matching, expected_output strategy, and validation checklist.
219
236
 
220
237
  #### 4c. Validate the dataset
@@ -81,6 +81,7 @@ concise_voice_style = create_llm_evaluator(
81
81
  **How template variables work**: `{eval_input}`, `{eval_output}`, `{expected_output}` are the only placeholders. Each is replaced with a string representation of the corresponding `Evaluable` field — if the field is a dict or list, it becomes a JSON string. The LLM judge sees the full serialized value.
82
82
 
83
83
  **Rules**:
84
+
84
85
  - **Only `{eval_input}`, `{eval_output}`, `{expected_output}`** — no nested access like `{eval_input[key]}` (this will crash with a `TypeError`)
85
86
  - **Keep templates short and direct** — the system prompt already tells the LLM to return `Score: X.X`. Your template just needs to present the data and define the scoring criteria.
86
87
  - **Don't instruct the LLM to "parse" or "extract" data** — just present the values and state the criteria. The LLM can read JSON naturally.
@@ -8,11 +8,11 @@ For `enable_storage()` and `observe` API details, see `references/pixie-api.md`
8
8
 
9
9
  Look at how a real user or client invokes the app, and do the same thing in your utility function:
10
10
 
11
- | App type | Entry point example | How to invoke it |
12
- | --- | --- | --- |
13
- | **Web server** (FastAPI, Flask) | HTTP/WebSocket endpoint | `TestClient`, `httpx`, or subprocess + HTTP requests |
14
- | **CLI application** | Command-line invocation | `subprocess.run()` |
15
- | **Standalone function** (no server, no middleware) | Python function | Import and call directly |
11
+ | App type | Entry point example | How to invoke it |
12
+ | -------------------------------------------------- | ----------------------- | ---------------------------------------------------- |
13
+ | **Web server** (FastAPI, Flask) | HTTP/WebSocket endpoint | `TestClient`, `httpx`, or subprocess + HTTP requests |
14
+ | **CLI application** | Command-line invocation | `subprocess.run()` |
15
+ | **Standalone function** (no server, no middleware) | Python function | Import and call directly |
16
16
 
17
17
  **Do NOT call an inner function** like `agent.respond()` directly just because it's simpler. Between the entry point and that inner function, the app does request handling, state management, prompt assembly, routing — all of which is under test. When you call an inner function, you skip all of that and end up reimplementing it in your test. Now your test is testing test code, not app code.
18
18
 
@@ -137,11 +137,10 @@ from pixie_qa.scripts.mock_backends import (
137
137
  MockSynthesisBackend,
138
138
  )
139
139
 
140
- enable_storage()
141
-
142
140
  @observe
143
141
  def run_app(eval_input: dict) -> dict:
144
142
  """Run the voice agent through its real FastAPI app layer."""
143
+ enable_storage()
145
144
  # Patch external dependencies before importing the app
146
145
  with patch("myapp.app.transcription_backend", MockTranscriptionBackend()), \
147
146
  patch("myapp.app.synthesis_backend", MockSynthesisBackend()), \
@@ -142,10 +142,10 @@ These are the primary testability seams. In Step 3, you'll write mock implementa
142
142
 
143
143
  <For each external dependency, how will you replace it in the utility function (Step 3)?>
144
144
 
145
- | Dependency | Mock approach | What mock provides (IN) | What mock captures (OUT) |
146
- | --- | --- | --- | --- |
147
- | <e.g., Redis> | <mock.patch / mock class / DI> | <conversation history from eval_input> | <saved messages> |
148
- | <e.g., STT service> | <MockTranscriptionBackend> | <text from eval_input> | <n/a> |
145
+ | Dependency | Mock approach | What mock provides (IN) | What mock captures (OUT) |
146
+ | ------------------- | ------------------------------ | -------------------------------------- | ------------------------ |
147
+ | <e.g., Redis> | <mock.patch / mock class / DI> | <conversation history from eval_input> | <saved messages> |
148
+ | <e.g., STT service> | <MockTranscriptionBackend> | <text from eval_input> | <n/a> |
149
149
 
150
150
  ### Intermediate states to capture
151
151
 
@@ -438,6 +438,108 @@ class TestAssertPassEvaluables:
438
438
  )
439
439
  assert received_outputs == ["echo:hello"]
440
440
 
441
+ @pytest.mark.asyncio
442
+ async def test_evaluables_with_runnable_calls_runnable(self) -> None:
443
+ """When evaluables AND runnable are provided, runnable is still called."""
444
+ call_count = 0
445
+
446
+ def counting_app(input: Any) -> None: # noqa: A002
447
+ nonlocal call_count
448
+ call_count += 1
449
+ with px.start_observation(input=input, name="app") as obs:
450
+ obs.set_output(f"ran:{input}")
451
+
452
+ items = [
453
+ Evaluable(eval_input="q1", expected_output="e1"),
454
+ Evaluable(eval_input="q2", expected_output="e2"),
455
+ ]
456
+ await assert_pass(
457
+ runnable=counting_app,
458
+ eval_inputs=["q1", "q2"],
459
+ evaluators=[_always_pass],
460
+ evaluables=items,
461
+ )
462
+ assert call_count == 2, "runnable should be called for each input"
463
+
464
+ @pytest.mark.asyncio
465
+ async def test_evaluables_precomputed_output_used_directly(self) -> None:
466
+ """When eval_output is already set, the evaluable is used directly."""
467
+ received: list[Evaluable] = []
468
+
469
+ async def capture_eval(
470
+ evaluable: Evaluable,
471
+ *,
472
+ trace: list[ObservationNode] | None = None,
473
+ ) -> Evaluation:
474
+ received.append(evaluable)
475
+ return Evaluation(score=1.0, reasoning="ok")
476
+
477
+ items = [
478
+ Evaluable(
479
+ eval_input="hello",
480
+ eval_output="precomputed_output",
481
+ expected_output="ref",
482
+ ),
483
+ ]
484
+ await assert_pass(
485
+ runnable=_sync_app,
486
+ eval_inputs=["hello"],
487
+ evaluators=[capture_eval],
488
+ evaluables=items,
489
+ )
490
+ # eval_output should be the pre-computed value (runnable not called)
491
+ assert received[0].eval_output == "precomputed_output"
492
+ assert received[0].expected_output == "ref"
493
+
494
+ @pytest.mark.asyncio
495
+ async def test_evaluables_none_output_runs_runnable(self) -> None:
496
+ """When eval_output is None, the runnable is called to produce it."""
497
+ received: list[Evaluable] = []
498
+
499
+ async def capture_eval(
500
+ evaluable: Evaluable,
501
+ *,
502
+ trace: list[ObservationNode] | None = None,
503
+ ) -> Evaluation:
504
+ received.append(evaluable)
505
+ return Evaluation(score=1.0, reasoning="ok")
506
+
507
+ items = [
508
+ Evaluable(eval_input="hello", expected_output="ref"),
509
+ ]
510
+ await assert_pass(
511
+ runnable=_sync_app,
512
+ eval_inputs=["hello"],
513
+ evaluators=[capture_eval],
514
+ evaluables=items,
515
+ )
516
+ # eval_output should come from the trace (runnable execution)
517
+ assert received[0].eval_output == "echo:hello"
518
+ # expected_output should come from the evaluable
519
+ assert received[0].expected_output == "ref"
520
+
521
+ @pytest.mark.asyncio
522
+ async def test_evaluables_from_trace_respected(self) -> None:
523
+ """from_trace is honoured even when evaluables are provided."""
524
+
525
+ async def check_child(
526
+ evaluable: Evaluable,
527
+ *,
528
+ trace: list[ObservationNode] | None = None,
529
+ ) -> Evaluation:
530
+ assert evaluable.eval_output == "generated"
531
+ assert evaluable.expected_output == "ref"
532
+ return Evaluation(score=1.0, reasoning="ok")
533
+
534
+ items = [Evaluable(eval_input="q1", expected_output="ref")]
535
+ await assert_pass(
536
+ runnable=_nested_app,
537
+ eval_inputs=["q1"],
538
+ evaluators=[check_child],
539
+ evaluables=items,
540
+ from_trace=lambda tree: as_evaluable(tree[0].find("generator")[0].span),
541
+ )
542
+
441
543
 
442
544
  # ── assert_dataset_pass tests ─────────────────────────────────────────────
443
545
 
@@ -529,3 +631,36 @@ class TestAssertDatasetPass:
529
631
  passes=3,
530
632
  )
531
633
  assert len(exc_info.value.results) == 3
634
+
635
+ @pytest.mark.asyncio
636
+ async def test_runnable_is_called_with_eval_output_from_trace(
637
+ self, tmp_path: Path
638
+ ) -> None:
639
+ """assert_dataset_pass calls the runnable; eval_output comes from trace."""
640
+ store = DatasetStore(dataset_dir=tmp_path)
641
+ store.create(
642
+ "run-ds",
643
+ items=[
644
+ Evaluable(eval_input="q1", expected_output="e1"),
645
+ ],
646
+ )
647
+ received: list[Evaluable] = []
648
+
649
+ async def capture_eval(
650
+ evaluable: Evaluable,
651
+ *,
652
+ trace: list[ObservationNode] | None = None,
653
+ ) -> Evaluation:
654
+ received.append(evaluable)
655
+ return Evaluation(score=1.0, reasoning="ok")
656
+
657
+ await assert_dataset_pass(
658
+ runnable=_sync_app,
659
+ dataset_name="run-ds",
660
+ evaluators=[capture_eval],
661
+ dataset_dir=str(tmp_path),
662
+ )
663
+ # eval_output should come from the runnable (trace), not the dataset
664
+ assert received[0].eval_output == "echo:q1"
665
+ # expected_output should still come from the dataset
666
+ assert received[0].expected_output == "e1"
@@ -1,126 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Check whether the eval-driven-dev skill and pixie-qa package need updating.
3
-
4
- Prints one of:
5
- "SKILL upgrade available"
6
- "Package upgrade available"
7
- "SKILL and Package upgrade available"
8
- "All up to date"
9
-
10
- Exit codes:
11
- 0 — everything is up to date (or status could not be determined)
12
- 1 — at least one component needs an upgrade
13
- """
14
-
15
- from __future__ import annotations
16
-
17
- import importlib.metadata
18
- import json
19
- import re
20
- from pathlib import Path
21
- from urllib.error import URLError
22
- from urllib.request import urlopen
23
-
24
- # ── Constants ────────────────────────────────────────────────────────────────
25
-
26
- SKILL_URL = (
27
- "https://raw.githubusercontent.com/yiouli/pixie-qa/"
28
- "main/skills/eval-driven-dev/SKILL.md"
29
- )
30
- PYPI_URL = "https://pypi.org/pypi/pixie-qa/json"
31
-
32
- # ── Helpers ──────────────────────────────────────────────────────────────────
33
-
34
- _RE_FRONTMATTER = re.compile(r"^---\s*\n(.*?)\n---", re.DOTALL)
35
- _RE_VERSION = re.compile(r"^\s+version:\s*(\S+)$", re.MULTILINE)
36
-
37
-
38
- def _parse_version(text: str) -> str:
39
- """Extract metadata.version from SKILL.md YAML frontmatter."""
40
- match = _RE_FRONTMATTER.search(text)
41
- frontmatter = match.group(1) if match else text
42
- m = _RE_VERSION.search(frontmatter)
43
- return m.group(1).strip() if m else "0.0.0"
44
-
45
-
46
- def _normalise_version(version: str) -> tuple[int, ...]:
47
- parts: list[int] = []
48
- for part in version.strip().split("."):
49
- try:
50
- parts.append(int(part))
51
- except ValueError:
52
- break
53
- return tuple(parts)
54
-
55
-
56
- # ── Skill check ──────────────────────────────────────────────────────────────
57
-
58
-
59
- def _skill_needs_upgrade() -> bool:
60
- """Return True if a newer version of the skill is available on GitHub."""
61
- resource_dir = Path(__file__).resolve().parent
62
- skill_path = resource_dir.parent / "SKILL.md"
63
- if not skill_path.exists():
64
- # SKILL.md is not on disk (e.g. prompt-based agents); skip check.
65
- return False
66
- local_text = skill_path.read_text(encoding="utf-8")
67
- local_version = _parse_version(local_text)
68
- try:
69
- with urlopen(SKILL_URL, timeout=10) as resp:
70
- remote_version = _parse_version(resp.read().decode("utf-8"))
71
- except (OSError, URLError):
72
- return False
73
- return _normalise_version(remote_version) > _normalise_version(local_version)
74
-
75
-
76
- # ── Package check ─────────────────────────────────────────────────────────────
77
-
78
-
79
- def _is_local_install(dist: importlib.metadata.Distribution) -> bool:
80
- """Return True if pixie-qa was installed from a local path rather than PyPI."""
81
- try:
82
- text = dist.read_text("direct_url.json")
83
- if text:
84
- url: str = json.loads(text).get("url", "")
85
- return url.startswith("file://")
86
- except Exception:
87
- pass
88
- return False
89
-
90
-
91
- def _package_needs_upgrade() -> bool:
92
- """Return True if pixie-qa is missing or a newer version is on PyPI."""
93
- try:
94
- dist = importlib.metadata.distribution("pixie-qa")
95
- except importlib.metadata.PackageNotFoundError:
96
- return True
97
- if _is_local_install(dist):
98
- return False
99
- installed: str = dist.metadata["Version"]
100
- try:
101
- with urlopen(PYPI_URL, timeout=10) as resp:
102
- latest: str = json.loads(resp.read().decode("utf-8"))["info"]["version"]
103
- except (OSError, URLError, KeyError, ValueError):
104
- return False
105
- return _normalise_version(latest) > _normalise_version(installed)
106
-
107
-
108
- # ── Entry point ───────────────────────────────────────────────────────────────
109
-
110
-
111
- def main() -> int:
112
- skill = _skill_needs_upgrade()
113
- package = _package_needs_upgrade()
114
- if skill and package:
115
- print("SKILL and Package upgrade available")
116
- elif skill:
117
- print("SKILL upgrade available")
118
- elif package:
119
- print("Package upgrade available")
120
- else:
121
- print("All up to date")
122
- return 1 if (skill or package) else 0
123
-
124
-
125
- if __name__ == "__main__":
126
- raise SystemExit(main())
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes