pixie-qa 0.2.2__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. pixie_qa-0.5.0/.gitignore +8 -0
  2. {pixie_qa-0.2.2 → pixie_qa-0.5.0}/PKG-INFO +41 -8
  3. pixie_qa-0.5.0/README.md +67 -0
  4. pixie_qa-0.5.0/pixie/__init__.py +90 -0
  5. pixie_qa-0.5.0/pixie/assets/mock-data.json +407 -0
  6. pixie_qa-0.5.0/pixie/assets/webui.html +64 -0
  7. pixie_qa-0.5.0/pixie/cli/__init__.py +11 -0
  8. pixie_qa-0.5.0/pixie/cli/analyze_command.py +156 -0
  9. pixie_qa-0.5.0/pixie/cli/format_command.py +223 -0
  10. pixie_qa-0.5.0/pixie/cli/init_command.py +55 -0
  11. pixie_qa-0.5.0/pixie/cli/main.py +202 -0
  12. pixie_qa-0.5.0/pixie/cli/start_command.py +43 -0
  13. pixie_qa-0.5.0/pixie/cli/test_command.py +178 -0
  14. pixie_qa-0.5.0/pixie/cli/trace_command.py +128 -0
  15. pixie_qa-0.5.0/pixie/config.py +130 -0
  16. pixie_qa-0.5.0/pixie/eval/__init__.py +143 -0
  17. pixie_qa-0.5.0/pixie/eval/evaluable.py +100 -0
  18. {pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/evaluation.py +15 -11
  19. {pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/llm_evaluator.py +35 -19
  20. pixie_qa-0.5.0/pixie/eval/rate_limiter.py +140 -0
  21. {pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/scorers.py +268 -123
  22. pixie_qa-0.5.0/pixie/harness/__init__.py +8 -0
  23. pixie_qa-0.5.0/pixie/harness/run_result.py +239 -0
  24. pixie_qa-0.5.0/pixie/harness/runnable.py +133 -0
  25. pixie_qa-0.5.0/pixie/harness/runner.py +813 -0
  26. pixie_qa-0.5.0/pixie/instrumentation/__init__.py +99 -0
  27. pixie_qa-0.5.0/pixie/instrumentation/llm_tracing.py +818 -0
  28. pixie_qa-0.5.0/pixie/instrumentation/wrap.py +323 -0
  29. pixie_qa-0.5.0/pixie/web/__init__.py +7 -0
  30. pixie_qa-0.5.0/pixie/web/app.py +266 -0
  31. pixie_qa-0.5.0/pixie/web/server.py +369 -0
  32. pixie_qa-0.5.0/pixie/web/watcher.py +102 -0
  33. {pixie_qa-0.2.2 → pixie_qa-0.5.0}/pyproject.toml +24 -4
  34. pixie_qa-0.2.2/.github/copilot-instructions.md +0 -632
  35. pixie_qa-0.2.2/.github/workflows/publish.yml +0 -80
  36. pixie_qa-0.2.2/.gitignore +0 -4
  37. pixie_qa-0.2.2/README.md +0 -36
  38. pixie_qa-0.2.2/changelogs/async-handler-processing.md +0 -96
  39. pixie_qa-0.2.2/changelogs/autoevals-adapters.md +0 -39
  40. pixie_qa-0.2.2/changelogs/cli-dataset-commands.md +0 -37
  41. pixie_qa-0.2.2/changelogs/dataset-management.md +0 -91
  42. pixie_qa-0.2.2/changelogs/deep-research-demo.md +0 -43
  43. pixie_qa-0.2.2/changelogs/eval-harness.md +0 -128
  44. pixie_qa-0.2.2/changelogs/expected-output-in-evals.md +0 -42
  45. pixie_qa-0.2.2/changelogs/instrumentation-module-implementation.md +0 -55
  46. pixie_qa-0.2.2/changelogs/loud-failure-mode.md +0 -58
  47. pixie_qa-0.2.2/changelogs/manual-instrumentation-usability.md +0 -56
  48. pixie_qa-0.2.2/changelogs/observation-store-implementation.md +0 -53
  49. pixie_qa-0.2.2/changelogs/observe-sensitive-field-stripping.md +0 -22
  50. pixie_qa-0.2.2/changelogs/pixie-directory-and-skill-improvements.md +0 -63
  51. pixie_qa-0.2.2/changelogs/pixie-test-e2e-suite.md +0 -69
  52. pixie_qa-0.2.2/changelogs/root-package-exports-and-trace-id.md +0 -58
  53. pixie_qa-0.2.2/changelogs/scorecard-branding-and-skill-version-check.md +0 -41
  54. pixie_qa-0.2.2/changelogs/scorecard-eval-detail-dialog.md +0 -28
  55. pixie_qa-0.2.2/changelogs/skill-v2-and-rootdir-discovery.md +0 -76
  56. pixie_qa-0.2.2/changelogs/test-scorecard.md +0 -54
  57. pixie_qa-0.2.2/changelogs/usability-utils.md +0 -60
  58. pixie_qa-0.2.2/docs/package.md +0 -233
  59. pixie_qa-0.2.2/pixie/__init__.py +0 -110
  60. pixie_qa-0.2.2/pixie/cli/__init__.py +0 -6
  61. pixie_qa-0.2.2/pixie/cli/dataset_command.py +0 -193
  62. pixie_qa-0.2.2/pixie/cli/main.py +0 -307
  63. pixie_qa-0.2.2/pixie/cli/test_command.py +0 -120
  64. pixie_qa-0.2.2/pixie/cli/trace_command.py +0 -186
  65. pixie_qa-0.2.2/pixie/config.py +0 -54
  66. pixie_qa-0.2.2/pixie/dataset/__init__.py +0 -11
  67. pixie_qa-0.2.2/pixie/dataset/models.py +0 -21
  68. pixie_qa-0.2.2/pixie/dataset/store.py +0 -212
  69. pixie_qa-0.2.2/pixie/evals/__init__.py +0 -121
  70. pixie_qa-0.2.2/pixie/evals/criteria.py +0 -77
  71. pixie_qa-0.2.2/pixie/evals/eval_utils.py +0 -358
  72. pixie_qa-0.2.2/pixie/evals/runner.py +0 -278
  73. pixie_qa-0.2.2/pixie/evals/scorecard.py +0 -916
  74. pixie_qa-0.2.2/pixie/evals/trace_capture.py +0 -70
  75. pixie_qa-0.2.2/pixie/evals/trace_helpers.py +0 -57
  76. pixie_qa-0.2.2/pixie/instrumentation/__init__.py +0 -49
  77. pixie_qa-0.2.2/pixie/instrumentation/context.py +0 -86
  78. pixie_qa-0.2.2/pixie/instrumentation/handler.py +0 -72
  79. pixie_qa-0.2.2/pixie/instrumentation/handlers.py +0 -105
  80. pixie_qa-0.2.2/pixie/instrumentation/instrumentors.py +0 -31
  81. pixie_qa-0.2.2/pixie/instrumentation/observation.py +0 -217
  82. pixie_qa-0.2.2/pixie/instrumentation/processor.py +0 -366
  83. pixie_qa-0.2.2/pixie/instrumentation/queue.py +0 -88
  84. pixie_qa-0.2.2/pixie/instrumentation/spans.py +0 -165
  85. pixie_qa-0.2.2/pixie/storage/__init__.py +0 -27
  86. pixie_qa-0.2.2/pixie/storage/evaluable.py +0 -138
  87. pixie_qa-0.2.2/pixie/storage/piccolo_conf.py +0 -10
  88. pixie_qa-0.2.2/pixie/storage/piccolo_migrations/__init__.py +0 -1
  89. pixie_qa-0.2.2/pixie/storage/serialization.py +0 -227
  90. pixie_qa-0.2.2/pixie/storage/store.py +0 -231
  91. pixie_qa-0.2.2/pixie/storage/tables.py +0 -21
  92. pixie_qa-0.2.2/pixie/storage/tree.py +0 -199
  93. pixie_qa-0.2.2/skills/eval-driven-dev/SKILL.md +0 -378
  94. pixie_qa-0.2.2/skills/eval-driven-dev/references/dataset-generation.md +0 -235
  95. pixie_qa-0.2.2/skills/eval-driven-dev/references/eval-tests.md +0 -241
  96. pixie_qa-0.2.2/skills/eval-driven-dev/references/instrumentation.md +0 -174
  97. pixie_qa-0.2.2/skills/eval-driven-dev/references/investigation.md +0 -146
  98. pixie_qa-0.2.2/skills/eval-driven-dev/references/pixie-api.md +0 -257
  99. pixie_qa-0.2.2/skills/eval-driven-dev/references/run-harness-patterns.md +0 -281
  100. pixie_qa-0.2.2/skills/eval-driven-dev/references/understanding-app.md +0 -201
  101. pixie_qa-0.2.2/specs/agent-skill-1.md +0 -25
  102. pixie_qa-0.2.2/specs/agent-skill.md +0 -71
  103. pixie_qa-0.2.2/specs/autoevals-adapters.md +0 -301
  104. pixie_qa-0.2.2/specs/dataset-management.md +0 -727
  105. pixie_qa-0.2.2/specs/evals-harness.md +0 -649
  106. pixie_qa-0.2.2/specs/expected-output-in-evals.md +0 -141
  107. pixie_qa-0.2.2/specs/instrumentation.md +0 -726
  108. pixie_qa-0.2.2/specs/manual-instrumentation-usability.md +0 -767
  109. pixie_qa-0.2.2/specs/storage.md +0 -473
  110. pixie_qa-0.2.2/specs/usability-utils.md +0 -327
  111. pixie_qa-0.2.2/tests/__init__.py +0 -0
  112. pixie_qa-0.2.2/tests/pixie/__init__.py +0 -0
  113. pixie_qa-0.2.2/tests/pixie/cli/__init__.py +0 -0
  114. pixie_qa-0.2.2/tests/pixie/cli/conftest.py +0 -15
  115. pixie_qa-0.2.2/tests/pixie/cli/e2e_cases.json +0 -183
  116. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/conftest.py +0 -9
  117. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -45
  118. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -156
  119. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -106
  120. pixie_qa-0.2.2/tests/pixie/cli/test_dataset_command.py +0 -412
  121. pixie_qa-0.2.2/tests/pixie/cli/test_e2e_pixie_test.py +0 -343
  122. pixie_qa-0.2.2/tests/pixie/cli/test_main.py +0 -261
  123. pixie_qa-0.2.2/tests/pixie/cli/test_trace_command.py +0 -324
  124. pixie_qa-0.2.2/tests/pixie/dataset/__init__.py +0 -0
  125. pixie_qa-0.2.2/tests/pixie/dataset/test_models.py +0 -64
  126. pixie_qa-0.2.2/tests/pixie/dataset/test_store.py +0 -222
  127. pixie_qa-0.2.2/tests/pixie/evals/__init__.py +0 -0
  128. pixie_qa-0.2.2/tests/pixie/evals/test_criteria.py +0 -116
  129. pixie_qa-0.2.2/tests/pixie/evals/test_eval_utils.py +0 -666
  130. pixie_qa-0.2.2/tests/pixie/evals/test_evaluation.py +0 -186
  131. pixie_qa-0.2.2/tests/pixie/evals/test_llm_evaluator.py +0 -235
  132. pixie_qa-0.2.2/tests/pixie/evals/test_runner.py +0 -452
  133. pixie_qa-0.2.2/tests/pixie/evals/test_scorecard.py +0 -487
  134. pixie_qa-0.2.2/tests/pixie/evals/test_scorers.py +0 -558
  135. pixie_qa-0.2.2/tests/pixie/evals/test_trace_capture.py +0 -205
  136. pixie_qa-0.2.2/tests/pixie/evals/test_trace_helpers.py +0 -154
  137. pixie_qa-0.2.2/tests/pixie/instrumentation/__init__.py +0 -0
  138. pixie_qa-0.2.2/tests/pixie/instrumentation/conftest.py +0 -35
  139. pixie_qa-0.2.2/tests/pixie/instrumentation/test_context.py +0 -157
  140. pixie_qa-0.2.2/tests/pixie/instrumentation/test_handler.py +0 -192
  141. pixie_qa-0.2.2/tests/pixie/instrumentation/test_integration.py +0 -208
  142. pixie_qa-0.2.2/tests/pixie/instrumentation/test_observation.py +0 -196
  143. pixie_qa-0.2.2/tests/pixie/instrumentation/test_processor.py +0 -560
  144. pixie_qa-0.2.2/tests/pixie/instrumentation/test_queue.py +0 -223
  145. pixie_qa-0.2.2/tests/pixie/instrumentation/test_spans.py +0 -254
  146. pixie_qa-0.2.2/tests/pixie/instrumentation/test_storage_handler.py +0 -108
  147. pixie_qa-0.2.2/tests/pixie/observation_store/__init__.py +0 -0
  148. pixie_qa-0.2.2/tests/pixie/observation_store/conftest.py +0 -231
  149. pixie_qa-0.2.2/tests/pixie/observation_store/test_evaluable.py +0 -191
  150. pixie_qa-0.2.2/tests/pixie/observation_store/test_serialization.py +0 -156
  151. pixie_qa-0.2.2/tests/pixie/observation_store/test_store.py +0 -289
  152. pixie_qa-0.2.2/tests/pixie/observation_store/test_tree.py +0 -248
  153. pixie_qa-0.2.2/tests/pixie/test_config.py +0 -73
  154. pixie_qa-0.2.2/tests/pixie/test_init.py +0 -157
  155. {pixie_qa-0.2.2 → pixie_qa-0.5.0}/LICENSE +0 -0
  156. {pixie_qa-0.2.2 → pixie_qa-0.5.0}/pixie/favicon.png +0 -0
@@ -0,0 +1,8 @@
1
+ .claude
2
+ .agents
3
+ __pycache__
4
+ pixie_qa/
5
+ uv.lock
6
+ pixie/assets/index.html
7
+ frontend/node_modules/
8
+ frontend/dist/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixie-qa
3
- Version: 0.2.2
3
+ Version: 0.5.0
4
4
  Summary: Automated quality assurance for AI applications
5
5
  Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
6
  Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -44,9 +44,11 @@ Requires-Dist: openai>=2.29.0
44
44
  Requires-Dist: openinference-instrumentation>=0.1.44
45
45
  Requires-Dist: opentelemetry-api>=1.27.0
46
46
  Requires-Dist: opentelemetry-sdk>=1.27.0
47
- Requires-Dist: piccolo[sqlite]>=1.33.0
48
47
  Requires-Dist: pydantic>=2.0
49
48
  Requires-Dist: python-dotenv>=1.2.2
49
+ Requires-Dist: starlette>=1.0.0
50
+ Requires-Dist: uvicorn>=0.42.0
51
+ Requires-Dist: watchfiles>=1.1.1
50
52
  Provides-Extra: all
51
53
  Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
52
54
  Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
@@ -67,18 +69,19 @@ Description-Content-Type: text/markdown
67
69
 
68
70
  # pixie-qa
69
71
 
70
- An agent skill that make coding agent the QA engineer for LLM applications.
72
+ An agent skill that makes coding agents the QA engineer for LLM applications.
71
73
 
72
74
  ## What the Skill Does
73
75
 
74
76
  The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
75
77
 
76
78
  1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
77
- 2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
78
- 3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
79
- 4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
79
+ 2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
80
+ 3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
81
+ 4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
80
82
  5. **Run the tests** — `pixie test` to run all evals and report per-case scores
81
- 6. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
83
+ 6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
84
+ 7. **Investigate failures** — diagnose failures, fix, repeat
82
85
 
83
86
  ## Getting Started
84
87
 
@@ -100,4 +103,34 @@ Your coding agent will read your code, instrument it, build a dataset from a few
100
103
 
101
104
  ## Python Package
102
105
 
103
- The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
106
+ The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
107
+
108
+ Install hooks once per clone:
109
+
110
+ ```bash
111
+ uv run pre-commit install
112
+ ```
113
+
114
+ ## Web UI
115
+
116
+ View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
117
+
118
+ ```bash
119
+ pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
120
+ pixie start my_dir # use a custom artifact root
121
+ pixie init # scaffolds pixie_qa/ without starting the server
122
+ ```
123
+
124
+ The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
125
+
126
+ The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
127
+
128
+ ## Configuration
129
+
130
+ Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
131
+
132
+ Useful settings include:
133
+
134
+ - `PIXIE_ROOT` to move all generated artefacts under a different root directory
135
+ - `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
136
+ - `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
@@ -0,0 +1,67 @@
1
+ # pixie-qa
2
+
3
+ An agent skill that makes coding agents the QA engineer for LLM applications.
4
+
5
+ ## What the Skill Does
6
+
7
+ The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
8
+
9
+ 1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
10
+ 2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
11
+ 3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
12
+ 4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
13
+ 5. **Run the tests** — `pixie test` to run all evals and report per-case scores
14
+ 6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
15
+ 7. **Investigate failures** — diagnose failures, fix, repeat
16
+
17
+ ## Getting Started
18
+
19
+ ### 1. Add the skill to your coding agent
20
+
21
+ ```bash
22
+ npx skills add yiouli/pixie-qa
23
+ ```
24
+
25
+ The accompanying python package would be installed by the skill automatically when it's used.
26
+
27
+ ### 2. Ask coding agent to set up evals
28
+
29
+ Open a conversation and say something like when developing a python based AI project:
30
+
31
+ > "setup QA for my agent"
32
+
33
+ Your coding agent will read your code, instrument it, build a dataset from a few real runs, write and run eval-based tests, investigate failures and fix.
34
+
35
+ ## Python Package
36
+
37
+ The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
38
+
39
+ Install hooks once per clone:
40
+
41
+ ```bash
42
+ uv run pre-commit install
43
+ ```
44
+
45
+ ## Web UI
46
+
47
+ View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
48
+
49
+ ```bash
50
+ pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
51
+ pixie start my_dir # use a custom artifact root
52
+ pixie init # scaffolds pixie_qa/ without starting the server
53
+ ```
54
+
55
+ The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
56
+
57
+ The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
58
+
59
+ ## Configuration
60
+
61
+ Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
62
+
63
+ Useful settings include:
64
+
65
+ - `PIXIE_ROOT` to move all generated artefacts under a different root directory
66
+ - `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
67
+ - `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
@@ -0,0 +1,90 @@
1
+ """pixie — automated quality assurance for AI applications.
2
+
3
+ Re-exports the full public API so users can ``from pixie import ...``
4
+ for every commonly used symbol without needing submodule paths.
5
+ """
6
+
7
+ from pixie.eval.evaluable import Evaluable, TestCase
8
+ from pixie.eval.evaluation import Evaluation, Evaluator, evaluate
9
+ from pixie.eval.llm_evaluator import create_llm_evaluator
10
+ from pixie.eval.scorers import (
11
+ AnswerCorrectness,
12
+ AnswerRelevancy,
13
+ AutoevalsAdapter,
14
+ Battle,
15
+ ClosedQA,
16
+ ContextRelevancy,
17
+ EmbeddingSimilarity,
18
+ ExactMatch,
19
+ Factuality,
20
+ Faithfulness,
21
+ Humor,
22
+ JSONDiff,
23
+ LevenshteinMatch,
24
+ ListContains,
25
+ Moderation,
26
+ NumericDiff,
27
+ Possible,
28
+ Security,
29
+ Sql,
30
+ Summary,
31
+ Translation,
32
+ ValidJSON,
33
+ )
34
+
35
+ # -- Harness ------------------------------------------------------------------
36
+ from pixie.harness.runnable import Runnable
37
+
38
+ # -- Instrumentation ----------------------------------------------------------
39
+ from pixie.instrumentation.llm_tracing import (
40
+ add_handler,
41
+ enable_llm_tracing,
42
+ flush,
43
+ remove_handler,
44
+ )
45
+ from pixie.instrumentation.wrap import (
46
+ WrappedData,
47
+ wrap,
48
+ )
49
+
50
+ __all__ = [
51
+ # Instrumentation
52
+ "WrappedData",
53
+ "flush",
54
+ "enable_llm_tracing",
55
+ "add_handler",
56
+ "remove_handler",
57
+ "wrap",
58
+ # Harness
59
+ "Runnable",
60
+ # Eval data models
61
+ "Evaluable",
62
+ "TestCase",
63
+ "Evaluation",
64
+ "Evaluator",
65
+ "evaluate",
66
+ "create_llm_evaluator",
67
+ # Pre-made evaluators (autoevals adapters)
68
+ "AnswerCorrectness",
69
+ "AnswerRelevancy",
70
+ "AutoevalsAdapter",
71
+ "Battle",
72
+ "ClosedQA",
73
+ "ContextRelevancy",
74
+ "EmbeddingSimilarity",
75
+ "ExactMatch",
76
+ "Factuality",
77
+ "Faithfulness",
78
+ "Humor",
79
+ "JSONDiff",
80
+ "LevenshteinMatch",
81
+ "ListContains",
82
+ "Moderation",
83
+ "NumericDiff",
84
+ "Possible",
85
+ "Security",
86
+ "Sql",
87
+ "Summary",
88
+ "Translation",
89
+ "ValidJSON",
90
+ ]
@@ -0,0 +1,407 @@
1
+ {
2
+ "command_args": "pixie test tests/",
3
+ "timestamp": "2025-06-15 12:00:00 UTC",
4
+ "pixie_repo_url": "https://github.com/yiouli/pixie-qa",
5
+ "feedback_url": "https://feedback.gopixie.ai/feedback",
6
+ "brand_icon_url": "https://github.com/user-attachments/assets/76c18199-f00a-4fb3-a12f-ce6c173727af",
7
+ "test_records": [
8
+ {
9
+ "name": "test_customer_faq.py::test_faq_factuality",
10
+ "status": "passed",
11
+ "message": null,
12
+ "asserts": [
13
+ {
14
+ "evaluator_names": [
15
+ "MockFactuality"
16
+ ],
17
+ "input_labels": [
18
+ "What is your return policy?",
19
+ "How do I track my order?",
20
+ "Do you offer international shipping?",
21
+ "What payment methods do you accept?",
22
+ "How can I contact support?"
23
+ ],
24
+ "results": [
25
+ [
26
+ [
27
+ {
28
+ "score": 0.85,
29
+ "reasoning": "High string similarity between expected and actual output.",
30
+ "details": {}
31
+ }
32
+ ],
33
+ [
34
+ {
35
+ "score": 0.72,
36
+ "reasoning": "Moderate string similarity.",
37
+ "details": {}
38
+ }
39
+ ],
40
+ [
41
+ {
42
+ "score": 0.91,
43
+ "reasoning": "Very high similarity match.",
44
+ "details": {}
45
+ }
46
+ ],
47
+ [
48
+ {
49
+ "score": 0.68,
50
+ "reasoning": "Reasonable similarity.",
51
+ "details": {}
52
+ }
53
+ ],
54
+ [
55
+ {
56
+ "score": 0.77,
57
+ "reasoning": "Good similarity match.",
58
+ "details": {}
59
+ }
60
+ ]
61
+ ]
62
+ ],
63
+ "passed": true,
64
+ "criteria_message": "Pass: 4/5 inputs passed (threshold 0.6, required 80%)",
65
+ "scoring_strategy": "Each evaluator score must be ≥ 0.6. At least 80% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
66
+ "evaluable_dicts": [
67
+ {
68
+ "input": "What is your return policy?",
69
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
70
+ "actual_output": "Items can be returned within 30 days for a full refund.",
71
+ "metadata": {}
72
+ },
73
+ {
74
+ "input": "How do I track my order?",
75
+ "expected_output": "You can track your order using the tracking link sent to your email.",
76
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
77
+ "metadata": {}
78
+ },
79
+ {
80
+ "input": "Do you offer international shipping?",
81
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
82
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
83
+ "metadata": {}
84
+ },
85
+ {
86
+ "input": "What payment methods do you accept?",
87
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
88
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
89
+ "metadata": {}
90
+ },
91
+ {
92
+ "input": "How can I contact support?",
93
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
94
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
95
+ "metadata": {}
96
+ }
97
+ ]
98
+ }
99
+ ]
100
+ },
101
+ {
102
+ "name": "test_customer_faq.py::test_faq_multi_evaluator",
103
+ "status": "failed",
104
+ "message": "AssertionError: 3/5 inputs failed on at least one evaluator",
105
+ "asserts": [
106
+ {
107
+ "evaluator_names": [
108
+ "MockFactuality",
109
+ "MockClosedQA"
110
+ ],
111
+ "input_labels": [
112
+ "What is your return policy?",
113
+ "How do I track my order?",
114
+ "Do you offer international shipping?",
115
+ "What payment methods do you accept?",
116
+ "How can I contact support?"
117
+ ],
118
+ "results": [
119
+ [
120
+ [
121
+ {
122
+ "score": 0.85,
123
+ "reasoning": "High similarity.",
124
+ "details": {}
125
+ },
126
+ {
127
+ "score": 0.3,
128
+ "reasoning": "Low keyword overlap.",
129
+ "details": {}
130
+ }
131
+ ],
132
+ [
133
+ {
134
+ "score": 0.72,
135
+ "reasoning": "Moderate similarity.",
136
+ "details": {}
137
+ },
138
+ {
139
+ "score": 0.45,
140
+ "reasoning": "Below threshold keyword overlap.",
141
+ "details": {}
142
+ }
143
+ ],
144
+ [
145
+ {
146
+ "score": 0.91,
147
+ "reasoning": "Very high similarity.",
148
+ "details": {}
149
+ },
150
+ {
151
+ "score": 0.6,
152
+ "reasoning": "Acceptable keyword overlap.",
153
+ "details": {}
154
+ }
155
+ ],
156
+ [
157
+ {
158
+ "score": 0.68,
159
+ "reasoning": "Reasonable similarity.",
160
+ "details": {}
161
+ },
162
+ {
163
+ "score": 0.25,
164
+ "reasoning": "Poor keyword match.",
165
+ "details": {}
166
+ }
167
+ ],
168
+ [
169
+ {
170
+ "score": 0.77,
171
+ "reasoning": "Good similarity.",
172
+ "details": {}
173
+ },
174
+ {
175
+ "score": 0.55,
176
+ "reasoning": "Marginal keyword overlap.",
177
+ "details": {}
178
+ }
179
+ ]
180
+ ]
181
+ ],
182
+ "passed": false,
183
+ "criteria_message": "Fail: only 2/5 inputs passed on all evaluators (required 100%)",
184
+ "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
185
+ "evaluable_dicts": [
186
+ {
187
+ "input": "What is your return policy?",
188
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
189
+ "actual_output": "Items can be returned within 30 days for a full refund.",
190
+ "metadata": {}
191
+ },
192
+ {
193
+ "input": "How do I track my order?",
194
+ "expected_output": "You can track your order using the tracking link sent to your email.",
195
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
196
+ "metadata": {}
197
+ },
198
+ {
199
+ "input": "Do you offer international shipping?",
200
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
201
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
202
+ "metadata": {}
203
+ },
204
+ {
205
+ "input": "What payment methods do you accept?",
206
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
207
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
208
+ "metadata": {}
209
+ },
210
+ {
211
+ "input": "How can I contact support?",
212
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
213
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
214
+ "metadata": {}
215
+ }
216
+ ]
217
+ }
218
+ ]
219
+ },
220
+ {
221
+ "name": "test_customer_faq.py::test_faq_no_hallucinations",
222
+ "status": "passed",
223
+ "message": null,
224
+ "asserts": [
225
+ {
226
+ "evaluator_names": [
227
+ "MockHallucination"
228
+ ],
229
+ "input_labels": [
230
+ "What is your return policy?",
231
+ "How do I track my order?",
232
+ "Do you offer international shipping?",
233
+ "What payment methods do you accept?",
234
+ "How can I contact support?"
235
+ ],
236
+ "results": [
237
+ [
238
+ [
239
+ {
240
+ "score": 0.95,
241
+ "reasoning": "No hallucination detected.",
242
+ "details": {}
243
+ }
244
+ ],
245
+ [
246
+ {
247
+ "score": 0.95,
248
+ "reasoning": "No hallucination detected.",
249
+ "details": {}
250
+ }
251
+ ],
252
+ [
253
+ {
254
+ "score": 0.95,
255
+ "reasoning": "No hallucination detected.",
256
+ "details": {}
257
+ }
258
+ ],
259
+ [
260
+ {
261
+ "score": 0.95,
262
+ "reasoning": "No hallucination detected.",
263
+ "details": {}
264
+ }
265
+ ],
266
+ [
267
+ {
268
+ "score": 0.95,
269
+ "reasoning": "No hallucination detected.",
270
+ "details": {}
271
+ }
272
+ ]
273
+ ]
274
+ ],
275
+ "passed": true,
276
+ "criteria_message": "Pass: 5/5 inputs passed (threshold 0.5, required 100%)",
277
+ "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
278
+ "evaluable_dicts": [
279
+ {
280
+ "input": "What is your return policy?",
281
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
282
+ "actual_output": "Items can be returned within 30 days for a full refund.",
283
+ "metadata": {}
284
+ },
285
+ {
286
+ "input": "How do I track my order?",
287
+ "expected_output": "You can track your order using the tracking link sent to your email.",
288
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
289
+ "metadata": {}
290
+ },
291
+ {
292
+ "input": "Do you offer international shipping?",
293
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
294
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
295
+ "metadata": {}
296
+ },
297
+ {
298
+ "input": "What payment methods do you accept?",
299
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
300
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
301
+ "metadata": {}
302
+ },
303
+ {
304
+ "input": "How can I contact support?",
305
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
306
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
307
+ "metadata": {}
308
+ }
309
+ ]
310
+ }
311
+ ]
312
+ },
313
+ {
314
+ "name": "test_customer_faq.py::test_faq_tone_check",
315
+ "status": "failed",
316
+ "message": "AssertionError: all 5 inputs failed",
317
+ "asserts": [
318
+ {
319
+ "evaluator_names": [
320
+ "MockStrictTone"
321
+ ],
322
+ "input_labels": [
323
+ "What is your return policy?",
324
+ "How do I track my order?",
325
+ "Do you offer international shipping?",
326
+ "What payment methods do you accept?",
327
+ "How can I contact support?"
328
+ ],
329
+ "results": [
330
+ [
331
+ [
332
+ {
333
+ "score": 0.2,
334
+ "reasoning": "Tone does not meet strict requirements.",
335
+ "details": {}
336
+ }
337
+ ],
338
+ [
339
+ {
340
+ "score": 0.2,
341
+ "reasoning": "Tone does not meet strict requirements.",
342
+ "details": {}
343
+ }
344
+ ],
345
+ [
346
+ {
347
+ "score": 0.2,
348
+ "reasoning": "Tone does not meet strict requirements.",
349
+ "details": {}
350
+ }
351
+ ],
352
+ [
353
+ {
354
+ "score": 0.2,
355
+ "reasoning": "Tone does not meet strict requirements.",
356
+ "details": {}
357
+ }
358
+ ],
359
+ [
360
+ {
361
+ "score": 0.2,
362
+ "reasoning": "Tone does not meet strict requirements.",
363
+ "details": {}
364
+ }
365
+ ]
366
+ ]
367
+ ],
368
+ "passed": false,
369
+ "criteria_message": "Fail: 0/5 inputs passed (threshold 0.5, required 100%)",
370
+ "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
371
+ "evaluable_dicts": [
372
+ {
373
+ "input": "What is your return policy?",
374
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
375
+ "actual_output": "Items can be returned within 30 days for a full refund.",
376
+ "metadata": {}
377
+ },
378
+ {
379
+ "input": "How do I track my order?",
380
+ "expected_output": "You can track your order using the tracking link sent to your email.",
381
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
382
+ "metadata": {}
383
+ },
384
+ {
385
+ "input": "Do you offer international shipping?",
386
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
387
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
388
+ "metadata": {}
389
+ },
390
+ {
391
+ "input": "What payment methods do you accept?",
392
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
393
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
394
+ "metadata": {}
395
+ },
396
+ {
397
+ "input": "How can I contact support?",
398
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
399
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
400
+ "metadata": {}
401
+ }
402
+ ]
403
+ }
404
+ ]
405
+ }
406
+ ]
407
+ }