pixie-qa 0.2.2__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. pixie_qa-0.4.0/.gitignore +8 -0
  2. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/PKG-INFO +32 -3
  3. pixie_qa-0.4.0/README.md +62 -0
  4. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/__init__.py +43 -40
  5. pixie_qa-0.4.0/pixie/assets/mock-data.json +407 -0
  6. pixie_qa-0.4.0/pixie/assets/webui.html +64 -0
  7. pixie_qa-0.4.0/pixie/cli/analyze_command.py +156 -0
  8. pixie_qa-0.4.0/pixie/cli/dag_command.py +75 -0
  9. pixie_qa-0.4.0/pixie/cli/init_command.py +55 -0
  10. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/main.py +161 -12
  11. pixie_qa-0.4.0/pixie/cli/start_command.py +43 -0
  12. pixie_qa-0.4.0/pixie/cli/test_command.py +257 -0
  13. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/trace_command.py +108 -0
  14. pixie_qa-0.4.0/pixie/config.py +130 -0
  15. pixie_qa-0.4.0/pixie/dag/__init__.py +400 -0
  16. pixie_qa-0.4.0/pixie/dag/trace_check.py +183 -0
  17. pixie_qa-0.4.0/pixie/evals/__init__.py +184 -0
  18. pixie_qa-0.4.0/pixie/evals/criteria.py +61 -0
  19. pixie_qa-0.4.0/pixie/evals/dataset_runner.py +495 -0
  20. pixie_qa-0.4.0/pixie/evals/eval_utils.py +334 -0
  21. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/evaluation.py +10 -0
  22. pixie_qa-0.4.0/pixie/evals/rate_limiter.py +140 -0
  23. pixie_qa-0.4.0/pixie/evals/scorecard.py +252 -0
  24. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/scorers.py +252 -110
  25. pixie_qa-0.4.0/pixie/evals/test_result.py +239 -0
  26. pixie_qa-0.4.0/pixie/instrumentation/__init__.py +80 -0
  27. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/instrumentors.py +18 -2
  28. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/evaluable.py +2 -0
  29. pixie_qa-0.4.0/pixie/web/__init__.py +1 -0
  30. pixie_qa-0.4.0/pixie/web/app.py +255 -0
  31. pixie_qa-0.4.0/pixie/web/server.py +369 -0
  32. pixie_qa-0.4.0/pixie/web/watcher.py +99 -0
  33. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pyproject.toml +23 -2
  34. pixie_qa-0.2.2/.github/copilot-instructions.md +0 -632
  35. pixie_qa-0.2.2/.github/workflows/publish.yml +0 -80
  36. pixie_qa-0.2.2/.gitignore +0 -4
  37. pixie_qa-0.2.2/README.md +0 -36
  38. pixie_qa-0.2.2/changelogs/async-handler-processing.md +0 -96
  39. pixie_qa-0.2.2/changelogs/autoevals-adapters.md +0 -39
  40. pixie_qa-0.2.2/changelogs/cli-dataset-commands.md +0 -37
  41. pixie_qa-0.2.2/changelogs/dataset-management.md +0 -91
  42. pixie_qa-0.2.2/changelogs/deep-research-demo.md +0 -43
  43. pixie_qa-0.2.2/changelogs/eval-harness.md +0 -128
  44. pixie_qa-0.2.2/changelogs/expected-output-in-evals.md +0 -42
  45. pixie_qa-0.2.2/changelogs/instrumentation-module-implementation.md +0 -55
  46. pixie_qa-0.2.2/changelogs/loud-failure-mode.md +0 -58
  47. pixie_qa-0.2.2/changelogs/manual-instrumentation-usability.md +0 -56
  48. pixie_qa-0.2.2/changelogs/observation-store-implementation.md +0 -53
  49. pixie_qa-0.2.2/changelogs/observe-sensitive-field-stripping.md +0 -22
  50. pixie_qa-0.2.2/changelogs/pixie-directory-and-skill-improvements.md +0 -63
  51. pixie_qa-0.2.2/changelogs/pixie-test-e2e-suite.md +0 -69
  52. pixie_qa-0.2.2/changelogs/root-package-exports-and-trace-id.md +0 -58
  53. pixie_qa-0.2.2/changelogs/scorecard-branding-and-skill-version-check.md +0 -41
  54. pixie_qa-0.2.2/changelogs/scorecard-eval-detail-dialog.md +0 -28
  55. pixie_qa-0.2.2/changelogs/skill-v2-and-rootdir-discovery.md +0 -76
  56. pixie_qa-0.2.2/changelogs/test-scorecard.md +0 -54
  57. pixie_qa-0.2.2/changelogs/usability-utils.md +0 -60
  58. pixie_qa-0.2.2/docs/package.md +0 -233
  59. pixie_qa-0.2.2/pixie/cli/test_command.py +0 -120
  60. pixie_qa-0.2.2/pixie/config.py +0 -54
  61. pixie_qa-0.2.2/pixie/evals/__init__.py +0 -121
  62. pixie_qa-0.2.2/pixie/evals/criteria.py +0 -77
  63. pixie_qa-0.2.2/pixie/evals/eval_utils.py +0 -358
  64. pixie_qa-0.2.2/pixie/evals/runner.py +0 -278
  65. pixie_qa-0.2.2/pixie/evals/scorecard.py +0 -916
  66. pixie_qa-0.2.2/pixie/instrumentation/__init__.py +0 -49
  67. pixie_qa-0.2.2/skills/eval-driven-dev/SKILL.md +0 -378
  68. pixie_qa-0.2.2/skills/eval-driven-dev/references/dataset-generation.md +0 -235
  69. pixie_qa-0.2.2/skills/eval-driven-dev/references/eval-tests.md +0 -241
  70. pixie_qa-0.2.2/skills/eval-driven-dev/references/instrumentation.md +0 -174
  71. pixie_qa-0.2.2/skills/eval-driven-dev/references/investigation.md +0 -146
  72. pixie_qa-0.2.2/skills/eval-driven-dev/references/pixie-api.md +0 -257
  73. pixie_qa-0.2.2/skills/eval-driven-dev/references/run-harness-patterns.md +0 -281
  74. pixie_qa-0.2.2/skills/eval-driven-dev/references/understanding-app.md +0 -201
  75. pixie_qa-0.2.2/specs/agent-skill-1.md +0 -25
  76. pixie_qa-0.2.2/specs/agent-skill.md +0 -71
  77. pixie_qa-0.2.2/specs/autoevals-adapters.md +0 -301
  78. pixie_qa-0.2.2/specs/dataset-management.md +0 -727
  79. pixie_qa-0.2.2/specs/evals-harness.md +0 -649
  80. pixie_qa-0.2.2/specs/expected-output-in-evals.md +0 -141
  81. pixie_qa-0.2.2/specs/instrumentation.md +0 -726
  82. pixie_qa-0.2.2/specs/manual-instrumentation-usability.md +0 -767
  83. pixie_qa-0.2.2/specs/storage.md +0 -473
  84. pixie_qa-0.2.2/specs/usability-utils.md +0 -327
  85. pixie_qa-0.2.2/tests/__init__.py +0 -0
  86. pixie_qa-0.2.2/tests/pixie/__init__.py +0 -0
  87. pixie_qa-0.2.2/tests/pixie/cli/__init__.py +0 -0
  88. pixie_qa-0.2.2/tests/pixie/cli/conftest.py +0 -15
  89. pixie_qa-0.2.2/tests/pixie/cli/e2e_cases.json +0 -183
  90. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/conftest.py +0 -9
  91. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -45
  92. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -156
  93. pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -106
  94. pixie_qa-0.2.2/tests/pixie/cli/test_dataset_command.py +0 -412
  95. pixie_qa-0.2.2/tests/pixie/cli/test_e2e_pixie_test.py +0 -343
  96. pixie_qa-0.2.2/tests/pixie/cli/test_main.py +0 -261
  97. pixie_qa-0.2.2/tests/pixie/cli/test_trace_command.py +0 -324
  98. pixie_qa-0.2.2/tests/pixie/dataset/__init__.py +0 -0
  99. pixie_qa-0.2.2/tests/pixie/dataset/test_models.py +0 -64
  100. pixie_qa-0.2.2/tests/pixie/dataset/test_store.py +0 -222
  101. pixie_qa-0.2.2/tests/pixie/evals/__init__.py +0 -0
  102. pixie_qa-0.2.2/tests/pixie/evals/test_criteria.py +0 -116
  103. pixie_qa-0.2.2/tests/pixie/evals/test_eval_utils.py +0 -666
  104. pixie_qa-0.2.2/tests/pixie/evals/test_evaluation.py +0 -186
  105. pixie_qa-0.2.2/tests/pixie/evals/test_llm_evaluator.py +0 -235
  106. pixie_qa-0.2.2/tests/pixie/evals/test_runner.py +0 -452
  107. pixie_qa-0.2.2/tests/pixie/evals/test_scorecard.py +0 -487
  108. pixie_qa-0.2.2/tests/pixie/evals/test_scorers.py +0 -558
  109. pixie_qa-0.2.2/tests/pixie/evals/test_trace_capture.py +0 -205
  110. pixie_qa-0.2.2/tests/pixie/evals/test_trace_helpers.py +0 -154
  111. pixie_qa-0.2.2/tests/pixie/instrumentation/__init__.py +0 -0
  112. pixie_qa-0.2.2/tests/pixie/instrumentation/conftest.py +0 -35
  113. pixie_qa-0.2.2/tests/pixie/instrumentation/test_context.py +0 -157
  114. pixie_qa-0.2.2/tests/pixie/instrumentation/test_handler.py +0 -192
  115. pixie_qa-0.2.2/tests/pixie/instrumentation/test_integration.py +0 -208
  116. pixie_qa-0.2.2/tests/pixie/instrumentation/test_observation.py +0 -196
  117. pixie_qa-0.2.2/tests/pixie/instrumentation/test_processor.py +0 -560
  118. pixie_qa-0.2.2/tests/pixie/instrumentation/test_queue.py +0 -223
  119. pixie_qa-0.2.2/tests/pixie/instrumentation/test_spans.py +0 -254
  120. pixie_qa-0.2.2/tests/pixie/instrumentation/test_storage_handler.py +0 -108
  121. pixie_qa-0.2.2/tests/pixie/observation_store/__init__.py +0 -0
  122. pixie_qa-0.2.2/tests/pixie/observation_store/conftest.py +0 -231
  123. pixie_qa-0.2.2/tests/pixie/observation_store/test_evaluable.py +0 -191
  124. pixie_qa-0.2.2/tests/pixie/observation_store/test_serialization.py +0 -156
  125. pixie_qa-0.2.2/tests/pixie/observation_store/test_store.py +0 -289
  126. pixie_qa-0.2.2/tests/pixie/observation_store/test_tree.py +0 -248
  127. pixie_qa-0.2.2/tests/pixie/test_config.py +0 -73
  128. pixie_qa-0.2.2/tests/pixie/test_init.py +0 -157
  129. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/LICENSE +0 -0
  130. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/__init__.py +0 -0
  131. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/dataset_command.py +0 -0
  132. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/__init__.py +0 -0
  133. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/models.py +0 -0
  134. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/store.py +0 -0
  135. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/llm_evaluator.py +0 -0
  136. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/trace_capture.py +0 -0
  137. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/trace_helpers.py +0 -0
  138. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/favicon.png +0 -0
  139. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/context.py +0 -0
  140. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/handler.py +0 -0
  141. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/handlers.py +0 -0
  142. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/observation.py +0 -0
  143. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/processor.py +0 -0
  144. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/queue.py +0 -0
  145. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/spans.py +0 -0
  146. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/__init__.py +0 -0
  147. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/piccolo_conf.py +0 -0
  148. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/piccolo_migrations/__init__.py +0 -0
  149. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/serialization.py +0 -0
  150. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/store.py +0 -0
  151. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/tables.py +0 -0
  152. {pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/tree.py +0 -0
@@ -0,0 +1,8 @@
1
+ .claude
2
+ .agents
3
+ __pycache__
4
+ pixie_qa/
5
+ uv.lock
6
+ pixie/assets/index.html
7
+ frontend/node_modules/
8
+ frontend/dist/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixie-qa
3
- Version: 0.2.2
3
+ Version: 0.4.0
4
4
  Summary: Automated quality assurance for AI applications
5
5
  Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
6
  Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -47,6 +47,9 @@ Requires-Dist: opentelemetry-sdk>=1.27.0
47
47
  Requires-Dist: piccolo[sqlite]>=1.33.0
48
48
  Requires-Dist: pydantic>=2.0
49
49
  Requires-Dist: python-dotenv>=1.2.2
50
+ Requires-Dist: starlette>=1.0.0
51
+ Requires-Dist: uvicorn>=0.42.0
52
+ Requires-Dist: watchfiles>=1.1.1
50
53
  Provides-Extra: all
51
54
  Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
52
55
  Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
@@ -77,8 +80,10 @@ The `qa-eval` skill guides your coding agent through the full eval-based QA loop
77
80
  2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
78
81
  3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
79
82
  4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
80
- 5. **Run the tests** — `pixie test` to run all evals and report per-case scores
81
- 6. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
83
+ 5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
84
+ 6. **Run the tests** — `pixie test` to run all evals and report per-case scores
85
+ 7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
86
+ 8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
82
87
 
83
88
  ## Getting Started
84
89
 
@@ -101,3 +106,27 @@ Your coding agent will read your code, instrument it, build a dataset from a few
101
106
  ## Python Package
102
107
 
103
108
  The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
109
+
110
+ ## Web UI
111
+
112
+ View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
113
+
114
+ ```bash
115
+ pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
116
+ pixie start my_dir # use a custom artifact root
117
+ pixie init # scaffolds pixie_qa/ without starting the server
118
+ ```
119
+
120
+ The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
121
+
122
+ The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
123
+
124
+ ## Configuration
125
+
126
+ Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
127
+
128
+ Useful settings include:
129
+
130
+ - `PIXIE_ROOT` to move all generated artefacts under a different root directory
131
+ - `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
132
+ - `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
@@ -0,0 +1,62 @@
1
+ # pixie-qa
2
+
3
+ An agent skill that make coding agent the QA engineer for LLM applications.
4
+
5
+ ## What the Skill Does
6
+
7
+ The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
8
+
9
+ 1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
10
+ 2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
11
+ 3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
12
+ 4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
13
+ 5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
14
+ 6. **Run the tests** — `pixie test` to run all evals and report per-case scores
15
+ 7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
16
+ 8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
17
+
18
+ ## Getting Started
19
+
20
+ ### 1. Add the skill to your coding agent
21
+
22
+ ```bash
23
+ npx skills add yiouli/pixie-qa
24
+ ```
25
+
26
+ The accompanying python package would be installed by the skill automatically when it's used.
27
+
28
+ ### 2. Ask coding agent to set up evals
29
+
30
+ Open a conversation and say something like when developing a python based AI project:
31
+
32
+ > "setup QA for my agent"
33
+
34
+ Your coding agent will read your code, instrument it, build a dataset from a few real runs, write and run eval-based tests, investigate failures and fix.
35
+
36
+ ## Python Package
37
+
38
+ The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
39
+
40
+ ## Web UI
41
+
42
+ View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
43
+
44
+ ```bash
45
+ pixie start # initializes pixie_qa/ (if needed) and opens http://localhost:7118
46
+ pixie start my_dir # use a custom artifact root
47
+ pixie init # scaffolds pixie_qa/ without starting the server
48
+ ```
49
+
50
+ The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
51
+
52
+ The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
53
+
54
+ ## Configuration
55
+
56
+ Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
57
+
58
+ Useful settings include:
59
+
60
+ - `PIXIE_ROOT` to move all generated artefacts under a different root directory
61
+ - `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
62
+ - `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators
@@ -18,29 +18,30 @@ from pixie.evals.eval_utils import (
18
18
  )
19
19
  from pixie.evals.evaluation import Evaluation, Evaluator, evaluate
20
20
  from pixie.evals.llm_evaluator import create_llm_evaluator
21
+ from pixie.evals.rate_limiter import RateLimitConfig, configure_rate_limits
21
22
  from pixie.evals.scorers import (
22
- AnswerCorrectnessEval,
23
- AnswerRelevancyEval,
23
+ AnswerCorrectness,
24
+ AnswerRelevancy,
24
25
  AutoevalsAdapter,
25
- BattleEval,
26
- ClosedQAEval,
27
- ContextRelevancyEval,
28
- EmbeddingSimilarityEval,
29
- ExactMatchEval,
30
- FactualityEval,
31
- FaithfulnessEval,
32
- HumorEval,
33
- JSONDiffEval,
26
+ Battle,
27
+ ClosedQA,
28
+ ContextRelevancy,
29
+ EmbeddingSimilarity,
30
+ ExactMatch,
31
+ Factuality,
32
+ Faithfulness,
33
+ Humor,
34
+ JSONDiff,
34
35
  LevenshteinMatch,
35
- ListContainsEval,
36
- ModerationEval,
37
- NumericDiffEval,
38
- PossibleEval,
39
- SecurityEval,
40
- SqlEval,
41
- SummaryEval,
42
- TranslationEval,
43
- ValidJSONEval,
36
+ ListContains,
37
+ Moderation,
38
+ NumericDiff,
39
+ Possible,
40
+ Security,
41
+ Sql,
42
+ Summary,
43
+ Translation,
44
+ ValidJSON,
44
45
  )
45
46
  from pixie.evals.trace_capture import MemoryTraceHandler, capture_traces
46
47
  from pixie.evals.trace_helpers import last_llm_call, root
@@ -67,36 +68,38 @@ __all__ = [
67
68
  "remove_handler",
68
69
  "start_observation",
69
70
  # Evals
70
- "AnswerCorrectnessEval",
71
- "AnswerRelevancyEval",
71
+ "AnswerCorrectness",
72
+ "AnswerRelevancy",
72
73
  "AutoevalsAdapter",
73
- "BattleEval",
74
- "ClosedQAEval",
75
- "ContextRelevancyEval",
76
- "EmbeddingSimilarityEval",
74
+ "Battle",
75
+ "ClosedQA",
76
+ "ContextRelevancy",
77
+ "EmbeddingSimilarity",
77
78
  "EvalAssertionError",
78
79
  "Evaluation",
79
80
  "Evaluator",
80
- "ExactMatchEval",
81
- "FactualityEval",
82
- "FaithfulnessEval",
83
- "HumorEval",
84
- "JSONDiffEval",
81
+ "ExactMatch",
82
+ "Factuality",
83
+ "Faithfulness",
84
+ "Humor",
85
+ "JSONDiff",
85
86
  "LevenshteinMatch",
86
- "ListContainsEval",
87
+ "ListContains",
87
88
  "MemoryTraceHandler",
88
- "ModerationEval",
89
- "NumericDiffEval",
90
- "PossibleEval",
89
+ "Moderation",
90
+ "NumericDiff",
91
+ "Possible",
92
+ "RateLimitConfig",
91
93
  "ScoreThreshold",
92
- "SecurityEval",
93
- "SqlEval",
94
- "SummaryEval",
95
- "TranslationEval",
96
- "ValidJSONEval",
94
+ "Security",
95
+ "Sql",
96
+ "Summary",
97
+ "Translation",
98
+ "ValidJSON",
97
99
  "assert_dataset_pass",
98
100
  "assert_pass",
99
101
  "capture_traces",
102
+ "configure_rate_limits",
100
103
  "create_llm_evaluator",
101
104
  "evaluate",
102
105
  "last_llm_call",
@@ -0,0 +1,407 @@
1
+ {
2
+ "command_args": "pixie test tests/",
3
+ "timestamp": "2025-06-15 12:00:00 UTC",
4
+ "pixie_repo_url": "https://github.com/yiouli/pixie-qa",
5
+ "feedback_url": "https://feedback.gopixie.ai/feedback",
6
+ "brand_icon_url": "https://github.com/user-attachments/assets/76c18199-f00a-4fb3-a12f-ce6c173727af",
7
+ "test_records": [
8
+ {
9
+ "name": "test_customer_faq.py::test_faq_factuality",
10
+ "status": "passed",
11
+ "message": null,
12
+ "asserts": [
13
+ {
14
+ "evaluator_names": [
15
+ "MockFactuality"
16
+ ],
17
+ "input_labels": [
18
+ "What is your return policy?",
19
+ "How do I track my order?",
20
+ "Do you offer international shipping?",
21
+ "What payment methods do you accept?",
22
+ "How can I contact support?"
23
+ ],
24
+ "results": [
25
+ [
26
+ [
27
+ {
28
+ "score": 0.85,
29
+ "reasoning": "High string similarity between expected and actual output.",
30
+ "details": {}
31
+ }
32
+ ],
33
+ [
34
+ {
35
+ "score": 0.72,
36
+ "reasoning": "Moderate string similarity.",
37
+ "details": {}
38
+ }
39
+ ],
40
+ [
41
+ {
42
+ "score": 0.91,
43
+ "reasoning": "Very high similarity match.",
44
+ "details": {}
45
+ }
46
+ ],
47
+ [
48
+ {
49
+ "score": 0.68,
50
+ "reasoning": "Reasonable similarity.",
51
+ "details": {}
52
+ }
53
+ ],
54
+ [
55
+ {
56
+ "score": 0.77,
57
+ "reasoning": "Good similarity match.",
58
+ "details": {}
59
+ }
60
+ ]
61
+ ]
62
+ ],
63
+ "passed": true,
64
+ "criteria_message": "Pass: 4/5 inputs passed (threshold 0.6, required 80%)",
65
+ "scoring_strategy": "Each evaluator score must be ≥ 0.6. At least 80% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
66
+ "evaluable_dicts": [
67
+ {
68
+ "input": "What is your return policy?",
69
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
70
+ "actual_output": "Items can be returned within 30 days for a full refund.",
71
+ "metadata": {}
72
+ },
73
+ {
74
+ "input": "How do I track my order?",
75
+ "expected_output": "You can track your order using the tracking link sent to your email.",
76
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
77
+ "metadata": {}
78
+ },
79
+ {
80
+ "input": "Do you offer international shipping?",
81
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
82
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
83
+ "metadata": {}
84
+ },
85
+ {
86
+ "input": "What payment methods do you accept?",
87
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
88
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
89
+ "metadata": {}
90
+ },
91
+ {
92
+ "input": "How can I contact support?",
93
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
94
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
95
+ "metadata": {}
96
+ }
97
+ ]
98
+ }
99
+ ]
100
+ },
101
+ {
102
+ "name": "test_customer_faq.py::test_faq_multi_evaluator",
103
+ "status": "failed",
104
+ "message": "AssertionError: 3/5 inputs failed on at least one evaluator",
105
+ "asserts": [
106
+ {
107
+ "evaluator_names": [
108
+ "MockFactuality",
109
+ "MockClosedQA"
110
+ ],
111
+ "input_labels": [
112
+ "What is your return policy?",
113
+ "How do I track my order?",
114
+ "Do you offer international shipping?",
115
+ "What payment methods do you accept?",
116
+ "How can I contact support?"
117
+ ],
118
+ "results": [
119
+ [
120
+ [
121
+ {
122
+ "score": 0.85,
123
+ "reasoning": "High similarity.",
124
+ "details": {}
125
+ },
126
+ {
127
+ "score": 0.3,
128
+ "reasoning": "Low keyword overlap.",
129
+ "details": {}
130
+ }
131
+ ],
132
+ [
133
+ {
134
+ "score": 0.72,
135
+ "reasoning": "Moderate similarity.",
136
+ "details": {}
137
+ },
138
+ {
139
+ "score": 0.45,
140
+ "reasoning": "Below threshold keyword overlap.",
141
+ "details": {}
142
+ }
143
+ ],
144
+ [
145
+ {
146
+ "score": 0.91,
147
+ "reasoning": "Very high similarity.",
148
+ "details": {}
149
+ },
150
+ {
151
+ "score": 0.6,
152
+ "reasoning": "Acceptable keyword overlap.",
153
+ "details": {}
154
+ }
155
+ ],
156
+ [
157
+ {
158
+ "score": 0.68,
159
+ "reasoning": "Reasonable similarity.",
160
+ "details": {}
161
+ },
162
+ {
163
+ "score": 0.25,
164
+ "reasoning": "Poor keyword match.",
165
+ "details": {}
166
+ }
167
+ ],
168
+ [
169
+ {
170
+ "score": 0.77,
171
+ "reasoning": "Good similarity.",
172
+ "details": {}
173
+ },
174
+ {
175
+ "score": 0.55,
176
+ "reasoning": "Marginal keyword overlap.",
177
+ "details": {}
178
+ }
179
+ ]
180
+ ]
181
+ ],
182
+ "passed": false,
183
+ "criteria_message": "Fail: only 2/5 inputs passed on all evaluators (required 100%)",
184
+ "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
185
+ "evaluable_dicts": [
186
+ {
187
+ "input": "What is your return policy?",
188
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
189
+ "actual_output": "Items can be returned within 30 days for a full refund.",
190
+ "metadata": {}
191
+ },
192
+ {
193
+ "input": "How do I track my order?",
194
+ "expected_output": "You can track your order using the tracking link sent to your email.",
195
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
196
+ "metadata": {}
197
+ },
198
+ {
199
+ "input": "Do you offer international shipping?",
200
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
201
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
202
+ "metadata": {}
203
+ },
204
+ {
205
+ "input": "What payment methods do you accept?",
206
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
207
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
208
+ "metadata": {}
209
+ },
210
+ {
211
+ "input": "How can I contact support?",
212
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
213
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
214
+ "metadata": {}
215
+ }
216
+ ]
217
+ }
218
+ ]
219
+ },
220
+ {
221
+ "name": "test_customer_faq.py::test_faq_no_hallucinations",
222
+ "status": "passed",
223
+ "message": null,
224
+ "asserts": [
225
+ {
226
+ "evaluator_names": [
227
+ "MockHallucination"
228
+ ],
229
+ "input_labels": [
230
+ "What is your return policy?",
231
+ "How do I track my order?",
232
+ "Do you offer international shipping?",
233
+ "What payment methods do you accept?",
234
+ "How can I contact support?"
235
+ ],
236
+ "results": [
237
+ [
238
+ [
239
+ {
240
+ "score": 0.95,
241
+ "reasoning": "No hallucination detected.",
242
+ "details": {}
243
+ }
244
+ ],
245
+ [
246
+ {
247
+ "score": 0.95,
248
+ "reasoning": "No hallucination detected.",
249
+ "details": {}
250
+ }
251
+ ],
252
+ [
253
+ {
254
+ "score": 0.95,
255
+ "reasoning": "No hallucination detected.",
256
+ "details": {}
257
+ }
258
+ ],
259
+ [
260
+ {
261
+ "score": 0.95,
262
+ "reasoning": "No hallucination detected.",
263
+ "details": {}
264
+ }
265
+ ],
266
+ [
267
+ {
268
+ "score": 0.95,
269
+ "reasoning": "No hallucination detected.",
270
+ "details": {}
271
+ }
272
+ ]
273
+ ]
274
+ ],
275
+ "passed": true,
276
+ "criteria_message": "Pass: 5/5 inputs passed (threshold 0.5, required 100%)",
277
+ "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
278
+ "evaluable_dicts": [
279
+ {
280
+ "input": "What is your return policy?",
281
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
282
+ "actual_output": "Items can be returned within 30 days for a full refund.",
283
+ "metadata": {}
284
+ },
285
+ {
286
+ "input": "How do I track my order?",
287
+ "expected_output": "You can track your order using the tracking link sent to your email.",
288
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
289
+ "metadata": {}
290
+ },
291
+ {
292
+ "input": "Do you offer international shipping?",
293
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
294
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
295
+ "metadata": {}
296
+ },
297
+ {
298
+ "input": "What payment methods do you accept?",
299
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
300
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
301
+ "metadata": {}
302
+ },
303
+ {
304
+ "input": "How can I contact support?",
305
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
306
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
307
+ "metadata": {}
308
+ }
309
+ ]
310
+ }
311
+ ]
312
+ },
313
+ {
314
+ "name": "test_customer_faq.py::test_faq_tone_check",
315
+ "status": "failed",
316
+ "message": "AssertionError: all 5 inputs failed",
317
+ "asserts": [
318
+ {
319
+ "evaluator_names": [
320
+ "MockStrictTone"
321
+ ],
322
+ "input_labels": [
323
+ "What is your return policy?",
324
+ "How do I track my order?",
325
+ "Do you offer international shipping?",
326
+ "What payment methods do you accept?",
327
+ "How can I contact support?"
328
+ ],
329
+ "results": [
330
+ [
331
+ [
332
+ {
333
+ "score": 0.2,
334
+ "reasoning": "Tone does not meet strict requirements.",
335
+ "details": {}
336
+ }
337
+ ],
338
+ [
339
+ {
340
+ "score": 0.2,
341
+ "reasoning": "Tone does not meet strict requirements.",
342
+ "details": {}
343
+ }
344
+ ],
345
+ [
346
+ {
347
+ "score": 0.2,
348
+ "reasoning": "Tone does not meet strict requirements.",
349
+ "details": {}
350
+ }
351
+ ],
352
+ [
353
+ {
354
+ "score": 0.2,
355
+ "reasoning": "Tone does not meet strict requirements.",
356
+ "details": {}
357
+ }
358
+ ],
359
+ [
360
+ {
361
+ "score": 0.2,
362
+ "reasoning": "Tone does not meet strict requirements.",
363
+ "details": {}
364
+ }
365
+ ]
366
+ ]
367
+ ],
368
+ "passed": false,
369
+ "criteria_message": "Fail: 0/5 inputs passed (threshold 0.5, required 100%)",
370
+ "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
371
+ "evaluable_dicts": [
372
+ {
373
+ "input": "What is your return policy?",
374
+ "expected_output": "You can return items within 30 days of purchase for a full refund.",
375
+ "actual_output": "Items can be returned within 30 days for a full refund.",
376
+ "metadata": {}
377
+ },
378
+ {
379
+ "input": "How do I track my order?",
380
+ "expected_output": "You can track your order using the tracking link sent to your email.",
381
+ "actual_output": "Check the tracking link in your confirmation email to track your order.",
382
+ "metadata": {}
383
+ },
384
+ {
385
+ "input": "Do you offer international shipping?",
386
+ "expected_output": "Yes, we ship to over 50 countries worldwide.",
387
+ "actual_output": "Yes, we offer international shipping to over 50 countries.",
388
+ "metadata": {}
389
+ },
390
+ {
391
+ "input": "What payment methods do you accept?",
392
+ "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
393
+ "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
394
+ "metadata": {}
395
+ },
396
+ {
397
+ "input": "How can I contact support?",
398
+ "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
399
+ "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
400
+ "metadata": {}
401
+ }
402
+ ]
403
+ }
404
+ ]
405
+ }
406
+ ]
407
+ }