PyPI - pixie-qa - Versions diffs - 0.2.2__tar.gz → 0.4.0__tar.gz - Mend

pixie-qa 0.2.2tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

pixie_qa-0.4.0/.gitignore +8 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/PKG-INFO +32 -3
pixie_qa-0.4.0/README.md +62 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/__init__.py +43 -40
pixie_qa-0.4.0/pixie/assets/mock-data.json +407 -0
pixie_qa-0.4.0/pixie/assets/webui.html +64 -0
pixie_qa-0.4.0/pixie/cli/analyze_command.py +156 -0
pixie_qa-0.4.0/pixie/cli/dag_command.py +75 -0
pixie_qa-0.4.0/pixie/cli/init_command.py +55 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/main.py +161 -12
pixie_qa-0.4.0/pixie/cli/start_command.py +43 -0
pixie_qa-0.4.0/pixie/cli/test_command.py +257 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/trace_command.py +108 -0
pixie_qa-0.4.0/pixie/config.py +130 -0
pixie_qa-0.4.0/pixie/dag/__init__.py +400 -0
pixie_qa-0.4.0/pixie/dag/trace_check.py +183 -0
pixie_qa-0.4.0/pixie/evals/__init__.py +184 -0
pixie_qa-0.4.0/pixie/evals/criteria.py +61 -0
pixie_qa-0.4.0/pixie/evals/dataset_runner.py +495 -0
pixie_qa-0.4.0/pixie/evals/eval_utils.py +334 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/evaluation.py +10 -0
pixie_qa-0.4.0/pixie/evals/rate_limiter.py +140 -0
pixie_qa-0.4.0/pixie/evals/scorecard.py +252 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/scorers.py +252 -110
pixie_qa-0.4.0/pixie/evals/test_result.py +239 -0
pixie_qa-0.4.0/pixie/instrumentation/__init__.py +80 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/instrumentors.py +18 -2
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/evaluable.py +2 -0
pixie_qa-0.4.0/pixie/web/__init__.py +1 -0
pixie_qa-0.4.0/pixie/web/app.py +255 -0
pixie_qa-0.4.0/pixie/web/server.py +369 -0
pixie_qa-0.4.0/pixie/web/watcher.py +99 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pyproject.toml +23 -2
pixie_qa-0.2.2/.github/copilot-instructions.md +0 -632
pixie_qa-0.2.2/.github/workflows/publish.yml +0 -80
pixie_qa-0.2.2/.gitignore +0 -4
pixie_qa-0.2.2/README.md +0 -36
pixie_qa-0.2.2/changelogs/async-handler-processing.md +0 -96
pixie_qa-0.2.2/changelogs/autoevals-adapters.md +0 -39
pixie_qa-0.2.2/changelogs/cli-dataset-commands.md +0 -37
pixie_qa-0.2.2/changelogs/dataset-management.md +0 -91
pixie_qa-0.2.2/changelogs/deep-research-demo.md +0 -43
pixie_qa-0.2.2/changelogs/eval-harness.md +0 -128
pixie_qa-0.2.2/changelogs/expected-output-in-evals.md +0 -42
pixie_qa-0.2.2/changelogs/instrumentation-module-implementation.md +0 -55
pixie_qa-0.2.2/changelogs/loud-failure-mode.md +0 -58
pixie_qa-0.2.2/changelogs/manual-instrumentation-usability.md +0 -56
pixie_qa-0.2.2/changelogs/observation-store-implementation.md +0 -53
pixie_qa-0.2.2/changelogs/observe-sensitive-field-stripping.md +0 -22
pixie_qa-0.2.2/changelogs/pixie-directory-and-skill-improvements.md +0 -63
pixie_qa-0.2.2/changelogs/pixie-test-e2e-suite.md +0 -69
pixie_qa-0.2.2/changelogs/root-package-exports-and-trace-id.md +0 -58
pixie_qa-0.2.2/changelogs/scorecard-branding-and-skill-version-check.md +0 -41
pixie_qa-0.2.2/changelogs/scorecard-eval-detail-dialog.md +0 -28
pixie_qa-0.2.2/changelogs/skill-v2-and-rootdir-discovery.md +0 -76
pixie_qa-0.2.2/changelogs/test-scorecard.md +0 -54
pixie_qa-0.2.2/changelogs/usability-utils.md +0 -60
pixie_qa-0.2.2/docs/package.md +0 -233
pixie_qa-0.2.2/pixie/cli/test_command.py +0 -120
pixie_qa-0.2.2/pixie/config.py +0 -54
pixie_qa-0.2.2/pixie/evals/__init__.py +0 -121
pixie_qa-0.2.2/pixie/evals/criteria.py +0 -77
pixie_qa-0.2.2/pixie/evals/eval_utils.py +0 -358
pixie_qa-0.2.2/pixie/evals/runner.py +0 -278
pixie_qa-0.2.2/pixie/evals/scorecard.py +0 -916
pixie_qa-0.2.2/pixie/instrumentation/__init__.py +0 -49
pixie_qa-0.2.2/skills/eval-driven-dev/SKILL.md +0 -378
pixie_qa-0.2.2/skills/eval-driven-dev/references/dataset-generation.md +0 -235
pixie_qa-0.2.2/skills/eval-driven-dev/references/eval-tests.md +0 -241
pixie_qa-0.2.2/skills/eval-driven-dev/references/instrumentation.md +0 -174
pixie_qa-0.2.2/skills/eval-driven-dev/references/investigation.md +0 -146
pixie_qa-0.2.2/skills/eval-driven-dev/references/pixie-api.md +0 -257
pixie_qa-0.2.2/skills/eval-driven-dev/references/run-harness-patterns.md +0 -281
pixie_qa-0.2.2/skills/eval-driven-dev/references/understanding-app.md +0 -201
pixie_qa-0.2.2/specs/agent-skill-1.md +0 -25
pixie_qa-0.2.2/specs/agent-skill.md +0 -71
pixie_qa-0.2.2/specs/autoevals-adapters.md +0 -301
pixie_qa-0.2.2/specs/dataset-management.md +0 -727
pixie_qa-0.2.2/specs/evals-harness.md +0 -649
pixie_qa-0.2.2/specs/expected-output-in-evals.md +0 -141
pixie_qa-0.2.2/specs/instrumentation.md +0 -726
pixie_qa-0.2.2/specs/manual-instrumentation-usability.md +0 -767
pixie_qa-0.2.2/specs/storage.md +0 -473
pixie_qa-0.2.2/specs/usability-utils.md +0 -327
pixie_qa-0.2.2/tests/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/cli/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/cli/conftest.py +0 -15
pixie_qa-0.2.2/tests/pixie/cli/e2e_cases.json +0 -183
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/conftest.py +0 -9
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -45
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -156
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -106
pixie_qa-0.2.2/tests/pixie/cli/test_dataset_command.py +0 -412
pixie_qa-0.2.2/tests/pixie/cli/test_e2e_pixie_test.py +0 -343
pixie_qa-0.2.2/tests/pixie/cli/test_main.py +0 -261
pixie_qa-0.2.2/tests/pixie/cli/test_trace_command.py +0 -324
pixie_qa-0.2.2/tests/pixie/dataset/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/dataset/test_models.py +0 -64
pixie_qa-0.2.2/tests/pixie/dataset/test_store.py +0 -222
pixie_qa-0.2.2/tests/pixie/evals/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/evals/test_criteria.py +0 -116
pixie_qa-0.2.2/tests/pixie/evals/test_eval_utils.py +0 -666
pixie_qa-0.2.2/tests/pixie/evals/test_evaluation.py +0 -186
pixie_qa-0.2.2/tests/pixie/evals/test_llm_evaluator.py +0 -235
pixie_qa-0.2.2/tests/pixie/evals/test_runner.py +0 -452
pixie_qa-0.2.2/tests/pixie/evals/test_scorecard.py +0 -487
pixie_qa-0.2.2/tests/pixie/evals/test_scorers.py +0 -558
pixie_qa-0.2.2/tests/pixie/evals/test_trace_capture.py +0 -205
pixie_qa-0.2.2/tests/pixie/evals/test_trace_helpers.py +0 -154
pixie_qa-0.2.2/tests/pixie/instrumentation/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/instrumentation/conftest.py +0 -35
pixie_qa-0.2.2/tests/pixie/instrumentation/test_context.py +0 -157
pixie_qa-0.2.2/tests/pixie/instrumentation/test_handler.py +0 -192
pixie_qa-0.2.2/tests/pixie/instrumentation/test_integration.py +0 -208
pixie_qa-0.2.2/tests/pixie/instrumentation/test_observation.py +0 -196
pixie_qa-0.2.2/tests/pixie/instrumentation/test_processor.py +0 -560
pixie_qa-0.2.2/tests/pixie/instrumentation/test_queue.py +0 -223
pixie_qa-0.2.2/tests/pixie/instrumentation/test_spans.py +0 -254
pixie_qa-0.2.2/tests/pixie/instrumentation/test_storage_handler.py +0 -108
pixie_qa-0.2.2/tests/pixie/observation_store/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/observation_store/conftest.py +0 -231
pixie_qa-0.2.2/tests/pixie/observation_store/test_evaluable.py +0 -191
pixie_qa-0.2.2/tests/pixie/observation_store/test_serialization.py +0 -156
pixie_qa-0.2.2/tests/pixie/observation_store/test_store.py +0 -289
pixie_qa-0.2.2/tests/pixie/observation_store/test_tree.py +0 -248
pixie_qa-0.2.2/tests/pixie/test_config.py +0 -73
pixie_qa-0.2.2/tests/pixie/test_init.py +0 -157
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/LICENSE +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/__init__.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/cli/dataset_command.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/__init__.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/models.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/dataset/store.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/llm_evaluator.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/trace_capture.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/evals/trace_helpers.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/favicon.png +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/context.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/handler.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/handlers.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/observation.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/processor.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/queue.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/instrumentation/spans.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/__init__.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/piccolo_conf.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/piccolo_migrations/__init__.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/serialization.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/store.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/tables.py +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/storage/tree.py +0 -0

pixie_qa-0.4.0/.gitignore ADDED Viewed

@@ -0,0 +1,8 @@
+.claude
+.agents
+__pycache__
+pixie_qa/
+uv.lock
+pixie/assets/index.html
+frontend/node_modules/
+frontend/dist/

{pixie_qa-0.2.2 → pixie_qa-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pixie-qa
-Version: 0.2.2
+Version: 0.4.0
 Summary: Automated quality assurance for AI applications
 Project-URL: Homepage, https://github.com/yiouli/pixie-qa
 Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -47,6 +47,9 @@ Requires-Dist: opentelemetry-sdk>=1.27.0
 Requires-Dist: piccolo[sqlite]>=1.33.0
 Requires-Dist: pydantic>=2.0
 Requires-Dist: python-dotenv>=1.2.2
+Requires-Dist: starlette>=1.0.0
+Requires-Dist: uvicorn>=0.42.0
+Requires-Dist: watchfiles>=1.1.1
 Provides-Extra: all
 Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
 Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
@@ -77,8 +80,10 @@ The `qa-eval` skill guides your coding agent through the full eval-based QA loop
 2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
 3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
 4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
-5. **Run the tests** — `pixie test` to run all evals and report per-case scores
-6. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
+5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
+6. **Run the tests** — `pixie test` to run all evals and report per-case scores
+7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
+8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
 ## Getting Started
@@ -101,3 +106,27 @@ Your coding agent will read your code, instrument it, build a dataset from a few
 ## Python Package
 The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
+## Web UI
+View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
+```bash
+pixie start              # initializes pixie_qa/ (if needed) and opens http://localhost:7118
+pixie start my_dir       # use a custom artifact root
+pixie init               # scaffolds pixie_qa/ without starting the server
+```
+The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
+The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
+## Configuration
+Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
+Useful settings include:
+- `PIXIE_ROOT` to move all generated artefacts under a different root directory
+- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
+- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators

pixie_qa-0.4.0/README.md ADDED Viewed

@@ -0,0 +1,62 @@
+# pixie-qa
+An agent skill that make coding agent the QA engineer for LLM applications.
+## What the Skill Does
+The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
+1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
+2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
+3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
+4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
+5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
+6. **Run the tests** — `pixie test` to run all evals and report per-case scores
+7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
+8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
+## Getting Started
+### 1. Add the skill to your coding agent
+```bash
+npx skills add yiouli/pixie-qa
+```
+The accompanying python package would be installed by the skill automatically when it's used.
+### 2. Ask coding agent to set up evals
+Open a conversation and say something like when developing a python based AI project:
+> "setup QA for my agent"
+Your coding agent will read your code, instrument it, build a dataset from a few real runs, write and run eval-based tests, investigate failures and fix.
+## Python Package
+The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
+## Web UI
+View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
+```bash
+pixie start              # initializes pixie_qa/ (if needed) and opens http://localhost:7118
+pixie start my_dir       # use a custom artifact root
+pixie init               # scaffolds pixie_qa/ without starting the server
+```
+The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
+The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
+## Configuration
+Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
+Useful settings include:
+- `PIXIE_ROOT` to move all generated artefacts under a different root directory
+- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
+- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators

{pixie_qa-0.2.2 → pixie_qa-0.4.0}/pixie/__init__.py RENAMED Viewed

@@ -18,29 +18,30 @@ from pixie.evals.eval_utils import (
 )
 from pixie.evals.evaluation import Evaluation, Evaluator, evaluate
 from pixie.evals.llm_evaluator import create_llm_evaluator
+from pixie.evals.rate_limiter import RateLimitConfig, configure_rate_limits
 from pixie.evals.scorers import (
-    AnswerCorrectnessEval,
-    AnswerRelevancyEval,
+    AnswerCorrectness,
+    AnswerRelevancy,
     AutoevalsAdapter,
-    BattleEval,
-    ClosedQAEval,
-    ContextRelevancyEval,
-    EmbeddingSimilarityEval,
-    ExactMatchEval,
-    FactualityEval,
-    FaithfulnessEval,
-    HumorEval,
-    JSONDiffEval,
+    Battle,
+    ClosedQA,
+    ContextRelevancy,
+    EmbeddingSimilarity,
+    ExactMatch,
+    Factuality,
+    Faithfulness,
+    Humor,
+    JSONDiff,
     LevenshteinMatch,
-    ListContainsEval,
-    ModerationEval,
-    NumericDiffEval,
-    PossibleEval,
-    SecurityEval,
-    SqlEval,
-    SummaryEval,
-    TranslationEval,
-    ValidJSONEval,
+    ListContains,
+    Moderation,
+    NumericDiff,
+    Possible,
+    Security,
+    Sql,
+    Summary,
+    Translation,
+    ValidJSON,
 )
 from pixie.evals.trace_capture import MemoryTraceHandler, capture_traces
 from pixie.evals.trace_helpers import last_llm_call, root
@@ -67,36 +68,38 @@ __all__ = [
     "remove_handler",
     "start_observation",
     # Evals
-    "AnswerCorrectnessEval",
-    "AnswerRelevancyEval",
+    "AnswerCorrectness",
+    "AnswerRelevancy",
     "AutoevalsAdapter",
-    "BattleEval",
-    "ClosedQAEval",
-    "ContextRelevancyEval",
-    "EmbeddingSimilarityEval",
+    "Battle",
+    "ClosedQA",
+    "ContextRelevancy",
+    "EmbeddingSimilarity",
     "EvalAssertionError",
     "Evaluation",
     "Evaluator",
-    "ExactMatchEval",
-    "FactualityEval",
-    "FaithfulnessEval",
-    "HumorEval",
-    "JSONDiffEval",
+    "ExactMatch",
+    "Factuality",
+    "Faithfulness",
+    "Humor",
+    "JSONDiff",
     "LevenshteinMatch",
-    "ListContainsEval",
+    "ListContains",
     "MemoryTraceHandler",
-    "ModerationEval",
-    "NumericDiffEval",
-    "PossibleEval",
+    "Moderation",
+    "NumericDiff",
+    "Possible",
+    "RateLimitConfig",
     "ScoreThreshold",
-    "SecurityEval",
-    "SqlEval",
-    "SummaryEval",
-    "TranslationEval",
-    "ValidJSONEval",
+    "Security",
+    "Sql",
+    "Summary",
+    "Translation",
+    "ValidJSON",
     "assert_dataset_pass",
     "assert_pass",
     "capture_traces",
+    "configure_rate_limits",
     "create_llm_evaluator",
     "evaluate",
     "last_llm_call",

pixie_qa-0.4.0/pixie/assets/mock-data.json ADDED Viewed

@@ -0,0 +1,407 @@
+{
+    "command_args": "pixie test tests/",
+    "timestamp": "2025-06-15 12:00:00 UTC",
+    "pixie_repo_url": "https://github.com/yiouli/pixie-qa",
+    "feedback_url": "https://feedback.gopixie.ai/feedback",
+    "brand_icon_url": "https://github.com/user-attachments/assets/76c18199-f00a-4fb3-a12f-ce6c173727af",
+    "test_records": [
+        {
+            "name": "test_customer_faq.py::test_faq_factuality",
+            "status": "passed",
+            "message": null,
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockFactuality"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.85,
+                                    "reasoning": "High string similarity between expected and actual output.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.72,
+                                    "reasoning": "Moderate string similarity.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.91,
+                                    "reasoning": "Very high similarity match.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.68,
+                                    "reasoning": "Reasonable similarity.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.77,
+                                    "reasoning": "Good similarity match.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": true,
+                    "criteria_message": "Pass: 4/5 inputs passed (threshold 0.6, required 80%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.6. At least 80% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "test_customer_faq.py::test_faq_multi_evaluator",
+            "status": "failed",
+            "message": "AssertionError: 3/5 inputs failed on at least one evaluator",
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockFactuality",
+                        "MockClosedQA"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.85,
+                                    "reasoning": "High similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.3,
+                                    "reasoning": "Low keyword overlap.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.72,
+                                    "reasoning": "Moderate similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.45,
+                                    "reasoning": "Below threshold keyword overlap.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.91,
+                                    "reasoning": "Very high similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.6,
+                                    "reasoning": "Acceptable keyword overlap.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.68,
+                                    "reasoning": "Reasonable similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.25,
+                                    "reasoning": "Poor keyword match.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.77,
+                                    "reasoning": "Good similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.55,
+                                    "reasoning": "Marginal keyword overlap.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": false,
+                    "criteria_message": "Fail: only 2/5 inputs passed on all evaluators (required 100%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "test_customer_faq.py::test_faq_no_hallucinations",
+            "status": "passed",
+            "message": null,
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockHallucination"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": true,
+                    "criteria_message": "Pass: 5/5 inputs passed (threshold 0.5, required 100%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "test_customer_faq.py::test_faq_tone_check",
+            "status": "failed",
+            "message": "AssertionError: all 5 inputs failed",
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockStrictTone"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": false,
+                    "criteria_message": "Fail: 0/5 inputs passed (threshold 0.5, required 100%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}

pixie-qa 0.2.2__tar.gz → 0.4.0__tar.gz

pixie-qa 0.2.2tar.gz → 0.4.0tar.gz