PyPI - pixie-qa - Versions diffs - 0.2.2__tar.gz → 0.5.0__tar.gz - Mend

pixie-qa 0.2.2tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (156) hide show

pixie_qa-0.5.0/.gitignore +8 -0
{pixie_qa-0.2.2 → pixie_qa-0.5.0}/PKG-INFO +41 -8
pixie_qa-0.5.0/README.md +67 -0
pixie_qa-0.5.0/pixie/__init__.py +90 -0
pixie_qa-0.5.0/pixie/assets/mock-data.json +407 -0
pixie_qa-0.5.0/pixie/assets/webui.html +64 -0
pixie_qa-0.5.0/pixie/cli/__init__.py +11 -0
pixie_qa-0.5.0/pixie/cli/analyze_command.py +156 -0
pixie_qa-0.5.0/pixie/cli/format_command.py +223 -0
pixie_qa-0.5.0/pixie/cli/init_command.py +55 -0
pixie_qa-0.5.0/pixie/cli/main.py +202 -0
pixie_qa-0.5.0/pixie/cli/start_command.py +43 -0
pixie_qa-0.5.0/pixie/cli/test_command.py +178 -0
pixie_qa-0.5.0/pixie/cli/trace_command.py +128 -0
pixie_qa-0.5.0/pixie/config.py +130 -0
pixie_qa-0.5.0/pixie/eval/__init__.py +143 -0
pixie_qa-0.5.0/pixie/eval/evaluable.py +100 -0
{pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/evaluation.py +15 -11
{pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/llm_evaluator.py +35 -19
pixie_qa-0.5.0/pixie/eval/rate_limiter.py +140 -0
{pixie_qa-0.2.2/pixie/evals → pixie_qa-0.5.0/pixie/eval}/scorers.py +268 -123
pixie_qa-0.5.0/pixie/harness/__init__.py +8 -0
pixie_qa-0.5.0/pixie/harness/run_result.py +239 -0
pixie_qa-0.5.0/pixie/harness/runnable.py +133 -0
pixie_qa-0.5.0/pixie/harness/runner.py +813 -0
pixie_qa-0.5.0/pixie/instrumentation/__init__.py +99 -0
pixie_qa-0.5.0/pixie/instrumentation/llm_tracing.py +818 -0
pixie_qa-0.5.0/pixie/instrumentation/wrap.py +323 -0
pixie_qa-0.5.0/pixie/web/__init__.py +7 -0
pixie_qa-0.5.0/pixie/web/app.py +266 -0
pixie_qa-0.5.0/pixie/web/server.py +369 -0
pixie_qa-0.5.0/pixie/web/watcher.py +102 -0
{pixie_qa-0.2.2 → pixie_qa-0.5.0}/pyproject.toml +24 -4
pixie_qa-0.2.2/.github/copilot-instructions.md +0 -632
pixie_qa-0.2.2/.github/workflows/publish.yml +0 -80
pixie_qa-0.2.2/.gitignore +0 -4
pixie_qa-0.2.2/README.md +0 -36
pixie_qa-0.2.2/changelogs/async-handler-processing.md +0 -96
pixie_qa-0.2.2/changelogs/autoevals-adapters.md +0 -39
pixie_qa-0.2.2/changelogs/cli-dataset-commands.md +0 -37
pixie_qa-0.2.2/changelogs/dataset-management.md +0 -91
pixie_qa-0.2.2/changelogs/deep-research-demo.md +0 -43
pixie_qa-0.2.2/changelogs/eval-harness.md +0 -128
pixie_qa-0.2.2/changelogs/expected-output-in-evals.md +0 -42
pixie_qa-0.2.2/changelogs/instrumentation-module-implementation.md +0 -55
pixie_qa-0.2.2/changelogs/loud-failure-mode.md +0 -58
pixie_qa-0.2.2/changelogs/manual-instrumentation-usability.md +0 -56
pixie_qa-0.2.2/changelogs/observation-store-implementation.md +0 -53
pixie_qa-0.2.2/changelogs/observe-sensitive-field-stripping.md +0 -22
pixie_qa-0.2.2/changelogs/pixie-directory-and-skill-improvements.md +0 -63
pixie_qa-0.2.2/changelogs/pixie-test-e2e-suite.md +0 -69
pixie_qa-0.2.2/changelogs/root-package-exports-and-trace-id.md +0 -58
pixie_qa-0.2.2/changelogs/scorecard-branding-and-skill-version-check.md +0 -41
pixie_qa-0.2.2/changelogs/scorecard-eval-detail-dialog.md +0 -28
pixie_qa-0.2.2/changelogs/skill-v2-and-rootdir-discovery.md +0 -76
pixie_qa-0.2.2/changelogs/test-scorecard.md +0 -54
pixie_qa-0.2.2/changelogs/usability-utils.md +0 -60
pixie_qa-0.2.2/docs/package.md +0 -233
pixie_qa-0.2.2/pixie/__init__.py +0 -110
pixie_qa-0.2.2/pixie/cli/__init__.py +0 -6
pixie_qa-0.2.2/pixie/cli/dataset_command.py +0 -193
pixie_qa-0.2.2/pixie/cli/main.py +0 -307
pixie_qa-0.2.2/pixie/cli/test_command.py +0 -120
pixie_qa-0.2.2/pixie/cli/trace_command.py +0 -186
pixie_qa-0.2.2/pixie/config.py +0 -54
pixie_qa-0.2.2/pixie/dataset/__init__.py +0 -11
pixie_qa-0.2.2/pixie/dataset/models.py +0 -21
pixie_qa-0.2.2/pixie/dataset/store.py +0 -212
pixie_qa-0.2.2/pixie/evals/__init__.py +0 -121
pixie_qa-0.2.2/pixie/evals/criteria.py +0 -77
pixie_qa-0.2.2/pixie/evals/eval_utils.py +0 -358
pixie_qa-0.2.2/pixie/evals/runner.py +0 -278
pixie_qa-0.2.2/pixie/evals/scorecard.py +0 -916
pixie_qa-0.2.2/pixie/evals/trace_capture.py +0 -70
pixie_qa-0.2.2/pixie/evals/trace_helpers.py +0 -57
pixie_qa-0.2.2/pixie/instrumentation/__init__.py +0 -49
pixie_qa-0.2.2/pixie/instrumentation/context.py +0 -86
pixie_qa-0.2.2/pixie/instrumentation/handler.py +0 -72
pixie_qa-0.2.2/pixie/instrumentation/handlers.py +0 -105
pixie_qa-0.2.2/pixie/instrumentation/instrumentors.py +0 -31
pixie_qa-0.2.2/pixie/instrumentation/observation.py +0 -217
pixie_qa-0.2.2/pixie/instrumentation/processor.py +0 -366
pixie_qa-0.2.2/pixie/instrumentation/queue.py +0 -88
pixie_qa-0.2.2/pixie/instrumentation/spans.py +0 -165
pixie_qa-0.2.2/pixie/storage/__init__.py +0 -27
pixie_qa-0.2.2/pixie/storage/evaluable.py +0 -138
pixie_qa-0.2.2/pixie/storage/piccolo_conf.py +0 -10
pixie_qa-0.2.2/pixie/storage/piccolo_migrations/__init__.py +0 -1
pixie_qa-0.2.2/pixie/storage/serialization.py +0 -227
pixie_qa-0.2.2/pixie/storage/store.py +0 -231
pixie_qa-0.2.2/pixie/storage/tables.py +0 -21
pixie_qa-0.2.2/pixie/storage/tree.py +0 -199
pixie_qa-0.2.2/skills/eval-driven-dev/SKILL.md +0 -378
pixie_qa-0.2.2/skills/eval-driven-dev/references/dataset-generation.md +0 -235
pixie_qa-0.2.2/skills/eval-driven-dev/references/eval-tests.md +0 -241
pixie_qa-0.2.2/skills/eval-driven-dev/references/instrumentation.md +0 -174
pixie_qa-0.2.2/skills/eval-driven-dev/references/investigation.md +0 -146
pixie_qa-0.2.2/skills/eval-driven-dev/references/pixie-api.md +0 -257
pixie_qa-0.2.2/skills/eval-driven-dev/references/run-harness-patterns.md +0 -281
pixie_qa-0.2.2/skills/eval-driven-dev/references/understanding-app.md +0 -201
pixie_qa-0.2.2/specs/agent-skill-1.md +0 -25
pixie_qa-0.2.2/specs/agent-skill.md +0 -71
pixie_qa-0.2.2/specs/autoevals-adapters.md +0 -301
pixie_qa-0.2.2/specs/dataset-management.md +0 -727
pixie_qa-0.2.2/specs/evals-harness.md +0 -649
pixie_qa-0.2.2/specs/expected-output-in-evals.md +0 -141
pixie_qa-0.2.2/specs/instrumentation.md +0 -726
pixie_qa-0.2.2/specs/manual-instrumentation-usability.md +0 -767
pixie_qa-0.2.2/specs/storage.md +0 -473
pixie_qa-0.2.2/specs/usability-utils.md +0 -327
pixie_qa-0.2.2/tests/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/cli/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/cli/conftest.py +0 -15
pixie_qa-0.2.2/tests/pixie/cli/e2e_cases.json +0 -183
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/conftest.py +0 -9
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -45
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -156
pixie_qa-0.2.2/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -106
pixie_qa-0.2.2/tests/pixie/cli/test_dataset_command.py +0 -412
pixie_qa-0.2.2/tests/pixie/cli/test_e2e_pixie_test.py +0 -343
pixie_qa-0.2.2/tests/pixie/cli/test_main.py +0 -261
pixie_qa-0.2.2/tests/pixie/cli/test_trace_command.py +0 -324
pixie_qa-0.2.2/tests/pixie/dataset/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/dataset/test_models.py +0 -64
pixie_qa-0.2.2/tests/pixie/dataset/test_store.py +0 -222
pixie_qa-0.2.2/tests/pixie/evals/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/evals/test_criteria.py +0 -116
pixie_qa-0.2.2/tests/pixie/evals/test_eval_utils.py +0 -666
pixie_qa-0.2.2/tests/pixie/evals/test_evaluation.py +0 -186
pixie_qa-0.2.2/tests/pixie/evals/test_llm_evaluator.py +0 -235
pixie_qa-0.2.2/tests/pixie/evals/test_runner.py +0 -452
pixie_qa-0.2.2/tests/pixie/evals/test_scorecard.py +0 -487
pixie_qa-0.2.2/tests/pixie/evals/test_scorers.py +0 -558
pixie_qa-0.2.2/tests/pixie/evals/test_trace_capture.py +0 -205
pixie_qa-0.2.2/tests/pixie/evals/test_trace_helpers.py +0 -154
pixie_qa-0.2.2/tests/pixie/instrumentation/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/instrumentation/conftest.py +0 -35
pixie_qa-0.2.2/tests/pixie/instrumentation/test_context.py +0 -157
pixie_qa-0.2.2/tests/pixie/instrumentation/test_handler.py +0 -192
pixie_qa-0.2.2/tests/pixie/instrumentation/test_integration.py +0 -208
pixie_qa-0.2.2/tests/pixie/instrumentation/test_observation.py +0 -196
pixie_qa-0.2.2/tests/pixie/instrumentation/test_processor.py +0 -560
pixie_qa-0.2.2/tests/pixie/instrumentation/test_queue.py +0 -223
pixie_qa-0.2.2/tests/pixie/instrumentation/test_spans.py +0 -254
pixie_qa-0.2.2/tests/pixie/instrumentation/test_storage_handler.py +0 -108
pixie_qa-0.2.2/tests/pixie/observation_store/__init__.py +0 -0
pixie_qa-0.2.2/tests/pixie/observation_store/conftest.py +0 -231
pixie_qa-0.2.2/tests/pixie/observation_store/test_evaluable.py +0 -191
pixie_qa-0.2.2/tests/pixie/observation_store/test_serialization.py +0 -156
pixie_qa-0.2.2/tests/pixie/observation_store/test_store.py +0 -289
pixie_qa-0.2.2/tests/pixie/observation_store/test_tree.py +0 -248
pixie_qa-0.2.2/tests/pixie/test_config.py +0 -73
pixie_qa-0.2.2/tests/pixie/test_init.py +0 -157
{pixie_qa-0.2.2 → pixie_qa-0.5.0}/LICENSE +0 -0
{pixie_qa-0.2.2 → pixie_qa-0.5.0}/pixie/favicon.png +0 -0

pixie_qa-0.5.0/.gitignore ADDED Viewed

@@ -0,0 +1,8 @@
+.claude
+.agents
+__pycache__
+pixie_qa/
+uv.lock
+pixie/assets/index.html
+frontend/node_modules/
+frontend/dist/

{pixie_qa-0.2.2 → pixie_qa-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pixie-qa
-Version: 0.2.2
+Version: 0.5.0
 Summary: Automated quality assurance for AI applications
 Project-URL: Homepage, https://github.com/yiouli/pixie-qa
 Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -44,9 +44,11 @@ Requires-Dist: openai>=2.29.0
 Requires-Dist: openinference-instrumentation>=0.1.44
 Requires-Dist: opentelemetry-api>=1.27.0
 Requires-Dist: opentelemetry-sdk>=1.27.0
-Requires-Dist: piccolo[sqlite]>=1.33.0
 Requires-Dist: pydantic>=2.0
 Requires-Dist: python-dotenv>=1.2.2
+Requires-Dist: starlette>=1.0.0
+Requires-Dist: uvicorn>=0.42.0
+Requires-Dist: watchfiles>=1.1.1
 Provides-Extra: all
 Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
 Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
@@ -67,18 +69,19 @@ Description-Content-Type: text/markdown
 # pixie-qa
-An agent skill that make coding agent the QA engineer for LLM applications.
+An agent skill that makes coding agents the QA engineer for LLM applications.
 ## What the Skill Does
 The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
 1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
-2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
-3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
-4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
+2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
+3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
+4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
 5. **Run the tests** — `pixie test` to run all evals and report per-case scores
-6. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
+6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
+7. **Investigate failures** — diagnose failures, fix, repeat
 ## Getting Started
@@ -100,4 +103,34 @@ Your coding agent will read your code, instrument it, build a dataset from a few
 ## Python Package
-The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
+The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
+Install hooks once per clone:
+```bash
+uv run pre-commit install
+```
+## Web UI
+View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
+```bash
+pixie start              # initializes pixie_qa/ (if needed) and opens http://localhost:7118
+pixie start my_dir       # use a custom artifact root
+pixie init               # scaffolds pixie_qa/ without starting the server
+```
+The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
+The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
+## Configuration
+Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
+Useful settings include:
+- `PIXIE_ROOT` to move all generated artefacts under a different root directory
+- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
+- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators

pixie_qa-0.5.0/README.md ADDED Viewed

@@ -0,0 +1,67 @@
+# pixie-qa
+An agent skill that makes coding agents the QA engineer for LLM applications.
+## What the Skill Does
+The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
+1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
+2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
+3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
+4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
+5. **Run the tests** — `pixie test` to run all evals and report per-case scores
+6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
+7. **Investigate failures** — diagnose failures, fix, repeat
+## Getting Started
+### 1. Add the skill to your coding agent
+```bash
+npx skills add yiouli/pixie-qa
+```
+The accompanying python package would be installed by the skill automatically when it's used.
+### 2. Ask coding agent to set up evals
+Open a conversation and say something like when developing a python based AI project:
+> "setup QA for my agent"
+Your coding agent will read your code, instrument it, build a dataset from a few real runs, write and run eval-based tests, investigate failures and fix.
+## Python Package
+The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
+Install hooks once per clone:
+```bash
+uv run pre-commit install
+```
+## Web UI
+View all eval artifacts (results, markdown docs, datasets, and legacy scorecards) in a live-updating local web UI:
+```bash
+pixie start              # initializes pixie_qa/ (if needed) and opens http://localhost:7118
+pixie start my_dir       # use a custom artifact root
+pixie init               # scaffolds pixie_qa/ without starting the server
+```
+The web UI provides tabbed navigation for results, scorecards (legacy), datasets, and markdown files. Changes to artifacts are pushed to the browser in real time via SSE.
+The server writes a `server.lock` file to the artifact root directory on startup (containing the port number) and removes it on shutdown, allowing other processes to discover whether the server is already running.
+## Configuration
+Pixie reads configuration from environment variables and a local `.env` file through a single central config layer. Existing process env vars win over `.env` values.
+Useful settings include:
+- `PIXIE_ROOT` to move all generated artefacts under a different root directory
+- `PIXIE_RATE_LIMIT_ENABLED=true` to enable evaluator throttling for `pixie test`
+- `PIXIE_RATE_LIMIT_RPS`, `PIXIE_RATE_LIMIT_RPM`, `PIXIE_RATE_LIMIT_TPS`, and `PIXIE_RATE_LIMIT_TPM` to tune request and token throughput for LLM-as-judge evaluators

pixie_qa-0.5.0/pixie/__init__.py ADDED Viewed

@@ -0,0 +1,90 @@
+"""pixie — automated quality assurance for AI applications.
+Re-exports the full public API so users can ``from pixie import ...``
+for every commonly used symbol without needing submodule paths.
+"""
+from pixie.eval.evaluable import Evaluable, TestCase
+from pixie.eval.evaluation import Evaluation, Evaluator, evaluate
+from pixie.eval.llm_evaluator import create_llm_evaluator
+from pixie.eval.scorers import (
+    AnswerCorrectness,
+    AnswerRelevancy,
+    AutoevalsAdapter,
+    Battle,
+    ClosedQA,
+    ContextRelevancy,
+    EmbeddingSimilarity,
+    ExactMatch,
+    Factuality,
+    Faithfulness,
+    Humor,
+    JSONDiff,
+    LevenshteinMatch,
+    ListContains,
+    Moderation,
+    NumericDiff,
+    Possible,
+    Security,
+    Sql,
+    Summary,
+    Translation,
+    ValidJSON,
+)
+# -- Harness ------------------------------------------------------------------
+from pixie.harness.runnable import Runnable
+# -- Instrumentation ----------------------------------------------------------
+from pixie.instrumentation.llm_tracing import (
+    add_handler,
+    enable_llm_tracing,
+    flush,
+    remove_handler,
+)
+from pixie.instrumentation.wrap import (
+    WrappedData,
+    wrap,
+)
+__all__ = [
+    # Instrumentation
+    "WrappedData",
+    "flush",
+    "enable_llm_tracing",
+    "add_handler",
+    "remove_handler",
+    "wrap",
+    # Harness
+    "Runnable",
+    # Eval data models
+    "Evaluable",
+    "TestCase",
+    "Evaluation",
+    "Evaluator",
+    "evaluate",
+    "create_llm_evaluator",
+    # Pre-made evaluators (autoevals adapters)
+    "AnswerCorrectness",
+    "AnswerRelevancy",
+    "AutoevalsAdapter",
+    "Battle",
+    "ClosedQA",
+    "ContextRelevancy",
+    "EmbeddingSimilarity",
+    "ExactMatch",
+    "Factuality",
+    "Faithfulness",
+    "Humor",
+    "JSONDiff",
+    "LevenshteinMatch",
+    "ListContains",
+    "Moderation",
+    "NumericDiff",
+    "Possible",
+    "Security",
+    "Sql",
+    "Summary",
+    "Translation",
+    "ValidJSON",
+]

pixie_qa-0.5.0/pixie/assets/mock-data.json ADDED Viewed

@@ -0,0 +1,407 @@
+{
+    "command_args": "pixie test tests/",
+    "timestamp": "2025-06-15 12:00:00 UTC",
+    "pixie_repo_url": "https://github.com/yiouli/pixie-qa",
+    "feedback_url": "https://feedback.gopixie.ai/feedback",
+    "brand_icon_url": "https://github.com/user-attachments/assets/76c18199-f00a-4fb3-a12f-ce6c173727af",
+    "test_records": [
+        {
+            "name": "test_customer_faq.py::test_faq_factuality",
+            "status": "passed",
+            "message": null,
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockFactuality"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.85,
+                                    "reasoning": "High string similarity between expected and actual output.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.72,
+                                    "reasoning": "Moderate string similarity.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.91,
+                                    "reasoning": "Very high similarity match.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.68,
+                                    "reasoning": "Reasonable similarity.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.77,
+                                    "reasoning": "Good similarity match.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": true,
+                    "criteria_message": "Pass: 4/5 inputs passed (threshold 0.6, required 80%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.6. At least 80% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "test_customer_faq.py::test_faq_multi_evaluator",
+            "status": "failed",
+            "message": "AssertionError: 3/5 inputs failed on at least one evaluator",
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockFactuality",
+                        "MockClosedQA"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.85,
+                                    "reasoning": "High similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.3,
+                                    "reasoning": "Low keyword overlap.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.72,
+                                    "reasoning": "Moderate similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.45,
+                                    "reasoning": "Below threshold keyword overlap.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.91,
+                                    "reasoning": "Very high similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.6,
+                                    "reasoning": "Acceptable keyword overlap.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.68,
+                                    "reasoning": "Reasonable similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.25,
+                                    "reasoning": "Poor keyword match.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.77,
+                                    "reasoning": "Good similarity.",
+                                    "details": {}
+                                },
+                                {
+                                    "score": 0.55,
+                                    "reasoning": "Marginal keyword overlap.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": false,
+                    "criteria_message": "Fail: only 2/5 inputs passed on all evaluators (required 100%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "test_customer_faq.py::test_faq_no_hallucinations",
+            "status": "passed",
+            "message": null,
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockHallucination"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.95,
+                                    "reasoning": "No hallucination detected.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": true,
+                    "criteria_message": "Pass: 5/5 inputs passed (threshold 0.5, required 100%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "test_customer_faq.py::test_faq_tone_check",
+            "status": "failed",
+            "message": "AssertionError: all 5 inputs failed",
+            "asserts": [
+                {
+                    "evaluator_names": [
+                        "MockStrictTone"
+                    ],
+                    "input_labels": [
+                        "What is your return policy?",
+                        "How do I track my order?",
+                        "Do you offer international shipping?",
+                        "What payment methods do you accept?",
+                        "How can I contact support?"
+                    ],
+                    "results": [
+                        [
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ],
+                            [
+                                {
+                                    "score": 0.2,
+                                    "reasoning": "Tone does not meet strict requirements.",
+                                    "details": {}
+                                }
+                            ]
+                        ]
+                    ],
+                    "passed": false,
+                    "criteria_message": "Fail: 0/5 inputs passed (threshold 0.5, required 100%)",
+                    "scoring_strategy": "Each evaluator score must be ≥ 0.5. At least 100% of test-case inputs must pass on all evaluators. Uses best-of-N-passes semantics (any single pass meeting criteria is sufficient).",
+                    "evaluable_dicts": [
+                        {
+                            "input": "What is your return policy?",
+                            "expected_output": "You can return items within 30 days of purchase for a full refund.",
+                            "actual_output": "Items can be returned within 30 days for a full refund.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How do I track my order?",
+                            "expected_output": "You can track your order using the tracking link sent to your email.",
+                            "actual_output": "Check the tracking link in your confirmation email to track your order.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "Do you offer international shipping?",
+                            "expected_output": "Yes, we ship to over 50 countries worldwide.",
+                            "actual_output": "Yes, we offer international shipping to over 50 countries.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "What payment methods do you accept?",
+                            "expected_output": "We accept Visa, Mastercard, PayPal, and Apple Pay.",
+                            "actual_output": "We accept major credit cards including Visa and Mastercard, plus PayPal and Apple Pay.",
+                            "metadata": {}
+                        },
+                        {
+                            "input": "How can I contact support?",
+                            "expected_output": "Email us at support@example.com or call 1-800-EXAMPLE.",
+                            "actual_output": "You can reach support at support@example.com or by calling 1-800-EXAMPLE.",
+                            "metadata": {}
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}

pixie-qa 0.2.2__tar.gz → 0.5.0__tar.gz

pixie-qa 0.2.2tar.gz → 0.5.0tar.gz