pixie-qa 0.1.2__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/.github/copilot-instructions.md +110 -4
- pixie_qa-0.1.8/.gitignore +3 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/PKG-INFO +4 -5
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/README.md +2 -4
- pixie_qa-0.1.8/changelogs/deep-research-demo.md +43 -0
- pixie_qa-0.1.8/changelogs/pixie-test-e2e-suite.md +69 -0
- pixie_qa-0.1.8/changelogs/scorecard-branding-and-skill-version-check.md +41 -0
- pixie_qa-0.1.8/changelogs/scorecard-eval-detail-dialog.md +28 -0
- pixie_qa-0.1.8/changelogs/skill-v2-and-rootdir-discovery.md +76 -0
- pixie_qa-0.1.8/changelogs/test-scorecard.md +54 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/docs/package.md +24 -5
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/main.py +3 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/test_command.py +40 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/config.py +2 -2
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/__init__.py +10 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/eval_utils.py +60 -3
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/runner.py +64 -11
- pixie_qa-0.1.8/pixie/evals/scorecard.py +815 -0
- pixie_qa-0.1.8/pixie/favicon.png +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/handlers.py +1 -1
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pyproject.toml +2 -1
- pixie_qa-0.1.8/skills/eval-driven-dev/SKILL.md +852 -0
- {pixie_qa-0.1.2/.claude → pixie_qa-0.1.8}/skills/eval-driven-dev/references/pixie-api.md +8 -8
- pixie_qa-0.1.8/skills/eval-driven-dev/resources/check_version.py +70 -0
- pixie_qa-0.1.8/skills/eval-driven-dev/resources/version.json +4 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/agent-skill.md +13 -6
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/evals-harness.md +105 -2
- pixie_qa-0.1.8/tests/pixie/cli/e2e_cases.json +183 -0
- pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/conftest.py +9 -0
- pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +45 -0
- pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +156 -0
- pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +106 -0
- pixie_qa-0.1.8/tests/pixie/cli/test_e2e_pixie_test.py +343 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_runner.py +128 -0
- pixie_qa-0.1.8/tests/pixie/evals/test_scorecard.py +487 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/test_config.py +3 -3
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev/SKILL.md +0 -522
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/benchmark.json +0 -363
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/benchmark.md +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/eval_metadata.json +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/outputs/metrics.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/outputs/response.md +0 -176
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/run-1/grading.json +0 -43
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/run-1/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/outputs/metrics.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/outputs/response.md +0 -180
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/run-1/grading.json +0 -44
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/run-1/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/eval_metadata.json +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/outputs/metrics.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/outputs/response.md +0 -330
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/run-1/grading.json +0 -44
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/run-1/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/outputs/metrics.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/outputs/response.md +0 -387
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/run-1/grading.json +0 -44
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/run-1/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/eval_metadata.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/outputs/metrics.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/outputs/response.md +0 -329
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/run-1/grading.json +0 -49
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/run-1/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/outputs/metrics.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/outputs/response.md +0 -243
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/run-1/grading.json +0 -49
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/run-1/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/benchmark.json +0 -353
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/benchmark.md +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/eval_metadata.json +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/grading.json +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/outputs/metrics.json +0 -33
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/outputs/summary.md +0 -49
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/tests/test_qa.py +0 -24
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/timing.json +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/grading.json +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/outputs/metrics.json +0 -47
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/outputs/summary.md +0 -87
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/tests/test_qa.py +0 -46
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/timing.json +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/eval_metadata.json +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/grading.json +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/outputs/metrics.json +0 -45
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/outputs/summary.md +0 -80
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/MEMORY.md +0 -83
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/build_dataset.py +0 -141
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/extractor.py +0 -46
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/tests/test_email_extraction.py +0 -229
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/timing.json +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/grading.json +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/outputs/metrics.json +0 -28
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/outputs/summary.md +0 -56
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/build_dataset.py +0 -108
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/extractor.py +0 -55
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/test_extractor.py +0 -290
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/timing.json +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/eval_metadata.json +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/grading.json +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/outputs/metrics.json +0 -15
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/outputs/summary.md +0 -75
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/MEMORY.md +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/build_dataset.py +0 -91
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/chatbot.py +0 -60
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/tests/test_rag_chatbot.py +0 -109
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/timing.json +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/grading.json +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/outputs/metrics.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/outputs/summary.md +0 -50
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/build_dataset.py +0 -56
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/chatbot.py +0 -66
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/test_chatbot.py +0 -137
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/timing.json +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/benchmark.json +0 -363
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/benchmark.md +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/eval_metadata.json +0 -12
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/grading.json +0 -47
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/MEMORY.md +0 -40
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/tests/test_qa.py +0 -25
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/run-1/grading.json +0 -47
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/run-1/outputs/MEMORY.md +0 -40
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/run-1/outputs/test_qa.py +0 -25
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/grading.json +0 -53
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/INVESTIGATION_NOTES.md +0 -74
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/tests/test_qa.py +0 -83
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/run-1/grading.json +0 -53
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/run-1/outputs/INVESTIGATION_NOTES.md +0 -74
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/run-1/outputs/test_qa.py +0 -83
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/eval_metadata.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/grading.json +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/MEMORY.md +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/build_dataset.py +0 -91
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/extractor.py +0 -64
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/run_evals.sh +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/tests/test_classifier.py +0 -117
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/grading.json +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/MEMORY.md +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/build_dataset.py +0 -91
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/extractor.py +0 -64
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/test_classifier.py +0 -117
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/grading.json +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/project/collect_traces.py +0 -80
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/project/extractor.py +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/run-1/grading.json +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/run-1/outputs/collect_traces.py +0 -80
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/run-1/outputs/extractor.py +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/eval_metadata.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/grading.json +0 -67
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/MEMORY.md +0 -27
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/chatbot.py +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/pixie_datasets/rag-chatbot-golden.json +0 -37
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/pixie_observations.db +0 -0
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/tests/test_chatbot.py +0 -21
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/grading.json +0 -67
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/outputs/MEMORY.md +0 -27
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/outputs/chatbot.py +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/outputs/test_chatbot.py +0 -21
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/timing.json +0 -6
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/grading.json +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/capture_traces.py +0 -92
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/chatbot.py +0 -53
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/test_chatbot_evals.py +0 -273
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/grading.json +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/outputs/capture_traces.py +0 -92
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/outputs/chatbot.py +0 -53
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/outputs/test_chatbot_evals.py +0 -273
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/benchmark.json +0 -363
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/benchmark.md +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/eval_metadata.json +0 -12
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/grading.json +0 -40
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/MEMORY.md +0 -40
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/tests/test_qa.py +0 -25
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/run-1/grading.json +0 -40
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/run-1/outputs/MEMORY.md +0 -40
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/run-1/outputs/test_qa.py +0 -25
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/grading.json +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/tests/test_qa.py +0 -24
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/run-1/grading.json +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/run-1/outputs/test_qa.py +0 -24
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/eval_metadata.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/grading.json +0 -50
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/MEMORY.md +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/extractor.py +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/pixie_datasets/email-classifier-golden.json +0 -29
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/pixie_observations.db +0 -0
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/tests/test_email_classifier.py +0 -86
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/grading.json +0 -50
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/outputs/MEMORY.md +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/outputs/extractor.py +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/outputs/test_email_classifier.py +0 -86
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/timing.json +0 -6
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/grading.json +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/conftest.py +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/extractor.py +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/generate_dataset.py +0 -78
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/instrumented_extractor.py +0 -22
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/pytest.ini +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/test_email_classifier.py +0 -329
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/run-1/grading.json +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/run-1/outputs/extractor.py +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/run-1/outputs/test_email_classifier.py +0 -329
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/eval_metadata.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/grading.json +0 -50
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/MEMORY.md +0 -22
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/chatbot.py +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/pixie_datasets/rag-chatbot-golden.json +0 -29
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/pixie_observations.db +0 -0
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/tests/test_rag_chatbot.py +0 -28
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/grading.json +0 -50
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/outputs/MEMORY.md +0 -22
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/outputs/chatbot.py +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/outputs/test_rag_chatbot.py +0 -28
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/timing.json +0 -6
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/grading.json +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/chatbot.py +0 -46
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/chatbot_instrumented.py +0 -72
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/save_dataset.py +0 -86
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/test_chatbot_evals.py +0 -180
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/run-1/grading.json +0 -57
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/run-1/outputs/chatbot_instrumented.py +0 -72
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/run-1/outputs/test_chatbot_evals.py +0 -180
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/benchmark.json +0 -363
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/benchmark.md +0 -13
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/eval_metadata.json +0 -12
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/grading.json +0 -71
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/MEMORY.md +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/tests/test_qa.py +0 -60
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/grading.json +0 -71
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/MEMORY.md +0 -51
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/tests/test_qa.py +0 -60
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/grading.json +0 -77
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/MEMORY.md +0 -48
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/tests/test_qa.py +0 -44
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/grading.json +0 -77
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/MEMORY.md +0 -48
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/pixie_datasets/qa-golden-set.json +0 -23
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/qa_app.py +0 -26
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/requirements.txt +0 -2
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/tests/test_qa.py +0 -44
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/eval_metadata.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/grading.json +0 -77
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/MEMORY.md +0 -48
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/build_dataset.py +0 -93
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/extractor.py +0 -65
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/tests/test_email_classifier.py +0 -22
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/grading.json +0 -77
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/MEMORY.md +0 -48
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/build_dataset.py +0 -93
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/extractor.py +0 -65
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/tests/test_email_classifier.py +0 -22
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/grading.json +0 -82
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/build_dataset.py +0 -156
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/extractor.py +0 -62
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/test_email_classifier.py +0 -345
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/grading.json +0 -82
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/build_dataset.py +0 -156
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/extractor.py +0 -62
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/test_email_classifier.py +0 -345
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/eval_metadata.json +0 -14
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/grading.json +0 -81
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/MEMORY.md +0 -71
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/build_dataset.py +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/chatbot.py +0 -53
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/tests/test_rag_chatbot.py +0 -54
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/grading.json +0 -81
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/MEMORY.md +0 -71
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/build_dataset.py +0 -63
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/chatbot.py +0 -53
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/tests/test_rag_chatbot.py +0 -54
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/grading.json +0 -81
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/MEMORY.md +0 -62
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/chatbot.py +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/datasets/rag-chatbot-golden.json +0 -41
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/test_chatbot_eval.py +0 -152
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/grading.json +0 -81
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/MEMORY.md +0 -62
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/chatbot.py +0 -52
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/datasets/rag-chatbot-golden.json +0 -41
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/requirements.txt +0 -1
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/test_chatbot_eval.py +0 -152
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/timing.json +0 -5
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-1.html +0 -1325
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-2.html +0 -1325
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-3.html +0 -1325
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-4.html +0 -1325
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-5.html +0 -1325
- pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/trigger-eval-set.json +0 -82
- pixie_qa-0.1.2/.github/workflows/daily-release.yml +0 -139
- pixie_qa-0.1.2/.gitignore +0 -4
- pixie_qa-0.1.2/tests/pixie/observation_store/__init__.py +0 -0
- pixie_qa-0.1.2/uv.lock +0 -1299
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/.github/workflows/publish.yml +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/LICENSE +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/async-handler-processing.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/autoevals-adapters.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/cli-dataset-commands.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/dataset-management.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/eval-harness.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/expected-output-in-evals.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/instrumentation-module-implementation.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/loud-failure-mode.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/manual-instrumentation-usability.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/observation-store-implementation.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/pixie-directory-and-skill-improvements.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/root-package-exports-and-trace-id.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/usability-utils.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/dataset_command.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/dataset/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/dataset/models.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/dataset/store.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/criteria.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/evaluation.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/scorers.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/trace_capture.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/trace_helpers.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/context.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/handler.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/instrumentors.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/observation.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/processor.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/queue.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/spans.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/evaluable.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/piccolo_conf.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/piccolo_migrations/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/serialization.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/store.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/tables.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/tree.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/agent-skill-1.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/autoevals-adapters.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/dataset-management.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/expected-output-in-evals.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/instrumentation.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/manual-instrumentation-usability.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/storage.md +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/usability-utils.md +0 -0
- {pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project → pixie_qa-0.1.8}/tests/__init__.py +0 -0
- {pixie_qa-0.1.2/tests → pixie_qa-0.1.8/tests/pixie}/__init__.py +0 -0
- {pixie_qa-0.1.2/tests/pixie → pixie_qa-0.1.8/tests/pixie/cli}/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/cli/test_dataset_command.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/cli/test_main.py +0 -0
- {pixie_qa-0.1.2/tests/pixie/cli → pixie_qa-0.1.8/tests/pixie/dataset}/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/dataset/test_models.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/dataset/test_store.py +0 -0
- {pixie_qa-0.1.2/tests/pixie/dataset → pixie_qa-0.1.8/tests/pixie/evals}/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_criteria.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_eval_utils.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_evaluation.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_scorers.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_trace_capture.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_trace_helpers.py +0 -0
- {pixie_qa-0.1.2/tests/pixie/evals → pixie_qa-0.1.8/tests/pixie/instrumentation}/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/conftest.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_context.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_handler.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_integration.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_observation.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_processor.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_queue.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_spans.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_storage_handler.py +0 -0
- {pixie_qa-0.1.2/tests/pixie/instrumentation → pixie_qa-0.1.8/tests/pixie/observation_store}/__init__.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/conftest.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_evaluable.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_serialization.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_store.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_tree.py +0 -0
- {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/test_init.py +0 -0
|
@@ -144,6 +144,103 @@ uv run pytest -k "test_function_name" # Run specific test
|
|
|
144
144
|
uv run pytest --cov=pixie # Run with coverage report
|
|
145
145
|
```
|
|
146
146
|
|
|
147
|
+
### 4a. End-to-End Tests for `pixie test`
|
|
148
|
+
|
|
149
|
+
The `pixie test` CLI command has a dedicated e2e test suite that verifies the full
|
|
150
|
+
command lifecycle — test discovery, execution, console output, exit codes, and
|
|
151
|
+
HTML scorecard generation. The suite uses **realistic fixtures** that mirror how
|
|
152
|
+
a real user would configure datasets, evaluators, and test files.
|
|
153
|
+
|
|
154
|
+
**Fixture layout:**
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
tests/pixie/cli/
|
|
158
|
+
e2e_fixtures/
|
|
159
|
+
datasets/
|
|
160
|
+
customer-faq.json # 5-item golden dataset (Evaluable items)
|
|
161
|
+
mock_evaluators.py # Deterministic mock evaluators (no LLM calls)
|
|
162
|
+
test_customer_faq.py # Realistic test file using assert_dataset_pass
|
|
163
|
+
e2e_cases.json # Edge-case scenario definitions
|
|
164
|
+
test_e2e_pixie_test.py # Automated pytest e2e tests
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
The automated pytest file (`test_e2e_pixie_test.py`) contains two test classes:
|
|
168
|
+
|
|
169
|
+
1. **`TestPixieTestRealisticE2E`** (10 tests) — Runs `pixie test` on the
|
|
170
|
+
realistic fixture (`test_customer_faq.py`) that uses 4 evaluator/criteria
|
|
171
|
+
combinations against the customer-FAQ dataset. Verifies exit code, console
|
|
172
|
+
summary, test names, check/cross marks, scorecard HTML generation, evaluator
|
|
173
|
+
names, PASS/FAIL badges, per-input scores, summary counts, and scoring
|
|
174
|
+
strategy descriptions.
|
|
175
|
+
|
|
176
|
+
2. **`TestPixieTestEdgeCases`** (32 tests) — Parametrised from `e2e_cases.json`
|
|
177
|
+
covering empty dirs, filters, verbose mode, single file targeting, etc.
|
|
178
|
+
|
|
179
|
+
**Mock evaluators** (`e2e_fixtures/mock_evaluators.py`) are deterministic
|
|
180
|
+
replacements for LLM-as-judge evaluators. They use string similarity, keyword
|
|
181
|
+
overlap, or fixed scores to produce realistic but reproducible results:
|
|
182
|
+
- `MockFactualityEval` — SequenceMatcher string similarity (most items pass)
|
|
183
|
+
- `MockClosedQAEval` — keyword overlap ratio (strict; some items fail)
|
|
184
|
+
- `MockHallucinationEval` — always returns score 0.95
|
|
185
|
+
- `MockFailingEval` (name="MockStrictTone") — always returns score 0.2
|
|
186
|
+
|
|
187
|
+
**Expected realistic fixture results:**
|
|
188
|
+
- `test_faq_factuality` → PASS (MockFactuality, threshold=0.6, pct=0.8)
|
|
189
|
+
- `test_faq_multi_evaluator` → FAIL (MockFactuality+MockClosedQA, threshold=0.5, pct=1.0)
|
|
190
|
+
- `test_faq_no_hallucinations` → PASS (MockHallucination, threshold=0.5, pct=1.0)
|
|
191
|
+
- `test_faq_tone_check` → FAIL (MockStrictTone, threshold=0.5, pct=1.0)
|
|
192
|
+
- Console: "2 passed, 2 failed", exit code 1
|
|
193
|
+
- Scorecard: HTML with evaluator names, scores, PASS/FAIL badges
|
|
194
|
+
|
|
195
|
+
**When to run e2e tests:**
|
|
196
|
+
|
|
197
|
+
Run the e2e suite whenever you change anything in:
|
|
198
|
+
- `pixie/cli/test_command.py` — the `pixie test` entry point
|
|
199
|
+
- `pixie/evals/runner.py` — test discovery, execution, formatting
|
|
200
|
+
- `pixie/evals/scorecard.py` — scorecard models, HTML generation
|
|
201
|
+
- `pixie/evals/eval_utils.py` — `assert_pass` / `assert_dataset_pass`
|
|
202
|
+
- `pixie/evals/criteria.py` — pass criteria
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v # Run all 42 e2e tests
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
**Agent verification protocol (manual inspection):**
|
|
209
|
+
|
|
210
|
+
In addition to the automated pytest tests, the coding agent should manually
|
|
211
|
+
verify the `pixie test` output after making changes to CLI/eval/scorecard code:
|
|
212
|
+
|
|
213
|
+
1. **Run the realistic fixture directly:**
|
|
214
|
+
```bash
|
|
215
|
+
PIXIE_ROOT=/tmp/pixie_e2e_verify uv run pixie test tests/pixie/cli/e2e_fixtures/test_customer_faq.py
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
2. **Inspect the console output** — verify that:
|
|
219
|
+
- All 4 test names appear with correct ✓/✗ marks
|
|
220
|
+
- Summary shows "2 passed, 2 failed"
|
|
221
|
+
- No unexpected errors or tracebacks
|
|
222
|
+
|
|
223
|
+
3. **Inspect the HTML scorecard** — open the generated file and verify:
|
|
224
|
+
- All 4 evaluator names appear (MockFactuality, MockClosedQA, etc.)
|
|
225
|
+
- Per-input score cells show reasonable numeric values
|
|
226
|
+
- PASS/FAIL badges match expectations (2 PASS, 2 FAIL)
|
|
227
|
+
- Scoring strategy descriptions are human-readable
|
|
228
|
+
- The scorecard is well-formatted and renders correctly
|
|
229
|
+
|
|
230
|
+
4. **Evaluate holistically** — given the dataset contents and evaluator
|
|
231
|
+
definitions, do the scores and pass/fail outcomes make sense? For example,
|
|
232
|
+
MockFactuality should score high on items where `eval_output` is similar to
|
|
233
|
+
`expected_output`, and MockStrictTone should always fail.
|
|
234
|
+
|
|
235
|
+
This manual step catches rendering issues, layout regressions, and semantic
|
|
236
|
+
correctness problems that simple string assertions can miss.
|
|
237
|
+
|
|
238
|
+
**Adding new edge-case scenarios:**
|
|
239
|
+
|
|
240
|
+
1. Add a new object to `tests/pixie/cli/e2e_cases.json`.
|
|
241
|
+
2. Run `uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v` to verify.
|
|
242
|
+
3. No code changes needed in the test file — it auto-discovers all cases.
|
|
243
|
+
|
|
147
244
|
### 5. Test Quality Guidelines
|
|
148
245
|
|
|
149
246
|
**Good tests are:**
|
|
@@ -344,11 +441,17 @@ uv run ruff format . # Format code
|
|
|
344
441
|
Before committing, run:
|
|
345
442
|
|
|
346
443
|
```bash
|
|
347
|
-
uv run pytest # All tests must pass
|
|
444
|
+
uv run pytest # All tests must pass (includes e2e)
|
|
348
445
|
uv run mypy pixie/ # Zero type errors
|
|
349
446
|
uv run ruff check . # No linting errors
|
|
350
447
|
```
|
|
351
448
|
|
|
449
|
+
When changing `pixie test` or scorecard-related code, also run e2e explicitly:
|
|
450
|
+
|
|
451
|
+
```bash
|
|
452
|
+
uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v # Verify pixie test e2e
|
|
453
|
+
```
|
|
454
|
+
|
|
352
455
|
Also verify **zero Pylance errors** in VS Code Problems panel (Pylance can catch type mismatches that mypy misses for untyped third-party packages).
|
|
353
456
|
|
|
354
457
|
---
|
|
@@ -509,6 +612,8 @@ This project has strict error-handling conventions due to operating inside OTel
|
|
|
509
612
|
6. ✅ Update docstrings / `README.md` / relevant `specs/` docs
|
|
510
613
|
7. ✅ Add/update `changelogs/<feature>.md` for non-trivial changes
|
|
511
614
|
8. ✅ Verify functionality works as expected
|
|
615
|
+
9. ✅ If touching `pixie test` / scorecard / runner / eval code, run `uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v` — all 42 e2e tests must pass (10 realistic + 32 edge-case)
|
|
616
|
+
10. ✅ If touching `pixie test` / scorecard code, also run the **agent verification protocol** (section 4a) — manually run `pixie test` on the realistic fixture and inspect console + scorecard output
|
|
512
617
|
|
|
513
618
|
**Development cycle:**
|
|
514
619
|
|
|
@@ -519,8 +624,9 @@ This project has strict error-handling conventions due to operating inside OTel
|
|
|
519
624
|
5. Implement feature (reuse existing code when possible)
|
|
520
625
|
6. After each task: run tests and type check
|
|
521
626
|
7. Run linting (`uv run ruff check .`)
|
|
522
|
-
8.
|
|
523
|
-
9.
|
|
524
|
-
10.
|
|
627
|
+
8. Run `pixie test` e2e suite if CLI/eval/scorecard code changed
|
|
628
|
+
9. Update docs and changelog for the task
|
|
629
|
+
10. Fix any issues
|
|
630
|
+
11. Commit
|
|
525
631
|
|
|
526
632
|
Following these practices ensures high code quality, type safety, maintainability, and reliability.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pixie-qa
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: Automated quality assurance for AI applications
|
|
5
5
|
Project-URL: Homepage, https://github.com/yiouli/pixie-qa
|
|
6
6
|
Project-URL: Repository, https://github.com/yiouli/pixie-qa
|
|
@@ -45,6 +45,7 @@ Requires-Dist: opentelemetry-api>=1.27.0
|
|
|
45
45
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
46
46
|
Requires-Dist: piccolo[sqlite]>=1.33.0
|
|
47
47
|
Requires-Dist: pydantic>=2.0
|
|
48
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
48
49
|
Provides-Extra: all
|
|
49
50
|
Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
|
|
50
51
|
Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
|
|
@@ -67,13 +68,11 @@ Description-Content-Type: text/markdown
|
|
|
67
68
|
|
|
68
69
|
An agent skill for **eval-driven development** of LLM-powered applications.
|
|
69
70
|
|
|
70
|
-
Use this skill to instrument your app, build golden datasets from real runs, write eval-based tests, and catch regressions before they ship — all from a single conversation with Claude.
|
|
71
|
-
|
|
72
71
|
## What the Skill Does
|
|
73
72
|
|
|
74
73
|
The `eval-driven-dev` skill guides your coding agent through the full QA loop for LLM applications:
|
|
75
74
|
|
|
76
|
-
1. **Understand the
|
|
75
|
+
1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
|
|
77
76
|
2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
|
|
78
77
|
3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
|
|
79
78
|
4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
|
|
@@ -85,7 +84,7 @@ The `eval-driven-dev` skill guides your coding agent through the full QA loop fo
|
|
|
85
84
|
### 1. Add the skill to your coding agent
|
|
86
85
|
|
|
87
86
|
```bash
|
|
88
|
-
npx
|
|
87
|
+
npx skills add yiouli/pixie-qa
|
|
89
88
|
```
|
|
90
89
|
|
|
91
90
|
The accompanying python package would be installed by the skill automatically when it's used.
|
|
@@ -2,13 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
An agent skill for **eval-driven development** of LLM-powered applications.
|
|
4
4
|
|
|
5
|
-
Use this skill to instrument your app, build golden datasets from real runs, write eval-based tests, and catch regressions before they ship — all from a single conversation with Claude.
|
|
6
|
-
|
|
7
5
|
## What the Skill Does
|
|
8
6
|
|
|
9
7
|
The `eval-driven-dev` skill guides your coding agent through the full QA loop for LLM applications:
|
|
10
8
|
|
|
11
|
-
1. **Understand the
|
|
9
|
+
1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
|
|
12
10
|
2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
|
|
13
11
|
3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
|
|
14
12
|
4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
|
|
@@ -20,7 +18,7 @@ The `eval-driven-dev` skill guides your coding agent through the full QA loop fo
|
|
|
20
18
|
### 1. Add the skill to your coding agent
|
|
21
19
|
|
|
22
20
|
```bash
|
|
23
|
-
npx
|
|
21
|
+
npx skills add yiouli/pixie-qa
|
|
24
22
|
```
|
|
25
23
|
|
|
26
24
|
The accompanying python package would be installed by the skill automatically when it's used.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Deep Research Demo Project
|
|
2
|
+
|
|
3
|
+
## What Changed
|
|
4
|
+
|
|
5
|
+
Added a simplified demo project under `demo/deep_research/` based on the
|
|
6
|
+
[GPT Researcher](https://github.com/assafelovic/gpt-researcher) project
|
|
7
|
+
(commit `7c32174`, Apache 2.0 license).
|
|
8
|
+
|
|
9
|
+
The demo serves as a real-world AI application that the **pixie-qa** skill
|
|
10
|
+
can be tested against.
|
|
11
|
+
|
|
12
|
+
### Simplifications from the original
|
|
13
|
+
|
|
14
|
+
| Removed | Reason |
|
|
15
|
+
|---------|--------|
|
|
16
|
+
| UI (frontend + backend server) | Not needed for programmatic evaluation |
|
|
17
|
+
| Deep Research mode | Complex multi-agent workflow, out of scope |
|
|
18
|
+
| Image generation | Not needed for text-based evaluation |
|
|
19
|
+
| MCP integrations | External tool integrations, not needed |
|
|
20
|
+
| All retrievers except DuckDuckGo | Simplifies dependencies, avoids paid APIs |
|
|
21
|
+
| Tavily / Firecrawl scrapers | Paid service dependencies |
|
|
22
|
+
| PDF / DOCX export | Not needed for evaluation |
|
|
23
|
+
| Docker / Terraform / multi-agent | Infrastructure, not needed |
|
|
24
|
+
|
|
25
|
+
### What remains
|
|
26
|
+
|
|
27
|
+
- Programmatic entry point (`run.py`) to run research with a string query
|
|
28
|
+
- Full agent workflow: query → sub-queries → web search → scrape → summarize → report
|
|
29
|
+
- DuckDuckGo as the sole search retriever (free, no API key needed for search)
|
|
30
|
+
- OpenAI as the LLM provider (requires `OPENAI_API_KEY`)
|
|
31
|
+
|
|
32
|
+
## Files Affected
|
|
33
|
+
|
|
34
|
+
- `demo/deep_research/` — entire new directory
|
|
35
|
+
- `demo/deep_research/gpt_researcher/` — simplified core library
|
|
36
|
+
- `demo/deep_research/run.py` — entry point
|
|
37
|
+
- `demo/deep_research/pyproject.toml` — dependencies
|
|
38
|
+
- `demo/deep_research/LICENSE` — Apache 2.0 (from upstream)
|
|
39
|
+
- `demo/deep_research/NOTICE` — attribution notice
|
|
40
|
+
|
|
41
|
+
## Migration Notes
|
|
42
|
+
|
|
43
|
+
This is a new addition — no migration required.
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# pixie test — e2e test suite
|
|
2
|
+
|
|
3
|
+
## What changed
|
|
4
|
+
|
|
5
|
+
Added a comprehensive end-to-end test suite for the `pixie test` CLI command
|
|
6
|
+
with two complementary layers:
|
|
7
|
+
|
|
8
|
+
1. **Realistic fixture tests** (10 tests) — run `pixie test` on a realistic
|
|
9
|
+
test file with a 5-item customer-FAQ golden dataset and 4 deterministic
|
|
10
|
+
mock evaluators. Verifies exit code, console summary, test names,
|
|
11
|
+
check/cross marks, scorecard HTML generation, evaluator names, PASS/FAIL
|
|
12
|
+
badges, per-input scores, summary counts, and scoring strategy descriptions.
|
|
13
|
+
|
|
14
|
+
2. **Edge-case tests** (32 tests) — parametrised from `e2e_cases.json`
|
|
15
|
+
covering empty dirs, filters, verbose mode, single file targeting, etc.
|
|
16
|
+
|
|
17
|
+
The copilot instructions now include an **agent verification protocol** that
|
|
18
|
+
tells the coding agent to manually run `pixie test` on the realistic fixtures
|
|
19
|
+
and holistically evaluate the console output and HTML scorecard after making
|
|
20
|
+
changes to CLI/eval/scorecard code.
|
|
21
|
+
|
|
22
|
+
### New files
|
|
23
|
+
|
|
24
|
+
- **`tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json`** — 5-item
|
|
25
|
+
golden dataset with FAQ questions, chatbot answers, and reference answers.
|
|
26
|
+
|
|
27
|
+
- **`tests/pixie/cli/e2e_fixtures/mock_evaluators.py`** — 4 deterministic
|
|
28
|
+
mock evaluators: MockFactualityEval (SequenceMatcher string similarity),
|
|
29
|
+
MockClosedQAEval (keyword overlap), MockHallucinationEval (always 0.95),
|
|
30
|
+
MockFailingEval/MockStrictTone (always 0.2). No LLM calls.
|
|
31
|
+
|
|
32
|
+
- **`tests/pixie/cli/e2e_fixtures/test_customer_faq.py`** — Realistic test
|
|
33
|
+
file using `assert_dataset_pass` with different scoring strategies.
|
|
34
|
+
Expected: 2 PASS (`test_faq_factuality`, `test_faq_no_hallucinations`),
|
|
35
|
+
2 FAIL (`test_faq_multi_evaluator`, `test_faq_tone_check`).
|
|
36
|
+
|
|
37
|
+
- **`tests/pixie/cli/e2e_cases.json`** — 8 edge-case scenarios as JSON data.
|
|
38
|
+
|
|
39
|
+
- **`tests/pixie/cli/test_e2e_pixie_test.py`** — Two test classes:
|
|
40
|
+
`TestPixieTestRealisticE2E` (10 tests) and `TestPixieTestEdgeCases`
|
|
41
|
+
(32 tests). Total: 42 test cases.
|
|
42
|
+
|
|
43
|
+
### Modified files
|
|
44
|
+
|
|
45
|
+
- **`.github/copilot-instructions.md`** — Rewrote section 4a with realistic
|
|
46
|
+
fixture layout, mock evaluator descriptions, expected results, and a full
|
|
47
|
+
agent verification protocol. Updated summary checklist to require both
|
|
48
|
+
automated e2e tests (42) and manual agent inspection.
|
|
49
|
+
|
|
50
|
+
- **`specs/evals-harness.md`** — Updated E2E Test Suite section to describe
|
|
51
|
+
both realistic fixtures and edge-case scenarios.
|
|
52
|
+
|
|
53
|
+
## Files affected
|
|
54
|
+
|
|
55
|
+
- `tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json`
|
|
56
|
+
- `tests/pixie/cli/e2e_fixtures/mock_evaluators.py`
|
|
57
|
+
- `tests/pixie/cli/e2e_fixtures/test_customer_faq.py`
|
|
58
|
+
- `tests/pixie/cli/e2e_cases.json`
|
|
59
|
+
- `tests/pixie/cli/test_e2e_pixie_test.py`
|
|
60
|
+
- `.github/copilot-instructions.md`
|
|
61
|
+
- `specs/evals-harness.md`
|
|
62
|
+
|
|
63
|
+
## Migration notes
|
|
64
|
+
|
|
65
|
+
No API changes. The e2e test suite is purely additive.
|
|
66
|
+
|
|
67
|
+
- To add new edge-case scenarios: edit `tests/pixie/cli/e2e_cases.json`.
|
|
68
|
+
- To modify realistic fixture behavior: edit mock evaluators or the test file
|
|
69
|
+
in `tests/pixie/cli/e2e_fixtures/`.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Scorecard branding and skill version check
|
|
2
|
+
|
|
3
|
+
## What changed and why
|
|
4
|
+
|
|
5
|
+
Two user-facing upgrades were added:
|
|
6
|
+
|
|
7
|
+
1. The HTML scorecard generated by `pixie test` now has a branded Pixie header
|
|
8
|
+
with a repo CTA and a feedback modal so users can quickly star the project
|
|
9
|
+
and send feedback without leaving the report context.
|
|
10
|
+
2. The `eval-driven-dev` skill now ships with version metadata and a bundled
|
|
11
|
+
version-check script that compares the local skill resource with the latest
|
|
12
|
+
`main` branch metadata and updates both the skill and Python package when the
|
|
13
|
+
local version is behind.
|
|
14
|
+
|
|
15
|
+
## Files affected
|
|
16
|
+
|
|
17
|
+
- `pixie/evals/scorecard.py` — adds the branded header, repo CTA, feedback
|
|
18
|
+
modal HTML, styling, and modal toggle script.
|
|
19
|
+
- `tests/pixie/evals/test_scorecard.py` — verifies the branded header and
|
|
20
|
+
feedback form wiring in generated scorecard HTML.
|
|
21
|
+
- `tests/pixie/cli/test_e2e_pixie_test.py` — verifies the generated scorecard
|
|
22
|
+
contains the branding and feedback actions end-to-end.
|
|
23
|
+
- `tests/pixie/test_skill_resources.py` — verifies the skill version metadata
|
|
24
|
+
and update command behavior.
|
|
25
|
+
- `.claude/skills/eval-driven-dev/resources/version.json` — source of truth for
|
|
26
|
+
the local skill version metadata.
|
|
27
|
+
- `.claude/skills/eval-driven-dev/resources/check_version.py` — checks the
|
|
28
|
+
remote version file and triggers updates when needed.
|
|
29
|
+
- `.claude/skills/eval-driven-dev/SKILL.md` — instructs the skill to run the
|
|
30
|
+
version check before any other skill steps.
|
|
31
|
+
- `README.md`, `docs/package.md`, and `specs/evals-harness.md` — document the
|
|
32
|
+
new header and skill update flow.
|
|
33
|
+
|
|
34
|
+
## Migration notes
|
|
35
|
+
|
|
36
|
+
- No API changes are required for existing `pixie test` users. The scorecard
|
|
37
|
+
remains a self-contained HTML file, now with extra header UI and a feedback
|
|
38
|
+
form.
|
|
39
|
+
- The version-check script is additive. If network access or package-manager
|
|
40
|
+
commands are unavailable, it exits cleanly after printing what could not be
|
|
41
|
+
checked or run.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Scorecard: Evaluator cell "details" dialog
|
|
2
|
+
|
|
3
|
+
## What changed
|
|
4
|
+
|
|
5
|
+
Each evaluator score cell in the scorecard detail table now has a **"details"** hyperlink.
|
|
6
|
+
Clicking it opens a modal dialog showing:
|
|
7
|
+
|
|
8
|
+
- **Score** — numeric value with green ✓ or red ✗ indicator
|
|
9
|
+
- **Reasoning** — the full `Evaluation.reasoning` string (previously only shown as a tooltip)
|
|
10
|
+
- **Details** — the `Evaluation.details` dict rendered as pretty-printed JSON (hidden when empty)
|
|
11
|
+
|
|
12
|
+
The modal is dismiss-able via the **Close** button, the **Esc** key, or a click on the backdrop — consistent with the existing feedback modal.
|
|
13
|
+
|
|
14
|
+
## Files affected
|
|
15
|
+
|
|
16
|
+
- `pixie/evals/scorecard.py`
|
|
17
|
+
- Added `import json`
|
|
18
|
+
- New `_render_eval_detail_modal()` helper — renders the reusable hidden modal
|
|
19
|
+
- `generate_scorecard_html()` — calls `_render_eval_detail_modal()` after the brand header
|
|
20
|
+
- `_render_pass_table()` — each evaluator cell now embeds a `data-eval` JSON attribute and a `details` link
|
|
21
|
+
- `_HTML_HEAD` — added CSS for `.details-link`, `.eval-detail-body`, `.eval-detail-row`, `.eval-detail-label`, `.eval-detail-value`, `.eval-detail-score-pass/fail`, `.eval-detail-json`
|
|
22
|
+
- `_HTML_FOOT` — added `showEvalDetail(link)` and `closeEvalDetailModal()` JS functions; updated Esc and backdrop-click handlers to also close the eval-detail modal
|
|
23
|
+
|
|
24
|
+
## Migration notes
|
|
25
|
+
|
|
26
|
+
No API changes. The `AssertRecord`, `Evaluation`, and `ScorecardReport` models are unchanged.
|
|
27
|
+
Existing scorecards already stored `Evaluation.reasoning` as a cell `title` attribute (tooltip);
|
|
28
|
+
that attribute has been replaced by the clickable details link — tooltip-only access is no longer available.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Skill v2: setup-vs-iterate, eval boundary, rootdir discovery
|
|
2
|
+
|
|
3
|
+
## What changed and why
|
|
4
|
+
|
|
5
|
+
### 1. Renamed default root directory from `.pixie` to `pixie_qa`
|
|
6
|
+
|
|
7
|
+
The dot-prefix `.pixie` caused Python import resolution issues (treated as hidden
|
|
8
|
+
directory, confused with relative imports). Renamed to `pixie_qa` — a plain,
|
|
9
|
+
importable name that avoids these problems.
|
|
10
|
+
|
|
11
|
+
- `pixie/config.py`: `DEFAULT_ROOT` changed from `".pixie"` to `"pixie_qa"`
|
|
12
|
+
- All documentation updated: SKILL.md, pixie-api.md, specs/agent-skill.md
|
|
13
|
+
|
|
14
|
+
### 2. Test runner rootdir discovery (pytest-style)
|
|
15
|
+
|
|
16
|
+
The old `_load_module()` in `pixie/evals/runner.py` added the test file's parent
|
|
17
|
+
and grandparent to `sys.path`. This broke for test files nested deeper than two
|
|
18
|
+
levels from the project root (e.g. `pixie_qa/tests/test_foo.py`).
|
|
19
|
+
|
|
20
|
+
Rewrote `_load_module()` to use rootdir discovery: `_find_rootdir()` walks up from
|
|
21
|
+
the test file directory looking for `pyproject.toml`, `setup.py`, or `setup.cfg` —
|
|
22
|
+
the same strategy pytest uses. The discovered rootdir is added to `sys.path`,
|
|
23
|
+
making project-root imports work regardless of test file depth.
|
|
24
|
+
|
|
25
|
+
### 3. SKILL.md: setup vs. iteration checkpoint
|
|
26
|
+
|
|
27
|
+
Added a "Setup vs. Iteration" section at the top of the skill. When the user says
|
|
28
|
+
"setup QA" / "set up evals" / "add tests", the agent now stops after Stage 6
|
|
29
|
+
(first test run) and reports results without fixing anything. It only proceeds
|
|
30
|
+
to Stage 7 (investigate and fix) if the user explicitly confirms.
|
|
31
|
+
|
|
32
|
+
Previously, the skill had no checkpoint — the agent would eagerly iterate on
|
|
33
|
+
failures, modifying application code without being asked.
|
|
34
|
+
|
|
35
|
+
### 4. SKILL.md: eval boundary guidance
|
|
36
|
+
|
|
37
|
+
Added "The eval boundary: what to evaluate" section. Evals focus on LLM-dependent
|
|
38
|
+
behaviour only (response quality, routing decisions, prompt effectiveness). Tool
|
|
39
|
+
implementations, database queries, keyword matching, and other deterministic logic
|
|
40
|
+
are explicitly out of scope — they should be tested with traditional unit tests.
|
|
41
|
+
|
|
42
|
+
The investigation section (Stage 7) now classifies failures into "LLM-related"
|
|
43
|
+
and "non-LLM" categories with guidance on how to handle each.
|
|
44
|
+
|
|
45
|
+
### 5. SKILL.md: instrument production code only
|
|
46
|
+
|
|
47
|
+
Strengthened Stage 3 with explicit rules against creating wrapper functions or
|
|
48
|
+
alternate code paths for eval purposes. Added a ❌ WRONG example showing the
|
|
49
|
+
anti-pattern (creating `run_for_eval()` that duplicates `main()` logic) and
|
|
50
|
+
✅ CORRECT examples showing `@observe` on existing functions and
|
|
51
|
+
`start_observation` context manager inside existing functions.
|
|
52
|
+
|
|
53
|
+
## Files affected
|
|
54
|
+
|
|
55
|
+
| File | Change |
|
|
56
|
+
| -------------------------------------------------------- | ------------------------------------------------- |
|
|
57
|
+
| `pixie/config.py` | `DEFAULT_ROOT = "pixie_qa"` |
|
|
58
|
+
| `pixie/instrumentation/handlers.py` | Docstring updated |
|
|
59
|
+
| `pixie/evals/runner.py` | New `_find_rootdir()`, rewritten `_load_module()` |
|
|
60
|
+
| `tests/pixie/test_config.py` | Updated assertions for `"pixie_qa"` default |
|
|
61
|
+
| `tests/pixie/evals/test_runner.py` | 8 new tests (rootdir + import resolution) |
|
|
62
|
+
| `.claude/skills/eval-driven-dev/SKILL.md` | Major rewrite (issues 3, 4, 5 + rename) |
|
|
63
|
+
| `.claude/skills/eval-driven-dev/references/pixie-api.md` | Config table updated |
|
|
64
|
+
| `specs/agent-skill.md` | `.pixie` → `pixie_qa` throughout |
|
|
65
|
+
|
|
66
|
+
## Migration notes
|
|
67
|
+
|
|
68
|
+
- **Breaking default change**: The default root directory changed from `.pixie` to
|
|
69
|
+
`pixie_qa`. Existing projects using the old default should either:
|
|
70
|
+
- Set `PIXIE_ROOT=.pixie` to preserve the old location, or
|
|
71
|
+
- Rename the directory: `mv .pixie pixie_qa`
|
|
72
|
+
- **Test runner**: `_load_module()` now uses rootdir discovery instead of
|
|
73
|
+
parent/grandparent. No action needed — this is backwards compatible and more
|
|
74
|
+
reliable.
|
|
75
|
+
- **Skill behaviour**: Agents following the updated SKILL.md will stop after
|
|
76
|
+
initial test setup and ask before iterating on failures.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Test Scorecard Feature
|
|
2
|
+
|
|
3
|
+
## What Changed
|
|
4
|
+
|
|
5
|
+
Added an HTML scorecard report that is automatically generated and saved to disk
|
|
6
|
+
for every `pixie test` command run. The scorecard provides a detailed,
|
|
7
|
+
human-readable breakdown of eval-based test results beyond the terminal summary.
|
|
8
|
+
|
|
9
|
+
### Scorecard Contents
|
|
10
|
+
|
|
11
|
+
- **Test run overview** — command args, timestamp, pass/fail summary, and a
|
|
12
|
+
table of all discovered tests with their status badges.
|
|
13
|
+
- **Per-test-function detail** — for each test that calls `assert_pass` or
|
|
14
|
+
`assert_dataset_pass`:
|
|
15
|
+
- Human-readable scoring strategy description.
|
|
16
|
+
- Per-evaluator pass rate summary table.
|
|
17
|
+
- Input × evaluator score grid with hover tooltips showing reasoning.
|
|
18
|
+
- Tabbed view for multi-pass runs (one tab per pass).
|
|
19
|
+
|
|
20
|
+
### Scorecard Storage
|
|
21
|
+
|
|
22
|
+
HTML files are saved to `{config.root}/scorecards/<YYYYMMDD-HHMMSS-normalized-args>.html`.
|
|
23
|
+
The CLI prints the full path after each run so users can click to open it.
|
|
24
|
+
|
|
25
|
+
## Files Affected
|
|
26
|
+
|
|
27
|
+
### New Files
|
|
28
|
+
|
|
29
|
+
- `pixie/evals/scorecard.py` — data models (`AssertRecord`, `TestRecord`,
|
|
30
|
+
`ScorecardReport`), `ScorecardCollector` (context-var-based accumulator),
|
|
31
|
+
HTML generation, and `save_scorecard()`.
|
|
32
|
+
- `tests/pixie/evals/test_scorecard.py` — 28 tests covering models, collector,
|
|
33
|
+
HTML generation, file saving, and integration with `assert_pass` / runner.
|
|
34
|
+
|
|
35
|
+
### Modified Files
|
|
36
|
+
|
|
37
|
+
- `pixie/evals/eval_utils.py` — `assert_pass` now publishes an `AssertRecord`
|
|
38
|
+
to the active `ScorecardCollector` (no-op when no collector is active).
|
|
39
|
+
- `pixie/evals/runner.py` — `_run_single()` activates a `ScorecardCollector`
|
|
40
|
+
per test; `EvalTestResult` gains an `assert_records` field.
|
|
41
|
+
- `pixie/cli/test_command.py` — builds a `ScorecardReport`, calls
|
|
42
|
+
`save_scorecard()`, and prints the path.
|
|
43
|
+
- `pixie/evals/__init__.py` — re-exports `ScorecardCollector`,
|
|
44
|
+
`ScorecardReport`, `generate_scorecard_html`, `save_scorecard`.
|
|
45
|
+
- `docs/package.md` — documents the HTML scorecard section under "Running Tests".
|
|
46
|
+
|
|
47
|
+
## Migration Notes
|
|
48
|
+
|
|
49
|
+
- No breaking API changes. Existing `pixie test` invocations behave identically
|
|
50
|
+
to before, with the addition of an HTML file being written and a path printed
|
|
51
|
+
at the end.
|
|
52
|
+
- `EvalTestResult.assert_records` defaults to an empty tuple, so any code
|
|
53
|
+
that accesses `EvalTestResult` is unaffected.
|
|
54
|
+
- The scorecard directory (`{config.root}/scorecards/`) is created on demand.
|
|
@@ -188,13 +188,32 @@ All evaluators are importable from `pixie` (e.g. `from pixie import FactualityEv
|
|
|
188
188
|
|
|
189
189
|
## Running Tests
|
|
190
190
|
|
|
191
|
-
Use `pixie
|
|
191
|
+
Use `pixie test` (or the equivalent `pixie-test` entry point, not bare `pytest`)
|
|
192
|
+
to run eval tests. It sets up the async environment and provides eval-specific
|
|
193
|
+
output formatting:
|
|
192
194
|
|
|
193
195
|
```bash
|
|
194
|
-
pixie
|
|
195
|
-
pixie
|
|
196
|
-
pixie
|
|
197
|
-
pixie
|
|
196
|
+
pixie test # run all test_*.py in the current directory
|
|
197
|
+
pixie test tests/ # specify a path
|
|
198
|
+
pixie test -k factuality # filter by name substring
|
|
199
|
+
pixie test -v # verbose: shows per-case scores and reasoning
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### HTML Scorecard
|
|
203
|
+
|
|
204
|
+
Every `pixie test` run generates an **HTML scorecard** saved to `{PIXIE_ROOT}/scorecards/<timestamp>.html`. The scorecard contains:
|
|
205
|
+
|
|
206
|
+
- **Test run overview** — command args, pass/fail summary, and a table of all tests with their status.
|
|
207
|
+
- **Per-test detail** — for each test function that calls `assert_pass` / `assert_dataset_pass`:
|
|
208
|
+
- Scoring strategy description (human-readable).
|
|
209
|
+
- Per-evaluator pass rate table.
|
|
210
|
+
- Per-input × per-evaluator score grid with tooltips showing reasoning.
|
|
211
|
+
- **Tabbed view** for multi-pass runs (one tab per pass).
|
|
212
|
+
|
|
213
|
+
After the test run, the CLI prints the scorecard path:
|
|
214
|
+
|
|
215
|
+
```text
|
|
216
|
+
See /path/to/pixie_qa/scorecards/20250615-120000-pixie-test.html for test details
|
|
198
217
|
```
|
|
199
218
|
|
|
200
219
|
---
|
|
@@ -20,6 +20,7 @@ import json
|
|
|
20
20
|
import sys
|
|
21
21
|
from typing import TextIO
|
|
22
22
|
|
|
23
|
+
from dotenv import load_dotenv
|
|
23
24
|
from piccolo.engine.sqlite import SQLiteEngine
|
|
24
25
|
from pydantic import JsonValue
|
|
25
26
|
|
|
@@ -183,6 +184,8 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
183
184
|
parser.print_help()
|
|
184
185
|
return 1
|
|
185
186
|
|
|
187
|
+
load_dotenv()
|
|
188
|
+
|
|
186
189
|
if args.command == "dataset":
|
|
187
190
|
if args.dataset_action is None:
|
|
188
191
|
parser.parse_args(["dataset", "--help"])
|
|
@@ -5,15 +5,48 @@ Usage::
|
|
|
5
5
|
pixie test [path] [--filter PATTERN] [--verbose]
|
|
6
6
|
|
|
7
7
|
Discovers and runs eval test functions, reporting pass/fail results.
|
|
8
|
+
Generates an HTML scorecard report saved to
|
|
9
|
+
``{config.root}/scorecards/<timestamp>.html``.
|
|
8
10
|
"""
|
|
9
11
|
|
|
10
12
|
from __future__ import annotations
|
|
11
13
|
|
|
12
14
|
import argparse
|
|
13
15
|
import sys
|
|
16
|
+
from collections.abc import Sequence
|
|
14
17
|
|
|
15
18
|
import pixie.instrumentation as px
|
|
16
19
|
from pixie.evals.runner import discover_tests, format_results, run_tests
|
|
20
|
+
from pixie.evals.scorecard import ScorecardReport, TestRecord, save_scorecard
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _build_report(
|
|
24
|
+
results: Sequence[object],
|
|
25
|
+
command_args: str,
|
|
26
|
+
) -> ScorecardReport:
|
|
27
|
+
"""Build a :class:`ScorecardReport` from runner results.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
results: List of ``EvalTestResult`` objects.
|
|
31
|
+
command_args: The command-line arguments string.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
A fully populated ``ScorecardReport``.
|
|
35
|
+
"""
|
|
36
|
+
from pixie.evals.runner import EvalTestResult
|
|
37
|
+
|
|
38
|
+
test_records: list[TestRecord] = []
|
|
39
|
+
for r in results:
|
|
40
|
+
assert isinstance(r, EvalTestResult)
|
|
41
|
+
test_records.append(
|
|
42
|
+
TestRecord(
|
|
43
|
+
name=r.name,
|
|
44
|
+
status=r.status,
|
|
45
|
+
message=r.message,
|
|
46
|
+
asserts=list(r.assert_records),
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
return ScorecardReport(command_args=command_args, test_records=test_records)
|
|
17
50
|
|
|
18
51
|
|
|
19
52
|
def main(argv: list[str] | None = None) -> int:
|
|
@@ -60,6 +93,13 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
60
93
|
output = format_results(results, verbose=args.verbose)
|
|
61
94
|
print(output) # noqa: T201
|
|
62
95
|
|
|
96
|
+
# ── Generate and save scorecard ───────────────────────────────
|
|
97
|
+
raw_argv = argv if argv is not None else sys.argv[1:]
|
|
98
|
+
command_str = "pixie test " + " ".join(raw_argv)
|
|
99
|
+
report = _build_report(results, command_args=command_str)
|
|
100
|
+
scorecard_path = save_scorecard(report)
|
|
101
|
+
print(f"\nSee {scorecard_path} for test details") # noqa: T201
|
|
102
|
+
|
|
63
103
|
all_passed = all(r.status == "passed" for r in results)
|
|
64
104
|
return 0 if all_passed else 1
|
|
65
105
|
|
|
@@ -11,14 +11,14 @@ import os
|
|
|
11
11
|
from dataclasses import dataclass
|
|
12
12
|
|
|
13
13
|
#: Default root directory for all pixie-generated artefacts.
|
|
14
|
-
DEFAULT_ROOT = "
|
|
14
|
+
DEFAULT_ROOT = "pixie_qa"
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
@dataclass(frozen=True)
|
|
18
18
|
class PixieConfig:
|
|
19
19
|
"""Immutable configuration snapshot.
|
|
20
20
|
|
|
21
|
-
All paths default to subdirectories / files within a single
|
|
21
|
+
All paths default to subdirectories / files within a single ``pixie_qa``
|
|
22
22
|
project folder so that observations, datasets, tests, scripts and notes
|
|
23
23
|
live in one predictable location.
|
|
24
24
|
|