pixie-qa 0.1.2__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/.github/copilot-instructions.md +110 -4
  2. pixie_qa-0.1.8/.gitignore +3 -0
  3. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/PKG-INFO +4 -5
  4. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/README.md +2 -4
  5. pixie_qa-0.1.8/changelogs/deep-research-demo.md +43 -0
  6. pixie_qa-0.1.8/changelogs/pixie-test-e2e-suite.md +69 -0
  7. pixie_qa-0.1.8/changelogs/scorecard-branding-and-skill-version-check.md +41 -0
  8. pixie_qa-0.1.8/changelogs/scorecard-eval-detail-dialog.md +28 -0
  9. pixie_qa-0.1.8/changelogs/skill-v2-and-rootdir-discovery.md +76 -0
  10. pixie_qa-0.1.8/changelogs/test-scorecard.md +54 -0
  11. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/docs/package.md +24 -5
  12. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/main.py +3 -0
  13. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/test_command.py +40 -0
  14. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/config.py +2 -2
  15. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/__init__.py +10 -0
  16. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/eval_utils.py +60 -3
  17. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/runner.py +64 -11
  18. pixie_qa-0.1.8/pixie/evals/scorecard.py +815 -0
  19. pixie_qa-0.1.8/pixie/favicon.png +0 -0
  20. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/handlers.py +1 -1
  21. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pyproject.toml +2 -1
  22. pixie_qa-0.1.8/skills/eval-driven-dev/SKILL.md +852 -0
  23. {pixie_qa-0.1.2/.claude → pixie_qa-0.1.8}/skills/eval-driven-dev/references/pixie-api.md +8 -8
  24. pixie_qa-0.1.8/skills/eval-driven-dev/resources/check_version.py +70 -0
  25. pixie_qa-0.1.8/skills/eval-driven-dev/resources/version.json +4 -0
  26. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/agent-skill.md +13 -6
  27. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/evals-harness.md +105 -2
  28. pixie_qa-0.1.8/tests/pixie/cli/e2e_cases.json +183 -0
  29. pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/conftest.py +9 -0
  30. pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +45 -0
  31. pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +156 -0
  32. pixie_qa-0.1.8/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +106 -0
  33. pixie_qa-0.1.8/tests/pixie/cli/test_e2e_pixie_test.py +343 -0
  34. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_runner.py +128 -0
  35. pixie_qa-0.1.8/tests/pixie/evals/test_scorecard.py +487 -0
  36. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/test_config.py +3 -3
  37. pixie_qa-0.1.2/.claude/skills/eval-driven-dev/SKILL.md +0 -522
  38. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/benchmark.json +0 -363
  39. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/benchmark.md +0 -13
  40. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/eval_metadata.json +0 -13
  41. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/outputs/metrics.json +0 -5
  42. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/outputs/response.md +0 -176
  43. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/run-1/grading.json +0 -43
  44. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/run-1/timing.json +0 -5
  45. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/outputs/metrics.json +0 -5
  46. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/outputs/response.md +0 -180
  47. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/run-1/grading.json +0 -44
  48. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/run-1/timing.json +0 -5
  49. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/eval_metadata.json +0 -13
  50. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/outputs/metrics.json +0 -5
  51. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/outputs/response.md +0 -330
  52. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/run-1/grading.json +0 -44
  53. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/run-1/timing.json +0 -5
  54. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/outputs/metrics.json +0 -5
  55. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/outputs/response.md +0 -387
  56. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/run-1/grading.json +0 -44
  57. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/run-1/timing.json +0 -5
  58. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/eval_metadata.json +0 -14
  59. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/outputs/metrics.json +0 -5
  60. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/outputs/response.md +0 -329
  61. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/run-1/grading.json +0 -49
  62. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/run-1/timing.json +0 -5
  63. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/outputs/metrics.json +0 -5
  64. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/outputs/response.md +0 -243
  65. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/run-1/grading.json +0 -49
  66. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/run-1/timing.json +0 -5
  67. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/benchmark.json +0 -353
  68. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/benchmark.md +0 -13
  69. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/eval_metadata.json +0 -13
  70. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/grading.json +0 -51
  71. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/outputs/metrics.json +0 -33
  72. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/outputs/summary.md +0 -49
  73. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/pixie_datasets/qa-golden-set.json +0 -23
  74. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/qa_app.py +0 -26
  75. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/requirements.txt +0 -2
  76. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/project/tests/test_qa.py +0 -24
  77. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/with_skill/run-1/timing.json +0 -1
  78. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/grading.json +0 -51
  79. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/outputs/metrics.json +0 -47
  80. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/outputs/summary.md +0 -87
  81. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/pixie_datasets/qa-golden-set.json +0 -23
  82. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/qa_app.py +0 -26
  83. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/requirements.txt +0 -2
  84. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/project/tests/test_qa.py +0 -46
  85. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-debug-failures/without_skill/run-1/timing.json +0 -1
  86. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/eval_metadata.json +0 -13
  87. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/grading.json +0 -52
  88. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/outputs/metrics.json +0 -45
  89. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/outputs/summary.md +0 -80
  90. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/MEMORY.md +0 -83
  91. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/build_dataset.py +0 -141
  92. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/extractor.py +0 -46
  93. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/requirements.txt +0 -2
  94. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project/tests/test_email_extraction.py +0 -229
  95. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/timing.json +0 -1
  96. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/grading.json +0 -52
  97. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/outputs/metrics.json +0 -28
  98. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/outputs/summary.md +0 -56
  99. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/build_dataset.py +0 -108
  100. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/extractor.py +0 -55
  101. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/requirements.txt +0 -2
  102. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/project/test_extractor.py +0 -290
  103. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/without_skill/run-1/timing.json +0 -1
  104. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/eval_metadata.json +0 -13
  105. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/grading.json +0 -51
  106. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/outputs/metrics.json +0 -15
  107. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/outputs/summary.md +0 -75
  108. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/MEMORY.md +0 -52
  109. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/build_dataset.py +0 -91
  110. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/chatbot.py +0 -60
  111. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/requirements.txt +0 -2
  112. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/project/tests/test_rag_chatbot.py +0 -109
  113. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/with_skill/run-1/timing.json +0 -1
  114. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/grading.json +0 -52
  115. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/outputs/metrics.json +0 -14
  116. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/outputs/summary.md +0 -50
  117. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/build_dataset.py +0 -56
  118. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/chatbot.py +0 -66
  119. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/requirements.txt +0 -2
  120. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/project/test_chatbot.py +0 -137
  121. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-rag-chatbot/without_skill/run-1/timing.json +0 -1
  122. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/benchmark.json +0 -363
  123. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/benchmark.md +0 -13
  124. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/eval_metadata.json +0 -12
  125. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/grading.json +0 -47
  126. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/MEMORY.md +0 -40
  127. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/pixie_datasets/qa-golden-set.json +0 -23
  128. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/qa_app.py +0 -26
  129. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/requirements.txt +0 -2
  130. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/project/tests/test_qa.py +0 -25
  131. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/run-1/grading.json +0 -47
  132. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/run-1/outputs/MEMORY.md +0 -40
  133. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/run-1/outputs/test_qa.py +0 -25
  134. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/with_skill/timing.json +0 -5
  135. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/grading.json +0 -53
  136. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/INVESTIGATION_NOTES.md +0 -74
  137. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/pixie_datasets/qa-golden-set.json +0 -23
  138. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/qa_app.py +0 -26
  139. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/requirements.txt +0 -2
  140. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/project/tests/test_qa.py +0 -83
  141. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/run-1/grading.json +0 -53
  142. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/run-1/outputs/INVESTIGATION_NOTES.md +0 -74
  143. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/run-1/outputs/test_qa.py +0 -83
  144. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-debug-failures/without_skill/timing.json +0 -5
  145. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/eval_metadata.json +0 -14
  146. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/grading.json +0 -57
  147. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/MEMORY.md +0 -23
  148. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/build_dataset.py +0 -91
  149. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/extractor.py +0 -64
  150. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/requirements.txt +0 -1
  151. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/run_evals.sh +0 -23
  152. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/project/tests/test_classifier.py +0 -117
  153. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/grading.json +0 -57
  154. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/MEMORY.md +0 -23
  155. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/build_dataset.py +0 -91
  156. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/extractor.py +0 -64
  157. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/run-1/outputs/test_classifier.py +0 -117
  158. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/with_skill/timing.json +0 -5
  159. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/grading.json +0 -63
  160. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/project/collect_traces.py +0 -80
  161. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/project/extractor.py +0 -57
  162. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/project/requirements.txt +0 -1
  163. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/run-1/grading.json +0 -63
  164. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/run-1/outputs/collect_traces.py +0 -80
  165. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/run-1/outputs/extractor.py +0 -57
  166. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-email-classifier/without_skill/timing.json +0 -5
  167. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/eval_metadata.json +0 -14
  168. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/grading.json +0 -67
  169. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/MEMORY.md +0 -27
  170. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/chatbot.py +0 -51
  171. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/pixie_datasets/rag-chatbot-golden.json +0 -37
  172. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/pixie_observations.db +0 -0
  173. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/requirements.txt +0 -1
  174. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project/tests/test_chatbot.py +0 -21
  175. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/grading.json +0 -67
  176. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/outputs/MEMORY.md +0 -27
  177. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/outputs/chatbot.py +0 -51
  178. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/run-1/outputs/test_chatbot.py +0 -21
  179. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/timing.json +0 -6
  180. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/grading.json +0 -63
  181. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/capture_traces.py +0 -92
  182. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/chatbot.py +0 -53
  183. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/requirements.txt +0 -1
  184. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/project/test_chatbot_evals.py +0 -273
  185. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/grading.json +0 -63
  186. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/outputs/capture_traces.py +0 -92
  187. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/outputs/chatbot.py +0 -53
  188. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/run-1/outputs/test_chatbot_evals.py +0 -273
  189. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/without_skill/timing.json +0 -5
  190. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/benchmark.json +0 -363
  191. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/benchmark.md +0 -13
  192. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/eval_metadata.json +0 -12
  193. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/grading.json +0 -40
  194. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/MEMORY.md +0 -40
  195. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/pixie_datasets/qa-golden-set.json +0 -23
  196. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/qa_app.py +0 -26
  197. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/requirements.txt +0 -2
  198. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/project/tests/test_qa.py +0 -25
  199. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/run-1/grading.json +0 -40
  200. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/run-1/outputs/MEMORY.md +0 -40
  201. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/run-1/outputs/test_qa.py +0 -25
  202. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/with_skill/timing.json +0 -5
  203. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/grading.json +0 -51
  204. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/pixie_datasets/qa-golden-set.json +0 -23
  205. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/qa_app.py +0 -26
  206. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/requirements.txt +0 -2
  207. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/project/tests/test_qa.py +0 -24
  208. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/run-1/grading.json +0 -51
  209. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/run-1/outputs/test_qa.py +0 -24
  210. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-debug-failures/without_skill/timing.json +0 -5
  211. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/eval_metadata.json +0 -14
  212. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/grading.json +0 -50
  213. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/MEMORY.md +0 -23
  214. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/extractor.py +0 -63
  215. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/pixie_datasets/email-classifier-golden.json +0 -29
  216. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/pixie_observations.db +0 -0
  217. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/requirements.txt +0 -1
  218. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/project/tests/test_email_classifier.py +0 -86
  219. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/grading.json +0 -50
  220. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/outputs/MEMORY.md +0 -23
  221. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/outputs/extractor.py +0 -63
  222. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/run-1/outputs/test_email_classifier.py +0 -86
  223. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/with_skill/timing.json +0 -6
  224. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/grading.json +0 -57
  225. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/conftest.py +0 -2
  226. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/extractor.py +0 -57
  227. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/generate_dataset.py +0 -78
  228. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/instrumented_extractor.py +0 -22
  229. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/pytest.ini +0 -2
  230. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/requirements.txt +0 -1
  231. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/project/test_email_classifier.py +0 -329
  232. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/run-1/grading.json +0 -57
  233. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/run-1/outputs/extractor.py +0 -57
  234. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/run-1/outputs/test_email_classifier.py +0 -329
  235. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-email-classifier/without_skill/timing.json +0 -5
  236. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/eval_metadata.json +0 -14
  237. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/grading.json +0 -50
  238. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/MEMORY.md +0 -22
  239. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/chatbot.py +0 -52
  240. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/pixie_datasets/rag-chatbot-golden.json +0 -29
  241. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/pixie_observations.db +0 -0
  242. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/requirements.txt +0 -1
  243. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project/tests/test_rag_chatbot.py +0 -28
  244. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/grading.json +0 -50
  245. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/outputs/MEMORY.md +0 -22
  246. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/outputs/chatbot.py +0 -52
  247. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/run-1/outputs/test_rag_chatbot.py +0 -28
  248. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/timing.json +0 -6
  249. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/grading.json +0 -57
  250. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/chatbot.py +0 -46
  251. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/chatbot_instrumented.py +0 -72
  252. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/requirements.txt +0 -1
  253. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/save_dataset.py +0 -86
  254. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/project/test_chatbot_evals.py +0 -180
  255. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/run-1/grading.json +0 -57
  256. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/run-1/outputs/chatbot_instrumented.py +0 -72
  257. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/run-1/outputs/test_chatbot_evals.py +0 -180
  258. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/without_skill/timing.json +0 -5
  259. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/benchmark.json +0 -363
  260. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/benchmark.md +0 -13
  261. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/eval_metadata.json +0 -12
  262. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/grading.json +0 -71
  263. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/MEMORY.md +0 -51
  264. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/pixie_datasets/qa-golden-set.json +0 -23
  265. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/qa_app.py +0 -26
  266. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/requirements.txt +0 -2
  267. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/project/tests/test_qa.py +0 -60
  268. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/grading.json +0 -71
  269. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/MEMORY.md +0 -51
  270. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/pixie_datasets/qa-golden-set.json +0 -23
  271. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/qa_app.py +0 -26
  272. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/requirements.txt +0 -2
  273. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/run-1/outputs/tests/test_qa.py +0 -60
  274. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/with_skill/timing.json +0 -5
  275. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/grading.json +0 -77
  276. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/MEMORY.md +0 -48
  277. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/pixie_datasets/qa-golden-set.json +0 -23
  278. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/qa_app.py +0 -26
  279. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/requirements.txt +0 -2
  280. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/project/tests/test_qa.py +0 -44
  281. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/grading.json +0 -77
  282. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/MEMORY.md +0 -48
  283. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/pixie_datasets/qa-golden-set.json +0 -23
  284. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/qa_app.py +0 -26
  285. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/requirements.txt +0 -2
  286. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/run-1/outputs/tests/test_qa.py +0 -44
  287. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-debug-failures/without_skill/timing.json +0 -5
  288. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/eval_metadata.json +0 -14
  289. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/grading.json +0 -77
  290. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/MEMORY.md +0 -48
  291. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/build_dataset.py +0 -93
  292. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/extractor.py +0 -65
  293. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/requirements.txt +0 -1
  294. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/project/tests/test_email_classifier.py +0 -22
  295. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/grading.json +0 -77
  296. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/MEMORY.md +0 -48
  297. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/build_dataset.py +0 -93
  298. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/extractor.py +0 -65
  299. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/requirements.txt +0 -1
  300. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/run-1/outputs/tests/test_email_classifier.py +0 -22
  301. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/with_skill/timing.json +0 -5
  302. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/grading.json +0 -82
  303. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/build_dataset.py +0 -156
  304. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/extractor.py +0 -62
  305. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/requirements.txt +0 -1
  306. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/project/test_email_classifier.py +0 -345
  307. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/grading.json +0 -82
  308. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/build_dataset.py +0 -156
  309. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/extractor.py +0 -62
  310. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/requirements.txt +0 -1
  311. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/run-1/outputs/test_email_classifier.py +0 -345
  312. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-email-classifier/without_skill/timing.json +0 -5
  313. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/eval_metadata.json +0 -14
  314. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/grading.json +0 -81
  315. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/MEMORY.md +0 -71
  316. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/build_dataset.py +0 -63
  317. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/chatbot.py +0 -53
  318. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/requirements.txt +0 -1
  319. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/project/tests/test_rag_chatbot.py +0 -54
  320. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/grading.json +0 -81
  321. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/MEMORY.md +0 -71
  322. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/build_dataset.py +0 -63
  323. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/chatbot.py +0 -53
  324. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/requirements.txt +0 -1
  325. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/run-1/outputs/tests/test_rag_chatbot.py +0 -54
  326. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/with_skill/timing.json +0 -5
  327. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/grading.json +0 -81
  328. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/MEMORY.md +0 -62
  329. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/chatbot.py +0 -52
  330. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/datasets/rag-chatbot-golden.json +0 -41
  331. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/requirements.txt +0 -1
  332. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/project/test_chatbot_eval.py +0 -152
  333. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/grading.json +0 -81
  334. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/MEMORY.md +0 -62
  335. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/chatbot.py +0 -52
  336. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/datasets/rag-chatbot-golden.json +0 -41
  337. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/requirements.txt +0 -1
  338. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/run-1/outputs/test_chatbot_eval.py +0 -152
  339. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-5/eval-rag-chatbot/without_skill/timing.json +0 -5
  340. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-1.html +0 -1325
  341. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-2.html +0 -1325
  342. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-3.html +0 -1325
  343. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-4.html +0 -1325
  344. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/review-iteration-5.html +0 -1325
  345. pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/trigger-eval-set.json +0 -82
  346. pixie_qa-0.1.2/.github/workflows/daily-release.yml +0 -139
  347. pixie_qa-0.1.2/.gitignore +0 -4
  348. pixie_qa-0.1.2/tests/pixie/observation_store/__init__.py +0 -0
  349. pixie_qa-0.1.2/uv.lock +0 -1299
  350. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/.github/workflows/publish.yml +0 -0
  351. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/LICENSE +0 -0
  352. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/async-handler-processing.md +0 -0
  353. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/autoevals-adapters.md +0 -0
  354. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/cli-dataset-commands.md +0 -0
  355. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/dataset-management.md +0 -0
  356. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/eval-harness.md +0 -0
  357. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/expected-output-in-evals.md +0 -0
  358. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/instrumentation-module-implementation.md +0 -0
  359. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/loud-failure-mode.md +0 -0
  360. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/manual-instrumentation-usability.md +0 -0
  361. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/observation-store-implementation.md +0 -0
  362. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/pixie-directory-and-skill-improvements.md +0 -0
  363. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/root-package-exports-and-trace-id.md +0 -0
  364. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/changelogs/usability-utils.md +0 -0
  365. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/__init__.py +0 -0
  366. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/__init__.py +0 -0
  367. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/cli/dataset_command.py +0 -0
  368. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/dataset/__init__.py +0 -0
  369. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/dataset/models.py +0 -0
  370. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/dataset/store.py +0 -0
  371. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/criteria.py +0 -0
  372. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/evaluation.py +0 -0
  373. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/scorers.py +0 -0
  374. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/trace_capture.py +0 -0
  375. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/evals/trace_helpers.py +0 -0
  376. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/__init__.py +0 -0
  377. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/context.py +0 -0
  378. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/handler.py +0 -0
  379. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/instrumentors.py +0 -0
  380. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/observation.py +0 -0
  381. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/processor.py +0 -0
  382. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/queue.py +0 -0
  383. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/instrumentation/spans.py +0 -0
  384. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/__init__.py +0 -0
  385. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/evaluable.py +0 -0
  386. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/piccolo_conf.py +0 -0
  387. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/piccolo_migrations/__init__.py +0 -0
  388. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/serialization.py +0 -0
  389. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/store.py +0 -0
  390. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/tables.py +0 -0
  391. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/pixie/storage/tree.py +0 -0
  392. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/agent-skill-1.md +0 -0
  393. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/autoevals-adapters.md +0 -0
  394. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/dataset-management.md +0 -0
  395. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/expected-output-in-evals.md +0 -0
  396. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/instrumentation.md +0 -0
  397. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/manual-instrumentation-usability.md +0 -0
  398. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/storage.md +0 -0
  399. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/specs/usability-utils.md +0 -0
  400. {pixie_qa-0.1.2/.claude/skills/eval-driven-dev-workspace/iteration-2/eval-json-extraction/with_skill/run-1/project → pixie_qa-0.1.8}/tests/__init__.py +0 -0
  401. {pixie_qa-0.1.2/tests → pixie_qa-0.1.8/tests/pixie}/__init__.py +0 -0
  402. {pixie_qa-0.1.2/tests/pixie → pixie_qa-0.1.8/tests/pixie/cli}/__init__.py +0 -0
  403. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/cli/test_dataset_command.py +0 -0
  404. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/cli/test_main.py +0 -0
  405. {pixie_qa-0.1.2/tests/pixie/cli → pixie_qa-0.1.8/tests/pixie/dataset}/__init__.py +0 -0
  406. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/dataset/test_models.py +0 -0
  407. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/dataset/test_store.py +0 -0
  408. {pixie_qa-0.1.2/tests/pixie/dataset → pixie_qa-0.1.8/tests/pixie/evals}/__init__.py +0 -0
  409. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_criteria.py +0 -0
  410. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_eval_utils.py +0 -0
  411. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_evaluation.py +0 -0
  412. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_scorers.py +0 -0
  413. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_trace_capture.py +0 -0
  414. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/evals/test_trace_helpers.py +0 -0
  415. {pixie_qa-0.1.2/tests/pixie/evals → pixie_qa-0.1.8/tests/pixie/instrumentation}/__init__.py +0 -0
  416. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/conftest.py +0 -0
  417. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_context.py +0 -0
  418. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_handler.py +0 -0
  419. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_integration.py +0 -0
  420. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_observation.py +0 -0
  421. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_processor.py +0 -0
  422. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_queue.py +0 -0
  423. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_spans.py +0 -0
  424. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/instrumentation/test_storage_handler.py +0 -0
  425. {pixie_qa-0.1.2/tests/pixie/instrumentation → pixie_qa-0.1.8/tests/pixie/observation_store}/__init__.py +0 -0
  426. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/conftest.py +0 -0
  427. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_evaluable.py +0 -0
  428. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_serialization.py +0 -0
  429. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_store.py +0 -0
  430. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/observation_store/test_tree.py +0 -0
  431. {pixie_qa-0.1.2 → pixie_qa-0.1.8}/tests/pixie/test_init.py +0 -0
@@ -144,6 +144,103 @@ uv run pytest -k "test_function_name" # Run specific test
144
144
  uv run pytest --cov=pixie # Run with coverage report
145
145
  ```
146
146
 
147
+ ### 4a. End-to-End Tests for `pixie test`
148
+
149
+ The `pixie test` CLI command has a dedicated e2e test suite that verifies the full
150
+ command lifecycle — test discovery, execution, console output, exit codes, and
151
+ HTML scorecard generation. The suite uses **realistic fixtures** that mirror how
152
+ a real user would configure datasets, evaluators, and test files.
153
+
154
+ **Fixture layout:**
155
+
156
+ ```
157
+ tests/pixie/cli/
158
+ e2e_fixtures/
159
+ datasets/
160
+ customer-faq.json # 5-item golden dataset (Evaluable items)
161
+ mock_evaluators.py # Deterministic mock evaluators (no LLM calls)
162
+ test_customer_faq.py # Realistic test file using assert_dataset_pass
163
+ e2e_cases.json # Edge-case scenario definitions
164
+ test_e2e_pixie_test.py # Automated pytest e2e tests
165
+ ```
166
+
167
+ The automated pytest file (`test_e2e_pixie_test.py`) contains two test classes:
168
+
169
+ 1. **`TestPixieTestRealisticE2E`** (10 tests) — Runs `pixie test` on the
170
+ realistic fixture (`test_customer_faq.py`) that uses 4 evaluator/criteria
171
+ combinations against the customer-FAQ dataset. Verifies exit code, console
172
+ summary, test names, check/cross marks, scorecard HTML generation, evaluator
173
+ names, PASS/FAIL badges, per-input scores, summary counts, and scoring
174
+ strategy descriptions.
175
+
176
+ 2. **`TestPixieTestEdgeCases`** (32 tests) — Parametrised from `e2e_cases.json`
177
+ covering empty dirs, filters, verbose mode, single file targeting, etc.
178
+
179
+ **Mock evaluators** (`e2e_fixtures/mock_evaluators.py`) are deterministic
180
+ replacements for LLM-as-judge evaluators. They use string similarity, keyword
181
+ overlap, or fixed scores to produce realistic but reproducible results:
182
+ - `MockFactualityEval` — SequenceMatcher string similarity (most items pass)
183
+ - `MockClosedQAEval` — keyword overlap ratio (strict; some items fail)
184
+ - `MockHallucinationEval` — always returns score 0.95
185
+ - `MockFailingEval` (name="MockStrictTone") — always returns score 0.2
186
+
187
+ **Expected realistic fixture results:**
188
+ - `test_faq_factuality` → PASS (MockFactuality, threshold=0.6, pct=0.8)
189
+ - `test_faq_multi_evaluator` → FAIL (MockFactuality+MockClosedQA, threshold=0.5, pct=1.0)
190
+ - `test_faq_no_hallucinations` → PASS (MockHallucination, threshold=0.5, pct=1.0)
191
+ - `test_faq_tone_check` → FAIL (MockStrictTone, threshold=0.5, pct=1.0)
192
+ - Console: "2 passed, 2 failed", exit code 1
193
+ - Scorecard: HTML with evaluator names, scores, PASS/FAIL badges
194
+
195
+ **When to run e2e tests:**
196
+
197
+ Run the e2e suite whenever you change anything in:
198
+ - `pixie/cli/test_command.py` — the `pixie test` entry point
199
+ - `pixie/evals/runner.py` — test discovery, execution, formatting
200
+ - `pixie/evals/scorecard.py` — scorecard models, HTML generation
201
+ - `pixie/evals/eval_utils.py` — `assert_pass` / `assert_dataset_pass`
202
+ - `pixie/evals/criteria.py` — pass criteria
203
+
204
+ ```bash
205
+ uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v # Run all 42 e2e tests
206
+ ```
207
+
208
+ **Agent verification protocol (manual inspection):**
209
+
210
+ In addition to the automated pytest tests, the coding agent should manually
211
+ verify the `pixie test` output after making changes to CLI/eval/scorecard code:
212
+
213
+ 1. **Run the realistic fixture directly:**
214
+ ```bash
215
+ PIXIE_ROOT=/tmp/pixie_e2e_verify uv run pixie test tests/pixie/cli/e2e_fixtures/test_customer_faq.py
216
+ ```
217
+
218
+ 2. **Inspect the console output** — verify that:
219
+ - All 4 test names appear with correct ✓/✗ marks
220
+ - Summary shows "2 passed, 2 failed"
221
+ - No unexpected errors or tracebacks
222
+
223
+ 3. **Inspect the HTML scorecard** — open the generated file and verify:
224
+ - All 4 evaluator names appear (MockFactuality, MockClosedQA, etc.)
225
+ - Per-input score cells show reasonable numeric values
226
+ - PASS/FAIL badges match expectations (2 PASS, 2 FAIL)
227
+ - Scoring strategy descriptions are human-readable
228
+ - The scorecard is well-formatted and renders correctly
229
+
230
+ 4. **Evaluate holistically** — given the dataset contents and evaluator
231
+ definitions, do the scores and pass/fail outcomes make sense? For example,
232
+ MockFactuality should score high on items where `eval_output` is similar to
233
+ `expected_output`, and MockStrictTone should always fail.
234
+
235
+ This manual step catches rendering issues, layout regressions, and semantic
236
+ correctness problems that simple string assertions can miss.
237
+
238
+ **Adding new edge-case scenarios:**
239
+
240
+ 1. Add a new object to `tests/pixie/cli/e2e_cases.json`.
241
+ 2. Run `uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v` to verify.
242
+ 3. No code changes needed in the test file — it auto-discovers all cases.
243
+
147
244
  ### 5. Test Quality Guidelines
148
245
 
149
246
  **Good tests are:**
@@ -344,11 +441,17 @@ uv run ruff format . # Format code
344
441
  Before committing, run:
345
442
 
346
443
  ```bash
347
- uv run pytest # All tests must pass
444
+ uv run pytest # All tests must pass (includes e2e)
348
445
  uv run mypy pixie/ # Zero type errors
349
446
  uv run ruff check . # No linting errors
350
447
  ```
351
448
 
449
+ When changing `pixie test` or scorecard-related code, also run e2e explicitly:
450
+
451
+ ```bash
452
+ uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v # Verify pixie test e2e
453
+ ```
454
+
352
455
  Also verify **zero Pylance errors** in VS Code Problems panel (Pylance can catch type mismatches that mypy misses for untyped third-party packages).
353
456
 
354
457
  ---
@@ -509,6 +612,8 @@ This project has strict error-handling conventions due to operating inside OTel
509
612
  6. ✅ Update docstrings / `README.md` / relevant `specs/` docs
510
613
  7. ✅ Add/update `changelogs/<feature>.md` for non-trivial changes
511
614
  8. ✅ Verify functionality works as expected
615
+ 9. ✅ If touching `pixie test` / scorecard / runner / eval code, run `uv run pytest tests/pixie/cli/test_e2e_pixie_test.py -v` — all 42 e2e tests must pass (10 realistic + 32 edge-case)
616
+ 10. ✅ If touching `pixie test` / scorecard code, also run the **agent verification protocol** (section 4a) — manually run `pixie test` on the realistic fixture and inspect console + scorecard output
512
617
 
513
618
  **Development cycle:**
514
619
 
@@ -519,8 +624,9 @@ This project has strict error-handling conventions due to operating inside OTel
519
624
  5. Implement feature (reuse existing code when possible)
520
625
  6. After each task: run tests and type check
521
626
  7. Run linting (`uv run ruff check .`)
522
- 8. Update docs and changelog for the task
523
- 9. Fix any issues
524
- 10. Commit
627
+ 8. Run `pixie test` e2e suite if CLI/eval/scorecard code changed
628
+ 9. Update docs and changelog for the task
629
+ 10. Fix any issues
630
+ 11. Commit
525
631
 
526
632
  Following these practices ensures high code quality, type safety, maintainability, and reliability.
@@ -0,0 +1,3 @@
1
+ .claude
2
+ __pycache__
3
+ pixie_qa/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixie-qa
3
- Version: 0.1.2
3
+ Version: 0.1.8
4
4
  Summary: Automated quality assurance for AI applications
5
5
  Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
6
  Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -45,6 +45,7 @@ Requires-Dist: opentelemetry-api>=1.27.0
45
45
  Requires-Dist: opentelemetry-sdk>=1.27.0
46
46
  Requires-Dist: piccolo[sqlite]>=1.33.0
47
47
  Requires-Dist: pydantic>=2.0
48
+ Requires-Dist: python-dotenv>=1.2.2
48
49
  Provides-Extra: all
49
50
  Requires-Dist: openinference-instrumentation-anthropic; extra == 'all'
50
51
  Requires-Dist: openinference-instrumentation-dspy; extra == 'all'
@@ -67,13 +68,11 @@ Description-Content-Type: text/markdown
67
68
 
68
69
  An agent skill for **eval-driven development** of LLM-powered applications.
69
70
 
70
- Use this skill to instrument your app, build golden datasets from real runs, write eval-based tests, and catch regressions before they ship — all from a single conversation with Claude.
71
-
72
71
  ## What the Skill Does
73
72
 
74
73
  The `eval-driven-dev` skill guides your coding agent through the full QA loop for LLM applications:
75
74
 
76
- 1. **Understand the app** — read the codebase, trace the data flow, learn what the app is supposed to do
75
+ 1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
77
76
  2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
78
77
  3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
79
78
  4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
@@ -85,7 +84,7 @@ The `eval-driven-dev` skill guides your coding agent through the full QA loop fo
85
84
  ### 1. Add the skill to your coding agent
86
85
 
87
86
  ```bash
88
- npx openskills install yiouli/pixie-qa
87
+ npx skills add yiouli/pixie-qa
89
88
  ```
90
89
 
91
90
  The accompanying python package would be installed by the skill automatically when it's used.
@@ -2,13 +2,11 @@
2
2
 
3
3
  An agent skill for **eval-driven development** of LLM-powered applications.
4
4
 
5
- Use this skill to instrument your app, build golden datasets from real runs, write eval-based tests, and catch regressions before they ship — all from a single conversation with Claude.
6
-
7
5
  ## What the Skill Does
8
6
 
9
7
  The `eval-driven-dev` skill guides your coding agent through the full QA loop for LLM applications:
10
8
 
11
- 1. **Understand the app** — read the codebase, trace the data flow, learn what the app is supposed to do
9
+ 1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
12
10
  2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
13
11
  3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
14
12
  4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
@@ -20,7 +18,7 @@ The `eval-driven-dev` skill guides your coding agent through the full QA loop fo
20
18
  ### 1. Add the skill to your coding agent
21
19
 
22
20
  ```bash
23
- npx openskills install yiouli/pixie-qa
21
+ npx skills add yiouli/pixie-qa
24
22
  ```
25
23
 
26
24
  The accompanying python package would be installed by the skill automatically when it's used.
@@ -0,0 +1,43 @@
1
+ # Deep Research Demo Project
2
+
3
+ ## What Changed
4
+
5
+ Added a simplified demo project under `demo/deep_research/` based on the
6
+ [GPT Researcher](https://github.com/assafelovic/gpt-researcher) project
7
+ (commit `7c32174`, Apache 2.0 license).
8
+
9
+ The demo serves as a real-world AI application that the **pixie-qa** skill
10
+ can be tested against.
11
+
12
+ ### Simplifications from the original
13
+
14
+ | Removed | Reason |
15
+ |---------|--------|
16
+ | UI (frontend + backend server) | Not needed for programmatic evaluation |
17
+ | Deep Research mode | Complex multi-agent workflow, out of scope |
18
+ | Image generation | Not needed for text-based evaluation |
19
+ | MCP integrations | External tool integrations, not needed |
20
+ | All retrievers except DuckDuckGo | Simplifies dependencies, avoids paid APIs |
21
+ | Tavily / Firecrawl scrapers | Paid service dependencies |
22
+ | PDF / DOCX export | Not needed for evaluation |
23
+ | Docker / Terraform / multi-agent | Infrastructure, not needed |
24
+
25
+ ### What remains
26
+
27
+ - Programmatic entry point (`run.py`) to run research with a string query
28
+ - Full agent workflow: query → sub-queries → web search → scrape → summarize → report
29
+ - DuckDuckGo as the sole search retriever (free, no API key needed for search)
30
+ - OpenAI as the LLM provider (requires `OPENAI_API_KEY`)
31
+
32
+ ## Files Affected
33
+
34
+ - `demo/deep_research/` — entire new directory
35
+ - `demo/deep_research/gpt_researcher/` — simplified core library
36
+ - `demo/deep_research/run.py` — entry point
37
+ - `demo/deep_research/pyproject.toml` — dependencies
38
+ - `demo/deep_research/LICENSE` — Apache 2.0 (from upstream)
39
+ - `demo/deep_research/NOTICE` — attribution notice
40
+
41
+ ## Migration Notes
42
+
43
+ This is a new addition — no migration required.
@@ -0,0 +1,69 @@
1
+ # pixie test — e2e test suite
2
+
3
+ ## What changed
4
+
5
+ Added a comprehensive end-to-end test suite for the `pixie test` CLI command
6
+ with two complementary layers:
7
+
8
+ 1. **Realistic fixture tests** (10 tests) — run `pixie test` on a realistic
9
+ test file with a 5-item customer-FAQ golden dataset and 4 deterministic
10
+ mock evaluators. Verifies exit code, console summary, test names,
11
+ check/cross marks, scorecard HTML generation, evaluator names, PASS/FAIL
12
+ badges, per-input scores, summary counts, and scoring strategy descriptions.
13
+
14
+ 2. **Edge-case tests** (32 tests) — parametrised from `e2e_cases.json`
15
+ covering empty dirs, filters, verbose mode, single file targeting, etc.
16
+
17
+ The copilot instructions now include an **agent verification protocol** that
18
+ tells the coding agent to manually run `pixie test` on the realistic fixtures
19
+ and holistically evaluate the console output and HTML scorecard after making
20
+ changes to CLI/eval/scorecard code.
21
+
22
+ ### New files
23
+
24
+ - **`tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json`** — 5-item
25
+ golden dataset with FAQ questions, chatbot answers, and reference answers.
26
+
27
+ - **`tests/pixie/cli/e2e_fixtures/mock_evaluators.py`** — 4 deterministic
28
+ mock evaluators: MockFactualityEval (SequenceMatcher string similarity),
29
+ MockClosedQAEval (keyword overlap), MockHallucinationEval (always 0.95),
30
+ MockFailingEval/MockStrictTone (always 0.2). No LLM calls.
31
+
32
+ - **`tests/pixie/cli/e2e_fixtures/test_customer_faq.py`** — Realistic test
33
+ file using `assert_dataset_pass` with different scoring strategies.
34
+ Expected: 2 PASS (`test_faq_factuality`, `test_faq_no_hallucinations`),
35
+ 2 FAIL (`test_faq_multi_evaluator`, `test_faq_tone_check`).
36
+
37
+ - **`tests/pixie/cli/e2e_cases.json`** — 8 edge-case scenarios as JSON data.
38
+
39
+ - **`tests/pixie/cli/test_e2e_pixie_test.py`** — Two test classes:
40
+ `TestPixieTestRealisticE2E` (10 tests) and `TestPixieTestEdgeCases`
41
+ (32 tests). Total: 42 test cases.
42
+
43
+ ### Modified files
44
+
45
+ - **`.github/copilot-instructions.md`** — Rewrote section 4a with realistic
46
+ fixture layout, mock evaluator descriptions, expected results, and a full
47
+ agent verification protocol. Updated summary checklist to require both
48
+ automated e2e tests (42) and manual agent inspection.
49
+
50
+ - **`specs/evals-harness.md`** — Updated E2E Test Suite section to describe
51
+ both realistic fixtures and edge-case scenarios.
52
+
53
+ ## Files affected
54
+
55
+ - `tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json`
56
+ - `tests/pixie/cli/e2e_fixtures/mock_evaluators.py`
57
+ - `tests/pixie/cli/e2e_fixtures/test_customer_faq.py`
58
+ - `tests/pixie/cli/e2e_cases.json`
59
+ - `tests/pixie/cli/test_e2e_pixie_test.py`
60
+ - `.github/copilot-instructions.md`
61
+ - `specs/evals-harness.md`
62
+
63
+ ## Migration notes
64
+
65
+ No API changes. The e2e test suite is purely additive.
66
+
67
+ - To add new edge-case scenarios: edit `tests/pixie/cli/e2e_cases.json`.
68
+ - To modify realistic fixture behavior: edit mock evaluators or the test file
69
+ in `tests/pixie/cli/e2e_fixtures/`.
@@ -0,0 +1,41 @@
1
+ # Scorecard branding and skill version check
2
+
3
+ ## What changed and why
4
+
5
+ Two user-facing upgrades were added:
6
+
7
+ 1. The HTML scorecard generated by `pixie test` now has a branded Pixie header
8
+ with a repo CTA and a feedback modal so users can quickly star the project
9
+ and send feedback without leaving the report context.
10
+ 2. The `eval-driven-dev` skill now ships with version metadata and a bundled
11
+ version-check script that compares the local skill resource with the latest
12
+ `main` branch metadata and updates both the skill and Python package when the
13
+ local version is behind.
14
+
15
+ ## Files affected
16
+
17
+ - `pixie/evals/scorecard.py` — adds the branded header, repo CTA, feedback
18
+ modal HTML, styling, and modal toggle script.
19
+ - `tests/pixie/evals/test_scorecard.py` — verifies the branded header and
20
+ feedback form wiring in generated scorecard HTML.
21
+ - `tests/pixie/cli/test_e2e_pixie_test.py` — verifies the generated scorecard
22
+ contains the branding and feedback actions end-to-end.
23
+ - `tests/pixie/test_skill_resources.py` — verifies the skill version metadata
24
+ and update command behavior.
25
+ - `.claude/skills/eval-driven-dev/resources/version.json` — source of truth for
26
+ the local skill version metadata.
27
+ - `.claude/skills/eval-driven-dev/resources/check_version.py` — checks the
28
+ remote version file and triggers updates when needed.
29
+ - `.claude/skills/eval-driven-dev/SKILL.md` — instructs the skill to run the
30
+ version check before any other skill steps.
31
+ - `README.md`, `docs/package.md`, and `specs/evals-harness.md` — document the
32
+ new header and skill update flow.
33
+
34
+ ## Migration notes
35
+
36
+ - No API changes are required for existing `pixie test` users. The scorecard
37
+ remains a self-contained HTML file, now with extra header UI and a feedback
38
+ form.
39
+ - The version-check script is additive. If network access or package-manager
40
+ commands are unavailable, it exits cleanly after printing what could not be
41
+ checked or run.
@@ -0,0 +1,28 @@
1
+ # Scorecard: Evaluator cell "details" dialog
2
+
3
+ ## What changed
4
+
5
+ Each evaluator score cell in the scorecard detail table now has a **"details"** hyperlink.
6
+ Clicking it opens a modal dialog showing:
7
+
8
+ - **Score** — numeric value with green ✓ or red ✗ indicator
9
+ - **Reasoning** — the full `Evaluation.reasoning` string (previously only shown as a tooltip)
10
+ - **Details** — the `Evaluation.details` dict rendered as pretty-printed JSON (hidden when empty)
11
+
12
+ The modal is dismiss-able via the **Close** button, the **Esc** key, or a click on the backdrop — consistent with the existing feedback modal.
13
+
14
+ ## Files affected
15
+
16
+ - `pixie/evals/scorecard.py`
17
+ - Added `import json`
18
+ - New `_render_eval_detail_modal()` helper — renders the reusable hidden modal
19
+ - `generate_scorecard_html()` — calls `_render_eval_detail_modal()` after the brand header
20
+ - `_render_pass_table()` — each evaluator cell now embeds a `data-eval` JSON attribute and a `details` link
21
+ - `_HTML_HEAD` — added CSS for `.details-link`, `.eval-detail-body`, `.eval-detail-row`, `.eval-detail-label`, `.eval-detail-value`, `.eval-detail-score-pass/fail`, `.eval-detail-json`
22
+ - `_HTML_FOOT` — added `showEvalDetail(link)` and `closeEvalDetailModal()` JS functions; updated Esc and backdrop-click handlers to also close the eval-detail modal
23
+
24
+ ## Migration notes
25
+
26
+ No API changes. The `AssertRecord`, `Evaluation`, and `ScorecardReport` models are unchanged.
27
+ Existing scorecards already stored `Evaluation.reasoning` as a cell `title` attribute (tooltip);
28
+ that attribute has been replaced by the clickable details link — tooltip-only access is no longer available.
@@ -0,0 +1,76 @@
1
+ # Skill v2: setup-vs-iterate, eval boundary, rootdir discovery
2
+
3
+ ## What changed and why
4
+
5
+ ### 1. Renamed default root directory from `.pixie` to `pixie_qa`
6
+
7
+ The dot-prefix `.pixie` caused Python import resolution issues (treated as hidden
8
+ directory, confused with relative imports). Renamed to `pixie_qa` — a plain,
9
+ importable name that avoids these problems.
10
+
11
+ - `pixie/config.py`: `DEFAULT_ROOT` changed from `".pixie"` to `"pixie_qa"`
12
+ - All documentation updated: SKILL.md, pixie-api.md, specs/agent-skill.md
13
+
14
+ ### 2. Test runner rootdir discovery (pytest-style)
15
+
16
+ The old `_load_module()` in `pixie/evals/runner.py` added the test file's parent
17
+ and grandparent to `sys.path`. This broke for test files nested deeper than two
18
+ levels from the project root (e.g. `pixie_qa/tests/test_foo.py`).
19
+
20
+ Rewrote `_load_module()` to use rootdir discovery: `_find_rootdir()` walks up from
21
+ the test file directory looking for `pyproject.toml`, `setup.py`, or `setup.cfg` —
22
+ the same strategy pytest uses. The discovered rootdir is added to `sys.path`,
23
+ making project-root imports work regardless of test file depth.
24
+
25
+ ### 3. SKILL.md: setup vs. iteration checkpoint
26
+
27
+ Added a "Setup vs. Iteration" section at the top of the skill. When the user says
28
+ "setup QA" / "set up evals" / "add tests", the agent now stops after Stage 6
29
+ (first test run) and reports results without fixing anything. It only proceeds
30
+ to Stage 7 (investigate and fix) if the user explicitly confirms.
31
+
32
+ Previously, the skill had no checkpoint — the agent would eagerly iterate on
33
+ failures, modifying application code without being asked.
34
+
35
+ ### 4. SKILL.md: eval boundary guidance
36
+
37
+ Added "The eval boundary: what to evaluate" section. Evals focus on LLM-dependent
38
+ behaviour only (response quality, routing decisions, prompt effectiveness). Tool
39
+ implementations, database queries, keyword matching, and other deterministic logic
40
+ are explicitly out of scope — they should be tested with traditional unit tests.
41
+
42
+ The investigation section (Stage 7) now classifies failures into "LLM-related"
43
+ and "non-LLM" categories with guidance on how to handle each.
44
+
45
+ ### 5. SKILL.md: instrument production code only
46
+
47
+ Strengthened Stage 3 with explicit rules against creating wrapper functions or
48
+ alternate code paths for eval purposes. Added a ❌ WRONG example showing the
49
+ anti-pattern (creating `run_for_eval()` that duplicates `main()` logic) and
50
+ ✅ CORRECT examples showing `@observe` on existing functions and
51
+ `start_observation` context manager inside existing functions.
52
+
53
+ ## Files affected
54
+
55
+ | File | Change |
56
+ | -------------------------------------------------------- | ------------------------------------------------- |
57
+ | `pixie/config.py` | `DEFAULT_ROOT = "pixie_qa"` |
58
+ | `pixie/instrumentation/handlers.py` | Docstring updated |
59
+ | `pixie/evals/runner.py` | New `_find_rootdir()`, rewritten `_load_module()` |
60
+ | `tests/pixie/test_config.py` | Updated assertions for `"pixie_qa"` default |
61
+ | `tests/pixie/evals/test_runner.py` | 8 new tests (rootdir + import resolution) |
62
+ | `.claude/skills/eval-driven-dev/SKILL.md` | Major rewrite (issues 3, 4, 5 + rename) |
63
+ | `.claude/skills/eval-driven-dev/references/pixie-api.md` | Config table updated |
64
+ | `specs/agent-skill.md` | `.pixie` → `pixie_qa` throughout |
65
+
66
+ ## Migration notes
67
+
68
+ - **Breaking default change**: The default root directory changed from `.pixie` to
69
+ `pixie_qa`. Existing projects using the old default should either:
70
+ - Set `PIXIE_ROOT=.pixie` to preserve the old location, or
71
+ - Rename the directory: `mv .pixie pixie_qa`
72
+ - **Test runner**: `_load_module()` now uses rootdir discovery instead of
73
+ parent/grandparent. No action needed — this is backwards compatible and more
74
+ reliable.
75
+ - **Skill behaviour**: Agents following the updated SKILL.md will stop after
76
+ initial test setup and ask before iterating on failures.
@@ -0,0 +1,54 @@
1
+ # Test Scorecard Feature
2
+
3
+ ## What Changed
4
+
5
+ Added an HTML scorecard report that is automatically generated and saved to disk
6
+ for every `pixie test` command run. The scorecard provides a detailed,
7
+ human-readable breakdown of eval-based test results beyond the terminal summary.
8
+
9
+ ### Scorecard Contents
10
+
11
+ - **Test run overview** — command args, timestamp, pass/fail summary, and a
12
+ table of all discovered tests with their status badges.
13
+ - **Per-test-function detail** — for each test that calls `assert_pass` or
14
+ `assert_dataset_pass`:
15
+ - Human-readable scoring strategy description.
16
+ - Per-evaluator pass rate summary table.
17
+ - Input × evaluator score grid with hover tooltips showing reasoning.
18
+ - Tabbed view for multi-pass runs (one tab per pass).
19
+
20
+ ### Scorecard Storage
21
+
22
+ HTML files are saved to `{config.root}/scorecards/<YYYYMMDD-HHMMSS-normalized-args>.html`.
23
+ The CLI prints the full path after each run so users can click to open it.
24
+
25
+ ## Files Affected
26
+
27
+ ### New Files
28
+
29
+ - `pixie/evals/scorecard.py` — data models (`AssertRecord`, `TestRecord`,
30
+ `ScorecardReport`), `ScorecardCollector` (context-var-based accumulator),
31
+ HTML generation, and `save_scorecard()`.
32
+ - `tests/pixie/evals/test_scorecard.py` — 28 tests covering models, collector,
33
+ HTML generation, file saving, and integration with `assert_pass` / runner.
34
+
35
+ ### Modified Files
36
+
37
+ - `pixie/evals/eval_utils.py` — `assert_pass` now publishes an `AssertRecord`
38
+ to the active `ScorecardCollector` (no-op when no collector is active).
39
+ - `pixie/evals/runner.py` — `_run_single()` activates a `ScorecardCollector`
40
+ per test; `EvalTestResult` gains an `assert_records` field.
41
+ - `pixie/cli/test_command.py` — builds a `ScorecardReport`, calls
42
+ `save_scorecard()`, and prints the path.
43
+ - `pixie/evals/__init__.py` — re-exports `ScorecardCollector`,
44
+ `ScorecardReport`, `generate_scorecard_html`, `save_scorecard`.
45
+ - `docs/package.md` — documents the HTML scorecard section under "Running Tests".
46
+
47
+ ## Migration Notes
48
+
49
+ - No breaking API changes. Existing `pixie test` invocations behave identically
50
+ to before, with the addition of an HTML file being written and a path printed
51
+ at the end.
52
+ - `EvalTestResult.assert_records` defaults to an empty tuple, so any code
53
+ that accesses `EvalTestResult` is unaffected.
54
+ - The scorecard directory (`{config.root}/scorecards/`) is created on demand.
@@ -188,13 +188,32 @@ All evaluators are importable from `pixie` (e.g. `from pixie import FactualityEv
188
188
 
189
189
  ## Running Tests
190
190
 
191
- Use `pixie-test` (not bare `pytest`) to run eval tests. It sets up the async environment and provides eval-specific output formatting:
191
+ Use `pixie test` (or the equivalent `pixie-test` entry point, not bare `pytest`)
192
+ to run eval tests. It sets up the async environment and provides eval-specific
193
+ output formatting:
192
194
 
193
195
  ```bash
194
- pixie-test # run all test_*.py in the current directory
195
- pixie-test tests/ # specify a path
196
- pixie-test -k factuality # filter by name substring
197
- pixie-test -v # verbose: shows per-case scores and reasoning
196
+ pixie test # run all test_*.py in the current directory
197
+ pixie test tests/ # specify a path
198
+ pixie test -k factuality # filter by name substring
199
+ pixie test -v # verbose: shows per-case scores and reasoning
200
+ ```
201
+
202
+ ### HTML Scorecard
203
+
204
+ Every `pixie test` run generates an **HTML scorecard** saved to `{PIXIE_ROOT}/scorecards/<timestamp>.html`. The scorecard contains:
205
+
206
+ - **Test run overview** — command args, pass/fail summary, and a table of all tests with their status.
207
+ - **Per-test detail** — for each test function that calls `assert_pass` / `assert_dataset_pass`:
208
+ - Scoring strategy description (human-readable).
209
+ - Per-evaluator pass rate table.
210
+ - Per-input × per-evaluator score grid with tooltips showing reasoning.
211
+ - **Tabbed view** for multi-pass runs (one tab per pass).
212
+
213
+ After the test run, the CLI prints the scorecard path:
214
+
215
+ ```text
216
+ See /path/to/pixie_qa/scorecards/20250615-120000-pixie-test.html for test details
198
217
  ```
199
218
 
200
219
  ---
@@ -20,6 +20,7 @@ import json
20
20
  import sys
21
21
  from typing import TextIO
22
22
 
23
+ from dotenv import load_dotenv
23
24
  from piccolo.engine.sqlite import SQLiteEngine
24
25
  from pydantic import JsonValue
25
26
 
@@ -183,6 +184,8 @@ def main(argv: list[str] | None = None) -> int:
183
184
  parser.print_help()
184
185
  return 1
185
186
 
187
+ load_dotenv()
188
+
186
189
  if args.command == "dataset":
187
190
  if args.dataset_action is None:
188
191
  parser.parse_args(["dataset", "--help"])
@@ -5,15 +5,48 @@ Usage::
5
5
  pixie test [path] [--filter PATTERN] [--verbose]
6
6
 
7
7
  Discovers and runs eval test functions, reporting pass/fail results.
8
+ Generates an HTML scorecard report saved to
9
+ ``{config.root}/scorecards/<timestamp>.html``.
8
10
  """
9
11
 
10
12
  from __future__ import annotations
11
13
 
12
14
  import argparse
13
15
  import sys
16
+ from collections.abc import Sequence
14
17
 
15
18
  import pixie.instrumentation as px
16
19
  from pixie.evals.runner import discover_tests, format_results, run_tests
20
+ from pixie.evals.scorecard import ScorecardReport, TestRecord, save_scorecard
21
+
22
+
23
+ def _build_report(
24
+ results: Sequence[object],
25
+ command_args: str,
26
+ ) -> ScorecardReport:
27
+ """Build a :class:`ScorecardReport` from runner results.
28
+
29
+ Args:
30
+ results: List of ``EvalTestResult`` objects.
31
+ command_args: The command-line arguments string.
32
+
33
+ Returns:
34
+ A fully populated ``ScorecardReport``.
35
+ """
36
+ from pixie.evals.runner import EvalTestResult
37
+
38
+ test_records: list[TestRecord] = []
39
+ for r in results:
40
+ assert isinstance(r, EvalTestResult)
41
+ test_records.append(
42
+ TestRecord(
43
+ name=r.name,
44
+ status=r.status,
45
+ message=r.message,
46
+ asserts=list(r.assert_records),
47
+ )
48
+ )
49
+ return ScorecardReport(command_args=command_args, test_records=test_records)
17
50
 
18
51
 
19
52
  def main(argv: list[str] | None = None) -> int:
@@ -60,6 +93,13 @@ def main(argv: list[str] | None = None) -> int:
60
93
  output = format_results(results, verbose=args.verbose)
61
94
  print(output) # noqa: T201
62
95
 
96
+ # ── Generate and save scorecard ───────────────────────────────
97
+ raw_argv = argv if argv is not None else sys.argv[1:]
98
+ command_str = "pixie test " + " ".join(raw_argv)
99
+ report = _build_report(results, command_args=command_str)
100
+ scorecard_path = save_scorecard(report)
101
+ print(f"\nSee {scorecard_path} for test details") # noqa: T201
102
+
63
103
  all_passed = all(r.status == "passed" for r in results)
64
104
  return 0 if all_passed else 1
65
105
 
@@ -11,14 +11,14 @@ import os
11
11
  from dataclasses import dataclass
12
12
 
13
13
  #: Default root directory for all pixie-generated artefacts.
14
- DEFAULT_ROOT = ".pixie"
14
+ DEFAULT_ROOT = "pixie_qa"
15
15
 
16
16
 
17
17
  @dataclass(frozen=True)
18
18
  class PixieConfig:
19
19
  """Immutable configuration snapshot.
20
20
 
21
- All paths default to subdirectories / files within a single ``.pixie``
21
+ All paths default to subdirectories / files within a single ``pixie_qa``
22
22
  project folder so that observations, datasets, tests, scripts and notes
23
23
  live in one predictable location.
24
24