PyPI - pixie-qa - Versions diffs - 0.1.0__tar.gz - Mend

pixie-qa 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (422) hide show

pixie_qa-0.1.0/.claude/settings.local.json ADDED Viewed

@@ -0,0 +1,42 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(mkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/with_skill/outputs\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-rag-chatbot/without_skill/outputs\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/with_skill/outputs\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-json-extraction/without_skill/outputs\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/with_skill/outputs\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1/eval-debug-failures/without_skill/outputs\necho \"Directories created\")",
+      "Bash(cd /home/yiouli/repo/pixie-qa/.claude/skills/skill-creator && python -m scripts.aggregate_benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1 --skill-name eval-driven-dev 2>&1)",
+      "Bash(python3 -c \"\nimport sys; sys.path.insert\\(0, '/home/yiouli/repo/pixie-qa/.claude/skills/skill-creator'\\)\nfrom scripts.aggregate_benchmark import generate_benchmark\nimport inspect, json\nsrc = inspect.getsource\\(generate_benchmark\\)\nprint\\(src[:3000]\\)\n\" 2>&1)",
+      "Bash(python3 -c \"\nimport sys; sys.path.insert\\(0, '/home/yiouli/repo/pixie-qa/.claude/skills/skill-creator'\\)\nfrom scripts.aggregate_benchmark import load_run_results\nimport inspect\nsrc = inspect.getsource\\(load_run_results\\)\nprint\\(src\\)\n\" 2>&1)",
+      "Bash(python -m scripts.aggregate_benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1 --skill-name eval-driven-dev 2>&1)",
+      "Bash(python /home/yiouli/repo/pixie-qa/.claude/skills/skill-creator/eval-viewer/generate_review.py \\\\\n  /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1 \\\\\n  --skill-name \"eval-driven-dev\" \\\\\n  --benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1/benchmark.json \\\\\n  --static /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/review-iteration-1.html \\\\\n  2>&1)",
+      "Bash(mkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects/rag-chatbot\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects/email-classifier\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects/qa-app-with-tests/tests\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects/qa-app-with-tests/pixie_datasets\necho \"done\")",
+      "Bash(python -m scripts.aggregate_benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-2 --skill-name eval-driven-dev 2>&1)",
+      "Bash(python /home/yiouli/repo/pixie-qa/.claude/skills/skill-creator/eval-viewer/generate_review.py \\\\\n  /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-2 \\\\\n  --skill-name \"eval-driven-dev\" \\\\\n  --benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-2/benchmark.json \\\\\n  --previous-workspace /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-1 \\\\\n  --static /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/review-iteration-2.html \\\\\n  2>&1)",
+      "Bash(mkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects/rag-chatbot-mock\nmkdir -p /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects/email-classifier-mock\necho \"done\")",
+      "Bash(cd /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-3/eval-rag-chatbot/with_skill/project && PYTHONPATH=/home/yiouli/repo/pixie-qa python chatbot.py)",
+      "Bash(PYTHONPATH=/home/yiouli/repo/pixie-qa pixie dataset create rag-chatbot-golden && PYTHONPATH=/home/yiouli/repo/pixie-qa pixie dataset list)",
+      "Bash(for i:*)",
+      "Bash(for f:*)",
+      "Bash(for id:*)",
+      "Bash(cd /home/yiouli/repo/pixie-qa/.claude/skills/skill-creator && python -m scripts.aggregate_benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-3 --skill-name eval-driven-dev 2>&1)",
+      "Bash(python eval-viewer/generate_review.py \\\\\n  /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-3 \\\\\n  --skill-name \"eval-driven-dev\" \\\\\n  --benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-3/benchmark.json \\\\\n  --previous-workspace /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-2 \\\\\n  --static /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/review-iteration-3.html \\\\\n  2>&1)",
+      "Bash(python eval-viewer/generate_review.py --help 2>&1 | head -30)",
+      "Bash(python -m scripts.aggregate_benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-3 --skill-name eval-driven-dev 2>&1 && cat /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-3/benchmark.md)",
+      "Bash(WORKSPACE=/home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-4\nEVALS_DIR=/home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects\n\nmkdir -p \"$WORKSPACE/eval-rag-chatbot/with_skill/project\"\nmkdir -p \"$WORKSPACE/eval-rag-chatbot/without_skill/project\"\nmkdir -p \"$WORKSPACE/eval-email-classifier/with_skill/project\"\nmkdir -p \"$WORKSPACE/eval-email-classifier/without_skill/project\"\nmkdir -p \"$WORKSPACE/eval-debug-failures/with_skill/project\"\nmkdir -p \"$WORKSPACE/eval-debug-failures/without_skill/project\"\n\ncp \"$EVALS_DIR/rag-chatbot-mock/chatbot.py\" \"$WORKSPACE/eval-rag-chatbot/with_skill/project/\"\ncp \"$EVALS_DIR/rag-chatbot-mock/requirements.txt\" \"$WORKSPACE/eval-rag-chatbot/with_skill/project/\"\ncp \"$EVALS_DIR/rag-chatbot-mock/chatbot.py\" \"$WORKSPACE/eval-rag-chatbot/without_skill/project/\"\ncp \"$EVALS_DIR/rag-chatbot-mock/requirements.txt\" \"$WORKSPACE/eval-rag-chatbot/without_skill/project/\"\n\ncp \"$EVALS_DIR/email-classifier-mock/extractor.py\" \"$WORKSPACE/eval-email-classifier/with_skill/project/\"\ncp \"$EVALS_DIR/email-classifier-mock/requirements.txt\" \"$WORKSPACE/eval-email-classifier/with_skill/project/\"\ncp \"$EVALS_DIR/email-classifier-mock/extractor.py\" \"$WORKSPACE/eval-email-classifier/without_skill/project/\"\ncp \"$EVALS_DIR/email-classifier-mock/requirements.txt\" \"$WORKSPACE/eval-email-classifier/without_skill/project/\"\n\ncp -r \"$EVALS_DIR/qa-app-with-tests/.\" \"$WORKSPACE/eval-debug-failures/with_skill/project/\"\ncp -r \"$EVALS_DIR/qa-app-with-tests/.\" \"$WORKSPACE/eval-debug-failures/without_skill/project/\"\n\necho \"Done\")",
+      "Bash(cd /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-4/eval-rag-chatbot/with_skill/project\n\nPYTHONPATH=/home/yiouli/repo/pixie-qa python chatbot.py\nPYTHONPATH=/home/yiouli/repo/pixie-qa pixie dataset create rag-chatbot-golden\nfor i in 1 2 3 4; do PYTHONPATH=/home/yiouli/repo/pixie-qa pixie dataset save rag-chatbot-golden; done\nPYTHONPATH=/home/yiouli/repo/pixie-qa pixie dataset list)",
+      "Bash(git checkout:*)",
+      "Bash(cd /home/yiouli/repo/pixie-qa/.claude/skills/skill-creator\n\npython -m scripts.aggregate_benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-4 --skill-name eval-driven-dev 2>&1\n\npython eval-viewer/generate_review.py \\\\\n  /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-4 \\\\\n  --skill-name \"eval-driven-dev\" \\\\\n  --benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-4/benchmark.json \\\\\n  --previous-workspace /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-3 \\\\\n  --static /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/review-iteration-4.html \\\\\n  2>&1)",
+      "Bash(xdg-open /tmp/eval_review_eval-driven-dev.html 2>/dev/null || open /tmp/eval_review_eval-driven-dev.html 2>/dev/null || echo \"no browser available\")",
+      "Bash(python -m scripts.run_loop \\\\\n  --eval-set /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/trigger-eval-set.json \\\\\n  --skill-path /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev \\\\\n  --model claude-sonnet-4-6 \\\\\n  --max-iterations 5 \\\\\n  --verbose 2>&1 | tee /tmp/run_loop_output.txt)",
+      "Bash(ls /tmp/skill_description_report_*.html 2>/dev/null | tail -1)",
+      "Bash(xdg-open /tmp/skill_description_report_eval-driven-dev_20260312_004156.html 2>/dev/null; echo \"done\")",
+      "Bash(python -m scripts.run_eval \\\\\n  --skill-path /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev \\\\\n  --query \"i have a rag chatbot in ~/projects/support-bot/chatbot.py that answers questions based on our zendesk docs. it's been giving inconsistent answers lately and i want to make sure it's actually working correctly\" \\\\\n  --runs 1 2>&1)",
+      "Bash(python -m scripts.run_loop \\\\\n  --eval-set /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/trigger-eval-set.json \\\\\n  --skill-path /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev \\\\\n  --model claude-sonnet-4-6 \\\\\n  --max-iterations 5 \\\\\n  --verbose 2>&1 | tee /tmp/run_loop_output_v2.txt)",
+      "Bash(cd /home/yiouli/repo/pixie-qa && env -u CLAUDECODE claude -p \"list all available skills you can see\" --output-format text 2>&1 | head -30)",
+      "Bash(find /home/yiouli/repo/pixie-qa/.claude -name \"*.json\" -o -name \"settings*\" 2>/dev/null | head -20)",
+      "Bash(env -u CLAUDECODE claude -p \"what is the FULL description of the eval-driven-dev skill?\" --output-format text 2>&1)",
+      "Bash(cat /home/yiouli/repo/pixie-qa/.claude/skills/mcp-builder/.openskills.json 2>/dev/null | python3 -m json.tool | head -20)",
+      "Bash(WORKSPACE=/home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace\nEVALS_DIR=/home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev/evals/sample-projects\n\nmkdir -p \"$WORKSPACE/iteration-5/eval-rag-chatbot/with_skill/project\"\nmkdir -p \"$WORKSPACE/iteration-5/eval-rag-chatbot/without_skill/project\"\nmkdir -p \"$WORKSPACE/iteration-5/eval-email-classifier/with_skill/project\"\nmkdir -p \"$WORKSPACE/iteration-5/eval-email-classifier/without_skill/project\"\nmkdir -p \"$WORKSPACE/iteration-5/eval-debug-failures/with_skill/project\"\nmkdir -p \"$WORKSPACE/iteration-5/eval-debug-failures/without_skill/project\"\n\ncp \"$EVALS_DIR/rag-chatbot-mock/chatbot.py\" \"$WORKSPACE/iteration-5/eval-rag-chatbot/with_skill/project/\"\ncp \"$EVALS_DIR/rag-chatbot-mock/requirements.txt\" \"$WORKSPACE/iteration-5/eval-rag-chatbot/with_skill/project/\"\ncp \"$EVALS_DIR/rag-chatbot-mock/chatbot.py\" \"$WORKSPACE/iteration-5/eval-rag-chatbot/without_skill/project/\"\ncp \"$EVALS_DIR/rag-chatbot-mock/requirements.txt\" \"$WORKSPACE/iteration-5/eval-rag-chatbot/without_skill/project/\"\n\ncp \"$EVALS_DIR/email-classifier-mock/extractor.py\" \"$WORKSPACE/iteration-5/eval-email-classifier/with_skill/project/\"\ncp \"$EVALS_DIR/email-classifier-mock/requirements.txt\" \"$WORKSPACE/iteration-5/eval-email-classifier/with_skill/project/\"\ncp \"$EVALS_DIR/email-classifier-mock/extractor.py\" \"$WORKSPACE/iteration-5/eval-email-classifier/without_skill/project/\"\ncp \"$EVALS_DIR/email-classifier-mock/requirements.txt\" \"$WORKSPACE/iteration-5/eval-email-classifier/without_skill/project/\"\n\ncp -r \"$EVALS_DIR/qa-app-with-tests/.\" \"$WORKSPACE/iteration-5/eval-debug-failures/with_skill/project/\"\ncp -r \"$EVALS_DIR/qa-app-with-tests/.\" \"$WORKSPACE/iteration-5/eval-debug-failures/without_skill/project/\"\n\necho \"Done\")",
+      "Bash(cd /home/yiouli/repo/pixie-qa/.claude/skills/skill-creator && python -m scripts.aggregate_benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-5 --skill-name eval-driven-dev 2>&1 && cat /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-5/benchmark.md)",
+      "Bash(python eval-viewer/generate_review.py \\\\\n  /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-5 \\\\\n  --skill-name \"eval-driven-dev\" \\\\\n  --benchmark /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-5/benchmark.json \\\\\n  --previous-workspace /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/iteration-4 \\\\\n  --static /home/yiouli/repo/pixie-qa/.claude/skills/eval-driven-dev-workspace/review-iteration-5.html \\\\\n  2>&1)"
+    ]
+  }
+}

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/SKILL.md ADDED Viewed

@@ -0,0 +1,282 @@
+---
+name: eval-driven-dev
+description: Instrument Python LLM apps, build golden datasets, write eval-based tests, run them, and root-cause failures — covering the full eval-driven development cycle. Make sure to use this skill whenever a user is developing, testing, QA-ing, evaluating, or benchmarking a Python project that calls an LLM, even if they don't say "evals" explicitly. Use for making sure an AI app works correctly, catching regressions after prompt changes, debugging why an agent started behaving differently, or validating output quality before shipping.
+---
+# Eval-Driven Development with pixie
+This skill is about doing the work, not describing it. When a user asks you to set up evals for their app, you should be reading their code, editing their files, running commands, and producing a working test pipeline — not writing a plan for them to follow later.
+The loop is: understand the app → instrument it → write the test file → build a dataset → run the tests → investigate failures → iterate. In practice the stages blur and you'll be going back and forth, but this ordering helps: write all the files (instrumentation, test file, MEMORY.md) before running any commands. That way your work survives even if an execution step hits a snag.
+---
+## Stage 1: Understand the Application
+Before touching any code, spend time actually reading the source. The code will tell you more than asking the user would, and it puts you in a much better position to make good decisions about what and how to evaluate.
+What you're looking for:
+- The entry point and the main function(s) that do the LLM-powered work
+- Every place external data flows into a prompt — user input, retrieved documents, database results, API responses, system prompts
+- The final output (what the user sees or what gets returned)
+- Any intermediate steps that might be worth evaluating separately (e.g. a retrieval step)
+Write your findings down in a `MEMORY.md` file in the project (or `.claude/memory/eval-notes.md`) as you go. Include:
+- How to run the app
+- Which function(s) you'll instrument and what their `eval_input` / `eval_output` will look like
+- The use cases the app handles
+- Your eval plan: what to measure and which evaluators make sense
+This file is how your understanding persists across sessions. Keep it updated as you learn more.
+If something is genuinely unclear from the code, ask the user — but most questions answer themselves once you've read the code carefully.
+---
+## Stage 2: Decide What to Evaluate
+Now that you understand the app, you can make thoughtful choices about what to measure:
+- **What quality dimension matters most?** Factual accuracy for QA apps, output format for structured extraction, relevance for RAG, safety for user-facing text.
+- **Which span to evaluate:** the whole pipeline (`root`) or just the LLM call (`last_llm_call`)? If you're debugging retrieval, you might evaluate at a different point than if you're checking final answer quality.
+- **Which evaluators fit:** see `references/pixie-api.md` → Evaluators. For factual QA: `FactualityEval`. For structured output: `ValidJSONEval` / `JSONDiffEval`. For RAG pipelines: `ContextRelevancyEval` / `FaithfulnessEval`.
+- **Pass criteria:** `ScoreThreshold(threshold=0.7, pct=0.8)` means 80% of cases must score ≥ 0.7. Think about what "good enough" looks like for this app.
+- **Expected outputs:** `FactualityEval` needs them. Format evaluators usually don't.
+Update your MEMORY.md with the plan before writing any code.
+---
+## Stage 3: Instrument the Application
+Edit the app's source files to add pixie instrumentation. The goal is to make every run capture its inputs and outputs as observable spans, so you can later replay those runs as eval cases.
+### Add `enable_storage()` at startup
+Somewhere in the app's entry point — main function or module top-level — call:
+```python
+from pixie import enable_storage
+enable_storage()  # creates SQLite DB, registers handler — idempotent
+```
+This is what actually persists traces to disk. Without it, `@observe` decorators will still fire but nothing gets saved.
+### Wrap the function(s) you want to evaluate
+`@observe` on a function captures all its kwargs as `eval_input` and its return value as `eval_output`:
+```python
+import pixie.instrumentation as px
+@px.observe(name="answer_question")
+def answer_question(question: str, context: str) -> str:
+    ...
+```
+For more control, use the context manager:
+```python
+with px.start_observation(input={"question": question, "context": context}, name="answer_question") as obs:
+    result = run_pipeline(question, context)
+    obs.set_output(result)
+    obs.set_metadata("retrieved_chunks", len(chunks))
+```
+Wrap at the outermost boundary that represents one "test case" — for a RAG app that's probably `answer_question(question, context)`, not the internal LLM call. The dataset items will have the same shape as whatever this function receives and returns.
+After instrumentation, call `px.flush()` at the end of runs to make sure all spans are written before you try to save them to a dataset.
+---
+## Stage 4: Write the Eval Test File
+Write the test file before building the dataset. This might seem backwards, but it forces you to decide what you're actually measuring before you start collecting data — otherwise the data collection has no direction.
+Create `tests/test_<feature>.py`. The pattern is: a `runnable` adapter that calls your app function, plus an async test function that calls `assert_dataset_pass`:
+```python
+from pixie import enable_storage
+from pixie.evals import assert_dataset_pass, FactualityEval, ScoreThreshold
+from pixie.evals import last_llm_call  # or: from pixie.evals import root
+from myapp import answer_question
+def runnable(eval_input):
+    """Replays one dataset item through the app. enable_storage() here ensures traces are captured."""
+    enable_storage()
+    answer_question(**eval_input)  # or answer_question(eval_input) if it's a plain string
+async def test_factuality():
+    await assert_dataset_pass(
+        runnable=runnable,
+        dataset_name="<dataset-name>",
+        evaluators=[FactualityEval()],
+        pass_criteria=ScoreThreshold(threshold=0.7, pct=0.8),
+        from_trace=last_llm_call,   # tells the harness which span's output to evaluate
+    )
+```
+Note that `enable_storage()` belongs inside the `runnable`, not at module level in the test file — it needs to fire on each invocation so the trace is captured for that specific run.
+The test runner is `pixie-test` (not `pytest` or `python -m pixie test` — those won't set up the async environment correctly):
+```bash
+pixie-test                     # run all test_*.py in current directory
+pixie-test tests/              # specify path
+pixie-test -k factuality       # filter by name
+pixie-test -v                  # verbose: shows per-case scores and reasoning
+```
+---
+## Stage 5: Build the Dataset
+Create the dataset first, then populate it by running the app:
+```bash
+pixie dataset create <dataset-name>
+pixie dataset list   # verify it exists
+```
+### Option A: Capture from real runs (the natural starting point)
+Run the app with representative inputs, then save each trace to the dataset:
+```bash
+# Run the app (enable_storage() must be active)
+python -c "from myapp import main; main('What is the capital of France?')"
+# Save the root span to the dataset
+pixie dataset save <dataset-name>
+# Or specifically save the last LLM call:
+pixie dataset save <dataset-name> --select last_llm_call
+# Add context:
+pixie dataset save <dataset-name> --notes "basic geography question"
+# Attach expected output for evaluators like FactualityEval:
+echo '"Paris"' | pixie dataset save <dataset-name> --expected-output
+```
+Try to cover the range of inputs you actually care about: normal cases, edge cases, things the app might plausibly get wrong (empty input, ambiguous queries, no-answer cases).
+### Option B: Build programmatically
+When you want to bulk-load items or add expected outputs directly:
+```python
+from pixie.dataset.store import DatasetStore
+from pixie.storage.evaluable import Evaluable
+store = DatasetStore()
+store.create("<dataset-name>")
+store.append("<dataset-name>", Evaluable(
+    eval_input={"question": "What is the capital of France?", "context": "Paris is the capital..."},
+    eval_output="Paris is the capital of France.",
+    expected_output="Paris",
+))
+```
+---
+## Stage 6: Run the Tests
+```bash
+pixie-test tests/ -v
+```
+The `-v` flag shows per-case scores and reasoning, which makes it much easier to see what's passing and what isn't. Check that the pass rates look reasonable given your `ScoreThreshold`.
+---
+## Stage 7: Investigate Failures
+When tests fail, the goal is to understand _why_, not to adjust thresholds until things pass.
+```bash
+pixie-test -v    # start here — shows score and reasoning per case
+```
+If you need to dig into a specific trace, look up the `trace_id` from the dataset:
+```python
+from pixie.dataset.store import DatasetStore
+store = DatasetStore()
+ds = store.get("<dataset-name>")
+for i, item in enumerate(ds.items):
+    print(i, item.eval_metadata)   # trace_id is here if saved via pixie dataset save
+```
+Then inspect the full span tree:
+```python
+import asyncio
+from pixie.storage.store import ObservationStore
+async def inspect(trace_id: str):
+    store = ObservationStore()
+    roots = await store.get_trace(trace_id)
+    for root in roots:
+        print(root.to_text())   # full span tree: inputs, outputs, LLM messages
+asyncio.run(inspect("the-trace-id-here"))
+```
+Common patterns to look for:
+| Symptom                          | Likely cause                                    |
+| -------------------------------- | ----------------------------------------------- |
+| Output is factually wrong        | Prompt or retrieved context is bad              |
+| Output is right but score is low | Wrong `expected_output`, or criteria too strict |
+| Score 0.0 with error details     | Evaluator crashed (missing API key, etc.)       |
+| All cases fail at same point     | `@observe` is on the wrong function             |
+Once you've diagnosed the issue, make a targeted change — to the code, prompt, dataset item, or pass criteria — and re-run. Always finish by giving the user the exact command to verify:
+```bash
+pixie-test tests/test_<feature>.py -v
+```
+---
+## Memory Template
+```markdown
+## Project: <name>
+### Entry point
+`python chatbot.py` or `answer_question(question, context)` etc.
+### Instrumented spans
+- `answer_question(question, context)` — @observe wraps the full pipeline
+  - eval_input: {"question": str, "context": str}
+  - eval_output: str (the answer)
+### Datasets
+- `qa-golden-set`: N items, factual QA, includes expected_output
+### Eval plan
+- Evaluator: FactualityEval
+- Pass criteria: ScoreThreshold(0.7, pct=0.8)
+- Test file: tests/test_qa.py::test_factuality
+### Known issues / findings
+- ...
+```
+---
+## Reference
+See `references/pixie-api.md` for all CLI commands, evaluator signatures, and the Python dataset/store API.

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/evals.json ADDED Viewed

@@ -0,0 +1,52 @@
+{
+  "skill_name": "eval-driven-dev",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "I have a RAG chatbot project at PROJECT_DIR (chatbot.py). The app runs without any API key needed. Please set up the full eval pipeline using pixie: instrument the code with pixie, actually run the app to capture traces, save them to a dataset with pixie, and write a test file.",
+      "expected_output": "The agent should: (1) read chatbot.py, (2) add enable_storage() and @observe/start_observation instrumentation, (3) actually run the app to generate traces (python -c or python chatbot.py), (4) run 'pixie dataset create' to create a dataset, (5) run 'pixie dataset save' to save traces from the run, (6) write a test file with assert_dataset_pass, (7) create a MEMORY.md.",
+      "files": ["evals/sample-projects/rag-chatbot-mock/chatbot.py"],
+      "expectations": [
+        "The agent edited chatbot.py to add @observe or start_observation wrapping answer_question",
+        "The agent added enable_storage() in chatbot.py (not just px.init())",
+        "The agent created a test file (test_*.py) with assert_dataset_pass or assert_pass",
+        "The test file calls enable_storage() inside the runnable (not at module level in the test)",
+        "The agent wrote code or commands to create a named dataset (DatasetStore().create() or 'pixie dataset create' CLI)",
+        "The agent wrote code or commands to save traces to the dataset ('pixie dataset save' CLI or DatasetStore().append())",
+        "The agent created a MEMORY.md or notes file in the project directory"
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "My email classifier app is at PROJECT_DIR (extractor.py). It extracts JSON with category, priority, and summary from emails. No API key needed to run it. Please add proper eval-based testing using pixie: instrument it, run it on some sample inputs, save the traces as a dataset, and write tests that check the JSON structure.",
+      "expected_output": "The agent should: (1) read extractor.py, (2) add @observe/enable_storage() instrumentation, (3) actually run the classifier on several emails, (4) use pixie dataset create + pixie dataset save to capture those runs into a dataset, (5) add expected_output to dataset items, (6) write a test file with ValidJSONEval or custom field-checking evaluator + assert_dataset_pass, (7) create MEMORY.md.",
+      "files": ["evals/sample-projects/email-classifier-mock/extractor.py"],
+      "expectations": [
+        "The agent edited extractor.py to add @observe or start_observation to extract_from_email",
+        "The agent added enable_storage() to extractor.py (not just px.init())",
+        "The agent actually ran the extractor on sample emails to generate traces",
+        "The agent used 'pixie dataset create' or DatasetStore().create() to create a named dataset",
+        "The agent used 'pixie dataset save' (CLI or equivalent) to save traces into the dataset",
+        "The agent wrote a test file using ValidJSONEval, JSONDiffEval, or a custom field evaluator",
+        "The agent created a MEMORY.md or notes file in the project directory"
+      ]
+    },
+    {
+      "id": 3,
+      "prompt": "My pixie eval tests are failing. Project is at PROJECT_DIR. The test in tests/test_qa.py runs assert_dataset_pass against 'qa-golden-set' and I'm getting low scores. Please investigate — look at the dataset, the test code, the app code, find the root cause, fix it, and show me how to re-run.",
+      "expected_output": "The agent should: (1) read all three files (test_qa.py, qa_app.py, qa-golden-set.json), (2) identify that eval_output is null in the dataset so FactualityEval is scoring nulls, (3) fix either the dataset or the test logic, (4) explicitly show the pixie-test command to re-run and verify.",
+      "files": [
+        "evals/sample-projects/qa-app-with-tests/qa_app.py",
+        "evals/sample-projects/qa-app-with-tests/tests/test_qa.py",
+        "evals/sample-projects/qa-app-with-tests/pixie_datasets/qa-golden-set.json"
+      ],
+      "expectations": [
+        "The agent read tests/test_qa.py, qa_app.py, and the dataset file",
+        "The agent identified that eval_output is null in the dataset items as the root cause",
+        "The agent made a concrete fix (populated eval_output in dataset OR rewrote test to run the app live)",
+        "The agent showed the 'pixie-test' command to re-run and verify the fix",
+        "The agent explained why the score was low (FactualityEval scoring null output vs expected answers)"
+      ]
+    }
+  ]
+}

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/email-classifier/extractor.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Extract structured information from customer support emails using GPT-4."""
+import json
+from openai import OpenAI
+SYSTEM_PROMPT = """You are a support ticket classifier. Given a customer support email,
+extract the following fields as JSON:
+- category: one of "billing", "technical", "account", "general"
+- priority: one of "low", "medium", "high"
+- summary: a single sentence summarizing the issue
+Respond with valid JSON only, no extra text."""
+def extract_from_email(email_text: str) -> dict:
+    """Extract structured fields from a customer support email."""
+    client = OpenAI()
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": email_text},
+        ],
+        temperature=0,
+    )
+    raw = response.choices[0].message.content
+    return json.loads(raw)
+# ---- ad-hoc testing (replace with proper eval tests) ----
+if __name__ == "__main__":
+    sample_emails = [
+        "Hi, my subscription was charged twice this month. Please refund the duplicate charge ASAP.",
+        "The app keeps crashing when I try to upload files larger than 10MB. This is urgent.",
+        "Can you tell me how to reset my password? I can't find the option in settings.",
+    ]
+    for email in sample_emails:
+        result = extract_from_email(email)
+        print(result)

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/email-classifier/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ openai
2	+ pixie-qa

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/email-classifier-mock/extractor.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Email classifier — mock version that works without an API key.
+This mock classifies emails using simple keyword rules.
+Suitable for running eval infrastructure without needing OPENAI_API_KEY.
+"""
+import json
+def extract_from_email(email_text: str) -> dict:
+    """Extract structured info from a customer support email.
+    Returns a dict with:
+      - category: "billing" | "technical" | "account" | "general"
+      - priority: "low" | "medium" | "high"
+      - summary: one-sentence summary
+    (Mock: uses keyword rules — no LLM API call needed.)
+    """
+    text = email_text.lower()
+    # Determine category
+    if any(w in text for w in ["charge", "refund", "invoice", "payment", "billing", "subscription"]):
+        category = "billing"
+    elif any(w in text for w in ["crash", "error", "bug", "broken", "not working", "upload", "download"]):
+        category = "technical"
+    elif any(w in text for w in ["password", "login", "account", "username", "sign in", "reset"]):
+        category = "account"
+    else:
+        category = "general"
+    # Determine priority
+    if any(w in text for w in ["urgent", "asap", "immediately", "critical", "crashing", "duplicate charge"]):
+        priority = "high"
+    elif any(w in text for w in ["soon", "when possible", "annoying", "frustrating"]):
+        priority = "medium"
+    else:
+        priority = "low"
+    # Generate summary
+    first_sentence = email_text.strip().split(".")[0].strip()
+    summary = first_sentence[:100] if first_sentence else "Customer support request."
+    return {"category": category, "priority": priority, "summary": summary}
+if __name__ == "__main__":
+    sample_emails = [
+        "Hi, my subscription was charged twice this month. Please refund the duplicate charge ASAP.",
+        "The app keeps crashing when I try to upload files larger than 10MB. This is urgent.",
+        "Can you tell me how to reset my password? I can't find the option in settings.",
+        "Just wondering when your mobile app will support dark mode.",
+    ]
+    for email in sample_emails:
+        result = extract_from_email(email)
+        print(json.dumps(result, indent=2))
+        print()

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/email-classifier-mock/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ pixie-qa

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/qa-app-with-tests/pixie_datasets/qa-golden-set.json ADDED Viewed

@@ -0,0 +1,23 @@
+{
+  "name": "qa-golden-set",
+  "items": [
+    {
+      "eval_input": {"question": "What is the capital of France?", "context": "France is a country in Western Europe. Its capital city is Paris, which is also the largest city."},
+      "eval_output": null,
+      "eval_metadata": {"notes": "basic geography"},
+      "expected_output": "Paris"
+    },
+    {
+      "eval_input": {"question": "Who wrote Romeo and Juliet?", "context": "Romeo and Juliet is a tragedy written by the English playwright William Shakespeare, believed to have been written between 1594 and 1596."},
+      "eval_output": null,
+      "eval_metadata": {"notes": "literature question"},
+      "expected_output": "William Shakespeare"
+    },
+    {
+      "eval_input": {"question": "What is the boiling point of water?", "context": "Water boils at 100 degrees Celsius (212 degrees Fahrenheit) at standard atmospheric pressure (1 atm)."},
+      "eval_output": null,
+      "eval_metadata": {"notes": "science fact"},
+      "expected_output": "100 degrees Celsius (212 degrees Fahrenheit)"
+    }
+  ]
+}

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/qa-app-with-tests/qa_app.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Q&A app instrumented with pixie — has some eval tests that are currently failing."""
+import pixie.instrumentation as px
+from pixie import enable_storage
+from anthropic import Anthropic
+@px.observe(name="answer_question")
+def answer_question(question: str, context: str = "") -> str:
+    """Answer a question, optionally with context."""
+    client = Anthropic()
+    messages = [{"role": "user", "content": question}]
+    if context:
+        messages = [{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}]
+    response = client.messages.create(
+        model="claude-haiku-4-5-20251001",
+        max_tokens=200,
+        messages=messages,
+    )
+    return response.content[0].text
+def main(question: str, context: str = "") -> str:
+    enable_storage()
+    return answer_question(question=question, context=context)

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/qa-app-with-tests/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ anthropic
2	+ pixie-qa

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/qa-app-with-tests/tests/test_qa.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Eval-based tests for the Q&A app — currently failing."""
+import asyncio
+from pixie import enable_storage
+from pixie.evals import assert_dataset_pass, FactualityEval, ScoreThreshold
+from qa_app import answer_question
+def runnable(eval_input):
+    enable_storage()
+    question = eval_input.get("question", "") if isinstance(eval_input, dict) else str(eval_input)
+    context = eval_input.get("context", "") if isinstance(eval_input, dict) else ""
+    answer_question(question=question, context=context)
+async def test_factuality():
+    """Test that answers are factually accurate compared to expected outputs."""
+    await assert_dataset_pass(
+        runnable=runnable,
+        dataset_name="qa-golden-set",
+        evaluators=[FactualityEval()],
+        pass_criteria=ScoreThreshold(threshold=0.7, pct=0.8),
+    )

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/rag-chatbot/chatbot.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Simple RAG chatbot that answers questions using retrieved doc chunks."""
+from anthropic import Anthropic
+def retrieve_docs(query: str) -> list[str]:
+    """Retrieve relevant document chunks for a query (stubbed)."""
+    # In production this would call a vector database
+    docs = {
+        "capital": ["Paris is the capital of France.", "Berlin is the capital of Germany."],
+        "population": ["France has a population of about 68 million.", "Germany has about 84 million people."],
+        "language": ["French is spoken in France.", "German is spoken in Germany and Austria."],
+    }
+    for keyword, chunks in docs.items():
+        if keyword in query.lower():
+            return chunks
+    return ["No relevant documents found."]
+def answer_question(question: str) -> str:
+    """Answer a question using retrieved context and Claude."""
+    context_chunks = retrieve_docs(question)
+    context = "\n".join(context_chunks)
+    client = Anthropic()
+    response = client.messages.create(
+        model="claude-haiku-4-5-20251001",
+        max_tokens=300,
+        system="You are a helpful assistant. Answer questions based only on the provided context.",
+        messages=[
+            {
+                "role": "user",
+                "content": f"Context:\n{context}\n\nQuestion: {question}",
+            }
+        ],
+    )
+    return response.content[0].text
+def main():
+    questions = [
+        "What is the capital of France?",
+        "What language do people speak in Germany?",
+        "What is the population of France?",
+    ]
+    for q in questions:
+        print(f"Q: {q}")
+        print(f"A: {answer_question(q)}")
+        print()
+if __name__ == "__main__":
+    main()

pixie_qa-0.1.0/.claude/skills/eval-driven-dev/evals/sample-projects/rag-chatbot/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ anthropic
2	+ pixie-qa