pixie-qa 0.1.10__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/PKG-INFO +3 -3
  2. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/README.md +2 -2
  3. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pyproject.toml +1 -1
  4. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/skills/eval-driven-dev/SKILL.md +6 -6
  5. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/skills/eval-driven-dev/resources/check_version.py +3 -5
  6. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/.github/copilot-instructions.md +0 -0
  7. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/.github/workflows/publish.yml +0 -0
  8. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/.gitignore +0 -0
  9. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/LICENSE +0 -0
  10. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/async-handler-processing.md +0 -0
  11. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/autoevals-adapters.md +0 -0
  12. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/cli-dataset-commands.md +0 -0
  13. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/dataset-management.md +0 -0
  14. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/deep-research-demo.md +0 -0
  15. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/eval-harness.md +0 -0
  16. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/expected-output-in-evals.md +0 -0
  17. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/instrumentation-module-implementation.md +0 -0
  18. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/loud-failure-mode.md +0 -0
  19. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/manual-instrumentation-usability.md +0 -0
  20. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/observation-store-implementation.md +0 -0
  21. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/pixie-directory-and-skill-improvements.md +0 -0
  22. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/pixie-test-e2e-suite.md +0 -0
  23. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/root-package-exports-and-trace-id.md +0 -0
  24. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/scorecard-branding-and-skill-version-check.md +0 -0
  25. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/scorecard-eval-detail-dialog.md +0 -0
  26. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/skill-v2-and-rootdir-discovery.md +0 -0
  27. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/test-scorecard.md +0 -0
  28. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/changelogs/usability-utils.md +0 -0
  29. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/docs/package.md +0 -0
  30. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/__init__.py +0 -0
  31. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/cli/__init__.py +0 -0
  32. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/cli/dataset_command.py +0 -0
  33. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/cli/main.py +0 -0
  34. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/cli/test_command.py +0 -0
  35. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/config.py +0 -0
  36. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/dataset/__init__.py +0 -0
  37. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/dataset/models.py +0 -0
  38. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/dataset/store.py +0 -0
  39. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/__init__.py +0 -0
  40. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/criteria.py +0 -0
  41. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/eval_utils.py +0 -0
  42. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/evaluation.py +0 -0
  43. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/runner.py +0 -0
  44. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/scorecard.py +0 -0
  45. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/scorers.py +0 -0
  46. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/trace_capture.py +0 -0
  47. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/evals/trace_helpers.py +0 -0
  48. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/favicon.png +0 -0
  49. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/__init__.py +0 -0
  50. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/context.py +0 -0
  51. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/handler.py +0 -0
  52. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/handlers.py +0 -0
  53. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/instrumentors.py +0 -0
  54. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/observation.py +0 -0
  55. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/processor.py +0 -0
  56. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/queue.py +0 -0
  57. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/instrumentation/spans.py +0 -0
  58. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/__init__.py +0 -0
  59. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/evaluable.py +0 -0
  60. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/piccolo_conf.py +0 -0
  61. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/piccolo_migrations/__init__.py +0 -0
  62. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/serialization.py +0 -0
  63. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/store.py +0 -0
  64. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/tables.py +0 -0
  65. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/pixie/storage/tree.py +0 -0
  66. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/skills/eval-driven-dev/references/pixie-api.md +0 -0
  67. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/agent-skill-1.md +0 -0
  68. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/agent-skill.md +0 -0
  69. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/autoevals-adapters.md +0 -0
  70. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/dataset-management.md +0 -0
  71. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/evals-harness.md +0 -0
  72. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/expected-output-in-evals.md +0 -0
  73. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/instrumentation.md +0 -0
  74. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/manual-instrumentation-usability.md +0 -0
  75. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/storage.md +0 -0
  76. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/specs/usability-utils.md +0 -0
  77. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/__init__.py +0 -0
  78. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/__init__.py +0 -0
  79. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/__init__.py +0 -0
  80. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/e2e_cases.json +0 -0
  81. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/e2e_fixtures/conftest.py +0 -0
  82. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -0
  83. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -0
  84. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -0
  85. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/test_dataset_command.py +0 -0
  86. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/test_e2e_pixie_test.py +0 -0
  87. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/cli/test_main.py +0 -0
  88. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/dataset/__init__.py +0 -0
  89. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/dataset/test_models.py +0 -0
  90. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/dataset/test_store.py +0 -0
  91. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/__init__.py +0 -0
  92. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_criteria.py +0 -0
  93. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_eval_utils.py +0 -0
  94. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_evaluation.py +0 -0
  95. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_runner.py +0 -0
  96. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_scorecard.py +0 -0
  97. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_scorers.py +0 -0
  98. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_trace_capture.py +0 -0
  99. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/evals/test_trace_helpers.py +0 -0
  100. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/__init__.py +0 -0
  101. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/conftest.py +0 -0
  102. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_context.py +0 -0
  103. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_handler.py +0 -0
  104. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_integration.py +0 -0
  105. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_observation.py +0 -0
  106. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_processor.py +0 -0
  107. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_queue.py +0 -0
  108. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_spans.py +0 -0
  109. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/instrumentation/test_storage_handler.py +0 -0
  110. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/observation_store/__init__.py +0 -0
  111. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/observation_store/conftest.py +0 -0
  112. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/observation_store/test_evaluable.py +0 -0
  113. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/observation_store/test_serialization.py +0 -0
  114. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/observation_store/test_store.py +0 -0
  115. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/observation_store/test_tree.py +0 -0
  116. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/test_config.py +0 -0
  117. {pixie_qa-0.1.10 → pixie_qa-0.1.11}/tests/pixie/test_init.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixie-qa
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: Automated quality assurance for AI applications
5
5
  Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
6
  Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -66,11 +66,11 @@ Description-Content-Type: text/markdown
66
66
 
67
67
  # pixie-qa
68
68
 
69
- An agent skill for **eval-driven development** of LLM-powered applications.
69
+ An agent skill that make coding agent the QA engineer for LLM applications.
70
70
 
71
71
  ## What the Skill Does
72
72
 
73
- The `eval-driven-dev` skill guides your coding agent through the full QA loop for LLM applications:
73
+ The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
74
74
 
75
75
  1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
76
76
  2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
@@ -1,10 +1,10 @@
1
1
  # pixie-qa
2
2
 
3
- An agent skill for **eval-driven development** of LLM-powered applications.
3
+ An agent skill that make coding agent the QA engineer for LLM applications.
4
4
 
5
5
  ## What the Skill Does
6
6
 
7
- The `eval-driven-dev` skill guides your coding agent through the full QA loop for LLM applications:
7
+ The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
8
8
 
9
9
  1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
10
10
  2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pixie-qa"
3
- version = "0.1.10"
3
+ version = "0.1.11"
4
4
  description = "Automated quality assurance for AI applications"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -1,23 +1,23 @@
1
1
  ---
2
2
  name: eval-driven-dev
3
- description: Instrument Python LLM apps, build golden datasets, write eval-based tests, run them, and root-cause failures — covering the full eval-driven development cycle. Make sure to use this skill whenever a user is developing, testing, QA-ing, evaluating, or benchmarking a Python project that calls an LLM, even if they don't say "evals" explicitly. Use for making sure an AI app works correctly, catching regressions after prompt changes, debugging why an agent started behaving differently, or validating output quality before shipping.
3
+ description: Add instrumentation, build golden datasets, write eval-based tests, run them, root-cause failures, and iterate Ensure your Python LLM application works correctly. Make sure to use this skill whenever a user is developing, testing, QA-ing, evaluating, or benchmarking a Python project that calls an LLM. Use for making sure an LLM application works correctly, catching regressions after prompt changes, fixing unexpected behavior, or validating output quality before shipping.
4
4
  license: MIT
5
5
  compatibility: Python 3.11+
6
6
  metadata:
7
- version: 0.1.10
7
+ version: 0.1.11
8
8
  ---
9
9
 
10
- # Eval-Driven Development with pixie
10
+ # Evaluation-Driven Development for Python LLM Applications
11
11
 
12
12
  This skill is about doing the work, not describing it. When a user asks you to set up evals for their app, you should be reading their code, editing their files, running commands, and producing a working test pipeline — not writing a plan for them to follow later.
13
13
 
14
14
  ## Startup checks (always first)
15
15
 
16
- Before doing anything else, perform these two steps in order:
16
+ This skill uses the python `pixie-qa` package. Before doing anything else, perform these two steps:
17
17
 
18
18
  ### 1. Upgrade pixie-qa
19
19
 
20
- Always attempt to upgrade the `pixie-qa` package in the user's environment regardless of whether the skill itself is outdated. Detect the package manager from the project (check for `uv.lock`, `poetry.lock`, `requirements.txt`, or a plain `pip` environment) and run the appropriate upgrade command:
20
+ Attempt to upgrade the `pixie-qa` package in the user's environment. Detect the package manager from the project (check for `uv.lock`, `poetry.lock`, `requirements.txt`, or a plain `pip` environment) and run the appropriate upgrade command:
21
21
 
22
22
  - **uv**: `uv add pixie-qa --upgrade` (or `uv sync --upgrade-package pixie-qa`)
23
23
  - **poetry**: `poetry add pixie-qa@latest`
@@ -27,7 +27,7 @@ If the upgrade fails (e.g., no network, version conflict), log the error and con
27
27
 
28
28
  ### 2. Check skill version
29
29
 
30
- After upgrading pixie-qa, check whether a newer version of this skill itself is available by running the script `check_version.py`.
30
+ Check whether a newer version of this skill itself is available by running the script `check_version.py`.
31
31
 
32
32
  If there is a newer version, reinstall the skill with `npx skills add` before proceeding.
33
33
 
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env python3
2
- """Check whether the eval-driven-dev skill is outdated and update it if needed."""
2
+ """Check whether the qa-eval skill is outdated and update it if needed."""
3
3
 
4
4
  from __future__ import annotations
5
5
 
@@ -9,9 +9,7 @@ from urllib.error import URLError
9
9
  from urllib.request import urlopen
10
10
 
11
11
  SKILL_REPO = "/yiouli/pixie-qa/"
12
- SKILL_URL = (
13
- f"https://raw.githubusercontent.com{SKILL_REPO}main/skills/eval-driven-dev/SKILL.md"
14
- )
12
+ SKILL_URL = f"https://raw.githubusercontent.com{SKILL_REPO}main/skills/qa-eval/SKILL.md"
15
13
 
16
14
  _RE_FRONTMATTER = re.compile(r"^---\s*\n(.*?)\n---", re.DOTALL)
17
15
  _RE_NAME = re.compile(r"^name:\s*(.+)$", re.MULTILINE)
@@ -57,7 +55,7 @@ def _normalise_version(version: str) -> tuple[int, ...]:
57
55
 
58
56
  def main() -> int:
59
57
  resource_dir = Path(__file__).resolve().parent
60
- skill_dir = resource_dir.parent # skills/eval-driven-dev/
58
+ skill_dir = resource_dir.parent # skills/ai-qa/
61
59
 
62
60
  local_data = _load_local_version(skill_dir)
63
61
  local_version = local_data.get("version", "0.0.0")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes