pixie-qa 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/PKG-INFO +15 -11
  2. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/README.md +14 -9
  3. pixie_qa-0.5.0/pixie/__init__.py +90 -0
  4. pixie_qa-0.5.0/pixie/assets/webui.html +64 -0
  5. pixie_qa-0.5.0/pixie/cli/__init__.py +11 -0
  6. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/cli/analyze_command.py +3 -3
  7. pixie_qa-0.5.0/pixie/cli/format_command.py +223 -0
  8. pixie_qa-0.5.0/pixie/cli/main.py +202 -0
  9. pixie_qa-0.5.0/pixie/cli/test_command.py +178 -0
  10. pixie_qa-0.5.0/pixie/cli/trace_command.py +128 -0
  11. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/config.py +8 -8
  12. pixie_qa-0.5.0/pixie/eval/__init__.py +143 -0
  13. pixie_qa-0.5.0/pixie/eval/evaluable.py +100 -0
  14. {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/evaluation.py +8 -14
  15. {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/llm_evaluator.py +35 -19
  16. {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/rate_limiter.py +1 -1
  17. {pixie_qa-0.4.0/pixie/evals → pixie_qa-0.5.0/pixie/eval}/scorers.py +53 -50
  18. pixie_qa-0.5.0/pixie/harness/__init__.py +8 -0
  19. pixie_qa-0.5.0/pixie/harness/runnable.py +133 -0
  20. pixie_qa-0.5.0/pixie/harness/runner.py +813 -0
  21. pixie_qa-0.5.0/pixie/instrumentation/__init__.py +99 -0
  22. pixie_qa-0.5.0/pixie/instrumentation/llm_tracing.py +818 -0
  23. pixie_qa-0.5.0/pixie/instrumentation/wrap.py +323 -0
  24. pixie_qa-0.5.0/pixie/web/__init__.py +7 -0
  25. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/web/app.py +15 -4
  26. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/web/watcher.py +6 -3
  27. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pyproject.toml +3 -4
  28. pixie_qa-0.4.0/pixie/__init__.py +0 -113
  29. pixie_qa-0.4.0/pixie/assets/webui.html +0 -64
  30. pixie_qa-0.4.0/pixie/cli/__init__.py +0 -6
  31. pixie_qa-0.4.0/pixie/cli/dag_command.py +0 -75
  32. pixie_qa-0.4.0/pixie/cli/dataset_command.py +0 -193
  33. pixie_qa-0.4.0/pixie/cli/main.py +0 -456
  34. pixie_qa-0.4.0/pixie/cli/test_command.py +0 -257
  35. pixie_qa-0.4.0/pixie/cli/trace_command.py +0 -294
  36. pixie_qa-0.4.0/pixie/dag/__init__.py +0 -400
  37. pixie_qa-0.4.0/pixie/dag/trace_check.py +0 -183
  38. pixie_qa-0.4.0/pixie/dataset/__init__.py +0 -11
  39. pixie_qa-0.4.0/pixie/dataset/models.py +0 -21
  40. pixie_qa-0.4.0/pixie/dataset/store.py +0 -212
  41. pixie_qa-0.4.0/pixie/evals/__init__.py +0 -184
  42. pixie_qa-0.4.0/pixie/evals/criteria.py +0 -61
  43. pixie_qa-0.4.0/pixie/evals/dataset_runner.py +0 -495
  44. pixie_qa-0.4.0/pixie/evals/eval_utils.py +0 -334
  45. pixie_qa-0.4.0/pixie/evals/scorecard.py +0 -252
  46. pixie_qa-0.4.0/pixie/evals/trace_capture.py +0 -70
  47. pixie_qa-0.4.0/pixie/evals/trace_helpers.py +0 -57
  48. pixie_qa-0.4.0/pixie/instrumentation/__init__.py +0 -80
  49. pixie_qa-0.4.0/pixie/instrumentation/context.py +0 -86
  50. pixie_qa-0.4.0/pixie/instrumentation/handler.py +0 -72
  51. pixie_qa-0.4.0/pixie/instrumentation/handlers.py +0 -105
  52. pixie_qa-0.4.0/pixie/instrumentation/instrumentors.py +0 -47
  53. pixie_qa-0.4.0/pixie/instrumentation/observation.py +0 -217
  54. pixie_qa-0.4.0/pixie/instrumentation/processor.py +0 -366
  55. pixie_qa-0.4.0/pixie/instrumentation/queue.py +0 -88
  56. pixie_qa-0.4.0/pixie/instrumentation/spans.py +0 -165
  57. pixie_qa-0.4.0/pixie/storage/__init__.py +0 -27
  58. pixie_qa-0.4.0/pixie/storage/evaluable.py +0 -140
  59. pixie_qa-0.4.0/pixie/storage/piccolo_conf.py +0 -10
  60. pixie_qa-0.4.0/pixie/storage/piccolo_migrations/__init__.py +0 -1
  61. pixie_qa-0.4.0/pixie/storage/serialization.py +0 -227
  62. pixie_qa-0.4.0/pixie/storage/store.py +0 -231
  63. pixie_qa-0.4.0/pixie/storage/tables.py +0 -21
  64. pixie_qa-0.4.0/pixie/storage/tree.py +0 -199
  65. pixie_qa-0.4.0/pixie/web/__init__.py +0 -1
  66. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/.gitignore +0 -0
  67. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/LICENSE +0 -0
  68. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/assets/mock-data.json +0 -0
  69. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/cli/init_command.py +0 -0
  70. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/cli/start_command.py +0 -0
  71. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/favicon.png +0 -0
  72. /pixie_qa-0.4.0/pixie/evals/test_result.py → /pixie_qa-0.5.0/pixie/harness/run_result.py +0 -0
  73. {pixie_qa-0.4.0 → pixie_qa-0.5.0}/pixie/web/server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixie-qa
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Automated quality assurance for AI applications
5
5
  Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
6
  Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -44,7 +44,6 @@ Requires-Dist: openai>=2.29.0
44
44
  Requires-Dist: openinference-instrumentation>=0.1.44
45
45
  Requires-Dist: opentelemetry-api>=1.27.0
46
46
  Requires-Dist: opentelemetry-sdk>=1.27.0
47
- Requires-Dist: piccolo[sqlite]>=1.33.0
48
47
  Requires-Dist: pydantic>=2.0
49
48
  Requires-Dist: python-dotenv>=1.2.2
50
49
  Requires-Dist: starlette>=1.0.0
@@ -70,20 +69,19 @@ Description-Content-Type: text/markdown
70
69
 
71
70
  # pixie-qa
72
71
 
73
- An agent skill that make coding agent the QA engineer for LLM applications.
72
+ An agent skill that makes coding agents the QA engineer for LLM applications.
74
73
 
75
74
  ## What the Skill Does
76
75
 
77
76
  The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
78
77
 
79
78
  1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
80
- 2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
81
- 3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
82
- 4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
83
- 5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
84
- 6. **Run the tests** — `pixie test` to run all evals and report per-case scores
85
- 7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
86
- 8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
79
+ 2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
80
+ 3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
81
+ 4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
82
+ 5. **Run the tests** — `pixie test` to run all evals and report per-case scores
83
+ 6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
84
+ 7. **Investigate failures** — diagnose failures, fix, repeat
87
85
 
88
86
  ## Getting Started
89
87
 
@@ -105,7 +103,13 @@ Your coding agent will read your code, instrument it, build a dataset from a few
105
103
 
106
104
  ## Python Package
107
105
 
108
- The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
106
+ The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
107
+
108
+ Install hooks once per clone:
109
+
110
+ ```bash
111
+ uv run pre-commit install
112
+ ```
109
113
 
110
114
  ## Web UI
111
115
 
@@ -1,19 +1,18 @@
1
1
  # pixie-qa
2
2
 
3
- An agent skill that make coding agent the QA engineer for LLM applications.
3
+ An agent skill that makes coding agents the QA engineer for LLM applications.
4
4
 
5
5
  ## What the Skill Does
6
6
 
7
7
  The `qa-eval` skill guides your coding agent through the full eval-based QA loop for LLM applications:
8
8
 
9
9
  1. **Understand the code** — read the codebase, trace the data flow, learn what the code is supposed to do
10
- 2. **Instrument it** — add `enable_storage()` and `@observe` so every run is captured to a local SQLite database
11
- 3. **Build a dataset** — save representative traces as test cases with `pixie dataset save`
12
- 4. **Write eval tests** — generate `test_*.py` files with `assert_dataset_pass` and appropriate evaluators
13
- 5. **Validate datasets** — `pixie dataset validate [dir_or_dataset_path]` to catch schema/config errors early
14
- 6. **Run the tests** — `pixie test` to run all evals and report per-case scores
15
- 7. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
16
- 8. **Investigate failures** — look up the stored trace for each failure, diagnose, fix, repeat
10
+ 2. **Instrument it** — use `wrap()` for data-object tracing and OpenInference auto-instrumentation for LLM span capture
11
+ 3. **Build a dataset** — create JSON datasets of representative inputs and expected outputs
12
+ 4. **Write eval tests** — generate `test_*.py` files with appropriate evaluators
13
+ 5. **Run the tests** — `pixie test` to run all evals and report per-case scores
14
+ 6. **Analyse results** — `pixie analyze <test_id>` to get LLM-generated analysis of test results
15
+ 7. **Investigate failures** — diagnose failures, fix, repeat
17
16
 
18
17
  ## Getting Started
19
18
 
@@ -35,7 +34,13 @@ Your coding agent will read your code, instrument it, build a dataset from a few
35
34
 
36
35
  ## Python Package
37
36
 
38
- The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. For the package API and CLI reference, see [docs/package.md](docs/package.md).
37
+ The `pixie-qa` Python package (imported as `pixie`) is what Claude installs and uses inside your project. API docs are auto-generated by `pdoc3` into [docs/pixie/index.md](docs/pixie/index.md) via pre-commit. The markdown renderer uses [scripts/pdoc_templates/text.mako](scripts/pdoc_templates/text.mako) so async functions and methods are explicitly shown as `async def` in signatures.
38
+
39
+ Install hooks once per clone:
40
+
41
+ ```bash
42
+ uv run pre-commit install
43
+ ```
39
44
 
40
45
  ## Web UI
41
46
 
@@ -0,0 +1,90 @@
1
+ """pixie — automated quality assurance for AI applications.
2
+
3
+ Re-exports the full public API so users can ``from pixie import ...``
4
+ for every commonly used symbol without needing submodule paths.
5
+ """
6
+
7
+ from pixie.eval.evaluable import Evaluable, TestCase
8
+ from pixie.eval.evaluation import Evaluation, Evaluator, evaluate
9
+ from pixie.eval.llm_evaluator import create_llm_evaluator
10
+ from pixie.eval.scorers import (
11
+ AnswerCorrectness,
12
+ AnswerRelevancy,
13
+ AutoevalsAdapter,
14
+ Battle,
15
+ ClosedQA,
16
+ ContextRelevancy,
17
+ EmbeddingSimilarity,
18
+ ExactMatch,
19
+ Factuality,
20
+ Faithfulness,
21
+ Humor,
22
+ JSONDiff,
23
+ LevenshteinMatch,
24
+ ListContains,
25
+ Moderation,
26
+ NumericDiff,
27
+ Possible,
28
+ Security,
29
+ Sql,
30
+ Summary,
31
+ Translation,
32
+ ValidJSON,
33
+ )
34
+
35
+ # -- Harness ------------------------------------------------------------------
36
+ from pixie.harness.runnable import Runnable
37
+
38
+ # -- Instrumentation ----------------------------------------------------------
39
+ from pixie.instrumentation.llm_tracing import (
40
+ add_handler,
41
+ enable_llm_tracing,
42
+ flush,
43
+ remove_handler,
44
+ )
45
+ from pixie.instrumentation.wrap import (
46
+ WrappedData,
47
+ wrap,
48
+ )
49
+
50
+ __all__ = [
51
+ # Instrumentation
52
+ "WrappedData",
53
+ "flush",
54
+ "enable_llm_tracing",
55
+ "add_handler",
56
+ "remove_handler",
57
+ "wrap",
58
+ # Harness
59
+ "Runnable",
60
+ # Eval data models
61
+ "Evaluable",
62
+ "TestCase",
63
+ "Evaluation",
64
+ "Evaluator",
65
+ "evaluate",
66
+ "create_llm_evaluator",
67
+ # Pre-made evaluators (autoevals adapters)
68
+ "AnswerCorrectness",
69
+ "AnswerRelevancy",
70
+ "AutoevalsAdapter",
71
+ "Battle",
72
+ "ClosedQA",
73
+ "ContextRelevancy",
74
+ "EmbeddingSimilarity",
75
+ "ExactMatch",
76
+ "Factuality",
77
+ "Faithfulness",
78
+ "Humor",
79
+ "JSONDiff",
80
+ "LevenshteinMatch",
81
+ "ListContains",
82
+ "Moderation",
83
+ "NumericDiff",
84
+ "Possible",
85
+ "Security",
86
+ "Sql",
87
+ "Summary",
88
+ "Translation",
89
+ "ValidJSON",
90
+ ]