pixie-qa 0.1.11__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/PKG-INFO +2 -1
  2. pixie_qa-0.2.0/changelogs/observe-sensitive-field-stripping.md +22 -0
  3. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/__init__.py +2 -0
  4. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/main.py +76 -0
  5. pixie_qa-0.2.0/pixie/cli/trace_command.py +186 -0
  6. pixie_qa-0.2.0/pixie/evals/llm_evaluator.py +207 -0
  7. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/observation.py +8 -2
  8. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pyproject.toml +2 -1
  9. pixie_qa-0.2.0/skills/eval-driven-dev/SKILL.md +361 -0
  10. pixie_qa-0.2.0/skills/eval-driven-dev/references/dataset-generation.md +235 -0
  11. pixie_qa-0.2.0/skills/eval-driven-dev/references/eval-tests.md +240 -0
  12. pixie_qa-0.2.0/skills/eval-driven-dev/references/instrumentation.md +174 -0
  13. pixie_qa-0.2.0/skills/eval-driven-dev/references/investigation.md +146 -0
  14. pixie_qa-0.2.0/skills/eval-driven-dev/references/pixie-api.md +257 -0
  15. pixie_qa-0.2.0/skills/eval-driven-dev/references/run-harness-patterns.md +282 -0
  16. pixie_qa-0.2.0/skills/eval-driven-dev/references/understanding-app.md +201 -0
  17. pixie_qa-0.2.0/skills/eval-driven-dev/resources/check_version.py +126 -0
  18. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/test_e2e_pixie_test.py +1 -1
  19. pixie_qa-0.2.0/tests/pixie/cli/test_trace_command.py +324 -0
  20. pixie_qa-0.2.0/tests/pixie/evals/test_llm_evaluator.py +235 -0
  21. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_scorecard.py +3 -3
  22. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_observation.py +46 -0
  23. pixie_qa-0.1.11/skills/eval-driven-dev/SKILL.md +0 -870
  24. pixie_qa-0.1.11/skills/eval-driven-dev/references/pixie-api.md +0 -195
  25. pixie_qa-0.1.11/skills/eval-driven-dev/resources/check_version.py +0 -84
  26. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/.github/copilot-instructions.md +0 -0
  27. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/.github/workflows/publish.yml +0 -0
  28. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/.gitignore +0 -0
  29. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/LICENSE +0 -0
  30. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/README.md +0 -0
  31. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/async-handler-processing.md +0 -0
  32. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/autoevals-adapters.md +0 -0
  33. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/cli-dataset-commands.md +0 -0
  34. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/dataset-management.md +0 -0
  35. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/deep-research-demo.md +0 -0
  36. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/eval-harness.md +0 -0
  37. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/expected-output-in-evals.md +0 -0
  38. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/instrumentation-module-implementation.md +0 -0
  39. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/loud-failure-mode.md +0 -0
  40. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/manual-instrumentation-usability.md +0 -0
  41. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/observation-store-implementation.md +0 -0
  42. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/pixie-directory-and-skill-improvements.md +0 -0
  43. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/pixie-test-e2e-suite.md +0 -0
  44. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/root-package-exports-and-trace-id.md +0 -0
  45. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/scorecard-branding-and-skill-version-check.md +0 -0
  46. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/scorecard-eval-detail-dialog.md +0 -0
  47. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/skill-v2-and-rootdir-discovery.md +0 -0
  48. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/test-scorecard.md +0 -0
  49. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/usability-utils.md +0 -0
  50. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/docs/package.md +0 -0
  51. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/__init__.py +0 -0
  52. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/dataset_command.py +0 -0
  53. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/test_command.py +0 -0
  54. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/config.py +0 -0
  55. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/dataset/__init__.py +0 -0
  56. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/dataset/models.py +0 -0
  57. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/dataset/store.py +0 -0
  58. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/__init__.py +0 -0
  59. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/criteria.py +0 -0
  60. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/eval_utils.py +0 -0
  61. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/evaluation.py +0 -0
  62. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/runner.py +0 -0
  63. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/scorecard.py +0 -0
  64. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/scorers.py +0 -0
  65. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/trace_capture.py +0 -0
  66. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/trace_helpers.py +0 -0
  67. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/favicon.png +0 -0
  68. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/__init__.py +0 -0
  69. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/context.py +0 -0
  70. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/handler.py +0 -0
  71. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/handlers.py +0 -0
  72. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/instrumentors.py +0 -0
  73. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/processor.py +0 -0
  74. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/queue.py +0 -0
  75. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/spans.py +0 -0
  76. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/__init__.py +0 -0
  77. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/evaluable.py +0 -0
  78. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/piccolo_conf.py +0 -0
  79. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/piccolo_migrations/__init__.py +0 -0
  80. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/serialization.py +0 -0
  81. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/store.py +0 -0
  82. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/tables.py +0 -0
  83. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/tree.py +0 -0
  84. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/agent-skill-1.md +0 -0
  85. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/agent-skill.md +0 -0
  86. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/autoevals-adapters.md +0 -0
  87. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/dataset-management.md +0 -0
  88. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/evals-harness.md +0 -0
  89. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/expected-output-in-evals.md +0 -0
  90. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/instrumentation.md +0 -0
  91. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/manual-instrumentation-usability.md +0 -0
  92. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/storage.md +0 -0
  93. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/usability-utils.md +0 -0
  94. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/__init__.py +0 -0
  95. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/__init__.py +0 -0
  96. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/__init__.py +0 -0
  97. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_cases.json +0 -0
  98. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/conftest.py +0 -0
  99. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -0
  100. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -0
  101. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -0
  102. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/test_dataset_command.py +0 -0
  103. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/test_main.py +0 -0
  104. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/dataset/__init__.py +0 -0
  105. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/dataset/test_models.py +0 -0
  106. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/dataset/test_store.py +0 -0
  107. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/__init__.py +0 -0
  108. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_criteria.py +0 -0
  109. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_eval_utils.py +0 -0
  110. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_evaluation.py +0 -0
  111. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_runner.py +0 -0
  112. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_scorers.py +0 -0
  113. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_trace_capture.py +0 -0
  114. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_trace_helpers.py +0 -0
  115. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/__init__.py +0 -0
  116. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/conftest.py +0 -0
  117. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_context.py +0 -0
  118. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_handler.py +0 -0
  119. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_integration.py +0 -0
  120. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_processor.py +0 -0
  121. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_queue.py +0 -0
  122. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_spans.py +0 -0
  123. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_storage_handler.py +0 -0
  124. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/__init__.py +0 -0
  125. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/conftest.py +0 -0
  126. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_evaluable.py +0 -0
  127. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_serialization.py +0 -0
  128. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_store.py +0 -0
  129. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_tree.py +0 -0
  130. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/test_config.py +0 -0
  131. {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/test_init.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixie-qa
3
- Version: 0.1.11
3
+ Version: 0.2.0
4
4
  Summary: Automated quality assurance for AI applications
5
5
  Project-URL: Homepage, https://github.com/yiouli/pixie-qa
6
6
  Project-URL: Repository, https://github.com/yiouli/pixie-qa
@@ -40,6 +40,7 @@ Classifier: Topic :: Software Development :: Testing
40
40
  Requires-Python: >=3.11
41
41
  Requires-Dist: autoevals>=0.1.0
42
42
  Requires-Dist: jsonpickle>=4.0.0
43
+ Requires-Dist: openai>=2.29.0
43
44
  Requires-Dist: openinference-instrumentation>=0.1.44
44
45
  Requires-Dist: opentelemetry-api>=1.27.0
45
46
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -0,0 +1,22 @@
1
+ # @observe: Strip `self` and `cls` from captured input
2
+
3
+ ## What changed
4
+
5
+ The `@observe` decorator now automatically removes `self` and `cls` from the
6
+ captured function arguments before serialization. Previously, decorating a
7
+ method with `@observe` would serialize the entire instance (including API keys,
8
+ client objects, and other sensitive state) into `eval_input` via jsonpickle.
9
+
10
+ ## Files affected
11
+
12
+ - `pixie/instrumentation/observation.py` — `sync_wrapper` and `async_wrapper`
13
+ now pop `self`/`cls` from `bound.arguments` before calling `_serialize()`.
14
+ - `tests/pixie/instrumentation/test_observation.py` — Two new tests:
15
+ `test_self_excluded_from_input` and `test_cls_excluded_from_input`.
16
+
17
+ ## Migration notes
18
+
19
+ No API changes. This is a backward-compatible fix. Existing `@observe` usage on
20
+ functions (not methods) is unaffected. Methods that previously leaked `self` into
21
+ `eval_input` will now produce cleaner, smaller input data containing only the
22
+ semantic arguments.
@@ -17,6 +17,7 @@ from pixie.evals.eval_utils import (
17
17
  run_and_evaluate,
18
18
  )
19
19
  from pixie.evals.evaluation import Evaluation, Evaluator, evaluate
20
+ from pixie.evals.llm_evaluator import create_llm_evaluator
20
21
  from pixie.evals.scorers import (
21
22
  AnswerCorrectnessEval,
22
23
  AnswerRelevancyEval,
@@ -96,6 +97,7 @@ __all__ = [
96
97
  "assert_dataset_pass",
97
98
  "assert_pass",
98
99
  "capture_traces",
100
+ "create_llm_evaluator",
99
101
  "evaluate",
100
102
  "last_llm_call",
101
103
  "root",
@@ -30,6 +30,7 @@ from pixie.cli.dataset_command import (
30
30
  dataset_save,
31
31
  format_dataset_table,
32
32
  )
33
+ from pixie.cli.trace_command import trace_last, trace_list, trace_show
33
34
  from pixie.config import get_config
34
35
  from pixie.dataset.store import DatasetStore
35
36
  from pixie.storage.evaluable import UNSET, _Unset
@@ -88,6 +89,58 @@ def _build_parser() -> argparse.ArgumentParser:
88
89
  help="Optional notes to attach to the evaluable metadata",
89
90
  )
90
91
 
92
+ # -- pixie trace ---------------------------------------------------------
93
+ trace_parser = subparsers.add_parser("trace", help="Inspect captured traces")
94
+ trace_sub = trace_parser.add_subparsers(dest="trace_action", help="Trace actions")
95
+
96
+ # pixie trace list [--limit N] [--errors]
97
+ trace_list_parser = trace_sub.add_parser("list", help="List recent traces")
98
+ trace_list_parser.add_argument(
99
+ "--limit",
100
+ type=int,
101
+ default=10,
102
+ help="Maximum number of traces to show (default: 10)",
103
+ )
104
+ trace_list_parser.add_argument(
105
+ "--errors",
106
+ action="store_true",
107
+ default=False,
108
+ help="Show only traces with errors",
109
+ )
110
+
111
+ # pixie trace show <trace_id> [-v] [--json]
112
+ trace_show_parser = trace_sub.add_parser("show", help="Show span tree for a trace")
113
+ trace_show_parser.add_argument(
114
+ "trace_id",
115
+ help="Trace ID (or prefix, minimum 8 characters)",
116
+ )
117
+ trace_show_parser.add_argument(
118
+ "-v",
119
+ "--verbose",
120
+ action="store_true",
121
+ default=False,
122
+ help="Show full input/output data for each span",
123
+ )
124
+ trace_show_parser.add_argument(
125
+ "--json",
126
+ dest="as_json",
127
+ action="store_true",
128
+ default=False,
129
+ help="Output as JSON",
130
+ )
131
+
132
+ # pixie trace last [--json]
133
+ trace_last_parser = trace_sub.add_parser(
134
+ "last", help="Show the most recent trace (verbose)"
135
+ )
136
+ trace_last_parser.add_argument(
137
+ "--json",
138
+ dest="as_json",
139
+ action="store_true",
140
+ default=False,
141
+ help="Output as JSON",
142
+ )
143
+
91
144
  # -- pixie test ----------------------------------------------------------
92
145
  test_parser = subparsers.add_parser("test", help="Run pixie eval tests")
93
146
  test_parser.add_argument(
@@ -213,6 +266,29 @@ def main(argv: list[str] | None = None) -> int:
213
266
  print(f"Error: {exc}", file=sys.stderr) # noqa: T201
214
267
  return 1
215
268
 
269
+ elif args.command == "trace":
270
+ if args.trace_action is None:
271
+ parser.parse_args(["trace", "--help"])
272
+ return 1
273
+
274
+ try:
275
+ if args.trace_action == "list":
276
+ return trace_list(
277
+ limit=args.limit,
278
+ errors_only=args.errors,
279
+ )
280
+ elif args.trace_action == "show":
281
+ return trace_show(
282
+ trace_id=args.trace_id,
283
+ verbose=args.verbose,
284
+ as_json=args.as_json,
285
+ )
286
+ elif args.trace_action == "last":
287
+ return trace_last(as_json=args.as_json)
288
+ except Exception as exc:
289
+ print(f"Error: {exc}", file=sys.stderr) # noqa: T201
290
+ return 1
291
+
216
292
  elif args.command == "test":
217
293
  from pixie.cli.test_command import main as test_main
218
294
 
@@ -0,0 +1,186 @@
1
+ """``pixie trace`` CLI subcommands — list, show, and last.
2
+
3
+ Provides read-only inspection of captured traces via the
4
+ :class:`~pixie.storage.store.ObservationStore`.
5
+
6
+ Commands::
7
+
8
+ pixie trace list [--limit N] [--errors]
9
+ pixie trace show <trace_id> [-v | --verbose] [--json]
10
+ pixie trace last [--json]
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ import json
17
+ from datetime import datetime
18
+ from typing import Any
19
+
20
+ from piccolo.engine.sqlite import SQLiteEngine
21
+
22
+ from pixie.config import get_config
23
+ from pixie.storage.store import ObservationStore
24
+ from pixie.storage.tree import ObservationNode
25
+
26
+
27
+ def _make_store() -> ObservationStore:
28
+ """Create an ObservationStore from config."""
29
+ config = get_config()
30
+ engine = SQLiteEngine(path=config.db_path)
31
+ return ObservationStore(engine=engine)
32
+
33
+
34
+ def _format_datetime(value: Any) -> str:
35
+ """Format a datetime value to a human-readable string."""
36
+ if isinstance(value, datetime):
37
+ return value.strftime("%Y-%m-%d %H:%M")
38
+ if isinstance(value, str):
39
+ try:
40
+ dt = datetime.fromisoformat(value)
41
+ return dt.strftime("%Y-%m-%d %H:%M")
42
+ except (ValueError, TypeError):
43
+ return str(value)
44
+ return str(value) if value is not None else ""
45
+
46
+
47
+ async def _trace_list(limit: int, errors_only: bool) -> list[dict[str, Any]]:
48
+ """Fetch trace summaries from the store."""
49
+ store = _make_store()
50
+ traces = await store.list_traces(limit=limit)
51
+ if errors_only:
52
+ traces = [t for t in traces if t.get("has_error")]
53
+ return traces
54
+
55
+
56
+ async def _trace_show(
57
+ trace_id: str,
58
+ verbose: bool,
59
+ as_json: bool,
60
+ ) -> str:
61
+ """Fetch and render a single trace."""
62
+ store = _make_store()
63
+ # Support prefix matching
64
+ traces = await store.list_traces(limit=500)
65
+ matched = [t for t in traces if t["trace_id"].startswith(trace_id)]
66
+ if not matched:
67
+ return f"Error: No trace found matching '{trace_id}'"
68
+ if len(matched) > 1:
69
+ ids = "\n ".join(t["trace_id"] for t in matched[:10])
70
+ return f"Error: Multiple traces match '{trace_id}'. Be more specific:\n {ids}"
71
+ full_id = matched[0]["trace_id"]
72
+
73
+ tree = await store.get_trace(full_id)
74
+ if not tree:
75
+ return f"Error: No spans found for trace '{full_id}'"
76
+
77
+ if as_json:
78
+ spans_data = []
79
+ for node in tree:
80
+ spans_data.extend(_collect_serialized(node))
81
+ return json.dumps(spans_data, indent=2, default=str)
82
+
83
+ # Text rendering — to_text already handles both compact and verbose
84
+ # For compact mode, we use a stripped-down version
85
+ if verbose:
86
+ lines = [f"[trace_id: {full_id}]\n"]
87
+ for root_node in tree:
88
+ lines.append(root_node.to_text(indent=0))
89
+ return "\n".join(lines)
90
+
91
+ # Compact mode: just names and timing
92
+ lines = [f"[trace_id: {full_id}]\n"]
93
+ for root_node in tree:
94
+ lines.append(_compact_text(root_node, indent=0))
95
+ return "\n".join(lines)
96
+
97
+
98
+ async def _trace_last(as_json: bool) -> str:
99
+ """Show the most recent trace in verbose mode."""
100
+ store = _make_store()
101
+ traces = await store.list_traces(limit=1)
102
+ if not traces:
103
+ return "No traces found."
104
+ trace_id = traces[0]["trace_id"]
105
+ return await _trace_show(trace_id, verbose=True, as_json=as_json)
106
+
107
+
108
+ def _compact_text(node: ObservationNode, indent: int = 0) -> str:
109
+ """Render a compact text view (names and timing only)."""
110
+ from pixie.instrumentation.spans import LLMSpan
111
+
112
+ prefix = " " * indent
113
+ lines: list[str] = []
114
+ if isinstance(node.span, LLMSpan):
115
+ span = node.span
116
+ header = (
117
+ f"{prefix}{span.request_model} [{span.provider}, {span.duration_ms:.0f}ms]"
118
+ )
119
+ lines.append(header)
120
+ token_parts: list[str] = []
121
+ if span.input_tokens > 0 or span.output_tokens > 0:
122
+ token_parts.append(f"{span.input_tokens} in / {span.output_tokens} out")
123
+ lines.append(f"{prefix} tokens: {' '.join(token_parts)}")
124
+ else:
125
+ name = node.span.name or "(unnamed)"
126
+ lines.append(f"{prefix}{name} [{node.span.duration_ms:.0f}ms]")
127
+
128
+ for child in node.children:
129
+ lines.append(_compact_text(child, indent + 1))
130
+ return "\n".join(lines)
131
+
132
+
133
+ def _collect_serialized(node: ObservationNode) -> list[dict[str, Any]]:
134
+ """Recursively collect serialized spans from a tree."""
135
+ from pixie.storage.serialization import serialize_span
136
+
137
+ result: list[dict[str, Any]] = [serialize_span(node.span)]
138
+ for child in node.children:
139
+ result.extend(_collect_serialized(child))
140
+ return result
141
+
142
+
143
+ def trace_list(limit: int = 10, errors_only: bool = False) -> int:
144
+ """Entry point for ``pixie trace list``."""
145
+ traces = asyncio.run(_trace_list(limit, errors_only))
146
+ if not traces:
147
+ print("No traces found.") # noqa: T201
148
+ return 0
149
+
150
+ # Table header
151
+ header = (
152
+ f"{'TRACE_ID':<34}"
153
+ f"{'ROOT SPAN':<25}"
154
+ f"{'STARTED':<20}"
155
+ f"{'SPANS':>6}"
156
+ f"{'ERRORS':>7}"
157
+ )
158
+ print(header) # noqa: T201
159
+ for t in traces:
160
+ row = (
161
+ f"{t['trace_id']:<34}"
162
+ f"{(t.get('root_name') or '(unknown)'):<25}"
163
+ f"{_format_datetime(t.get('started_at')):<20}"
164
+ f"{t.get('observation_count', 0):>6}"
165
+ f"{('yes' if t.get('has_error') else ''):>7}"
166
+ )
167
+ print(row) # noqa: T201
168
+ return 0
169
+
170
+
171
+ def trace_show(
172
+ trace_id: str,
173
+ verbose: bool = False,
174
+ as_json: bool = False,
175
+ ) -> int:
176
+ """Entry point for ``pixie trace show``."""
177
+ output = asyncio.run(_trace_show(trace_id, verbose, as_json))
178
+ print(output) # noqa: T201
179
+ return 1 if output.startswith("Error:") else 0
180
+
181
+
182
+ def trace_last(as_json: bool = False) -> int:
183
+ """Entry point for ``pixie trace last``."""
184
+ output = asyncio.run(_trace_last(as_json))
185
+ print(output) # noqa: T201
186
+ return 0
@@ -0,0 +1,207 @@
1
+ """Factory for custom LLM-as-judge evaluators from prompt templates.
2
+
3
+ Usage::
4
+
5
+ from pixie import create_llm_evaluator
6
+
7
+ concise_voice_style = create_llm_evaluator(
8
+ name="ConciseVoiceStyle",
9
+ prompt_template=\"\"\"
10
+ You are evaluating whether a voice agent response is concise and
11
+ phone-friendly.
12
+
13
+ User said: {eval_input}
14
+ Agent responded: {eval_output}
15
+ Expected behavior: {expected_output}
16
+
17
+ Score 1.0 if the response is concise (under 3 sentences), directly
18
+ addresses the question, and uses conversational language suitable for
19
+ a phone call. Score 0.0 if it's verbose, off-topic, or uses
20
+ written-style formatting.
21
+ \"\"\",
22
+ )
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import logging
29
+ import re
30
+ from typing import Any
31
+
32
+ from openai import OpenAI
33
+
34
+ from pixie.evals.evaluation import Evaluation
35
+ from pixie.storage.evaluable import Evaluable, _Unset
36
+ from pixie.storage.tree import ObservationNode
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Default model for LLM-as-judge calls
41
+ _DEFAULT_MODEL = "gpt-4o-mini"
42
+
43
+ # Regex to detect nested field access like {eval_input[key]} in templates
44
+ _NESTED_ACCESS_RE = re.compile(
45
+ r"\{(eval_input|eval_output|expected_output)\["
46
+ )
47
+
48
+
49
+ def _value_to_str(value: Any) -> str:
50
+ """Convert an eval value to a string for template substitution."""
51
+ if value is None:
52
+ return ""
53
+ if isinstance(value, _Unset):
54
+ return "(not provided)"
55
+ if isinstance(value, (dict, list)):
56
+ return json.dumps(value, default=str)
57
+ return str(value)
58
+
59
+
60
+ def _parse_score(text: str) -> tuple[float, str]:
61
+ """Extract a 0-1 score and reasoning from LLM response text.
62
+
63
+ Looks for patterns like "Score: 0.8", "score: 1.0", "0.7/1.0", or
64
+ a bare float on its own line. Returns (score, reasoning).
65
+ """
66
+ # Try "Score: X" pattern first
67
+ match = re.search(r"[Ss]core\s*[:=]\s*([01](?:\.\d+)?)", text)
68
+ if match:
69
+ score = float(match.group(1))
70
+ return min(max(score, 0.0), 1.0), text.strip()
71
+
72
+ # Try "X/1" or "X/1.0" pattern
73
+ match = re.search(r"([01](?:\.\d+)?)\s*/\s*1(?:\.0)?", text)
74
+ if match:
75
+ score = float(match.group(1))
76
+ return min(max(score, 0.0), 1.0), text.strip()
77
+
78
+ # Try bare float on a line
79
+ match = re.search(r"^([01](?:\.\d+)?)\s*$", text, re.MULTILINE)
80
+ if match:
81
+ score = float(match.group(1))
82
+ return min(max(score, 0.0), 1.0), text.strip()
83
+
84
+ # Fallback: couldn't parse score
85
+ logger.warning("Could not parse score from LLM response: %s", text[:200])
86
+ return 0.0, f"Failed to parse score. Raw response: {text.strip()}"
87
+
88
+
89
+ class _LLMEvaluator:
90
+ """Evaluator that uses an LLM to judge quality via a prompt template."""
91
+
92
+ def __init__(
93
+ self,
94
+ name: str,
95
+ prompt_template: str,
96
+ model: str,
97
+ client: Any | None,
98
+ ) -> None:
99
+ self._name = name
100
+ self._prompt_template = prompt_template
101
+ self._model = model
102
+ self._client = client
103
+
104
+ @property
105
+ def name(self) -> str:
106
+ """Return the evaluator's display name."""
107
+ return self._name
108
+
109
+ def _get_client(self) -> Any:
110
+ """Get or create the OpenAI client."""
111
+ if self._client is not None:
112
+ return self._client
113
+
114
+ return OpenAI()
115
+
116
+ def _render_prompt(self, evaluable: Evaluable) -> str:
117
+ """Fill in the template with evaluable fields."""
118
+ expected = evaluable.expected_output
119
+ rendered = self._prompt_template.format(
120
+ eval_input=_value_to_str(evaluable.eval_input),
121
+ eval_output=_value_to_str(evaluable.eval_output),
122
+ expected_output=_value_to_str(expected),
123
+ )
124
+ return rendered + "\n\nRespond with 'Score: X.X' followed by reasoning."
125
+
126
+ async def __call__(
127
+ self,
128
+ evaluable: Evaluable,
129
+ *,
130
+ trace: list[ObservationNode] | None = None,
131
+ ) -> Evaluation:
132
+ """Run the LLM judge and parse the score."""
133
+ import asyncio
134
+
135
+ prompt = self._render_prompt(evaluable)
136
+ client = self._get_client()
137
+
138
+ response = await asyncio.to_thread(
139
+ client.chat.completions.create,
140
+ model=self._model,
141
+ messages=[
142
+ {
143
+ "role": "system",
144
+ "content": (
145
+ "You are an evaluation judge. Score the following on "
146
+ "a scale of 0.0 to 1.0. Always include 'Score: X.X' "
147
+ "in your response, followed by your reasoning."
148
+ ),
149
+ },
150
+ {"role": "user", "content": prompt},
151
+ ],
152
+ temperature=0.0,
153
+ )
154
+
155
+ text = response.choices[0].message.content or ""
156
+ score, reasoning = _parse_score(text)
157
+
158
+ return Evaluation(
159
+ score=score,
160
+ reasoning=reasoning,
161
+ details={"evaluator": self._name, "model": self._model},
162
+ )
163
+
164
+
165
+ def create_llm_evaluator(
166
+ name: str,
167
+ prompt_template: str,
168
+ *,
169
+ model: str = _DEFAULT_MODEL,
170
+ client: Any | None = None,
171
+ ) -> _LLMEvaluator:
172
+ """Create a custom LLM-as-judge evaluator from a prompt template.
173
+
174
+ The template may reference these variables (populated from the
175
+ :class:`~pixie.storage.evaluable.Evaluable` fields):
176
+
177
+ - ``{eval_input}`` — the evaluable's input
178
+ - ``{eval_output}`` — the evaluable's output
179
+ - ``{expected_output}`` — the evaluable's expected output
180
+
181
+ Args:
182
+ name: Display name for the evaluator (shown in scorecard).
183
+ prompt_template: A string template with ``{eval_input}``,
184
+ ``{eval_output}``, and/or ``{expected_output}`` placeholders.
185
+ model: OpenAI model name (default: ``gpt-4o-mini``).
186
+ client: Optional pre-configured OpenAI client instance.
187
+
188
+ Returns:
189
+ An evaluator callable satisfying the ``Evaluator`` protocol.
190
+
191
+ Raises:
192
+ ValueError: If the template uses nested field access like
193
+ ``{eval_input[key]}`` (only top-level placeholders are supported).
194
+ """
195
+ match = _NESTED_ACCESS_RE.search(prompt_template)
196
+ if match:
197
+ raise ValueError(
198
+ f"Nested field access like '{{{match.group(1)}[...]}}' is not "
199
+ f"supported in prompt templates. Use '{{{match.group(1)}}}' "
200
+ f"instead — dict values are serialized to JSON automatically."
201
+ )
202
+ return _LLMEvaluator(
203
+ name=name,
204
+ prompt_template=prompt_template,
205
+ model=model,
206
+ client=client,
207
+ )
@@ -58,7 +58,10 @@ def observe(
58
58
  sig = inspect.signature(fn)
59
59
  bound = sig.bind(*args, **kwargs)
60
60
  bound.apply_defaults()
61
- serialized_input = _serialize(dict(bound.arguments))
61
+ arguments = dict(bound.arguments)
62
+ arguments.pop("self", None)
63
+ arguments.pop("cls", None)
64
+ serialized_input = _serialize(arguments)
62
65
 
63
66
  with start_observation(
64
67
  input=serialized_input, name=span_name
@@ -76,7 +79,10 @@ def observe(
76
79
  sig = inspect.signature(fn)
77
80
  bound = sig.bind(*args, **kwargs)
78
81
  bound.apply_defaults()
79
- serialized_input = _serialize(dict(bound.arguments))
82
+ arguments = dict(bound.arguments)
83
+ arguments.pop("self", None)
84
+ arguments.pop("cls", None)
85
+ serialized_input = _serialize(arguments)
80
86
 
81
87
  with start_observation(
82
88
  input=serialized_input, name=span_name
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pixie-qa"
3
- version = "0.1.11"
3
+ version = "0.2.0"
4
4
  description = "Automated quality assurance for AI applications"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -25,6 +25,7 @@ dependencies = [
25
25
  "pydantic>=2.0",
26
26
  "jsonpickle>=4.0.0",
27
27
  "python-dotenv>=1.2.2",
28
+ "openai>=2.29.0",
28
29
  ]
29
30
 
30
31
  [project.urls]