pixie-qa 0.1.11__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/PKG-INFO +2 -1
- pixie_qa-0.2.0/changelogs/observe-sensitive-field-stripping.md +22 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/__init__.py +2 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/main.py +76 -0
- pixie_qa-0.2.0/pixie/cli/trace_command.py +186 -0
- pixie_qa-0.2.0/pixie/evals/llm_evaluator.py +207 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/observation.py +8 -2
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pyproject.toml +2 -1
- pixie_qa-0.2.0/skills/eval-driven-dev/SKILL.md +361 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/references/dataset-generation.md +235 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/references/eval-tests.md +240 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/references/instrumentation.md +174 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/references/investigation.md +146 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/references/pixie-api.md +257 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/references/run-harness-patterns.md +282 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/references/understanding-app.md +201 -0
- pixie_qa-0.2.0/skills/eval-driven-dev/resources/check_version.py +126 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/test_e2e_pixie_test.py +1 -1
- pixie_qa-0.2.0/tests/pixie/cli/test_trace_command.py +324 -0
- pixie_qa-0.2.0/tests/pixie/evals/test_llm_evaluator.py +235 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_scorecard.py +3 -3
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_observation.py +46 -0
- pixie_qa-0.1.11/skills/eval-driven-dev/SKILL.md +0 -870
- pixie_qa-0.1.11/skills/eval-driven-dev/references/pixie-api.md +0 -195
- pixie_qa-0.1.11/skills/eval-driven-dev/resources/check_version.py +0 -84
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/.github/copilot-instructions.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/.github/workflows/publish.yml +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/.gitignore +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/LICENSE +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/README.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/async-handler-processing.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/autoevals-adapters.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/cli-dataset-commands.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/dataset-management.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/deep-research-demo.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/eval-harness.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/expected-output-in-evals.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/instrumentation-module-implementation.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/loud-failure-mode.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/manual-instrumentation-usability.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/observation-store-implementation.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/pixie-directory-and-skill-improvements.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/pixie-test-e2e-suite.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/root-package-exports-and-trace-id.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/scorecard-branding-and-skill-version-check.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/scorecard-eval-detail-dialog.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/skill-v2-and-rootdir-discovery.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/test-scorecard.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/changelogs/usability-utils.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/docs/package.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/dataset_command.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/cli/test_command.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/config.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/dataset/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/dataset/models.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/dataset/store.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/criteria.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/eval_utils.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/evaluation.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/runner.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/scorecard.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/scorers.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/trace_capture.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/evals/trace_helpers.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/favicon.png +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/context.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/handler.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/handlers.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/instrumentors.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/processor.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/queue.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/instrumentation/spans.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/evaluable.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/piccolo_conf.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/piccolo_migrations/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/serialization.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/store.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/tables.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/pixie/storage/tree.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/agent-skill-1.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/agent-skill.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/autoevals-adapters.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/dataset-management.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/evals-harness.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/expected-output-in-evals.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/instrumentation.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/manual-instrumentation-usability.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/storage.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/specs/usability-utils.md +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_cases.json +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/conftest.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/datasets/customer-faq.json +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/mock_evaluators.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/e2e_fixtures/test_customer_faq.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/test_dataset_command.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/cli/test_main.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/dataset/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/dataset/test_models.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/dataset/test_store.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_criteria.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_eval_utils.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_evaluation.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_runner.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_scorers.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_trace_capture.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/evals/test_trace_helpers.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/conftest.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_context.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_handler.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_integration.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_processor.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_queue.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_spans.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/instrumentation/test_storage_handler.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/__init__.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/conftest.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_evaluable.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_serialization.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_store.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/observation_store/test_tree.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/test_config.py +0 -0
- {pixie_qa-0.1.11 → pixie_qa-0.2.0}/tests/pixie/test_init.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pixie-qa
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Automated quality assurance for AI applications
|
|
5
5
|
Project-URL: Homepage, https://github.com/yiouli/pixie-qa
|
|
6
6
|
Project-URL: Repository, https://github.com/yiouli/pixie-qa
|
|
@@ -40,6 +40,7 @@ Classifier: Topic :: Software Development :: Testing
|
|
|
40
40
|
Requires-Python: >=3.11
|
|
41
41
|
Requires-Dist: autoevals>=0.1.0
|
|
42
42
|
Requires-Dist: jsonpickle>=4.0.0
|
|
43
|
+
Requires-Dist: openai>=2.29.0
|
|
43
44
|
Requires-Dist: openinference-instrumentation>=0.1.44
|
|
44
45
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
45
46
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# @observe: Strip `self` and `cls` from captured input
|
|
2
|
+
|
|
3
|
+
## What changed
|
|
4
|
+
|
|
5
|
+
The `@observe` decorator now automatically removes `self` and `cls` from the
|
|
6
|
+
captured function arguments before serialization. Previously, decorating a
|
|
7
|
+
method with `@observe` would serialize the entire instance (including API keys,
|
|
8
|
+
client objects, and other sensitive state) into `eval_input` via jsonpickle.
|
|
9
|
+
|
|
10
|
+
## Files affected
|
|
11
|
+
|
|
12
|
+
- `pixie/instrumentation/observation.py` — `sync_wrapper` and `async_wrapper`
|
|
13
|
+
now pop `self`/`cls` from `bound.arguments` before calling `_serialize()`.
|
|
14
|
+
- `tests/pixie/instrumentation/test_observation.py` — Two new tests:
|
|
15
|
+
`test_self_excluded_from_input` and `test_cls_excluded_from_input`.
|
|
16
|
+
|
|
17
|
+
## Migration notes
|
|
18
|
+
|
|
19
|
+
No API changes. This is a backward-compatible fix. Existing `@observe` usage on
|
|
20
|
+
functions (not methods) is unaffected. Methods that previously leaked `self` into
|
|
21
|
+
`eval_input` will now produce cleaner, smaller input data containing only the
|
|
22
|
+
semantic arguments.
|
|
@@ -17,6 +17,7 @@ from pixie.evals.eval_utils import (
|
|
|
17
17
|
run_and_evaluate,
|
|
18
18
|
)
|
|
19
19
|
from pixie.evals.evaluation import Evaluation, Evaluator, evaluate
|
|
20
|
+
from pixie.evals.llm_evaluator import create_llm_evaluator
|
|
20
21
|
from pixie.evals.scorers import (
|
|
21
22
|
AnswerCorrectnessEval,
|
|
22
23
|
AnswerRelevancyEval,
|
|
@@ -96,6 +97,7 @@ __all__ = [
|
|
|
96
97
|
"assert_dataset_pass",
|
|
97
98
|
"assert_pass",
|
|
98
99
|
"capture_traces",
|
|
100
|
+
"create_llm_evaluator",
|
|
99
101
|
"evaluate",
|
|
100
102
|
"last_llm_call",
|
|
101
103
|
"root",
|
|
@@ -30,6 +30,7 @@ from pixie.cli.dataset_command import (
|
|
|
30
30
|
dataset_save,
|
|
31
31
|
format_dataset_table,
|
|
32
32
|
)
|
|
33
|
+
from pixie.cli.trace_command import trace_last, trace_list, trace_show
|
|
33
34
|
from pixie.config import get_config
|
|
34
35
|
from pixie.dataset.store import DatasetStore
|
|
35
36
|
from pixie.storage.evaluable import UNSET, _Unset
|
|
@@ -88,6 +89,58 @@ def _build_parser() -> argparse.ArgumentParser:
|
|
|
88
89
|
help="Optional notes to attach to the evaluable metadata",
|
|
89
90
|
)
|
|
90
91
|
|
|
92
|
+
# -- pixie trace ---------------------------------------------------------
|
|
93
|
+
trace_parser = subparsers.add_parser("trace", help="Inspect captured traces")
|
|
94
|
+
trace_sub = trace_parser.add_subparsers(dest="trace_action", help="Trace actions")
|
|
95
|
+
|
|
96
|
+
# pixie trace list [--limit N] [--errors]
|
|
97
|
+
trace_list_parser = trace_sub.add_parser("list", help="List recent traces")
|
|
98
|
+
trace_list_parser.add_argument(
|
|
99
|
+
"--limit",
|
|
100
|
+
type=int,
|
|
101
|
+
default=10,
|
|
102
|
+
help="Maximum number of traces to show (default: 10)",
|
|
103
|
+
)
|
|
104
|
+
trace_list_parser.add_argument(
|
|
105
|
+
"--errors",
|
|
106
|
+
action="store_true",
|
|
107
|
+
default=False,
|
|
108
|
+
help="Show only traces with errors",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# pixie trace show <trace_id> [-v] [--json]
|
|
112
|
+
trace_show_parser = trace_sub.add_parser("show", help="Show span tree for a trace")
|
|
113
|
+
trace_show_parser.add_argument(
|
|
114
|
+
"trace_id",
|
|
115
|
+
help="Trace ID (or prefix, minimum 8 characters)",
|
|
116
|
+
)
|
|
117
|
+
trace_show_parser.add_argument(
|
|
118
|
+
"-v",
|
|
119
|
+
"--verbose",
|
|
120
|
+
action="store_true",
|
|
121
|
+
default=False,
|
|
122
|
+
help="Show full input/output data for each span",
|
|
123
|
+
)
|
|
124
|
+
trace_show_parser.add_argument(
|
|
125
|
+
"--json",
|
|
126
|
+
dest="as_json",
|
|
127
|
+
action="store_true",
|
|
128
|
+
default=False,
|
|
129
|
+
help="Output as JSON",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# pixie trace last [--json]
|
|
133
|
+
trace_last_parser = trace_sub.add_parser(
|
|
134
|
+
"last", help="Show the most recent trace (verbose)"
|
|
135
|
+
)
|
|
136
|
+
trace_last_parser.add_argument(
|
|
137
|
+
"--json",
|
|
138
|
+
dest="as_json",
|
|
139
|
+
action="store_true",
|
|
140
|
+
default=False,
|
|
141
|
+
help="Output as JSON",
|
|
142
|
+
)
|
|
143
|
+
|
|
91
144
|
# -- pixie test ----------------------------------------------------------
|
|
92
145
|
test_parser = subparsers.add_parser("test", help="Run pixie eval tests")
|
|
93
146
|
test_parser.add_argument(
|
|
@@ -213,6 +266,29 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
213
266
|
print(f"Error: {exc}", file=sys.stderr) # noqa: T201
|
|
214
267
|
return 1
|
|
215
268
|
|
|
269
|
+
elif args.command == "trace":
|
|
270
|
+
if args.trace_action is None:
|
|
271
|
+
parser.parse_args(["trace", "--help"])
|
|
272
|
+
return 1
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
if args.trace_action == "list":
|
|
276
|
+
return trace_list(
|
|
277
|
+
limit=args.limit,
|
|
278
|
+
errors_only=args.errors,
|
|
279
|
+
)
|
|
280
|
+
elif args.trace_action == "show":
|
|
281
|
+
return trace_show(
|
|
282
|
+
trace_id=args.trace_id,
|
|
283
|
+
verbose=args.verbose,
|
|
284
|
+
as_json=args.as_json,
|
|
285
|
+
)
|
|
286
|
+
elif args.trace_action == "last":
|
|
287
|
+
return trace_last(as_json=args.as_json)
|
|
288
|
+
except Exception as exc:
|
|
289
|
+
print(f"Error: {exc}", file=sys.stderr) # noqa: T201
|
|
290
|
+
return 1
|
|
291
|
+
|
|
216
292
|
elif args.command == "test":
|
|
217
293
|
from pixie.cli.test_command import main as test_main
|
|
218
294
|
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""``pixie trace`` CLI subcommands — list, show, and last.
|
|
2
|
+
|
|
3
|
+
Provides read-only inspection of captured traces via the
|
|
4
|
+
:class:`~pixie.storage.store.ObservationStore`.
|
|
5
|
+
|
|
6
|
+
Commands::
|
|
7
|
+
|
|
8
|
+
pixie trace list [--limit N] [--errors]
|
|
9
|
+
pixie trace show <trace_id> [-v | --verbose] [--json]
|
|
10
|
+
pixie trace last [--json]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from piccolo.engine.sqlite import SQLiteEngine
|
|
21
|
+
|
|
22
|
+
from pixie.config import get_config
|
|
23
|
+
from pixie.storage.store import ObservationStore
|
|
24
|
+
from pixie.storage.tree import ObservationNode
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _make_store() -> ObservationStore:
|
|
28
|
+
"""Create an ObservationStore from config."""
|
|
29
|
+
config = get_config()
|
|
30
|
+
engine = SQLiteEngine(path=config.db_path)
|
|
31
|
+
return ObservationStore(engine=engine)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _format_datetime(value: Any) -> str:
|
|
35
|
+
"""Format a datetime value to a human-readable string."""
|
|
36
|
+
if isinstance(value, datetime):
|
|
37
|
+
return value.strftime("%Y-%m-%d %H:%M")
|
|
38
|
+
if isinstance(value, str):
|
|
39
|
+
try:
|
|
40
|
+
dt = datetime.fromisoformat(value)
|
|
41
|
+
return dt.strftime("%Y-%m-%d %H:%M")
|
|
42
|
+
except (ValueError, TypeError):
|
|
43
|
+
return str(value)
|
|
44
|
+
return str(value) if value is not None else ""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def _trace_list(limit: int, errors_only: bool) -> list[dict[str, Any]]:
|
|
48
|
+
"""Fetch trace summaries from the store."""
|
|
49
|
+
store = _make_store()
|
|
50
|
+
traces = await store.list_traces(limit=limit)
|
|
51
|
+
if errors_only:
|
|
52
|
+
traces = [t for t in traces if t.get("has_error")]
|
|
53
|
+
return traces
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def _trace_show(
|
|
57
|
+
trace_id: str,
|
|
58
|
+
verbose: bool,
|
|
59
|
+
as_json: bool,
|
|
60
|
+
) -> str:
|
|
61
|
+
"""Fetch and render a single trace."""
|
|
62
|
+
store = _make_store()
|
|
63
|
+
# Support prefix matching
|
|
64
|
+
traces = await store.list_traces(limit=500)
|
|
65
|
+
matched = [t for t in traces if t["trace_id"].startswith(trace_id)]
|
|
66
|
+
if not matched:
|
|
67
|
+
return f"Error: No trace found matching '{trace_id}'"
|
|
68
|
+
if len(matched) > 1:
|
|
69
|
+
ids = "\n ".join(t["trace_id"] for t in matched[:10])
|
|
70
|
+
return f"Error: Multiple traces match '{trace_id}'. Be more specific:\n {ids}"
|
|
71
|
+
full_id = matched[0]["trace_id"]
|
|
72
|
+
|
|
73
|
+
tree = await store.get_trace(full_id)
|
|
74
|
+
if not tree:
|
|
75
|
+
return f"Error: No spans found for trace '{full_id}'"
|
|
76
|
+
|
|
77
|
+
if as_json:
|
|
78
|
+
spans_data = []
|
|
79
|
+
for node in tree:
|
|
80
|
+
spans_data.extend(_collect_serialized(node))
|
|
81
|
+
return json.dumps(spans_data, indent=2, default=str)
|
|
82
|
+
|
|
83
|
+
# Text rendering — to_text already handles both compact and verbose
|
|
84
|
+
# For compact mode, we use a stripped-down version
|
|
85
|
+
if verbose:
|
|
86
|
+
lines = [f"[trace_id: {full_id}]\n"]
|
|
87
|
+
for root_node in tree:
|
|
88
|
+
lines.append(root_node.to_text(indent=0))
|
|
89
|
+
return "\n".join(lines)
|
|
90
|
+
|
|
91
|
+
# Compact mode: just names and timing
|
|
92
|
+
lines = [f"[trace_id: {full_id}]\n"]
|
|
93
|
+
for root_node in tree:
|
|
94
|
+
lines.append(_compact_text(root_node, indent=0))
|
|
95
|
+
return "\n".join(lines)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
async def _trace_last(as_json: bool) -> str:
|
|
99
|
+
"""Show the most recent trace in verbose mode."""
|
|
100
|
+
store = _make_store()
|
|
101
|
+
traces = await store.list_traces(limit=1)
|
|
102
|
+
if not traces:
|
|
103
|
+
return "No traces found."
|
|
104
|
+
trace_id = traces[0]["trace_id"]
|
|
105
|
+
return await _trace_show(trace_id, verbose=True, as_json=as_json)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _compact_text(node: ObservationNode, indent: int = 0) -> str:
|
|
109
|
+
"""Render a compact text view (names and timing only)."""
|
|
110
|
+
from pixie.instrumentation.spans import LLMSpan
|
|
111
|
+
|
|
112
|
+
prefix = " " * indent
|
|
113
|
+
lines: list[str] = []
|
|
114
|
+
if isinstance(node.span, LLMSpan):
|
|
115
|
+
span = node.span
|
|
116
|
+
header = (
|
|
117
|
+
f"{prefix}{span.request_model} [{span.provider}, {span.duration_ms:.0f}ms]"
|
|
118
|
+
)
|
|
119
|
+
lines.append(header)
|
|
120
|
+
token_parts: list[str] = []
|
|
121
|
+
if span.input_tokens > 0 or span.output_tokens > 0:
|
|
122
|
+
token_parts.append(f"{span.input_tokens} in / {span.output_tokens} out")
|
|
123
|
+
lines.append(f"{prefix} tokens: {' '.join(token_parts)}")
|
|
124
|
+
else:
|
|
125
|
+
name = node.span.name or "(unnamed)"
|
|
126
|
+
lines.append(f"{prefix}{name} [{node.span.duration_ms:.0f}ms]")
|
|
127
|
+
|
|
128
|
+
for child in node.children:
|
|
129
|
+
lines.append(_compact_text(child, indent + 1))
|
|
130
|
+
return "\n".join(lines)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _collect_serialized(node: ObservationNode) -> list[dict[str, Any]]:
|
|
134
|
+
"""Recursively collect serialized spans from a tree."""
|
|
135
|
+
from pixie.storage.serialization import serialize_span
|
|
136
|
+
|
|
137
|
+
result: list[dict[str, Any]] = [serialize_span(node.span)]
|
|
138
|
+
for child in node.children:
|
|
139
|
+
result.extend(_collect_serialized(child))
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def trace_list(limit: int = 10, errors_only: bool = False) -> int:
|
|
144
|
+
"""Entry point for ``pixie trace list``."""
|
|
145
|
+
traces = asyncio.run(_trace_list(limit, errors_only))
|
|
146
|
+
if not traces:
|
|
147
|
+
print("No traces found.") # noqa: T201
|
|
148
|
+
return 0
|
|
149
|
+
|
|
150
|
+
# Table header
|
|
151
|
+
header = (
|
|
152
|
+
f"{'TRACE_ID':<34}"
|
|
153
|
+
f"{'ROOT SPAN':<25}"
|
|
154
|
+
f"{'STARTED':<20}"
|
|
155
|
+
f"{'SPANS':>6}"
|
|
156
|
+
f"{'ERRORS':>7}"
|
|
157
|
+
)
|
|
158
|
+
print(header) # noqa: T201
|
|
159
|
+
for t in traces:
|
|
160
|
+
row = (
|
|
161
|
+
f"{t['trace_id']:<34}"
|
|
162
|
+
f"{(t.get('root_name') or '(unknown)'):<25}"
|
|
163
|
+
f"{_format_datetime(t.get('started_at')):<20}"
|
|
164
|
+
f"{t.get('observation_count', 0):>6}"
|
|
165
|
+
f"{('yes' if t.get('has_error') else ''):>7}"
|
|
166
|
+
)
|
|
167
|
+
print(row) # noqa: T201
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def trace_show(
|
|
172
|
+
trace_id: str,
|
|
173
|
+
verbose: bool = False,
|
|
174
|
+
as_json: bool = False,
|
|
175
|
+
) -> int:
|
|
176
|
+
"""Entry point for ``pixie trace show``."""
|
|
177
|
+
output = asyncio.run(_trace_show(trace_id, verbose, as_json))
|
|
178
|
+
print(output) # noqa: T201
|
|
179
|
+
return 1 if output.startswith("Error:") else 0
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def trace_last(as_json: bool = False) -> int:
|
|
183
|
+
"""Entry point for ``pixie trace last``."""
|
|
184
|
+
output = asyncio.run(_trace_last(as_json))
|
|
185
|
+
print(output) # noqa: T201
|
|
186
|
+
return 0
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Factory for custom LLM-as-judge evaluators from prompt templates.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from pixie import create_llm_evaluator
|
|
6
|
+
|
|
7
|
+
concise_voice_style = create_llm_evaluator(
|
|
8
|
+
name="ConciseVoiceStyle",
|
|
9
|
+
prompt_template=\"\"\"
|
|
10
|
+
You are evaluating whether a voice agent response is concise and
|
|
11
|
+
phone-friendly.
|
|
12
|
+
|
|
13
|
+
User said: {eval_input}
|
|
14
|
+
Agent responded: {eval_output}
|
|
15
|
+
Expected behavior: {expected_output}
|
|
16
|
+
|
|
17
|
+
Score 1.0 if the response is concise (under 3 sentences), directly
|
|
18
|
+
addresses the question, and uses conversational language suitable for
|
|
19
|
+
a phone call. Score 0.0 if it's verbose, off-topic, or uses
|
|
20
|
+
written-style formatting.
|
|
21
|
+
\"\"\",
|
|
22
|
+
)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import json
|
|
28
|
+
import logging
|
|
29
|
+
import re
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
from openai import OpenAI
|
|
33
|
+
|
|
34
|
+
from pixie.evals.evaluation import Evaluation
|
|
35
|
+
from pixie.storage.evaluable import Evaluable, _Unset
|
|
36
|
+
from pixie.storage.tree import ObservationNode
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
# Default model for LLM-as-judge calls
|
|
41
|
+
_DEFAULT_MODEL = "gpt-4o-mini"
|
|
42
|
+
|
|
43
|
+
# Regex to detect nested field access like {eval_input[key]} in templates
|
|
44
|
+
_NESTED_ACCESS_RE = re.compile(
|
|
45
|
+
r"\{(eval_input|eval_output|expected_output)\["
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _value_to_str(value: Any) -> str:
|
|
50
|
+
"""Convert an eval value to a string for template substitution."""
|
|
51
|
+
if value is None:
|
|
52
|
+
return ""
|
|
53
|
+
if isinstance(value, _Unset):
|
|
54
|
+
return "(not provided)"
|
|
55
|
+
if isinstance(value, (dict, list)):
|
|
56
|
+
return json.dumps(value, default=str)
|
|
57
|
+
return str(value)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _parse_score(text: str) -> tuple[float, str]:
|
|
61
|
+
"""Extract a 0-1 score and reasoning from LLM response text.
|
|
62
|
+
|
|
63
|
+
Looks for patterns like "Score: 0.8", "score: 1.0", "0.7/1.0", or
|
|
64
|
+
a bare float on its own line. Returns (score, reasoning).
|
|
65
|
+
"""
|
|
66
|
+
# Try "Score: X" pattern first
|
|
67
|
+
match = re.search(r"[Ss]core\s*[:=]\s*([01](?:\.\d+)?)", text)
|
|
68
|
+
if match:
|
|
69
|
+
score = float(match.group(1))
|
|
70
|
+
return min(max(score, 0.0), 1.0), text.strip()
|
|
71
|
+
|
|
72
|
+
# Try "X/1" or "X/1.0" pattern
|
|
73
|
+
match = re.search(r"([01](?:\.\d+)?)\s*/\s*1(?:\.0)?", text)
|
|
74
|
+
if match:
|
|
75
|
+
score = float(match.group(1))
|
|
76
|
+
return min(max(score, 0.0), 1.0), text.strip()
|
|
77
|
+
|
|
78
|
+
# Try bare float on a line
|
|
79
|
+
match = re.search(r"^([01](?:\.\d+)?)\s*$", text, re.MULTILINE)
|
|
80
|
+
if match:
|
|
81
|
+
score = float(match.group(1))
|
|
82
|
+
return min(max(score, 0.0), 1.0), text.strip()
|
|
83
|
+
|
|
84
|
+
# Fallback: couldn't parse score
|
|
85
|
+
logger.warning("Could not parse score from LLM response: %s", text[:200])
|
|
86
|
+
return 0.0, f"Failed to parse score. Raw response: {text.strip()}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class _LLMEvaluator:
|
|
90
|
+
"""Evaluator that uses an LLM to judge quality via a prompt template."""
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
name: str,
|
|
95
|
+
prompt_template: str,
|
|
96
|
+
model: str,
|
|
97
|
+
client: Any | None,
|
|
98
|
+
) -> None:
|
|
99
|
+
self._name = name
|
|
100
|
+
self._prompt_template = prompt_template
|
|
101
|
+
self._model = model
|
|
102
|
+
self._client = client
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def name(self) -> str:
|
|
106
|
+
"""Return the evaluator's display name."""
|
|
107
|
+
return self._name
|
|
108
|
+
|
|
109
|
+
def _get_client(self) -> Any:
|
|
110
|
+
"""Get or create the OpenAI client."""
|
|
111
|
+
if self._client is not None:
|
|
112
|
+
return self._client
|
|
113
|
+
|
|
114
|
+
return OpenAI()
|
|
115
|
+
|
|
116
|
+
def _render_prompt(self, evaluable: Evaluable) -> str:
|
|
117
|
+
"""Fill in the template with evaluable fields."""
|
|
118
|
+
expected = evaluable.expected_output
|
|
119
|
+
rendered = self._prompt_template.format(
|
|
120
|
+
eval_input=_value_to_str(evaluable.eval_input),
|
|
121
|
+
eval_output=_value_to_str(evaluable.eval_output),
|
|
122
|
+
expected_output=_value_to_str(expected),
|
|
123
|
+
)
|
|
124
|
+
return rendered + "\n\nRespond with 'Score: X.X' followed by reasoning."
|
|
125
|
+
|
|
126
|
+
async def __call__(
|
|
127
|
+
self,
|
|
128
|
+
evaluable: Evaluable,
|
|
129
|
+
*,
|
|
130
|
+
trace: list[ObservationNode] | None = None,
|
|
131
|
+
) -> Evaluation:
|
|
132
|
+
"""Run the LLM judge and parse the score."""
|
|
133
|
+
import asyncio
|
|
134
|
+
|
|
135
|
+
prompt = self._render_prompt(evaluable)
|
|
136
|
+
client = self._get_client()
|
|
137
|
+
|
|
138
|
+
response = await asyncio.to_thread(
|
|
139
|
+
client.chat.completions.create,
|
|
140
|
+
model=self._model,
|
|
141
|
+
messages=[
|
|
142
|
+
{
|
|
143
|
+
"role": "system",
|
|
144
|
+
"content": (
|
|
145
|
+
"You are an evaluation judge. Score the following on "
|
|
146
|
+
"a scale of 0.0 to 1.0. Always include 'Score: X.X' "
|
|
147
|
+
"in your response, followed by your reasoning."
|
|
148
|
+
),
|
|
149
|
+
},
|
|
150
|
+
{"role": "user", "content": prompt},
|
|
151
|
+
],
|
|
152
|
+
temperature=0.0,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
text = response.choices[0].message.content or ""
|
|
156
|
+
score, reasoning = _parse_score(text)
|
|
157
|
+
|
|
158
|
+
return Evaluation(
|
|
159
|
+
score=score,
|
|
160
|
+
reasoning=reasoning,
|
|
161
|
+
details={"evaluator": self._name, "model": self._model},
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def create_llm_evaluator(
|
|
166
|
+
name: str,
|
|
167
|
+
prompt_template: str,
|
|
168
|
+
*,
|
|
169
|
+
model: str = _DEFAULT_MODEL,
|
|
170
|
+
client: Any | None = None,
|
|
171
|
+
) -> _LLMEvaluator:
|
|
172
|
+
"""Create a custom LLM-as-judge evaluator from a prompt template.
|
|
173
|
+
|
|
174
|
+
The template may reference these variables (populated from the
|
|
175
|
+
:class:`~pixie.storage.evaluable.Evaluable` fields):
|
|
176
|
+
|
|
177
|
+
- ``{eval_input}`` — the evaluable's input
|
|
178
|
+
- ``{eval_output}`` — the evaluable's output
|
|
179
|
+
- ``{expected_output}`` — the evaluable's expected output
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
name: Display name for the evaluator (shown in scorecard).
|
|
183
|
+
prompt_template: A string template with ``{eval_input}``,
|
|
184
|
+
``{eval_output}``, and/or ``{expected_output}`` placeholders.
|
|
185
|
+
model: OpenAI model name (default: ``gpt-4o-mini``).
|
|
186
|
+
client: Optional pre-configured OpenAI client instance.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
An evaluator callable satisfying the ``Evaluator`` protocol.
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
ValueError: If the template uses nested field access like
|
|
193
|
+
``{eval_input[key]}`` (only top-level placeholders are supported).
|
|
194
|
+
"""
|
|
195
|
+
match = _NESTED_ACCESS_RE.search(prompt_template)
|
|
196
|
+
if match:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
f"Nested field access like '{{{match.group(1)}[...]}}' is not "
|
|
199
|
+
f"supported in prompt templates. Use '{{{match.group(1)}}}' "
|
|
200
|
+
f"instead — dict values are serialized to JSON automatically."
|
|
201
|
+
)
|
|
202
|
+
return _LLMEvaluator(
|
|
203
|
+
name=name,
|
|
204
|
+
prompt_template=prompt_template,
|
|
205
|
+
model=model,
|
|
206
|
+
client=client,
|
|
207
|
+
)
|
|
@@ -58,7 +58,10 @@ def observe(
|
|
|
58
58
|
sig = inspect.signature(fn)
|
|
59
59
|
bound = sig.bind(*args, **kwargs)
|
|
60
60
|
bound.apply_defaults()
|
|
61
|
-
|
|
61
|
+
arguments = dict(bound.arguments)
|
|
62
|
+
arguments.pop("self", None)
|
|
63
|
+
arguments.pop("cls", None)
|
|
64
|
+
serialized_input = _serialize(arguments)
|
|
62
65
|
|
|
63
66
|
with start_observation(
|
|
64
67
|
input=serialized_input, name=span_name
|
|
@@ -76,7 +79,10 @@ def observe(
|
|
|
76
79
|
sig = inspect.signature(fn)
|
|
77
80
|
bound = sig.bind(*args, **kwargs)
|
|
78
81
|
bound.apply_defaults()
|
|
79
|
-
|
|
82
|
+
arguments = dict(bound.arguments)
|
|
83
|
+
arguments.pop("self", None)
|
|
84
|
+
arguments.pop("cls", None)
|
|
85
|
+
serialized_input = _serialize(arguments)
|
|
80
86
|
|
|
81
87
|
with start_observation(
|
|
82
88
|
input=serialized_input, name=span_name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "pixie-qa"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "Automated quality assurance for AI applications"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.11"
|
|
@@ -25,6 +25,7 @@ dependencies = [
|
|
|
25
25
|
"pydantic>=2.0",
|
|
26
26
|
"jsonpickle>=4.0.0",
|
|
27
27
|
"python-dotenv>=1.2.2",
|
|
28
|
+
"openai>=2.29.0",
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
[project.urls]
|