trajectly 0.3.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trajectly-0.3.0rc1/LICENSE +21 -0
- trajectly-0.3.0rc1/PKG-INFO +88 -0
- trajectly-0.3.0rc1/README.md +61 -0
- trajectly-0.3.0rc1/pyproject.toml +69 -0
- trajectly-0.3.0rc1/setup.cfg +4 -0
- trajectly-0.3.0rc1/src/sitecustomize.py +10 -0
- trajectly-0.3.0rc1/src/trajectly/__init__.py +5 -0
- trajectly-0.3.0rc1/src/trajectly/__main__.py +5 -0
- trajectly-0.3.0rc1/src/trajectly/abstraction/__init__.py +20 -0
- trajectly-0.3.0rc1/src/trajectly/abstraction/pipeline.py +152 -0
- trajectly-0.3.0rc1/src/trajectly/abstraction/predicates.py +68 -0
- trajectly-0.3.0rc1/src/trajectly/benchmark.py +78 -0
- trajectly-0.3.0rc1/src/trajectly/canonical.py +17 -0
- trajectly-0.3.0rc1/src/trajectly/cli.py +312 -0
- trajectly-0.3.0rc1/src/trajectly/constants.py +55 -0
- trajectly-0.3.0rc1/src/trajectly/contracts.py +570 -0
- trajectly-0.3.0rc1/src/trajectly/diff/__init__.py +4 -0
- trajectly-0.3.0rc1/src/trajectly/diff/engine.py +212 -0
- trajectly-0.3.0rc1/src/trajectly/diff/lcs.py +29 -0
- trajectly-0.3.0rc1/src/trajectly/diff/models.py +29 -0
- trajectly-0.3.0rc1/src/trajectly/diff/structural.py +55 -0
- trajectly-0.3.0rc1/src/trajectly/engine.py +1090 -0
- trajectly-0.3.0rc1/src/trajectly/engine_common.py +76 -0
- trajectly-0.3.0rc1/src/trajectly/errors.py +56 -0
- trajectly-0.3.0rc1/src/trajectly/events.py +101 -0
- trajectly-0.3.0rc1/src/trajectly/fixtures.py +216 -0
- trajectly-0.3.0rc1/src/trajectly/normalize/__init__.py +19 -0
- trajectly-0.3.0rc1/src/trajectly/normalize/canonical.py +115 -0
- trajectly-0.3.0rc1/src/trajectly/normalize/version.py +5 -0
- trajectly-0.3.0rc1/src/trajectly/plugins/__init__.py +5 -0
- trajectly-0.3.0rc1/src/trajectly/plugins/cloud_exporter.py +127 -0
- trajectly-0.3.0rc1/src/trajectly/plugins/interfaces.py +24 -0
- trajectly-0.3.0rc1/src/trajectly/plugins/loader.py +39 -0
- trajectly-0.3.0rc1/src/trajectly/redaction.py +31 -0
- trajectly-0.3.0rc1/src/trajectly/refinement/__init__.py +10 -0
- trajectly-0.3.0rc1/src/trajectly/refinement/checker.py +167 -0
- trajectly-0.3.0rc1/src/trajectly/refinement/skeleton.py +33 -0
- trajectly-0.3.0rc1/src/trajectly/replay_guard.py +267 -0
- trajectly-0.3.0rc1/src/trajectly/report/__init__.py +3 -0
- trajectly-0.3.0rc1/src/trajectly/report/renderers.py +91 -0
- trajectly-0.3.0rc1/src/trajectly/report/schema.py +112 -0
- trajectly-0.3.0rc1/src/trajectly/runtime.py +123 -0
- trajectly-0.3.0rc1/src/trajectly/schema.py +171 -0
- trajectly-0.3.0rc1/src/trajectly/sdk/__init__.py +71 -0
- trajectly-0.3.0rc1/src/trajectly/sdk/adapters.py +344 -0
- trajectly-0.3.0rc1/src/trajectly/sdk/context.py +450 -0
- trajectly-0.3.0rc1/src/trajectly/shrink/__init__.py +3 -0
- trajectly-0.3.0rc1/src/trajectly/shrink/ddmin.py +85 -0
- trajectly-0.3.0rc1/src/trajectly/specs/__init__.py +70 -0
- trajectly-0.3.0rc1/src/trajectly/specs/compat_v02.py +316 -0
- trajectly-0.3.0rc1/src/trajectly/specs/migrate.py +102 -0
- trajectly-0.3.0rc1/src/trajectly/specs/v03.py +394 -0
- trajectly-0.3.0rc1/src/trajectly/trace/__init__.py +26 -0
- trajectly-0.3.0rc1/src/trajectly/trace/io.py +70 -0
- trajectly-0.3.0rc1/src/trajectly/trace/meta.py +20 -0
- trajectly-0.3.0rc1/src/trajectly/trace/models.py +76 -0
- trajectly-0.3.0rc1/src/trajectly/trace/validate.py +81 -0
- trajectly-0.3.0rc1/src/trajectly/trt/__init__.py +8 -0
- trajectly-0.3.0rc1/src/trajectly/trt/runner.py +316 -0
- trajectly-0.3.0rc1/src/trajectly/trt/types.py +18 -0
- trajectly-0.3.0rc1/src/trajectly/trt/witness.py +54 -0
- trajectly-0.3.0rc1/src/trajectly.egg-info/PKG-INFO +88 -0
- trajectly-0.3.0rc1/src/trajectly.egg-info/SOURCES.txt +65 -0
- trajectly-0.3.0rc1/src/trajectly.egg-info/dependency_links.txt +1 -0
- trajectly-0.3.0rc1/src/trajectly.egg-info/entry_points.txt +2 -0
- trajectly-0.3.0rc1/src/trajectly.egg-info/requires.txt +9 -0
- trajectly-0.3.0rc1/src/trajectly.egg-info/top_level.txt +2 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Trajectly
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trajectly
|
|
3
|
+
Version: 0.3.0rc1
|
|
4
|
+
Summary: Trajectory Refinement Testing (TRT) for deterministic agent CI
|
|
5
|
+
Author-email: Ahmed Ashmawy <awashmawy@proton.me>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: agents,testing,regression,llm,ci
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Software Development :: Testing
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: typer<1,>=0.12
|
|
19
|
+
Requires-Dist: PyYAML<7,>=6
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest<9,>=8; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-cov<6,>=5; extra == "dev"
|
|
23
|
+
Requires-Dist: ruff<1,>=0.6; extra == "dev"
|
|
24
|
+
Requires-Dist: mypy<2,>=1.11; extra == "dev"
|
|
25
|
+
Requires-Dist: types-PyYAML<7,>=6.0; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# Trajectly
|
|
29
|
+
|
|
30
|
+
Deterministic regression testing for AI agents. Record a baseline, enforce contracts, catch regressions before they ship.
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install trajectly
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## 30-Second Example
|
|
39
|
+
|
|
40
|
+
Trajectly works in three steps: **record** a known-good baseline, **run** against it later, and **get a verdict**.
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Clone the repo and install dev dependencies
|
|
44
|
+
git clone https://github.com/trajectly/trajectly.git
|
|
45
|
+
cd trajectly
|
|
46
|
+
pip install -e ".[dev]"
|
|
47
|
+
|
|
48
|
+
# Set your OpenAI key (the example calls gpt-4o-mini)
|
|
49
|
+
export OPENAI_API_KEY="sk-..."
|
|
50
|
+
|
|
51
|
+
# 1. Record the baseline
|
|
52
|
+
cd examples
|
|
53
|
+
trajectly record specs/trt-support-triage-baseline.agent.yaml
|
|
54
|
+
|
|
55
|
+
# 2. Run the regression variant against it
|
|
56
|
+
trajectly run specs/trt-support-triage-regression.agent.yaml
|
|
57
|
+
|
|
58
|
+
# 3. See what broke
|
|
59
|
+
trajectly report
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
The report shows exactly **which step** failed, **why** (the regression calls `unsafe_export`, which is denied by policy), and gives you a **deterministic repro command**.
|
|
63
|
+
|
|
64
|
+
## How It Works
|
|
65
|
+
|
|
66
|
+
1. **Record** -- run your agent normally. Trajectly captures every tool call and LLM response as a trace.
|
|
67
|
+
2. **Replay** -- re-run the agent. Trajectly replays recorded LLM responses from fixtures so results are deterministic.
|
|
68
|
+
3. **Compare** -- Trajectly checks the new trace against the baseline:
|
|
69
|
+
- **Contracts**: are only allowed tools called? Are denied tools blocked?
|
|
70
|
+
- **Refinement**: does the new call sequence preserve the baseline sequence?
|
|
71
|
+
4. **Verdict** -- PASS or FAIL with the exact failure step (witness index), violation code, and a copy-paste repro command.
|
|
72
|
+
|
|
73
|
+
## Examples
|
|
74
|
+
|
|
75
|
+
| Example | Provider | Tools | What it tests |
|
|
76
|
+
|---------|----------|-------|---------------|
|
|
77
|
+
| [Ticket Classifier](docs/tutorial-support-triage.md) | OpenAI | `fetch_ticket`, `store_triage` | Simple 2-tool agent with contract enforcement |
|
|
78
|
+
| [Code Review Bot](docs/tutorial-code-review-bot.md) | Gemini | `fetch_pr`, `lint_code`, `post_review` | Multi-tool sequence with policy guardrails |
|
|
79
|
+
|
|
80
|
+
## Documentation
|
|
81
|
+
|
|
82
|
+
- [Full documentation](docs/trajectly.md) -- concepts, CLI reference, spec format, SDK reference
|
|
83
|
+
- [Tutorial: Ticket Classifier](docs/tutorial-support-triage.md) -- step-by-step simple example
|
|
84
|
+
- [Tutorial: Code Review Bot](docs/tutorial-code-review-bot.md) -- step-by-step medium example
|
|
85
|
+
|
|
86
|
+
## License
|
|
87
|
+
|
|
88
|
+
MIT
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Trajectly
|
|
2
|
+
|
|
3
|
+
Deterministic regression testing for AI agents. Record a baseline, enforce contracts, catch regressions before they ship.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install trajectly
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## 30-Second Example
|
|
12
|
+
|
|
13
|
+
Trajectly works in three steps: **record** a known-good baseline, **run** against it later, and **get a verdict**.
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Clone the repo and install dev dependencies
|
|
17
|
+
git clone https://github.com/trajectly/trajectly.git
|
|
18
|
+
cd trajectly
|
|
19
|
+
pip install -e ".[dev]"
|
|
20
|
+
|
|
21
|
+
# Set your OpenAI key (the example calls gpt-4o-mini)
|
|
22
|
+
export OPENAI_API_KEY="sk-..."
|
|
23
|
+
|
|
24
|
+
# 1. Record the baseline
|
|
25
|
+
cd examples
|
|
26
|
+
trajectly record specs/trt-support-triage-baseline.agent.yaml
|
|
27
|
+
|
|
28
|
+
# 2. Run the regression variant against it
|
|
29
|
+
trajectly run specs/trt-support-triage-regression.agent.yaml
|
|
30
|
+
|
|
31
|
+
# 3. See what broke
|
|
32
|
+
trajectly report
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
The report shows exactly **which step** failed, **why** (the regression calls `unsafe_export`, which is denied by policy), and gives you a **deterministic repro command**.
|
|
36
|
+
|
|
37
|
+
## How It Works
|
|
38
|
+
|
|
39
|
+
1. **Record** -- run your agent normally. Trajectly captures every tool call and LLM response as a trace.
|
|
40
|
+
2. **Replay** -- re-run the agent. Trajectly replays recorded LLM responses from fixtures so results are deterministic.
|
|
41
|
+
3. **Compare** -- Trajectly checks the new trace against the baseline:
|
|
42
|
+
- **Contracts**: are only allowed tools called? Are denied tools blocked?
|
|
43
|
+
- **Refinement**: does the new call sequence preserve the baseline sequence?
|
|
44
|
+
4. **Verdict** -- PASS or FAIL with the exact failure step (witness index), violation code, and a copy-paste repro command.
|
|
45
|
+
|
|
46
|
+
## Examples
|
|
47
|
+
|
|
48
|
+
| Example | Provider | Tools | What it tests |
|
|
49
|
+
|---------|----------|-------|---------------|
|
|
50
|
+
| [Ticket Classifier](docs/tutorial-support-triage.md) | OpenAI | `fetch_ticket`, `store_triage` | Simple 2-tool agent with contract enforcement |
|
|
51
|
+
| [Code Review Bot](docs/tutorial-code-review-bot.md) | Gemini | `fetch_pr`, `lint_code`, `post_review` | Multi-tool sequence with policy guardrails |
|
|
52
|
+
|
|
53
|
+
## Documentation
|
|
54
|
+
|
|
55
|
+
- [Full documentation](docs/trajectly.md) -- concepts, CLI reference, spec format, SDK reference
|
|
56
|
+
- [Tutorial: Ticket Classifier](docs/tutorial-support-triage.md) -- step-by-step simple example
|
|
57
|
+
- [Tutorial: Code Review Bot](docs/tutorial-code-review-bot.md) -- step-by-step medium example
|
|
58
|
+
|
|
59
|
+
## License
|
|
60
|
+
|
|
61
|
+
MIT
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "trajectly"
|
|
7
|
+
version = "0.3.0rc1"
|
|
8
|
+
description = "Trajectory Refinement Testing (TRT) for deterministic agent CI"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Ahmed Ashmawy", email = "awashmawy@proton.me"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["agents", "testing", "regression", "llm", "ci"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Software Development :: Testing",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"typer>=0.12,<1",
|
|
27
|
+
"PyYAML>=6,<7",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = [
|
|
32
|
+
"pytest>=8,<9",
|
|
33
|
+
"pytest-cov>=5,<6",
|
|
34
|
+
"ruff>=0.6,<1",
|
|
35
|
+
"mypy>=1.11,<2",
|
|
36
|
+
"types-PyYAML>=6.0,<7",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
trajectly = "trajectly.cli:app"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
package-dir = {"" = "src"}
|
|
44
|
+
py-modules = ["sitecustomize"]
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["src"]
|
|
48
|
+
|
|
49
|
+
[tool.pytest.ini_options]
|
|
50
|
+
pythonpath = ["src"]
|
|
51
|
+
testpaths = ["tests"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff]
|
|
54
|
+
line-length = 120
|
|
55
|
+
target-version = "py311"
|
|
56
|
+
|
|
57
|
+
[tool.ruff.lint]
|
|
58
|
+
select = ["E", "F", "I", "B", "UP", "N", "RUF"]
|
|
59
|
+
ignore = ["B008"]
|
|
60
|
+
|
|
61
|
+
[tool.mypy]
|
|
62
|
+
python_version = "3.11"
|
|
63
|
+
strict = true
|
|
64
|
+
warn_unused_ignores = true
|
|
65
|
+
warn_redundant_casts = true
|
|
66
|
+
warn_unreachable = true
|
|
67
|
+
show_error_codes = true
|
|
68
|
+
pretty = true
|
|
69
|
+
packages = ["trajectly"]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
if os.getenv("TRAJECTLY_REPLAY_GUARD") == "1":
|
|
6
|
+
# Import-time hook used by subprocess replays. Keeping this in
|
|
7
|
+
# `sitecustomize` ensures the guard is active before user code runs.
|
|
8
|
+
from trajectly.replay_guard import activate
|
|
9
|
+
|
|
10
|
+
activate()
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from trajectly.abstraction.pipeline import AbstractionConfig, AbstractTrace, Token, build_abstract_trace
|
|
2
|
+
from trajectly.abstraction.predicates import (
|
|
3
|
+
contains_email,
|
|
4
|
+
contains_phone,
|
|
5
|
+
extract_domains,
|
|
6
|
+
extract_numeric_values,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
# Keep this export surface stable for downstream integrations importing
|
|
10
|
+
# abstraction helpers directly from `trajectly.abstraction`.
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AbstractTrace",
|
|
13
|
+
"AbstractionConfig",
|
|
14
|
+
"Token",
|
|
15
|
+
"build_abstract_trace",
|
|
16
|
+
"contains_email",
|
|
17
|
+
"contains_phone",
|
|
18
|
+
"extract_domains",
|
|
19
|
+
"extract_numeric_values",
|
|
20
|
+
]
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Deterministic abstraction pipeline (Definition 2 in trt_theory.md).
|
|
2
|
+
|
|
3
|
+
Implements ``build_abstract_trace``, which maps a concrete event trace to an
|
|
4
|
+
abstract representation consisting of a token stream and a predicate bag.
|
|
5
|
+
|
|
6
|
+
**Determinism (Theorem 2 precondition):** The pipeline iterates events by
|
|
7
|
+
index, applies a fixed token-mapping function per event type, and accumulates
|
|
8
|
+
predicates in a single pass. Output keys are sorted (``tool_calls_by_name``,
|
|
9
|
+
``domains``) so the abstract trace is identical for identical inputs regardless
|
|
10
|
+
of Python dict insertion order.
|
|
11
|
+
|
|
12
|
+
**Abstraction homomorphism:** ``alpha(T, c) = (Tokens, Predicates)`` where
|
|
13
|
+
each token preserves the event index and causal kind, and predicates aggregate
|
|
14
|
+
over the full trace.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from typing import Any, Literal
|
|
21
|
+
|
|
22
|
+
from trajectly.abstraction.predicates import (
|
|
23
|
+
contains_email,
|
|
24
|
+
contains_phone,
|
|
25
|
+
extract_domains,
|
|
26
|
+
extract_numeric_values,
|
|
27
|
+
)
|
|
28
|
+
from trajectly.events import TraceEvent
|
|
29
|
+
|
|
30
|
+
TokenKind = Literal[
|
|
31
|
+
"CALL",
|
|
32
|
+
"RESULT",
|
|
33
|
+
"LLM_REQUEST",
|
|
34
|
+
"LLM_RESPONSE",
|
|
35
|
+
"MESSAGE",
|
|
36
|
+
"OBSERVATION",
|
|
37
|
+
"ERROR",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(slots=True)
|
|
42
|
+
class Token:
|
|
43
|
+
event_index: int
|
|
44
|
+
kind: TokenKind
|
|
45
|
+
name: str
|
|
46
|
+
payload: dict[str, Any]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(slots=True)
|
|
50
|
+
class AbstractionConfig:
|
|
51
|
+
ignore_call_tools: list[str] = field(default_factory=list)
|
|
52
|
+
enable_pii_detection: bool = True
|
|
53
|
+
enable_domain_extraction: bool = True
|
|
54
|
+
enable_numeric_extraction: bool = True
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(slots=True)
|
|
58
|
+
class AbstractTrace:
|
|
59
|
+
tokens: list[Token]
|
|
60
|
+
predicates: dict[str, Any]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _token_from_event(event: TraceEvent, event_index: int, ignore_call_tools: set[str]) -> Token | None:
|
|
64
|
+
# Event-to-token mapping is intentionally conservative: only stable,
|
|
65
|
+
# contract-relevant event types feed TRT abstraction.
|
|
66
|
+
payload = dict(event.payload)
|
|
67
|
+
if event.event_type == "tool_called":
|
|
68
|
+
tool_name = str(payload.get("tool_name", "unknown"))
|
|
69
|
+
if tool_name in ignore_call_tools:
|
|
70
|
+
return None
|
|
71
|
+
return Token(event_index=event_index, kind="CALL", name=tool_name, payload=payload)
|
|
72
|
+
if event.event_type == "tool_returned":
|
|
73
|
+
tool_name = str(payload.get("tool_name", "unknown"))
|
|
74
|
+
return Token(event_index=event_index, kind="RESULT", name=tool_name, payload=payload)
|
|
75
|
+
if event.event_type == "llm_called":
|
|
76
|
+
provider = str(payload.get("provider", "unknown"))
|
|
77
|
+
model = str(payload.get("model", "unknown"))
|
|
78
|
+
return Token(event_index=event_index, kind="LLM_REQUEST", name=f"{provider}:{model}", payload=payload)
|
|
79
|
+
if event.event_type == "llm_returned":
|
|
80
|
+
provider = str(payload.get("provider", "unknown"))
|
|
81
|
+
model = str(payload.get("model", "unknown"))
|
|
82
|
+
return Token(event_index=event_index, kind="LLM_RESPONSE", name=f"{provider}:{model}", payload=payload)
|
|
83
|
+
if event.event_type == "agent_step":
|
|
84
|
+
name = str(payload.get("name", "step"))
|
|
85
|
+
return Token(event_index=event_index, kind="MESSAGE", name=name, payload=payload)
|
|
86
|
+
if event.event_type == "run_finished":
|
|
87
|
+
return Token(event_index=event_index, kind="OBSERVATION", name="run_finished", payload=payload)
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def build_abstract_trace(
|
|
92
|
+
events: list[TraceEvent],
|
|
93
|
+
*,
|
|
94
|
+
config: AbstractionConfig | None = None,
|
|
95
|
+
) -> AbstractTrace:
|
|
96
|
+
cfg = config or AbstractionConfig()
|
|
97
|
+
ignore_call_tools = set(cfg.ignore_call_tools)
|
|
98
|
+
tokens: list[Token] = []
|
|
99
|
+
|
|
100
|
+
for index, event in enumerate(events):
|
|
101
|
+
token = _token_from_event(event, index, ignore_call_tools)
|
|
102
|
+
if token is not None:
|
|
103
|
+
tokens.append(token)
|
|
104
|
+
|
|
105
|
+
# Predicate bag shape is fixed for deterministic report payloads.
|
|
106
|
+
predicates: dict[str, Any] = {
|
|
107
|
+
"tool_calls_total": sum(1 for token in tokens if token.kind == "CALL"),
|
|
108
|
+
"tool_calls_by_name": {},
|
|
109
|
+
"domains": [],
|
|
110
|
+
"pii": {"email": False, "phone": False},
|
|
111
|
+
"max_numeric_value": None,
|
|
112
|
+
"refund_count": 0,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
tool_counts: dict[str, int] = {}
|
|
116
|
+
domains: set[str] = set()
|
|
117
|
+
numeric_values: list[float] = []
|
|
118
|
+
has_email = False
|
|
119
|
+
has_phone = False
|
|
120
|
+
refund_count = 0
|
|
121
|
+
|
|
122
|
+
for token in tokens:
|
|
123
|
+
# Predicates are derived in a single deterministic pass so witness-level
|
|
124
|
+
# checks can be reproduced exactly in CI replay.
|
|
125
|
+
if token.kind == "CALL":
|
|
126
|
+
tool_counts[token.name] = tool_counts.get(token.name, 0) + 1
|
|
127
|
+
if "refund" in token.name.lower():
|
|
128
|
+
refund_count += 1
|
|
129
|
+
|
|
130
|
+
if cfg.enable_domain_extraction:
|
|
131
|
+
domains.update(extract_domains(token.payload))
|
|
132
|
+
if cfg.enable_numeric_extraction:
|
|
133
|
+
numeric_values.extend(extract_numeric_values(token.payload))
|
|
134
|
+
if cfg.enable_pii_detection:
|
|
135
|
+
has_email = has_email or contains_email(token.payload)
|
|
136
|
+
has_phone = has_phone or contains_phone(token.payload)
|
|
137
|
+
|
|
138
|
+
predicates["tool_calls_by_name"] = dict(sorted(tool_counts.items()))
|
|
139
|
+
predicates["refund_count"] = refund_count
|
|
140
|
+
predicates["domains"] = sorted(domains)
|
|
141
|
+
predicates["pii"] = {"email": has_email, "phone": has_phone}
|
|
142
|
+
predicates["max_numeric_value"] = max(numeric_values) if numeric_values else None
|
|
143
|
+
|
|
144
|
+
return AbstractTrace(tokens=tokens, predicates=predicates)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
__all__ = [
|
|
148
|
+
"AbstractTrace",
|
|
149
|
+
"AbstractionConfig",
|
|
150
|
+
"Token",
|
|
151
|
+
"build_abstract_trace",
|
|
152
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
|
9
|
+
PHONE_RE = re.compile(r"\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b")
|
|
10
|
+
URL_RE = re.compile(r"https?://[^\s)]+")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _walk_strings(value: Any) -> Iterable[str]:
|
|
14
|
+
# Predicate extraction intentionally walks only serializable payload-like
|
|
15
|
+
# shapes to keep abstraction deterministic and side-effect free.
|
|
16
|
+
if isinstance(value, str):
|
|
17
|
+
yield value
|
|
18
|
+
return
|
|
19
|
+
if isinstance(value, dict):
|
|
20
|
+
for item in value.values():
|
|
21
|
+
yield from _walk_strings(item)
|
|
22
|
+
return
|
|
23
|
+
if isinstance(value, list | tuple):
|
|
24
|
+
for item in value:
|
|
25
|
+
yield from _walk_strings(item)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def contains_email(value: Any) -> bool:
|
|
29
|
+
return any(EMAIL_RE.search(text) for text in _walk_strings(value))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def contains_phone(value: Any) -> bool:
|
|
33
|
+
return any(PHONE_RE.search(text) for text in _walk_strings(value))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_domains(value: Any) -> list[str]:
|
|
37
|
+
domains: set[str] = set()
|
|
38
|
+
for text in _walk_strings(value):
|
|
39
|
+
candidates = [text, *URL_RE.findall(text)]
|
|
40
|
+
for candidate in candidates:
|
|
41
|
+
parsed = urlparse(candidate)
|
|
42
|
+
host = parsed.hostname
|
|
43
|
+
if host:
|
|
44
|
+
domains.add(host.lower())
|
|
45
|
+
return sorted(domains)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_numeric_values(value: Any) -> list[float]:
|
|
49
|
+
numbers: list[float] = []
|
|
50
|
+
if isinstance(value, int | float):
|
|
51
|
+
return [float(value)]
|
|
52
|
+
if isinstance(value, dict):
|
|
53
|
+
for item in value.values():
|
|
54
|
+
numbers.extend(extract_numeric_values(item))
|
|
55
|
+
return numbers
|
|
56
|
+
if isinstance(value, list | tuple):
|
|
57
|
+
for item in value:
|
|
58
|
+
numbers.extend(extract_numeric_values(item))
|
|
59
|
+
return numbers
|
|
60
|
+
return numbers
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
"contains_email",
|
|
65
|
+
"contains_phone",
|
|
66
|
+
"extract_domains",
|
|
67
|
+
"extract_numeric_values",
|
|
68
|
+
]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TRT performance benchmark harness (QA-T007).
|
|
3
|
+
|
|
4
|
+
Runs TRT run_specs in a minimal workspace repeatedly and reports wall-clock times.
|
|
5
|
+
Deterministic and offline-safe (replay-only; no network).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import tempfile
|
|
11
|
+
import time
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from trajectly.constants import EXIT_SUCCESS
|
|
16
|
+
from trajectly.engine import initialize_workspace, record_specs, run_specs
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _write(path: Path, body: str) -> None:
|
|
20
|
+
path.write_text(body.strip() + "\n", encoding="utf-8")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _setup_workspace(root: Path) -> Path:
|
|
24
|
+
"""Create minimal TRT workspace with one spec and agent; record baseline. Returns spec path."""
|
|
25
|
+
initialize_workspace(root)
|
|
26
|
+
agent = root / "agent.py"
|
|
27
|
+
_write(agent, "print('ok')")
|
|
28
|
+
spec = root / "bench.agent.yaml"
|
|
29
|
+
_write(
|
|
30
|
+
spec,
|
|
31
|
+
"""
|
|
32
|
+
schema_version: "0.3"
|
|
33
|
+
name: bench
|
|
34
|
+
command: python agent.py
|
|
35
|
+
workdir: .
|
|
36
|
+
strict: true
|
|
37
|
+
""",
|
|
38
|
+
)
|
|
39
|
+
outcome = record_specs(targets=[str(spec)], project_root=root)
|
|
40
|
+
if outcome.exit_code != EXIT_SUCCESS:
|
|
41
|
+
raise RuntimeError(f"record_specs failed: {outcome.errors}")
|
|
42
|
+
return spec
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def run_benchmark(iterations: int = 5) -> dict[str, Any]:
|
|
46
|
+
"""Run TRT run_specs `iterations` times in a fresh workspace; return timings and summary."""
|
|
47
|
+
times_s: list[float] = []
|
|
48
|
+
with tempfile.TemporaryDirectory(prefix="trajectly_bench_") as tmp:
|
|
49
|
+
root = Path(tmp)
|
|
50
|
+
spec = _setup_workspace(root)
|
|
51
|
+
for _ in range(iterations):
|
|
52
|
+
t0 = time.perf_counter()
|
|
53
|
+
outcome = run_specs(targets=[str(spec)], project_root=root)
|
|
54
|
+
t1 = time.perf_counter()
|
|
55
|
+
if outcome.exit_code != EXIT_SUCCESS:
|
|
56
|
+
raise RuntimeError(f"run_specs failed: {outcome.errors}")
|
|
57
|
+
times_s.append(t1 - t0)
|
|
58
|
+
n = len(times_s)
|
|
59
|
+
return {
|
|
60
|
+
"runs": [{"wall_s": round(t, 6)} for t in times_s],
|
|
61
|
+
"summary": {
|
|
62
|
+
"n": n,
|
|
63
|
+
"mean_s": round(sum(times_s) / n, 6),
|
|
64
|
+
"min_s": round(min(times_s), 6),
|
|
65
|
+
"max_s": round(max(times_s), 6),
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def to_md(data: dict[str, Any]) -> str:
|
|
71
|
+
"""Short Markdown summary of benchmark result."""
|
|
72
|
+
s = data["summary"]
|
|
73
|
+
return (
|
|
74
|
+
"## TRT benchmark summary\n\n"
|
|
75
|
+
f"- **Runs:** {s['n']}\n"
|
|
76
|
+
f"- **Mean:** {s['mean_s']} s\n"
|
|
77
|
+
f"- **Min / Max:** {s['min_s']} s / {s['max_s']} s\n"
|
|
78
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from trajectly.normalize.canonical import (
|
|
2
|
+
DEFAULT_CANONICAL_NORMALIZER,
|
|
3
|
+
CanonicalNormalizer,
|
|
4
|
+
canonical_dumps,
|
|
5
|
+
normalize_for_json,
|
|
6
|
+
sha256_of_data,
|
|
7
|
+
sha256_of_subset,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DEFAULT_CANONICAL_NORMALIZER",
|
|
12
|
+
"CanonicalNormalizer",
|
|
13
|
+
"canonical_dumps",
|
|
14
|
+
"normalize_for_json",
|
|
15
|
+
"sha256_of_data",
|
|
16
|
+
"sha256_of_subset",
|
|
17
|
+
]
|