opentraces-schema 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opentraces_schema-0.1.0/.gitignore +24 -0
- opentraces_schema-0.1.0/CHANGELOG.md +34 -0
- opentraces_schema-0.1.0/FIELD-MAPPINGS.md +159 -0
- opentraces_schema-0.1.0/PKG-INFO +68 -0
- opentraces_schema-0.1.0/RATIONALE-0.1.0.md +264 -0
- opentraces_schema-0.1.0/README.md +57 -0
- opentraces_schema-0.1.0/VERSION-POLICY.md +39 -0
- opentraces_schema-0.1.0/pyproject.toml +25 -0
- opentraces_schema-0.1.0/src/opentraces_schema/__init__.py +43 -0
- opentraces_schema-0.1.0/src/opentraces_schema/models.py +242 -0
- opentraces_schema-0.1.0/src/opentraces_schema/version.py +3 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
.venv/
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.pyc
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
.pytest_cache/
|
|
8
|
+
*.egg-link
|
|
9
|
+
web/site/node_modules/
|
|
10
|
+
web/site/.next/
|
|
11
|
+
web/viewer/node_modules/
|
|
12
|
+
web/coming-soon/.vercel/
|
|
13
|
+
.gstack/
|
|
14
|
+
.opentraces/
|
|
15
|
+
|
|
16
|
+
.desloppify/
|
|
17
|
+
|
|
18
|
+
# Private research (not shipped in OSS release)
|
|
19
|
+
kb/
|
|
20
|
+
.opentraces/staging/
|
|
21
|
+
.opentraces/config.json
|
|
22
|
+
.agents/skills/opentraces
|
|
23
|
+
.claude
|
|
24
|
+
.opentraces
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to the opentraces-schema package will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
6
|
+
This project uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html) with
|
|
7
|
+
schema-specific semantics described in VERSION-POLICY.md.
|
|
8
|
+
|
|
9
|
+
## [Unreleased]
|
|
10
|
+
|
|
11
|
+
## [0.1.0] - 2026-03-27
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
- Initial schema release with 15 Pydantic v2 models
|
|
15
|
+
- `TraceRecord` top-level model: one JSONL line per complete agent session
|
|
16
|
+
- `Step` model oriented around TAO (Thought-Action-Observation) loops, not conversational turns
|
|
17
|
+
- `Outcome` model with RL-ready signals: `success`, `signal_source`, `signal_confidence` (derived/inferred/annotated)
|
|
18
|
+
- `Attribution` block (experimental) bridging trajectory data and code attribution per Agent Trace spec
|
|
19
|
+
- Sub-agent hierarchy via `Step.parent_step`, `Step.agent_role`, `Step.subagent_trajectory_ref`
|
|
20
|
+
- `Step.call_type` (main/subagent/warmup) for filtering cache-priming calls
|
|
21
|
+
- System prompt deduplication via hash-keyed `system_prompts` dict on `TraceRecord`
|
|
22
|
+
- `SecurityMetadata` with 3-tier classification (1=open, 2=guarded, 3=strict)
|
|
23
|
+
- Content hashing (SHA-256) on `TraceRecord` for cross-upload deduplication
|
|
24
|
+
- `AttributionRange.content_hash` using murmur3 for cross-refactor tracking
|
|
25
|
+
- `Observation.output_summary` for lightweight filtering without loading full tool results
|
|
26
|
+
- `TokenUsage` with `prefix_reuse_tokens`, `cache_read_tokens`, `cache_write_tokens`
|
|
27
|
+
- `Metrics` model with session-level aggregates and `estimated_cost_usd`
|
|
28
|
+
- `Environment` and `VCS` models for runtime context and reproducibility
|
|
29
|
+
- `Task` model with `source`, `repository`, `base_commit`
|
|
30
|
+
- `Agent` model using `provider/model-name` convention from models.dev
|
|
31
|
+
- `Snippet` model for extracted code blocks linked to source steps
|
|
32
|
+
|
|
33
|
+
### Design References
|
|
34
|
+
- See [RATIONALE-0.1.0.md](RATIONALE-0.1.0.md) for the design basis of each decision in this version
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Field Mappings: opentraces -> Downstream Formats
|
|
2
|
+
|
|
3
|
+
Reference tables for converting opentraces TraceRecord to downstream schemas.
|
|
4
|
+
These mappings are implemented in `src/opentraces/exporters/` but documented here
|
|
5
|
+
for ML researchers who want to write their own converters.
|
|
6
|
+
|
|
7
|
+
## opentraces -> ATIF v1.6
|
|
8
|
+
|
|
9
|
+
ATIF (Agent Trajectory Interchange Format) is designed for SFT and RL training pipelines.
|
|
10
|
+
The export is lossy: opentraces fields with no ATIF equivalent are dropped.
|
|
11
|
+
|
|
12
|
+
### Root Level
|
|
13
|
+
|
|
14
|
+
| opentraces | ATIF v1.6 | Notes |
|
|
15
|
+
|-----------|-----------|-------|
|
|
16
|
+
| `schema_version` | `schema_version: "ATIF-v1.6"` | Hardcoded |
|
|
17
|
+
| `session_id` | `session_id` | Direct |
|
|
18
|
+
| `agent.name` | `agent.name` | Direct |
|
|
19
|
+
| `agent.version` | `agent.version` | Direct |
|
|
20
|
+
| `agent.model` | `agent.model_name` | Rename |
|
|
21
|
+
| `tool_definitions` | `agent.tool_definitions` | Direct |
|
|
22
|
+
| `trace_id` | - | Dropped (ATIF uses session_id only) |
|
|
23
|
+
| `content_hash` | - | Dropped |
|
|
24
|
+
| `timestamp_start` | - | Dropped (per-step timestamps preserved) |
|
|
25
|
+
| `timestamp_end` | - | Dropped |
|
|
26
|
+
| `environment` | - | Dropped |
|
|
27
|
+
| `outcome` | - | Dropped |
|
|
28
|
+
| `dependencies` | - | Dropped |
|
|
29
|
+
| `metrics` | - | Dropped (per-step metrics preserved) |
|
|
30
|
+
| `security` | - | Dropped |
|
|
31
|
+
| `attribution` | - | Dropped |
|
|
32
|
+
| `system_prompts` | - | Dropped (ATIF stores inline per step) |
|
|
33
|
+
| `metadata` | - | Dropped |
|
|
34
|
+
|
|
35
|
+
### Step Level
|
|
36
|
+
|
|
37
|
+
| opentraces Step | ATIF Step | Notes |
|
|
38
|
+
|----------------|-----------|-------|
|
|
39
|
+
| `step_index` | `step_id` | Renumbered sequentially from 1 at export time |
|
|
40
|
+
| `role` | `source` | Direct (system/user/agent are the same) |
|
|
41
|
+
| `content` | `message` | Direct, omitted if None |
|
|
42
|
+
| `reasoning_content` | `reasoning_content` | Direct |
|
|
43
|
+
| `model` | `model_name` | Rename |
|
|
44
|
+
| `timestamp` | `timestamp` | Direct |
|
|
45
|
+
| `system_prompt_hash` | - | Dropped |
|
|
46
|
+
| `agent_role` | - | Dropped |
|
|
47
|
+
| `parent_step` | - | Dropped |
|
|
48
|
+
| `call_type` | - | Dropped |
|
|
49
|
+
| `subagent_trajectory_ref` | - | Dropped (ATIF supports this but we don't populate it) |
|
|
50
|
+
| `tools_available` | - | Dropped |
|
|
51
|
+
| `snippets` | - | Dropped |
|
|
52
|
+
|
|
53
|
+
### Tool Calls
|
|
54
|
+
|
|
55
|
+
| opentraces ToolCall | ATIF ToolCallSchema | Notes |
|
|
56
|
+
|--------------------|---------------------|-------|
|
|
57
|
+
| `tool_call_id` | `tool_call_id` | Direct |
|
|
58
|
+
| `tool_name` | `function_name` | Rename |
|
|
59
|
+
| `input` (dict) | `arguments` (dict) | Direct (ATIF accepts dict) |
|
|
60
|
+
| `duration_ms` | - | Dropped |
|
|
61
|
+
|
|
62
|
+
### Observations
|
|
63
|
+
|
|
64
|
+
opentraces stores observations as a flat list on each Step. ATIF wraps them
|
|
65
|
+
in a singular `observation` object with a `results` array.
|
|
66
|
+
|
|
67
|
+
| opentraces Observation | ATIF ObservationResult | Notes |
|
|
68
|
+
|-----------------------|-----------------------|-------|
|
|
69
|
+
| `source_call_id` | `source_call_id` | Direct |
|
|
70
|
+
| `content` | `content` | Direct |
|
|
71
|
+
| `output_summary` | - | Dropped |
|
|
72
|
+
| `error` | `content` | Mapped as `[error: {value}]` string |
|
|
73
|
+
|
|
74
|
+
**Structure transformation:**
|
|
75
|
+
```
|
|
76
|
+
opentraces: step.observations = [Obs1, Obs2]
|
|
77
|
+
ATIF: step.observation = {"results": [Result1, Result2]}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
When a step has zero observations, the `observation` key is omitted entirely.
|
|
81
|
+
|
|
82
|
+
### Token Usage
|
|
83
|
+
|
|
84
|
+
| opentraces TokenUsage | ATIF MetricsSchema | Notes |
|
|
85
|
+
|-----------------------|-------------------|-------|
|
|
86
|
+
| `input_tokens` | `prompt_tokens` | Rename |
|
|
87
|
+
| `output_tokens` | `completion_tokens` | Rename |
|
|
88
|
+
| `cache_read_tokens` | `cached_tokens` | Rename |
|
|
89
|
+
| `cache_write_tokens` | - | Dropped (no ATIF equivalent) |
|
|
90
|
+
| `prefix_reuse_tokens` | - | Dropped (opentraces-only metric) |
|
|
91
|
+
|
|
92
|
+
ATIF also supports `cost_usd`, `prompt_token_ids`, `completion_token_ids`, and
|
|
93
|
+
`logprobs`, but these are not available from CLI-level agent traces and are
|
|
94
|
+
omitted from the export.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## opentraces -> ADP (Agent Data Protocol)
|
|
99
|
+
|
|
100
|
+
ADP is designed as a training interlingua for SFT across multiple agent harnesses.
|
|
101
|
+
The export flattens opentraces' hierarchical steps into ADP's alternating
|
|
102
|
+
action/observation list.
|
|
103
|
+
|
|
104
|
+
*Exporter implementation planned. Mapping table below is a reference for
|
|
105
|
+
researchers writing their own converters.*
|
|
106
|
+
|
|
107
|
+
### Core Mapping
|
|
108
|
+
|
|
109
|
+
| opentraces | ADP | Notes |
|
|
110
|
+
|-----------|-----|-------|
|
|
111
|
+
| `session_id` | `Trajectory.id` | Direct |
|
|
112
|
+
| Step(role=agent, tool_calls=[tc]) | `APIAction(function=tc.tool_name, kwargs=tc.input)` | Each tool call becomes a separate APIAction |
|
|
113
|
+
| Step(role=agent, content=code) | `CodeAction(language=..., content=...)` | Only if step contains executable code |
|
|
114
|
+
| Step(role=agent, content=text) | `MessageAction(content=text)` | Agent messages without tool calls |
|
|
115
|
+
| Observation(content=text) | `TextObservation(source="environment", content=text)` | Tool results |
|
|
116
|
+
| Step(role=user) | `TextObservation(source="user", content=text)` | User messages |
|
|
117
|
+
| `reasoning_content` | `APIAction.description` or `CodeAction.description` | Reasoning attached to the action |
|
|
118
|
+
| `metadata` | `Trajectory.details` | Flexible dict |
|
|
119
|
+
|
|
120
|
+
### Fields Dropped by ADP Export
|
|
121
|
+
|
|
122
|
+
All of: `attribution`, `security`, `environment`, `outcome`, `dependencies`,
|
|
123
|
+
`metrics`, `system_prompts`, `tool_definitions`, `content_hash`, `token_usage`,
|
|
124
|
+
`snippets`, `hierarchy` (parent_step, agent_role, call_type).
|
|
125
|
+
|
|
126
|
+
ADP's key strength is simplicity: 3 action types + 2 observation types cover
|
|
127
|
+
coding, browsing, tool use, and SWE. The trade-off is losing all the metadata
|
|
128
|
+
that makes opentraces traces useful for analytics, RL reward modeling,
|
|
129
|
+
attribution, and security auditing.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## opentraces -> OTel GenAI (future)
|
|
134
|
+
|
|
135
|
+
OTel GenAI Semantic Conventions represent traces as span trees, which is a
|
|
136
|
+
fundamentally different structure from our step arrays. Each opentraces Step
|
|
137
|
+
would become a span, with tool calls as child spans.
|
|
138
|
+
|
|
139
|
+
*Exporter implementation planned for v0.2.*
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Notes for Converter Authors
|
|
144
|
+
|
|
145
|
+
1. **step_id renumbering**: ATIF uses 1-indexed step_id. opentraces step_index
|
|
146
|
+
may be 0 or 1-indexed depending on the parser. Always renumber at export time.
|
|
147
|
+
|
|
148
|
+
2. **observation wrapping**: ATIF uses singular `observation` with `results[]`.
|
|
149
|
+
opentraces uses plural `observations[]`. Don't just rename the field.
|
|
150
|
+
|
|
151
|
+
3. **token_usage partial mapping**: opentraces tracks 5 token sub-fields, ATIF
|
|
152
|
+
tracks 3. The two cache fields unique to opentraces (cache_write, prefix_reuse)
|
|
153
|
+
are our key differentiator for cost analysis.
|
|
154
|
+
|
|
155
|
+
4. **content=None steps**: Steps that are pure tool calls (no text content) should
|
|
156
|
+
omit the `message`/`content` field, not set it to empty string.
|
|
157
|
+
|
|
158
|
+
5. **dangling tool calls**: Observations with `error="no_result"` indicate tool
|
|
159
|
+
calls that never received a response. Map these to a descriptive error string.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: opentraces-schema
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Schema models for the opentraces.ai agent trace JSONL format
|
|
5
|
+
Project-URL: Homepage, https://opentraces.ai
|
|
6
|
+
Project-URL: Repository, https://github.com/JayFarei/opentraces
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# opentraces-schema
|
|
13
|
+
|
|
14
|
+
Pydantic v2 models for the opentraces.ai JSONL trace format.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -e packages/opentraces-schema
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from opentraces_schema import TraceRecord, SCHEMA_VERSION
|
|
26
|
+
|
|
27
|
+
record = TraceRecord(
|
|
28
|
+
trace_id="abc-123",
|
|
29
|
+
session_id="sess-456",
|
|
30
|
+
agent={"name": "claude-code", "version": "1.0.32"},
|
|
31
|
+
)
|
|
32
|
+
line = record.to_jsonl_line()
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Version
|
|
36
|
+
|
|
37
|
+
The schema version (`0.1.0`) lives in `src/opentraces_schema/version.py` as the
|
|
38
|
+
single source of truth. See [VERSION-POLICY.md](VERSION-POLICY.md) for semver
|
|
39
|
+
semantics and the bump checklist.
|
|
40
|
+
|
|
41
|
+
## Schema Rationale
|
|
42
|
+
|
|
43
|
+
Every version of the schema ships with a rationale document explaining why each
|
|
44
|
+
model and field exists, grounded in public standards (ATIF, Agent Trace, ADP, OTel)
|
|
45
|
+
and empirical observations from real agent traces.
|
|
46
|
+
|
|
47
|
+
The current rationale is [RATIONALE-0.1.0.md](RATIONALE-0.1.0.md). Each new version
|
|
48
|
+
will have its own rationale file linked from the [CHANGELOG](CHANGELOG.md).
|
|
49
|
+
|
|
50
|
+
## Contributing
|
|
51
|
+
|
|
52
|
+
Schema feedback, questions, and proposals are welcome via
|
|
53
|
+
[GitHub Issues](https://github.com/opentraces/opentraces/issues). When suggesting
|
|
54
|
+
a schema change, please include:
|
|
55
|
+
|
|
56
|
+
- **What** field or model you would add, change, or remove
|
|
57
|
+
- **Why** it matters for your use case (training, analytics, attribution, etc.)
|
|
58
|
+
- **How** it relates to existing standards (ATIF, Agent Trace, ADP, OTel) if applicable
|
|
59
|
+
|
|
60
|
+
Breaking changes (field renames, removals, type changes) require a major version bump.
|
|
61
|
+
New optional fields and models are minor bumps. See [VERSION-POLICY.md](VERSION-POLICY.md)
|
|
62
|
+
for details.
|
|
63
|
+
|
|
64
|
+
## Documentation
|
|
65
|
+
|
|
66
|
+
- [CHANGELOG.md](CHANGELOG.md) - What changed in each version
|
|
67
|
+
- [VERSION-POLICY.md](VERSION-POLICY.md) - What version numbers mean for a schema package
|
|
68
|
+
- [RATIONALE-0.1.0.md](RATIONALE-0.1.0.md) - Design rationale for v0.1.0
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
# Schema Rationale: opentraces-schema v0.1.0
|
|
2
|
+
|
|
3
|
+
Why this schema is what it is. Each section connects a design decision to the
|
|
4
|
+
standards, empirical observations, and constraints that motivated it.
|
|
5
|
+
|
|
6
|
+
Standards referenced:
|
|
7
|
+
- [Agent Trace spec](https://github.com/nichochar/agent-trace) (Cursor RFC, CC BY 4.0)
|
|
8
|
+
- [ATIF v1.6](https://github.com/harbor-ai/agent-trajectory-interchange-format) (Agent Trajectory Interchange Format)
|
|
9
|
+
- [ADP](https://arxiv.org/abs/2410.10762) (Agent Data Protocol)
|
|
10
|
+
- [OTel GenAI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
## TraceRecord (Top-Level)
|
|
14
|
+
|
|
15
|
+
### Why an independent schema, not ATIF-native
|
|
16
|
+
|
|
17
|
+
Three existing standards each serve a different purpose:
|
|
18
|
+
- **ATIF** optimizes for training pipelines (token IDs, logprobs)
|
|
19
|
+
- **Agent Trace** captures attribution only (who wrote which lines)
|
|
20
|
+
- **OTel** captures observability only (latency, error rates)
|
|
21
|
+
|
|
22
|
+
No single standard covers trajectory + attribution + security + environment.
|
|
23
|
+
This schema bridges all three as a superset, with `opentraces export --format atif`
|
|
24
|
+
planned for standards-compatible output.
|
|
25
|
+
|
|
26
|
+
### Why content_hash (SHA-256)
|
|
27
|
+
|
|
28
|
+
With sharded JSONL upload (one file per push, never append to existing shards),
|
|
29
|
+
dedup must happen at record level. Existing community trace datasets on HuggingFace
|
|
30
|
+
use content hashing for deduplication at upload time.
|
|
31
|
+
|
|
32
|
+
SHA-256 chosen over cheaper hashes because traces are large enough that hash
|
|
33
|
+
computation time is negligible vs I/O. The `compute_content_hash()` method excludes
|
|
34
|
+
`content_hash` and `trace_id` so re-parsing identical source data produces the same
|
|
35
|
+
hash regardless of the random UUID assigned.
|
|
36
|
+
|
|
37
|
+
### Why session_id + trace_id (two IDs)
|
|
38
|
+
|
|
39
|
+
- `session_id`: the agent's native session identifier, stable across re-parsing
|
|
40
|
+
- `trace_id`: random UUID for database-level uniqueness
|
|
41
|
+
|
|
42
|
+
Agent sessions group many API calls under a single session ID, but a given session
|
|
43
|
+
may be re-ingested multiple times (e.g. re-exported after a schema upgrade). Separate
|
|
44
|
+
IDs allow re-ingestion without collision while preserving join keys to the original
|
|
45
|
+
agent session.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
## Steps: TAO-Loop Oriented
|
|
49
|
+
|
|
50
|
+
### Why `steps` not `turns`
|
|
51
|
+
|
|
52
|
+
Each Step represents one LLM API call (request + response), not a conversational turn.
|
|
53
|
+
Multi-agent coding sessions routinely involve 50-100+ API calls spanning multiple
|
|
54
|
+
parallel sub-agents within a single user-visible "conversation." Conversational turns
|
|
55
|
+
would collapse this hierarchy into a flat sequence, losing the architectural signal
|
|
56
|
+
needed for caching analysis and training data segmentation.
|
|
57
|
+
|
|
58
|
+
Both ATIF and ADP use step-based models. The TAO (Thought-Action-Observation) loop
|
|
59
|
+
has converged as the canonical trajectory primitive across agent frameworks
|
|
60
|
+
(OpenHands, SWE-Agent, AgentLab) and community datasets.
|
|
61
|
+
|
|
62
|
+
### Why `role: "agent"` not `"assistant"`
|
|
63
|
+
|
|
64
|
+
ATIF and the broader agent community convention use `system | user | agent`. "Agent"
|
|
65
|
+
is semantically accurate for autonomous coding agents that reason, act, and observe
|
|
66
|
+
in loops, as opposed to chat assistants that respond to single prompts.
|
|
67
|
+
|
|
68
|
+
### Why parent_step + agent_role + subagent_trajectory_ref
|
|
69
|
+
|
|
70
|
+
Multi-agent coding systems exhibit a hierarchical phase structure:
|
|
71
|
+
|
|
72
|
+
1. **Warm-up**: cache priming calls with no reasoning
|
|
73
|
+
2. **Main agent**: full system prompt, full tool set
|
|
74
|
+
3. **Explore**: parallel sub-agents with fresh context, reduced tool sets, role-specific prompts
|
|
75
|
+
4. **Plan**: receives only summarized explore findings, not raw context
|
|
76
|
+
5. **Execute**: main agent follows plan as checklist
|
|
77
|
+
|
|
78
|
+
Sub-agents receive fresh context (not the parent's) and a subset of tools. This
|
|
79
|
+
hierarchical context isolation creates stable prefixes within each sub-agent loop,
|
|
80
|
+
which is why multi-agent architectures achieve high prefix cache reuse rates.
|
|
81
|
+
|
|
82
|
+
Three fields capture this hierarchy:
|
|
83
|
+
- `parent_step`: tree edge (which main-agent step spawned this sub-agent)
|
|
84
|
+
- `agent_role`: phase label (main, explore, plan) for filtering without reading system prompts
|
|
85
|
+
- `subagent_trajectory_ref`: links to a separate TraceRecord when sub-agent trajectories are stored independently
|
|
86
|
+
|
|
87
|
+
ATIF provides the `subagent_trajectory_ref` pattern for multi-agent delegation.
|
|
88
|
+
|
|
89
|
+
### Why call_type: main | subagent | warmup
|
|
90
|
+
|
|
91
|
+
Multi-agent systems include warm-up calls that exist purely to seed the KV cache.
|
|
92
|
+
They contain no reasoning and produce empty or minimal output, but are architecturally
|
|
93
|
+
significant: they explain why later calls achieve high prefix reuse.
|
|
94
|
+
|
|
95
|
+
For training data, warm-up calls should be filterable (they add noise to SFT datasets).
|
|
96
|
+
For caching analysis, they are essential. `call_type` enables both use cases.
|
|
97
|
+
|
|
98
|
+
### Why system_prompt_hash + top-level system_prompts dict
|
|
99
|
+
|
|
100
|
+
In multi-agent sessions, system prompts can be 20K+ tokens and repeat identically
|
|
101
|
+
across every call within a sub-agent phase. Storing inline would multiply storage
|
|
102
|
+
dramatically for long sessions.
|
|
103
|
+
|
|
104
|
+
Hash-keyed deduplication: store each unique system prompt once in a top-level dict,
|
|
105
|
+
reference by hash in each step. This separates the queryable step metadata from
|
|
106
|
+
the bulk content.
|
|
107
|
+
|
|
108
|
+
### Why reasoning_content as explicit field
|
|
109
|
+
|
|
110
|
+
Extended thinking and chain-of-thought content is returned in a separate field by
|
|
111
|
+
LLM APIs that support it. Benchmark evaluations suggest that including hidden
|
|
112
|
+
reasoning in training data improves downstream task performance.
|
|
113
|
+
|
|
114
|
+
A dedicated `reasoning_content` field preserves the API-level separation for training
|
|
115
|
+
pipelines that may want to include or exclude chain-of-thought independently of the
|
|
116
|
+
main content.
|
|
117
|
+
|
|
118
|
+
### Why output_summary on Observation
|
|
119
|
+
|
|
120
|
+
In multi-agent architectures, downstream sub-agents (e.g. plan phase) receive only
|
|
121
|
+
summarized findings from upstream sub-agents (e.g. explore phase), not raw results.
|
|
122
|
+
This is an information bottleneck pattern.
|
|
123
|
+
|
|
124
|
+
`output_summary` serves the same purpose for trace consumers: scan summaries to assess
|
|
125
|
+
relevance without loading full tool outputs (which can be megabytes for file reads or
|
|
126
|
+
grep results over large codebases).
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
## ToolCall and Observation: Separated
|
|
130
|
+
|
|
131
|
+
### Why tool calls and observations are separate lists
|
|
132
|
+
|
|
133
|
+
Training pipelines (SFT, RLHF) depend on clean tool_call / tool_result separation
|
|
134
|
+
for learning tool selection and result interpretation as distinct capabilities.
|
|
135
|
+
ATIF and ADP both maintain this separation. Observations link back via `source_call_id`
|
|
136
|
+
for 1:1 matching. Dangling tool calls (agent requested a tool but no result was
|
|
137
|
+
recorded) are marked with `error: "no_result"` rather than dropped.
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
## TokenUsage: Cache-Aware
|
|
141
|
+
|
|
142
|
+
### Why prefix_reuse_tokens, cache_read_tokens, cache_write_tokens
|
|
143
|
+
|
|
144
|
+
Prefix reuse is the dominant cost driver for multi-agent architectures. Empirical
|
|
145
|
+
measurements show that well-structured hierarchical agents achieve 90%+ prefix reuse
|
|
146
|
+
(representing 80%+ cost savings), while agents with dynamic mid-prompt mutations
|
|
147
|
+
achieve under 50%, and template-based agents under 5%.
|
|
148
|
+
|
|
149
|
+
| Architecture pattern | Typical prefix reuse | Implication |
|
|
150
|
+
|---------------------|---------------------|-------------|
|
|
151
|
+
| Hierarchical multi-agent (stable prefixes) | 90%+ | Well-structured, cache-friendly |
|
|
152
|
+
| Dynamic memory mutation | ~40-50% | Memory updates break prefix alignment |
|
|
153
|
+
| Template-based prompting | <5% | Variable insertion near prompt start destroys caching |
|
|
154
|
+
|
|
155
|
+
Per-step cache breakdown enables phase-level cost analysis and cross-architecture
|
|
156
|
+
comparisons.
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
## Outcome: RL-Ready Signals
|
|
160
|
+
|
|
161
|
+
### Why success + signal_source + signal_confidence
|
|
162
|
+
|
|
163
|
+
The training community needs trajectory-level reward signals for RLHF, DPO, and RLVR.
|
|
164
|
+
Most existing community trace datasets lack outcome fields entirely, making them
|
|
165
|
+
unsuitable for reward modeling without manual annotation.
|
|
166
|
+
|
|
167
|
+
Three confidence tiers communicate trustworthiness:
|
|
168
|
+
- **derived**: deterministic extraction (e.g. `committed` from git state)
|
|
169
|
+
- **inferred**: heuristic-based (e.g. success from test output patterns)
|
|
170
|
+
- **annotated**: human or CI label
|
|
171
|
+
|
|
172
|
+
This lets training pipelines filter by confidence: use only `derived` signals for
|
|
173
|
+
high-confidence reward, include `inferred` for larger but noisier datasets.
|
|
174
|
+
|
|
175
|
+
### Why committed + commit_sha
|
|
176
|
+
|
|
177
|
+
The cheapest deterministic quality signal available: did the agent's changes get
|
|
178
|
+
committed to git? Derivable from git state with zero annotation cost, zero LLM
|
|
179
|
+
enrichment. Aligns with the project principle: zero required annotation, all
|
|
180
|
+
enrichment in v0.1.0 is deterministic.
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
## Attribution: Embedded Agent Trace Block
|
|
184
|
+
|
|
185
|
+
### Why embed attribution rather than link externally
|
|
186
|
+
|
|
187
|
+
The Agent Trace spec stores attribution in a separate file from trajectories.
|
|
188
|
+
This schema bridges both: trajectory (process) AND attribution (output) in a single
|
|
189
|
+
record. Embedding keeps traces self-contained for dataset consumption, where each
|
|
190
|
+
JSONL line should be independently useful without external file lookups.
|
|
191
|
+
|
|
192
|
+
Marked `experimental: true` because attribution confidence varies by session
|
|
193
|
+
complexity: single-file edits produce high-confidence attribution, multi-file
|
|
194
|
+
refactors with interleaved tool calls produce lower confidence.
|
|
195
|
+
|
|
196
|
+
### Why murmur3 for AttributionRange.content_hash
|
|
197
|
+
|
|
198
|
+
The Agent Trace spec uses `algorithm:value` format (e.g. `murmur3:9f2e8a1b`) for
|
|
199
|
+
position-independent content tracking. murmur3 is fast and non-cryptographic,
|
|
200
|
+
sufficient for detecting code movement across refactors and file renames.
|
|
201
|
+
|
|
202
|
+
SHA-256 is used at the trace level (collision resistance needed for dedup integrity),
|
|
203
|
+
murmur3 at the line-range level (speed needed, no security requirement).
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
## SecurityMetadata: 3-Tier
|
|
207
|
+
|
|
208
|
+
### Why a 3-tier security model
|
|
209
|
+
|
|
210
|
+
- **Tier 1** (open): full content, suitable for public repos and open-source projects
|
|
211
|
+
- **Tier 2** (guarded): redacted secrets, anonymized paths/usernames
|
|
212
|
+
- **Tier 3** (strict): structural metadata only, no content
|
|
213
|
+
|
|
214
|
+
Existing trace-sharing tools typically offer a single redaction mode: everything is
|
|
215
|
+
processed the same way. A tier system enables per-project configuration with
|
|
216
|
+
per-session override, so a user working on both open-source and proprietary code
|
|
217
|
+
can publish traces from both with appropriate protection levels.
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
## Environment and VCS
|
|
221
|
+
|
|
222
|
+
### Why capture OS, shell, language_ecosystem
|
|
223
|
+
|
|
224
|
+
Reproducibility: the same task on macOS vs Linux may produce different agent behavior
|
|
225
|
+
(different shell commands, different tool availability, different file paths).
|
|
226
|
+
Filtering: researchers can select traces by ecosystem (Python, TypeScript, Rust)
|
|
227
|
+
to build domain-specific training datasets without parsing file extensions from
|
|
228
|
+
tool call arguments.
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
## Deliberate Exclusions
|
|
232
|
+
|
|
233
|
+
### No token IDs or logprobs
|
|
234
|
+
|
|
235
|
+
ATIF v1.6 includes `prompt_token_ids`, `completion_token_ids`, and `logprobs` per
|
|
236
|
+
step, enabling RL without retokenization drift. These fields are not available from
|
|
237
|
+
agent CLI tools (Claude Code, Cursor, Codex CLI, Gemini CLI) because they intercept
|
|
238
|
+
at the application layer, not the inference layer.
|
|
239
|
+
|
|
240
|
+
Planned for inclusion when agent APIs expose them. Until then, ATIF export
|
|
241
|
+
(`opentraces export --format atif`) will bridge this gap for training pipelines
|
|
242
|
+
that need token-level data from other sources.
|
|
243
|
+
|
|
244
|
+
### No OTel span IDs
|
|
245
|
+
|
|
246
|
+
OTel provides distributed tracing metadata (trace_id, span_id, parent_span_id)
|
|
247
|
+
designed for production observability across microservices. Our `trace_id` is
|
|
248
|
+
session-scoped, not request-scoped. OTel interop is planned via export, not
|
|
249
|
+
native embedding, because the primary consumers of this schema are training
|
|
250
|
+
pipelines and researchers, not production monitoring dashboards.
|
|
251
|
+
|
|
252
|
+
### No AGENTS.md content
|
|
253
|
+
|
|
254
|
+
AGENTS.md is a project-level instruction file for agents, not trace data. While
|
|
255
|
+
it provides context for understanding agent behavior, embedding it in every trace
|
|
256
|
+
record would be wasteful. It may be referenced via the `metadata` dict in future
|
|
257
|
+
versions if there is demand.
|
|
258
|
+
|
|
259
|
+
### No LLM enrichment fields
|
|
260
|
+
|
|
261
|
+
Fields like `task_description`, `domain_tags`, and `task_type` are not in v0.1.0
|
|
262
|
+
because they require LLM inference to generate. The project principle is zero
|
|
263
|
+
required annotation: all enrichment in v0.1.0 is deterministic. LLM-enriched
|
|
264
|
+
metadata may be added as optional fields in a future minor version.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# opentraces-schema
|
|
2
|
+
|
|
3
|
+
Pydantic v2 models for the opentraces.ai JSONL trace format.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -e packages/opentraces-schema
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from opentraces_schema import TraceRecord, SCHEMA_VERSION
|
|
15
|
+
|
|
16
|
+
record = TraceRecord(
|
|
17
|
+
trace_id="abc-123",
|
|
18
|
+
session_id="sess-456",
|
|
19
|
+
agent={"name": "claude-code", "version": "1.0.32"},
|
|
20
|
+
)
|
|
21
|
+
line = record.to_jsonl_line()
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Version
|
|
25
|
+
|
|
26
|
+
The schema version (`0.1.0`) lives in `src/opentraces_schema/version.py` as the
|
|
27
|
+
single source of truth. See [VERSION-POLICY.md](VERSION-POLICY.md) for semver
|
|
28
|
+
semantics and the bump checklist.
|
|
29
|
+
|
|
30
|
+
## Schema Rationale
|
|
31
|
+
|
|
32
|
+
Every version of the schema ships with a rationale document explaining why each
|
|
33
|
+
model and field exists, grounded in public standards (ATIF, Agent Trace, ADP, OTel)
|
|
34
|
+
and empirical observations from real agent traces.
|
|
35
|
+
|
|
36
|
+
The current rationale is [RATIONALE-0.1.0.md](RATIONALE-0.1.0.md). Each new version
|
|
37
|
+
will have its own rationale file linked from the [CHANGELOG](CHANGELOG.md).
|
|
38
|
+
|
|
39
|
+
## Contributing
|
|
40
|
+
|
|
41
|
+
Schema feedback, questions, and proposals are welcome via
|
|
42
|
+
[GitHub Issues](https://github.com/opentraces/opentraces/issues). When suggesting
|
|
43
|
+
a schema change, please include:
|
|
44
|
+
|
|
45
|
+
- **What** field or model you would add, change, or remove
|
|
46
|
+
- **Why** it matters for your use case (training, analytics, attribution, etc.)
|
|
47
|
+
- **How** it relates to existing standards (ATIF, Agent Trace, ADP, OTel) if applicable
|
|
48
|
+
|
|
49
|
+
Breaking changes (field renames, removals, type changes) require a major version bump.
|
|
50
|
+
New optional fields and models are minor bumps. See [VERSION-POLICY.md](VERSION-POLICY.md)
|
|
51
|
+
for details.
|
|
52
|
+
|
|
53
|
+
## Documentation
|
|
54
|
+
|
|
55
|
+
- [CHANGELOG.md](CHANGELOG.md) - What changed in each version
|
|
56
|
+
- [VERSION-POLICY.md](VERSION-POLICY.md) - What version numbers mean for a schema package
|
|
57
|
+
- [RATIONALE-0.1.0.md](RATIONALE-0.1.0.md) - Design rationale for v0.1.0
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Version Policy
|
|
2
|
+
|
|
3
|
+
opentraces-schema follows Semantic Versioning (semver) with schema-specific semantics.
|
|
4
|
+
|
|
5
|
+
## What the version numbers mean
|
|
6
|
+
|
|
7
|
+
- **MAJOR** (X.0.0): Breaking changes to existing fields. Renaming, removing, or
|
|
8
|
+
changing the type of an existing field. Consumers must update parsers.
|
|
9
|
+
- **MINOR** (0.X.0): New optional fields, new models, new enum values added to
|
|
10
|
+
existing Literal types. Existing parsers continue to work without changes.
|
|
11
|
+
- **PATCH** (0.0.X): Docstring fixes, validation constraint adjustments that do
|
|
12
|
+
not change the serialized format, bug fixes in computed fields.
|
|
13
|
+
|
|
14
|
+
## Pre-1.0 stability
|
|
15
|
+
|
|
16
|
+
During 0.x development, MINOR bumps may include breaking changes. The schema is
|
|
17
|
+
not yet stable. Pin to exact versions (`opentraces-schema==0.1.0`) rather than
|
|
18
|
+
ranges until 1.0.
|
|
19
|
+
|
|
20
|
+
## Where the version lives
|
|
21
|
+
|
|
22
|
+
The single source of truth is `src/opentraces_schema/version.py`. The `SCHEMA_VERSION`
|
|
23
|
+
constant is used by:
|
|
24
|
+
|
|
25
|
+
- `pyproject.toml` via hatch dynamic versioning
|
|
26
|
+
- `TraceRecord.schema_version` default
|
|
27
|
+
- `Attribution.version` default
|
|
28
|
+
|
|
29
|
+
Note: `TraceRecord.schema_version` and `Attribution.version` are currently coupled
|
|
30
|
+
to the same `SCHEMA_VERSION`. If the attribution spec needs to version independently
|
|
31
|
+
in the future, this coupling can be broken by introducing a separate constant.
|
|
32
|
+
|
|
33
|
+
## Bump checklist
|
|
34
|
+
|
|
35
|
+
1. Update `SCHEMA_VERSION` in `src/opentraces_schema/version.py`
|
|
36
|
+
2. Add entry to `CHANGELOG.md` under `[Unreleased]`, then move to new version header
|
|
37
|
+
3. Create `RATIONALE-{VERSION}.md` documenting design decisions for the new version
|
|
38
|
+
4. Link the rationale file from the CHANGELOG entry
|
|
39
|
+
5. Tag the commit: `git tag schema-v{VERSION}`
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "opentraces-schema"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Schema models for the opentraces.ai agent trace JSONL format"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"pydantic>=2.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.urls]
|
|
17
|
+
Homepage = "https://opentraces.ai"
|
|
18
|
+
Repository = "https://github.com/JayFarei/opentraces"
|
|
19
|
+
|
|
20
|
+
[tool.hatch.version]
|
|
21
|
+
path = "src/opentraces_schema/version.py"
|
|
22
|
+
pattern = "SCHEMA_VERSION = \"(?P<version>[^\"]+)\""
|
|
23
|
+
|
|
24
|
+
[tool.hatch.build.targets.wheel]
|
|
25
|
+
packages = ["src/opentraces_schema"]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""opentraces-schema: Pydantic models for the opentraces.ai JSONL trace format."""
|
|
2
|
+
|
|
3
|
+
from .models import (
|
|
4
|
+
Agent,
|
|
5
|
+
Attribution,
|
|
6
|
+
AttributionConversation,
|
|
7
|
+
AttributionFile,
|
|
8
|
+
AttributionRange,
|
|
9
|
+
Environment,
|
|
10
|
+
Metrics,
|
|
11
|
+
Observation,
|
|
12
|
+
Outcome,
|
|
13
|
+
SecurityMetadata,
|
|
14
|
+
Snippet,
|
|
15
|
+
Step,
|
|
16
|
+
Task,
|
|
17
|
+
TokenUsage,
|
|
18
|
+
ToolCall,
|
|
19
|
+
TraceRecord,
|
|
20
|
+
VCS,
|
|
21
|
+
)
|
|
22
|
+
from .version import SCHEMA_VERSION
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Agent",
|
|
26
|
+
"Attribution",
|
|
27
|
+
"AttributionConversation",
|
|
28
|
+
"AttributionFile",
|
|
29
|
+
"AttributionRange",
|
|
30
|
+
"Environment",
|
|
31
|
+
"Metrics",
|
|
32
|
+
"Observation",
|
|
33
|
+
"Outcome",
|
|
34
|
+
"SCHEMA_VERSION",
|
|
35
|
+
"SecurityMetadata",
|
|
36
|
+
"Snippet",
|
|
37
|
+
"Step",
|
|
38
|
+
"Task",
|
|
39
|
+
"TokenUsage",
|
|
40
|
+
"ToolCall",
|
|
41
|
+
"TraceRecord",
|
|
42
|
+
"VCS",
|
|
43
|
+
]
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Pydantic v2 models for the opentraces.ai JSONL trace schema.
|
|
2
|
+
|
|
3
|
+
This module defines the complete schema for enriched agent session traces.
|
|
4
|
+
Each TraceRecord represents one complete agent session or task unit.
|
|
5
|
+
|
|
6
|
+
The schema is informed by ATIF v1.6, ADP, Agent Trace spec, and field patterns
|
|
7
|
+
found in existing HF datasets (nlile, Nebius, CoderForge).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
from typing import Any, Literal
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
from .version import SCHEMA_VERSION
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Task(BaseModel):
|
|
22
|
+
"""Structured task metadata for filtering and grouping."""
|
|
23
|
+
|
|
24
|
+
description: str | None = None
|
|
25
|
+
source: str | None = Field(None, description="user_prompt, cli_arg, skill, etc.")
|
|
26
|
+
repository: str | None = Field(None, description="owner/repo format")
|
|
27
|
+
base_commit: str | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Agent(BaseModel):
|
|
31
|
+
"""Agent identity following provider/model convention."""
|
|
32
|
+
|
|
33
|
+
name: str = Field(description="Agent identifier: claude-code, cursor, codex, etc.")
|
|
34
|
+
version: str | None = None
|
|
35
|
+
model: str | None = Field(None, description="provider/model-name, e.g. anthropic/claude-sonnet-4-20250514")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class VCS(BaseModel):
|
|
39
|
+
"""Version control metadata. type='none' when not in a git repo."""
|
|
40
|
+
|
|
41
|
+
type: Literal["git", "none"] = "none"
|
|
42
|
+
base_commit: str | None = None
|
|
43
|
+
branch: str | None = None
|
|
44
|
+
diff: str | None = Field(None, description="Unified diff string or null")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Environment(BaseModel):
|
|
48
|
+
"""Runtime environment metadata for filtering and reproducibility."""
|
|
49
|
+
|
|
50
|
+
os: str | None = None
|
|
51
|
+
shell: str | None = None
|
|
52
|
+
vcs: VCS = Field(default_factory=VCS)
|
|
53
|
+
language_ecosystem: list[str] = Field(default_factory=list)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ToolCall(BaseModel):
|
|
57
|
+
"""A single tool invocation within a step."""
|
|
58
|
+
|
|
59
|
+
tool_call_id: str
|
|
60
|
+
tool_name: str
|
|
61
|
+
input: dict[str, Any] = Field(default_factory=dict)
|
|
62
|
+
duration_ms: int | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Observation(BaseModel):
|
|
66
|
+
"""Result of a tool call, linked back via source_call_id."""
|
|
67
|
+
|
|
68
|
+
source_call_id: str
|
|
69
|
+
content: str | None = None
|
|
70
|
+
output_summary: str | None = Field(None, description="Lightweight preview of tool result")
|
|
71
|
+
error: str | None = Field(None, description="Error info, e.g. 'no_result' for dangling tool calls")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Snippet(BaseModel):
|
|
75
|
+
"""Code block extracted from tool results or agent responses."""
|
|
76
|
+
|
|
77
|
+
file_path: str
|
|
78
|
+
start_line: int | None = None
|
|
79
|
+
end_line: int | None = None
|
|
80
|
+
language: str | None = None
|
|
81
|
+
text: str | None = None
|
|
82
|
+
source_step: int | None = Field(None, description="Step index that produced this snippet")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class TokenUsage(BaseModel):
|
|
86
|
+
"""Per-step token usage breakdown for cost and efficiency analysis."""
|
|
87
|
+
|
|
88
|
+
input_tokens: int = 0
|
|
89
|
+
output_tokens: int = 0
|
|
90
|
+
cache_read_tokens: int = 0
|
|
91
|
+
cache_write_tokens: int = 0
|
|
92
|
+
prefix_reuse_tokens: int = 0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Step(BaseModel):
|
|
96
|
+
"""A single LLM API call (request + response) in the TAO loop.
|
|
97
|
+
|
|
98
|
+
Each step represents one thought-action-observation cycle, not a
|
|
99
|
+
conversational turn. This aligns with ATIF's step-based model.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
step_index: int
|
|
103
|
+
role: Literal["system", "user", "agent"]
|
|
104
|
+
content: str | None = None
|
|
105
|
+
reasoning_content: str | None = Field(None, description="Chain-of-thought / extended thinking")
|
|
106
|
+
model: str | None = None
|
|
107
|
+
system_prompt_hash: str | None = Field(None, description="Key into top-level system_prompts map")
|
|
108
|
+
agent_role: str | None = Field(None, description="main, explore, plan, etc.")
|
|
109
|
+
parent_step: int | None = Field(None, description="Step index of parent for sub-agent hierarchy")
|
|
110
|
+
call_type: Literal["main", "subagent", "warmup"] | None = None
|
|
111
|
+
subagent_trajectory_ref: str | None = Field(None, description="Session ID of sub-agent trajectory")
|
|
112
|
+
tools_available: list[str] = Field(default_factory=list)
|
|
113
|
+
tool_calls: list[ToolCall] = Field(default_factory=list)
|
|
114
|
+
observations: list[Observation] = Field(default_factory=list)
|
|
115
|
+
snippets: list[Snippet] = Field(default_factory=list)
|
|
116
|
+
token_usage: TokenUsage = Field(default_factory=TokenUsage)
|
|
117
|
+
timestamp: str | None = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class Outcome(BaseModel):
|
|
121
|
+
"""Session outcome signals for RL/reward modeling.
|
|
122
|
+
|
|
123
|
+
signal_confidence indicates how trustworthy the signal is:
|
|
124
|
+
- derived: deterministic extraction (e.g. committed from git)
|
|
125
|
+
- inferred: heuristic-based (e.g. success from test output patterns)
|
|
126
|
+
- annotated: human or CI annotation
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
success: bool | None = None
|
|
130
|
+
signal_source: str = "deterministic"
|
|
131
|
+
signal_confidence: Literal["derived", "inferred", "annotated"] = "derived"
|
|
132
|
+
description: str | None = None
|
|
133
|
+
patch: str | None = Field(None, description="Unified diff produced by the session")
|
|
134
|
+
committed: bool = False
|
|
135
|
+
commit_sha: str | None = None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class AttributionRange(BaseModel):
|
|
139
|
+
"""A range of lines attributed to an agent conversation."""
|
|
140
|
+
|
|
141
|
+
start_line: int
|
|
142
|
+
end_line: int
|
|
143
|
+
content_hash: str | None = Field(None, description="murmur3 hash for cross-refactor tracking")
|
|
144
|
+
confidence: Literal["high", "medium", "low"] | None = None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class AttributionConversation(BaseModel):
|
|
148
|
+
"""Links attributed code ranges to the conversation that produced them."""
|
|
149
|
+
|
|
150
|
+
contributor: dict[str, str] = Field(
|
|
151
|
+
default_factory=dict,
|
|
152
|
+
description="e.g. {type: 'ai', model_id: 'anthropic/claude-sonnet-4-20250514'}",
|
|
153
|
+
)
|
|
154
|
+
url: str | None = Field(None, description="opentraces://trace_id/step_N")
|
|
155
|
+
ranges: list[AttributionRange] = Field(default_factory=list)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class AttributionFile(BaseModel):
|
|
159
|
+
"""Attribution data for a single file."""
|
|
160
|
+
|
|
161
|
+
path: str
|
|
162
|
+
conversations: list[AttributionConversation] = Field(default_factory=list)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class Attribution(BaseModel):
|
|
166
|
+
"""Embedded Agent Trace-compatible attribution block.
|
|
167
|
+
|
|
168
|
+
Bridges trajectory (process) and attribution (output). Records which
|
|
169
|
+
files and line ranges were produced by the agent session.
|
|
170
|
+
|
|
171
|
+
Marked experimental in v0.1 - confidence varies by session complexity.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
version: str = SCHEMA_VERSION
|
|
175
|
+
experimental: bool = True
|
|
176
|
+
files: list[AttributionFile] = Field(default_factory=list)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class Metrics(BaseModel):
|
|
180
|
+
"""Aggregated session-level metrics for analytics and cost modeling."""
|
|
181
|
+
|
|
182
|
+
total_steps: int = 0
|
|
183
|
+
total_input_tokens: int = 0
|
|
184
|
+
total_output_tokens: int = 0
|
|
185
|
+
total_duration_s: float | None = None
|
|
186
|
+
cache_hit_rate: float | None = Field(None, ge=0.0, le=1.0)
|
|
187
|
+
estimated_cost_usd: float | None = None
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class SecurityMetadata(BaseModel):
|
|
191
|
+
"""Records what security processing was applied and what was flagged/redacted."""
|
|
192
|
+
|
|
193
|
+
scanned: bool = False
|
|
194
|
+
flags_reviewed: int = 0
|
|
195
|
+
redactions_applied: int = 0
|
|
196
|
+
classifier_version: str | None = None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class TraceRecord(BaseModel):
|
|
200
|
+
"""Top-level model for one complete agent session trace.
|
|
201
|
+
|
|
202
|
+
Each line in the JSONL file is one TraceRecord. The schema bridges
|
|
203
|
+
trajectory data (ATIF/ADP) with code attribution (Agent Trace spec),
|
|
204
|
+
creating the complete record of process + output.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
schema_version: str = SCHEMA_VERSION
|
|
208
|
+
trace_id: str
|
|
209
|
+
session_id: str
|
|
210
|
+
content_hash: str | None = None
|
|
211
|
+
timestamp_start: str | None = None
|
|
212
|
+
timestamp_end: str | None = None
|
|
213
|
+
task: Task = Field(default_factory=Task)
|
|
214
|
+
agent: Agent
|
|
215
|
+
environment: Environment = Field(default_factory=Environment)
|
|
216
|
+
system_prompts: dict[str, str] = Field(
|
|
217
|
+
default_factory=dict,
|
|
218
|
+
description="Deduplicated system prompts keyed by hash",
|
|
219
|
+
)
|
|
220
|
+
tool_definitions: list[dict[str, Any]] = Field(default_factory=list)
|
|
221
|
+
steps: list[Step] = Field(default_factory=list)
|
|
222
|
+
outcome: Outcome = Field(default_factory=Outcome)
|
|
223
|
+
dependencies: list[str] = Field(default_factory=list)
|
|
224
|
+
metrics: Metrics = Field(default_factory=Metrics)
|
|
225
|
+
security: SecurityMetadata = Field(default_factory=SecurityMetadata)
|
|
226
|
+
attribution: Attribution | None = None
|
|
227
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
228
|
+
|
|
229
|
+
def compute_content_hash(self) -> str:
|
|
230
|
+
"""Compute SHA-256 hash of the trace content for deduplication.
|
|
231
|
+
|
|
232
|
+
Excludes content_hash and trace_id so re-parsing identical content
|
|
233
|
+
produces the same hash regardless of the random UUID assigned.
|
|
234
|
+
"""
|
|
235
|
+
data = self.model_dump(exclude={"content_hash", "trace_id"})
|
|
236
|
+
serialized = json.dumps(data, sort_keys=True, default=str)
|
|
237
|
+
return hashlib.sha256(serialized.encode()).hexdigest()
|
|
238
|
+
|
|
239
|
+
def to_jsonl_line(self) -> str:
|
|
240
|
+
"""Serialize to a single JSONL line with computed content_hash."""
|
|
241
|
+
self.content_hash = self.compute_content_hash()
|
|
242
|
+
return self.model_dump_json()
|