@alis-build/harness-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +700 -0
- package/dist/adapters/claude-code/index.d.ts +3 -0
- package/dist/adapters/claude-code/index.js +2 -0
- package/dist/build-DsVJ_UeU.js +1396 -0
- package/dist/build-DsVJ_UeU.js.map +1 -0
- package/dist/cardinality-DlE44e-4.js +31 -0
- package/dist/cardinality-DlE44e-4.js.map +1 -0
- package/dist/claude-code-ycT0JQZF.js +563 -0
- package/dist/claude-code-ycT0JQZF.js.map +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +623 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/config/loader.d.ts +2 -0
- package/dist/config/loader.js +2 -0
- package/dist/index-6Z17eKZx.d.ts +72 -0
- package/dist/index.d.ts +725 -0
- package/dist/index.js +5 -0
- package/dist/loader-BCnFJ8rm.js +717 -0
- package/dist/loader-BCnFJ8rm.js.map +1 -0
- package/dist/loader-DTvoVfN0.d.ts +33 -0
- package/dist/rolldown-runtime-D7D4PA-g.js +13 -0
- package/dist/runner/suite.d.ts +2 -0
- package/dist/runner/suite.js +2 -0
- package/dist/suite-BoOvK_lq.d.ts +7 -0
- package/dist/suite-chj0j22j.js +684 -0
- package/dist/suite-chj0j22j.js.map +1 -0
- package/dist/types-B9H4IZtA.d.ts +305 -0
- package/dist/types-BQol062t.d.ts +292 -0
- package/package.json +74 -0
- package/schemas/eval-interchange-agent-trace.schema.json +322 -0
- package/schemas/eval-interchange-proto-instance.schema.json +106 -0
- package/schemas/eval-interchange.schema.json +140 -0
- package/schemas/eval-run-envelope.schema.json +2195 -0
- package/schemas/trajectory-view.schema.json +441 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json#ProtoTrajectoryInstance",
|
|
4
|
+
"title": "ProtoTrajectoryInstance",
|
|
5
|
+
"description": "Proto-compatible evaluation instance with JSON-string tool_input.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"predicted_trajectory": {
|
|
9
|
+
"$ref": "#/$defs/__schema0"
|
|
10
|
+
},
|
|
11
|
+
"reference_trajectory": {
|
|
12
|
+
"$ref": "#/$defs/__schema4"
|
|
13
|
+
},
|
|
14
|
+
"prompt": {
|
|
15
|
+
"$ref": "#/$defs/__schema5"
|
|
16
|
+
},
|
|
17
|
+
"response": {
|
|
18
|
+
"$ref": "#/$defs/__schema7"
|
|
19
|
+
},
|
|
20
|
+
"reference": {
|
|
21
|
+
"$ref": "#/$defs/__schema9"
|
|
22
|
+
}
|
|
23
|
+
},
|
|
24
|
+
"required": [
|
|
25
|
+
"predicted_trajectory"
|
|
26
|
+
],
|
|
27
|
+
"additionalProperties": false,
|
|
28
|
+
"$defs": {
|
|
29
|
+
"__schema0": {
|
|
30
|
+
"description": "Predicted trajectory in wire format.",
|
|
31
|
+
"$ref": "#/$defs/InterchangeTrajectory"
|
|
32
|
+
},
|
|
33
|
+
"__schema1": {
|
|
34
|
+
"type": "array",
|
|
35
|
+
"items": {
|
|
36
|
+
"$ref": "#/$defs/InterchangeToolCall"
|
|
37
|
+
},
|
|
38
|
+
"description": "Ordered tool calls in the trajectory."
|
|
39
|
+
},
|
|
40
|
+
"InterchangeToolCall": {
|
|
41
|
+
"type": "object",
|
|
42
|
+
"properties": {
|
|
43
|
+
"tool_name": {
|
|
44
|
+
"$ref": "#/$defs/__schema2"
|
|
45
|
+
},
|
|
46
|
+
"tool_input": {
|
|
47
|
+
"$ref": "#/$defs/__schema3"
|
|
48
|
+
}
|
|
49
|
+
},
|
|
50
|
+
"required": [
|
|
51
|
+
"tool_name",
|
|
52
|
+
"tool_input"
|
|
53
|
+
],
|
|
54
|
+
"additionalProperties": false,
|
|
55
|
+
"title": "InterchangeToolCall",
|
|
56
|
+
"description": "Tool call in interchange wire format."
|
|
57
|
+
},
|
|
58
|
+
"__schema2": {
|
|
59
|
+
"type": "string",
|
|
60
|
+
"description": "Tool name as emitted by the agent."
|
|
61
|
+
},
|
|
62
|
+
"__schema3": {
|
|
63
|
+
"type": "string",
|
|
64
|
+
"description": "JSON-serialized tool arguments (wire format)."
|
|
65
|
+
},
|
|
66
|
+
"InterchangeTrajectory": {
|
|
67
|
+
"type": "object",
|
|
68
|
+
"properties": {
|
|
69
|
+
"tool_calls": {
|
|
70
|
+
"$ref": "#/$defs/__schema1"
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
"required": [
|
|
74
|
+
"tool_calls"
|
|
75
|
+
],
|
|
76
|
+
"additionalProperties": false,
|
|
77
|
+
"title": "InterchangeTrajectory",
|
|
78
|
+
"description": "Ordered sequence of tool calls."
|
|
79
|
+
},
|
|
80
|
+
"__schema4": {
|
|
81
|
+
"description": "Reference trajectory in wire format.",
|
|
82
|
+
"$ref": "#/$defs/InterchangeTrajectory"
|
|
83
|
+
},
|
|
84
|
+
"__schema5": {
|
|
85
|
+
"description": "Eval prompt.",
|
|
86
|
+
"$ref": "#/$defs/__schema6"
|
|
87
|
+
},
|
|
88
|
+
"__schema6": {
|
|
89
|
+
"type": "string"
|
|
90
|
+
},
|
|
91
|
+
"__schema7": {
|
|
92
|
+
"description": "Final response.",
|
|
93
|
+
"$ref": "#/$defs/__schema8"
|
|
94
|
+
},
|
|
95
|
+
"__schema8": {
|
|
96
|
+
"type": "string"
|
|
97
|
+
},
|
|
98
|
+
"__schema9": {
|
|
99
|
+
"description": "Reference answer text.",
|
|
100
|
+
"$ref": "#/$defs/__schema10"
|
|
101
|
+
},
|
|
102
|
+
"__schema10": {
|
|
103
|
+
"type": "string"
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json",
|
|
4
|
+
"title": "EvalDatasetRow",
|
|
5
|
+
"description": "Flattened row for tabular or JSONL dataset consumption.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"prompt": {
|
|
9
|
+
"$ref": "#/$defs/__schema0"
|
|
10
|
+
},
|
|
11
|
+
"response": {
|
|
12
|
+
"$ref": "#/$defs/__schema2"
|
|
13
|
+
},
|
|
14
|
+
"reference": {
|
|
15
|
+
"$ref": "#/$defs/__schema4"
|
|
16
|
+
},
|
|
17
|
+
"predicted_trajectory": {
|
|
18
|
+
"$ref": "#/$defs/__schema6"
|
|
19
|
+
},
|
|
20
|
+
"reference_trajectory": {
|
|
21
|
+
"$ref": "#/$defs/__schema9"
|
|
22
|
+
},
|
|
23
|
+
"latency_in_seconds": {
|
|
24
|
+
"$ref": "#/$defs/__schema11"
|
|
25
|
+
},
|
|
26
|
+
"failure": {
|
|
27
|
+
"$ref": "#/$defs/__schema12"
|
|
28
|
+
},
|
|
29
|
+
"human_ratings": {
|
|
30
|
+
"$ref": "#/$defs/__schema15"
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
"required": [
|
|
34
|
+
"predicted_trajectory",
|
|
35
|
+
"latency_in_seconds",
|
|
36
|
+
"failure"
|
|
37
|
+
],
|
|
38
|
+
"additionalProperties": false,
|
|
39
|
+
"$defs": {
|
|
40
|
+
"__schema0": {
|
|
41
|
+
"description": "Eval prompt sent to the agent.",
|
|
42
|
+
"$ref": "#/$defs/__schema1"
|
|
43
|
+
},
|
|
44
|
+
"__schema1": {
|
|
45
|
+
"type": "string"
|
|
46
|
+
},
|
|
47
|
+
"__schema2": {
|
|
48
|
+
"description": "Final agent response text.",
|
|
49
|
+
"$ref": "#/$defs/__schema3"
|
|
50
|
+
},
|
|
51
|
+
"__schema3": {
|
|
52
|
+
"type": "string"
|
|
53
|
+
},
|
|
54
|
+
"__schema4": {
|
|
55
|
+
"description": "Reference answer text when provided.",
|
|
56
|
+
"$ref": "#/$defs/__schema5"
|
|
57
|
+
},
|
|
58
|
+
"__schema5": {
|
|
59
|
+
"type": "string"
|
|
60
|
+
},
|
|
61
|
+
"__schema6": {
|
|
62
|
+
"type": "array",
|
|
63
|
+
"items": {
|
|
64
|
+
"$ref": "#/$defs/TabularToolCall"
|
|
65
|
+
},
|
|
66
|
+
"description": "Predicted tool-call trajectory with structured tool_input."
|
|
67
|
+
},
|
|
68
|
+
"TabularToolCall": {
|
|
69
|
+
"type": "object",
|
|
70
|
+
"properties": {
|
|
71
|
+
"tool_name": {
|
|
72
|
+
"$ref": "#/$defs/__schema7"
|
|
73
|
+
},
|
|
74
|
+
"tool_input": {
|
|
75
|
+
"$ref": "#/$defs/__schema8"
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
"required": [
|
|
79
|
+
"tool_name",
|
|
80
|
+
"tool_input"
|
|
81
|
+
],
|
|
82
|
+
"additionalProperties": false,
|
|
83
|
+
"title": "TabularToolCall",
|
|
84
|
+
"description": "Tool call with structured tool_input for JSONL/tabular export."
|
|
85
|
+
},
|
|
86
|
+
"__schema7": {
|
|
87
|
+
"type": "string",
|
|
88
|
+
"description": "Tool name as emitted by the agent."
|
|
89
|
+
},
|
|
90
|
+
"__schema8": {
|
|
91
|
+
"description": "Tool arguments as a structured object for tabular consumption."
|
|
92
|
+
},
|
|
93
|
+
"__schema9": {
|
|
94
|
+
"description": "Reference tool-call trajectory when provided.",
|
|
95
|
+
"$ref": "#/$defs/__schema10"
|
|
96
|
+
},
|
|
97
|
+
"__schema10": {
|
|
98
|
+
"type": "array",
|
|
99
|
+
"items": {
|
|
100
|
+
"$ref": "#/$defs/TabularToolCall"
|
|
101
|
+
}
|
|
102
|
+
},
|
|
103
|
+
"__schema11": {
|
|
104
|
+
"type": "number",
|
|
105
|
+
"description": "Session latency in seconds."
|
|
106
|
+
},
|
|
107
|
+
"__schema12": {
|
|
108
|
+
"anyOf": [
|
|
109
|
+
{
|
|
110
|
+
"$ref": "#/$defs/__schema13"
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"$ref": "#/$defs/__schema14"
|
|
114
|
+
}
|
|
115
|
+
],
|
|
116
|
+
"description": "1 when the harness run failed, 0 on success."
|
|
117
|
+
},
|
|
118
|
+
"__schema13": {
|
|
119
|
+
"type": "number",
|
|
120
|
+
"const": 0
|
|
121
|
+
},
|
|
122
|
+
"__schema14": {
|
|
123
|
+
"type": "number",
|
|
124
|
+
"const": 1
|
|
125
|
+
},
|
|
126
|
+
"__schema15": {
|
|
127
|
+
"description": "Human ratings keyed by metric name for judge calibration.",
|
|
128
|
+
"$ref": "#/$defs/__schema16"
|
|
129
|
+
},
|
|
130
|
+
"__schema16": {
|
|
131
|
+
"type": "object",
|
|
132
|
+
"propertyNames": {
|
|
133
|
+
"type": "string"
|
|
134
|
+
},
|
|
135
|
+
"additionalProperties": {
|
|
136
|
+
"type": "number"
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|