@alis-build/harness-eval 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -4
- package/dist/adapters/claude-code/index.d.ts +1 -1
- package/dist/adapters/claude-code/index.js +1 -1
- package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
- package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
- package/dist/cli/bin.js +109 -12
- package/dist/cli/bin.js.map +1 -1
- package/dist/config/loader.d.ts +1 -1
- package/dist/config/loader.js +1 -1
- package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
- package/dist/index.d.ts +270 -152
- package/dist/index.js +124 -5
- package/dist/index.js.map +1 -0
- package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
- package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
- package/dist/loader-DcI0KfRX.js.map +1 -0
- package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
- package/dist/projections-BcX7w-f6.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
- package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
- package/dist/suite-Dlzl-HI0.js.map +1 -0
- package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
- package/package.json +4 -2
- package/schemas/eval-interchange-instances.schema.json +196 -0
- package/schemas/eval-interchange.schema.json +65 -52
- package/schemas/eval-run-envelope.schema.json +182 -425
- package/dist/build-DsVJ_UeU.js.map +0 -1
- package/dist/loader-BCnFJ8rm.js.map +0 -1
- package/dist/suite-chj0j22j.js.map +0 -1
- package/schemas/eval-interchange-agent-trace.schema.json +0 -322
- package/schemas/eval-interchange-proto-instance.schema.json +0 -106
|
@@ -2,132 +2,145 @@
|
|
|
2
2
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
3
|
"$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json",
|
|
4
4
|
"title": "EvalDatasetRow",
|
|
5
|
-
"description": "Flattened row for
|
|
5
|
+
"description": "Flattened row for trajectory projection JSONL.",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
|
-
"
|
|
8
|
+
"caseId": {
|
|
9
9
|
"$ref": "#/$defs/__schema0"
|
|
10
10
|
},
|
|
11
|
-
"
|
|
11
|
+
"repetitionIndex": {
|
|
12
|
+
"$ref": "#/$defs/__schema1"
|
|
13
|
+
},
|
|
14
|
+
"prompt": {
|
|
12
15
|
"$ref": "#/$defs/__schema2"
|
|
13
16
|
},
|
|
14
|
-
"
|
|
17
|
+
"response": {
|
|
15
18
|
"$ref": "#/$defs/__schema4"
|
|
16
19
|
},
|
|
17
|
-
"
|
|
20
|
+
"evaluationInstance": {
|
|
18
21
|
"$ref": "#/$defs/__schema6"
|
|
19
22
|
},
|
|
20
|
-
"
|
|
21
|
-
"$ref": "#/$defs/
|
|
22
|
-
},
|
|
23
|
-
"latency_in_seconds": {
|
|
24
|
-
"$ref": "#/$defs/__schema11"
|
|
23
|
+
"latencySeconds": {
|
|
24
|
+
"$ref": "#/$defs/__schema12"
|
|
25
25
|
},
|
|
26
26
|
"failure": {
|
|
27
|
-
"$ref": "#/$defs/
|
|
27
|
+
"$ref": "#/$defs/__schema13"
|
|
28
28
|
},
|
|
29
|
-
"
|
|
30
|
-
"$ref": "#/$defs/
|
|
29
|
+
"humanRatings": {
|
|
30
|
+
"$ref": "#/$defs/__schema16"
|
|
31
31
|
}
|
|
32
32
|
},
|
|
33
33
|
"required": [
|
|
34
|
-
"
|
|
35
|
-
"
|
|
34
|
+
"caseId",
|
|
35
|
+
"repetitionIndex",
|
|
36
|
+
"latencySeconds",
|
|
36
37
|
"failure"
|
|
37
38
|
],
|
|
38
39
|
"additionalProperties": false,
|
|
39
40
|
"$defs": {
|
|
40
41
|
"__schema0": {
|
|
41
|
-
"
|
|
42
|
-
"
|
|
42
|
+
"type": "string",
|
|
43
|
+
"description": "Test case id."
|
|
43
44
|
},
|
|
44
45
|
"__schema1": {
|
|
45
|
-
"type": "
|
|
46
|
+
"type": "integer",
|
|
47
|
+
"minimum": -9007199254740991,
|
|
48
|
+
"maximum": 9007199254740991,
|
|
49
|
+
"description": "Repetition index."
|
|
46
50
|
},
|
|
47
51
|
"__schema2": {
|
|
48
|
-
"description": "
|
|
52
|
+
"description": "Eval prompt sent to the agent.",
|
|
49
53
|
"$ref": "#/$defs/__schema3"
|
|
50
54
|
},
|
|
51
55
|
"__schema3": {
|
|
52
56
|
"type": "string"
|
|
53
57
|
},
|
|
54
58
|
"__schema4": {
|
|
55
|
-
"description": "
|
|
59
|
+
"description": "Final agent response text.",
|
|
56
60
|
"$ref": "#/$defs/__schema5"
|
|
57
61
|
},
|
|
58
62
|
"__schema5": {
|
|
59
63
|
"type": "string"
|
|
60
64
|
},
|
|
61
65
|
"__schema6": {
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"$ref": "#/$defs/TabularToolCall"
|
|
65
|
-
},
|
|
66
|
-
"description": "Predicted tool-call trajectory with structured tool_input."
|
|
66
|
+
"description": "Vertex EvaluationInstance wire object.",
|
|
67
|
+
"$ref": "#/$defs/EvaluationInstanceJson"
|
|
67
68
|
},
|
|
68
|
-
"
|
|
69
|
+
"EvaluationInstanceJson": {
|
|
69
70
|
"type": "object",
|
|
70
71
|
"properties": {
|
|
71
|
-
"
|
|
72
|
+
"prompt": {
|
|
72
73
|
"$ref": "#/$defs/__schema7"
|
|
73
74
|
},
|
|
74
|
-
"
|
|
75
|
-
"$ref": "#/$defs/
|
|
75
|
+
"response": {
|
|
76
|
+
"$ref": "#/$defs/__schema10"
|
|
77
|
+
},
|
|
78
|
+
"reference": {
|
|
79
|
+
"$ref": "#/$defs/__schema11"
|
|
76
80
|
}
|
|
77
81
|
},
|
|
78
|
-
"required": [
|
|
79
|
-
"tool_name",
|
|
80
|
-
"tool_input"
|
|
81
|
-
],
|
|
82
82
|
"additionalProperties": false,
|
|
83
|
-
"title": "
|
|
84
|
-
"description": "
|
|
83
|
+
"title": "EvaluationInstanceJson",
|
|
84
|
+
"description": "Vertex EvaluationInstance wire format (agentEvalData omitted in v1)."
|
|
85
85
|
},
|
|
86
86
|
"__schema7": {
|
|
87
|
-
"
|
|
88
|
-
"
|
|
87
|
+
"description": "Eval prompt.",
|
|
88
|
+
"$ref": "#/$defs/InstanceData"
|
|
89
|
+
},
|
|
90
|
+
"InstanceData": {
|
|
91
|
+
"type": "object",
|
|
92
|
+
"properties": {
|
|
93
|
+
"text": {
|
|
94
|
+
"$ref": "#/$defs/__schema8"
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
"additionalProperties": false,
|
|
98
|
+
"title": "InstanceData",
|
|
99
|
+
"description": "EvaluationInstance prompt/response/reference text wrapper."
|
|
89
100
|
},
|
|
90
101
|
"__schema8": {
|
|
91
|
-
"description": "
|
|
102
|
+
"description": "Plain text instance data.",
|
|
103
|
+
"$ref": "#/$defs/__schema9"
|
|
92
104
|
},
|
|
93
105
|
"__schema9": {
|
|
94
|
-
"
|
|
95
|
-
"$ref": "#/$defs/__schema10"
|
|
106
|
+
"type": "string"
|
|
96
107
|
},
|
|
97
108
|
"__schema10": {
|
|
98
|
-
"
|
|
99
|
-
"
|
|
100
|
-
"$ref": "#/$defs/TabularToolCall"
|
|
101
|
-
}
|
|
109
|
+
"description": "Final agent response.",
|
|
110
|
+
"$ref": "#/$defs/InstanceData"
|
|
102
111
|
},
|
|
103
112
|
"__schema11": {
|
|
113
|
+
"description": "Reference answer text.",
|
|
114
|
+
"$ref": "#/$defs/InstanceData"
|
|
115
|
+
},
|
|
116
|
+
"__schema12": {
|
|
104
117
|
"type": "number",
|
|
105
118
|
"description": "Session latency in seconds."
|
|
106
119
|
},
|
|
107
|
-
"
|
|
120
|
+
"__schema13": {
|
|
108
121
|
"anyOf": [
|
|
109
122
|
{
|
|
110
|
-
"$ref": "#/$defs/
|
|
123
|
+
"$ref": "#/$defs/__schema14"
|
|
111
124
|
},
|
|
112
125
|
{
|
|
113
|
-
"$ref": "#/$defs/
|
|
126
|
+
"$ref": "#/$defs/__schema15"
|
|
114
127
|
}
|
|
115
128
|
],
|
|
116
129
|
"description": "1 when the harness run failed, 0 on success."
|
|
117
130
|
},
|
|
118
|
-
"
|
|
131
|
+
"__schema14": {
|
|
119
132
|
"type": "number",
|
|
120
133
|
"const": 0
|
|
121
134
|
},
|
|
122
|
-
"
|
|
135
|
+
"__schema15": {
|
|
123
136
|
"type": "number",
|
|
124
137
|
"const": 1
|
|
125
138
|
},
|
|
126
|
-
"
|
|
139
|
+
"__schema16": {
|
|
127
140
|
"description": "Human ratings keyed by metric name for judge calibration.",
|
|
128
|
-
"$ref": "#/$defs/
|
|
141
|
+
"$ref": "#/$defs/__schema17"
|
|
129
142
|
},
|
|
130
|
-
"
|
|
143
|
+
"__schema17": {
|
|
131
144
|
"type": "object",
|
|
132
145
|
"propertyNames": {
|
|
133
146
|
"type": "string"
|