@alis-build/harness-eval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +17 -4
  2. package/dist/adapters/claude-code/index.d.ts +1 -1
  3. package/dist/adapters/claude-code/index.js +1 -1
  4. package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
  5. package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
  6. package/dist/cli/bin.js +109 -12
  7. package/dist/cli/bin.js.map +1 -1
  8. package/dist/config/loader.d.ts +1 -1
  9. package/dist/config/loader.js +1 -1
  10. package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
  11. package/dist/index.d.ts +270 -152
  12. package/dist/index.js +124 -5
  13. package/dist/index.js.map +1 -0
  14. package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
  15. package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
  16. package/dist/loader-DcI0KfRX.js.map +1 -0
  17. package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
  18. package/dist/projections-BcX7w-f6.js.map +1 -0
  19. package/dist/runner/suite.d.ts +1 -1
  20. package/dist/runner/suite.js +1 -1
  21. package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
  22. package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
  23. package/dist/suite-Dlzl-HI0.js.map +1 -0
  24. package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
  25. package/package.json +4 -2
  26. package/schemas/eval-interchange-instances.schema.json +196 -0
  27. package/schemas/eval-interchange.schema.json +65 -52
  28. package/schemas/eval-run-envelope.schema.json +182 -425
  29. package/dist/build-DsVJ_UeU.js.map +0 -1
  30. package/dist/loader-BCnFJ8rm.js.map +0 -1
  31. package/dist/suite-chj0j22j.js.map +0 -1
  32. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  33. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
@@ -2,132 +2,145 @@
2
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
3
  "$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json",
4
4
  "title": "EvalDatasetRow",
5
- "description": "Flattened row for tabular or JSONL dataset consumption.",
5
+ "description": "Flattened row for trajectory projection JSONL.",
6
6
  "type": "object",
7
7
  "properties": {
8
- "prompt": {
8
+ "caseId": {
9
9
  "$ref": "#/$defs/__schema0"
10
10
  },
11
- "response": {
11
+ "repetitionIndex": {
12
+ "$ref": "#/$defs/__schema1"
13
+ },
14
+ "prompt": {
12
15
  "$ref": "#/$defs/__schema2"
13
16
  },
14
- "reference": {
17
+ "response": {
15
18
  "$ref": "#/$defs/__schema4"
16
19
  },
17
- "predicted_trajectory": {
20
+ "evaluationInstance": {
18
21
  "$ref": "#/$defs/__schema6"
19
22
  },
20
- "reference_trajectory": {
21
- "$ref": "#/$defs/__schema9"
22
- },
23
- "latency_in_seconds": {
24
- "$ref": "#/$defs/__schema11"
23
+ "latencySeconds": {
24
+ "$ref": "#/$defs/__schema12"
25
25
  },
26
26
  "failure": {
27
- "$ref": "#/$defs/__schema12"
27
+ "$ref": "#/$defs/__schema13"
28
28
  },
29
- "human_ratings": {
30
- "$ref": "#/$defs/__schema15"
29
+ "humanRatings": {
30
+ "$ref": "#/$defs/__schema16"
31
31
  }
32
32
  },
33
33
  "required": [
34
- "predicted_trajectory",
35
- "latency_in_seconds",
34
+ "caseId",
35
+ "repetitionIndex",
36
+ "latencySeconds",
36
37
  "failure"
37
38
  ],
38
39
  "additionalProperties": false,
39
40
  "$defs": {
40
41
  "__schema0": {
41
- "description": "Eval prompt sent to the agent.",
42
- "$ref": "#/$defs/__schema1"
42
+ "type": "string",
43
+ "description": "Test case id."
43
44
  },
44
45
  "__schema1": {
45
- "type": "string"
46
+ "type": "integer",
47
+ "minimum": -9007199254740991,
48
+ "maximum": 9007199254740991,
49
+ "description": "Repetition index."
46
50
  },
47
51
  "__schema2": {
48
- "description": "Final agent response text.",
52
+ "description": "Eval prompt sent to the agent.",
49
53
  "$ref": "#/$defs/__schema3"
50
54
  },
51
55
  "__schema3": {
52
56
  "type": "string"
53
57
  },
54
58
  "__schema4": {
55
- "description": "Reference answer text when provided.",
59
+ "description": "Final agent response text.",
56
60
  "$ref": "#/$defs/__schema5"
57
61
  },
58
62
  "__schema5": {
59
63
  "type": "string"
60
64
  },
61
65
  "__schema6": {
62
- "type": "array",
63
- "items": {
64
- "$ref": "#/$defs/TabularToolCall"
65
- },
66
- "description": "Predicted tool-call trajectory with structured tool_input."
66
+ "description": "Vertex EvaluationInstance wire object.",
67
+ "$ref": "#/$defs/EvaluationInstanceJson"
67
68
  },
68
- "TabularToolCall": {
69
+ "EvaluationInstanceJson": {
69
70
  "type": "object",
70
71
  "properties": {
71
- "tool_name": {
72
+ "prompt": {
72
73
  "$ref": "#/$defs/__schema7"
73
74
  },
74
- "tool_input": {
75
- "$ref": "#/$defs/__schema8"
75
+ "response": {
76
+ "$ref": "#/$defs/__schema10"
77
+ },
78
+ "reference": {
79
+ "$ref": "#/$defs/__schema11"
76
80
  }
77
81
  },
78
- "required": [
79
- "tool_name",
80
- "tool_input"
81
- ],
82
82
  "additionalProperties": false,
83
- "title": "TabularToolCall",
84
- "description": "Tool call with structured tool_input for JSONL/tabular export."
83
+ "title": "EvaluationInstanceJson",
84
+ "description": "Vertex EvaluationInstance wire format (agentEvalData omitted in v1)."
85
85
  },
86
86
  "__schema7": {
87
- "type": "string",
88
- "description": "Tool name as emitted by the agent."
87
+ "description": "Eval prompt.",
88
+ "$ref": "#/$defs/InstanceData"
89
+ },
90
+ "InstanceData": {
91
+ "type": "object",
92
+ "properties": {
93
+ "text": {
94
+ "$ref": "#/$defs/__schema8"
95
+ }
96
+ },
97
+ "additionalProperties": false,
98
+ "title": "InstanceData",
99
+ "description": "EvaluationInstance prompt/response/reference text wrapper."
89
100
  },
90
101
  "__schema8": {
91
- "description": "Tool arguments as a structured object for tabular consumption."
102
+ "description": "Plain text instance data.",
103
+ "$ref": "#/$defs/__schema9"
92
104
  },
93
105
  "__schema9": {
94
- "description": "Reference tool-call trajectory when provided.",
95
- "$ref": "#/$defs/__schema10"
106
+ "type": "string"
96
107
  },
97
108
  "__schema10": {
98
- "type": "array",
99
- "items": {
100
- "$ref": "#/$defs/TabularToolCall"
101
- }
109
+ "description": "Final agent response.",
110
+ "$ref": "#/$defs/InstanceData"
102
111
  },
103
112
  "__schema11": {
113
+ "description": "Reference answer text.",
114
+ "$ref": "#/$defs/InstanceData"
115
+ },
116
+ "__schema12": {
104
117
  "type": "number",
105
118
  "description": "Session latency in seconds."
106
119
  },
107
- "__schema12": {
120
+ "__schema13": {
108
121
  "anyOf": [
109
122
  {
110
- "$ref": "#/$defs/__schema13"
123
+ "$ref": "#/$defs/__schema14"
111
124
  },
112
125
  {
113
- "$ref": "#/$defs/__schema14"
126
+ "$ref": "#/$defs/__schema15"
114
127
  }
115
128
  ],
116
129
  "description": "1 when the harness run failed, 0 on success."
117
130
  },
118
- "__schema13": {
131
+ "__schema14": {
119
132
  "type": "number",
120
133
  "const": 0
121
134
  },
122
- "__schema14": {
135
+ "__schema15": {
123
136
  "type": "number",
124
137
  "const": 1
125
138
  },
126
- "__schema15": {
139
+ "__schema16": {
127
140
  "description": "Human ratings keyed by metric name for judge calibration.",
128
- "$ref": "#/$defs/__schema16"
141
+ "$ref": "#/$defs/__schema17"
129
142
  },
130
- "__schema16": {
143
+ "__schema17": {
131
144
  "type": "object",
132
145
  "propertyNames": {
133
146
  "type": "string"