npm - @alis-build/harness-eval - Versions diffs - 0.1.0 → 0.1.2 - Mend

@alis-build/harness-eval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/README.md +17 -4
package/dist/adapters/claude-code/index.d.ts +1 -1
package/dist/adapters/claude-code/index.js +1 -1
package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
package/dist/cli/bin.js +109 -12
package/dist/cli/bin.js.map +1 -1
package/dist/config/loader.d.ts +1 -1
package/dist/config/loader.js +1 -1
package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
package/dist/index.d.ts +270 -152
package/dist/index.js +124 -5
package/dist/index.js.map +1 -0
package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
package/dist/loader-DcI0KfRX.js.map +1 -0
package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
package/dist/projections-BcX7w-f6.js.map +1 -0
package/dist/runner/suite.d.ts +1 -1
package/dist/runner/suite.js +1 -1
package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
package/dist/suite-Dlzl-HI0.js.map +1 -0
package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
package/package.json +4 -2
package/schemas/eval-interchange-instances.schema.json +196 -0
package/schemas/eval-interchange.schema.json +65 -52
package/schemas/eval-run-envelope.schema.json +182 -425
package/dist/build-DsVJ_UeU.js.map +0 -1
package/dist/loader-BCnFJ8rm.js.map +0 -1
package/dist/suite-chj0j22j.js.map +0 -1
package/schemas/eval-interchange-agent-trace.schema.json +0 -322
package/schemas/eval-interchange-proto-instance.schema.json +0 -106

package/schemas/eval-interchange.schema.json CHANGED Viewed

@@ -2,132 +2,145 @@
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json",
   "title": "EvalDatasetRow",
-  "description": "Flattened row for tabular or JSONL dataset consumption.",
+  "description": "Flattened row for trajectory projection JSONL.",
   "type": "object",
   "properties": {
-    "prompt": {
+    "caseId": {
       "$ref": "#/$defs/__schema0"
     },
-    "response": {
+    "repetitionIndex": {
+      "$ref": "#/$defs/__schema1"
+    },
+    "prompt": {
       "$ref": "#/$defs/__schema2"
     },
-    "reference": {
+    "response": {
       "$ref": "#/$defs/__schema4"
     },
-    "predicted_trajectory": {
+    "evaluationInstance": {
       "$ref": "#/$defs/__schema6"
     },
-    "reference_trajectory": {
-      "$ref": "#/$defs/__schema9"
-    },
-    "latency_in_seconds": {
-      "$ref": "#/$defs/__schema11"
+    "latencySeconds": {
+      "$ref": "#/$defs/__schema12"
     },
     "failure": {
-      "$ref": "#/$defs/__schema12"
+      "$ref": "#/$defs/__schema13"
     },
-    "human_ratings": {
-      "$ref": "#/$defs/__schema15"
+    "humanRatings": {
+      "$ref": "#/$defs/__schema16"
     }
   },
   "required": [
-    "predicted_trajectory",
-    "latency_in_seconds",
+    "caseId",
+    "repetitionIndex",
+    "latencySeconds",
     "failure"
   ],
   "additionalProperties": false,
   "$defs": {
     "__schema0": {
-      "description": "Eval prompt sent to the agent.",
-      "$ref": "#/$defs/__schema1"
+      "type": "string",
+      "description": "Test case id."
     },
     "__schema1": {
-      "type": "string"
+      "type": "integer",
+      "minimum": -9007199254740991,
+      "maximum": 9007199254740991,
+      "description": "Repetition index."
     },
     "__schema2": {
-      "description": "Final agent response text.",
+      "description": "Eval prompt sent to the agent.",
       "$ref": "#/$defs/__schema3"
     },
     "__schema3": {
       "type": "string"
     },
     "__schema4": {
-      "description": "Reference answer text when provided.",
+      "description": "Final agent response text.",
       "$ref": "#/$defs/__schema5"
     },
     "__schema5": {
       "type": "string"
     },
     "__schema6": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/TabularToolCall"
-      },
-      "description": "Predicted tool-call trajectory with structured tool_input."
+      "description": "Vertex EvaluationInstance wire object.",
+      "$ref": "#/$defs/EvaluationInstanceJson"
     },
-    "TabularToolCall": {
+    "EvaluationInstanceJson": {
       "type": "object",
       "properties": {
-        "tool_name": {
+        "prompt": {
           "$ref": "#/$defs/__schema7"
         },
-        "tool_input": {
-          "$ref": "#/$defs/__schema8"
+        "response": {
+          "$ref": "#/$defs/__schema10"
+        },
+        "reference": {
+          "$ref": "#/$defs/__schema11"
         }
       },
-      "required": [
-        "tool_name",
-        "tool_input"
-      ],
       "additionalProperties": false,
-      "title": "TabularToolCall",
-      "description": "Tool call with structured tool_input for JSONL/tabular export."
+      "title": "EvaluationInstanceJson",
+      "description": "Vertex EvaluationInstance wire format (agentEvalData omitted in v1)."
     },
     "__schema7": {
-      "type": "string",
-      "description": "Tool name as emitted by the agent."
+      "description": "Eval prompt.",
+      "$ref": "#/$defs/InstanceData"
+    },
+    "InstanceData": {
+      "type": "object",
+      "properties": {
+        "text": {
+          "$ref": "#/$defs/__schema8"
+        }
+      },
+      "additionalProperties": false,
+      "title": "InstanceData",
+      "description": "EvaluationInstance prompt/response/reference text wrapper."
     },
     "__schema8": {
-      "description": "Tool arguments as a structured object for tabular consumption."
+      "description": "Plain text instance data.",
+      "$ref": "#/$defs/__schema9"
     },
     "__schema9": {
-      "description": "Reference tool-call trajectory when provided.",
-      "$ref": "#/$defs/__schema10"
+      "type": "string"
     },
     "__schema10": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/TabularToolCall"
-      }
+      "description": "Final agent response.",
+      "$ref": "#/$defs/InstanceData"
     },
     "__schema11": {
+      "description": "Reference answer text.",
+      "$ref": "#/$defs/InstanceData"
+    },
+    "__schema12": {
       "type": "number",
       "description": "Session latency in seconds."
     },
-    "__schema12": {
+    "__schema13": {
       "anyOf": [
         {
-          "$ref": "#/$defs/__schema13"
+          "$ref": "#/$defs/__schema14"
         },
         {
-          "$ref": "#/$defs/__schema14"
+          "$ref": "#/$defs/__schema15"
         }
       ],
       "description": "1 when the harness run failed, 0 on success."
     },
-    "__schema13": {
+    "__schema14": {
       "type": "number",
       "const": 0
     },
-    "__schema14": {
+    "__schema15": {
       "type": "number",
       "const": 1
     },
-    "__schema15": {
+    "__schema16": {
       "description": "Human ratings keyed by metric name for judge calibration.",
-      "$ref": "#/$defs/__schema16"
+      "$ref": "#/$defs/__schema17"
     },
-    "__schema16": {
+    "__schema17": {
       "type": "object",
       "propertyNames": {
         "type": "string"