@tangle-network/agent-eval 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ import { z } from 'zod';
2
+ import { OpenAPIObject } from 'openapi3-ts/oas31';
3
+ import * as hono_types from 'hono/types';
4
+ import { ServerType } from '@hono/node-server';
5
+ import { Hono } from 'hono';
6
+
7
+ declare const RubricDimensionSchema: z.ZodObject<{
8
+ id: z.ZodString;
9
+ description: z.ZodString;
10
+ weight: z.ZodDefault<z.ZodNumber>;
11
+ min: z.ZodDefault<z.ZodNumber>;
12
+ max: z.ZodDefault<z.ZodNumber>;
13
+ }, z.core.$strip>;
14
+ declare const FailureModeSchema: z.ZodObject<{
15
+ id: z.ZodString;
16
+ description: z.ZodString;
17
+ }, z.core.$strip>;
18
+ declare const RubricSchema: z.ZodObject<{
19
+ name: z.ZodString;
20
+ description: z.ZodString;
21
+ systemPrompt: z.ZodString;
22
+ dimensions: z.ZodArray<z.ZodObject<{
23
+ id: z.ZodString;
24
+ description: z.ZodString;
25
+ weight: z.ZodDefault<z.ZodNumber>;
26
+ min: z.ZodDefault<z.ZodNumber>;
27
+ max: z.ZodDefault<z.ZodNumber>;
28
+ }, z.core.$strip>>;
29
+ failureModes: z.ZodDefault<z.ZodArray<z.ZodObject<{
30
+ id: z.ZodString;
31
+ description: z.ZodString;
32
+ }, z.core.$strip>>>;
33
+ wins: z.ZodDefault<z.ZodArray<z.ZodObject<{
34
+ id: z.ZodString;
35
+ description: z.ZodString;
36
+ }, z.core.$strip>>>;
37
+ }, z.core.$strip>;
38
+ declare const JudgeRequestSchema: z.ZodObject<{
39
+ rubricName: z.ZodOptional<z.ZodString>;
40
+ rubric: z.ZodOptional<z.ZodObject<{
41
+ name: z.ZodString;
42
+ description: z.ZodString;
43
+ systemPrompt: z.ZodString;
44
+ dimensions: z.ZodArray<z.ZodObject<{
45
+ id: z.ZodString;
46
+ description: z.ZodString;
47
+ weight: z.ZodDefault<z.ZodNumber>;
48
+ min: z.ZodDefault<z.ZodNumber>;
49
+ max: z.ZodDefault<z.ZodNumber>;
50
+ }, z.core.$strip>>;
51
+ failureModes: z.ZodDefault<z.ZodArray<z.ZodObject<{
52
+ id: z.ZodString;
53
+ description: z.ZodString;
54
+ }, z.core.$strip>>>;
55
+ wins: z.ZodDefault<z.ZodArray<z.ZodObject<{
56
+ id: z.ZodString;
57
+ description: z.ZodString;
58
+ }, z.core.$strip>>>;
59
+ }, z.core.$strip>>;
60
+ content: z.ZodString;
61
+ context: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
62
+ model: z.ZodOptional<z.ZodString>;
63
+ }, z.core.$strip>;
64
+ declare const JudgeResultSchema: z.ZodObject<{
65
+ composite: z.ZodNumber;
66
+ dimensions: z.ZodRecord<z.ZodString, z.ZodNumber>;
67
+ failureModes: z.ZodDefault<z.ZodArray<z.ZodString>>;
68
+ wins: z.ZodDefault<z.ZodArray<z.ZodString>>;
69
+ rationale: z.ZodString;
70
+ rubricVersion: z.ZodString;
71
+ model: z.ZodString;
72
+ durationMs: z.ZodNumber;
73
+ }, z.core.$strip>;
74
+ declare const RubricInfoSchema: z.ZodObject<{
75
+ name: z.ZodString;
76
+ description: z.ZodString;
77
+ dimensions: z.ZodArray<z.ZodObject<{
78
+ id: z.ZodString;
79
+ description: z.ZodString;
80
+ weight: z.ZodNumber;
81
+ }, z.core.$strip>>;
82
+ failureModes: z.ZodDefault<z.ZodArray<z.ZodString>>;
83
+ rubricVersion: z.ZodString;
84
+ }, z.core.$strip>;
85
+ declare const ListRubricsResponseSchema: z.ZodObject<{
86
+ rubrics: z.ZodArray<z.ZodObject<{
87
+ name: z.ZodString;
88
+ description: z.ZodString;
89
+ dimensions: z.ZodArray<z.ZodObject<{
90
+ id: z.ZodString;
91
+ description: z.ZodString;
92
+ weight: z.ZodNumber;
93
+ }, z.core.$strip>>;
94
+ failureModes: z.ZodDefault<z.ZodArray<z.ZodString>>;
95
+ rubricVersion: z.ZodString;
96
+ }, z.core.$strip>>;
97
+ }, z.core.$strip>;
98
+ declare const VersionResponseSchema: z.ZodObject<{
99
+ package: z.ZodString;
100
+ version: z.ZodString;
101
+ wireVersion: z.ZodString;
102
+ apiSurface: z.ZodArray<z.ZodString>;
103
+ }, z.core.$strip>;
104
+ declare const HealthResponseSchema: z.ZodObject<{
105
+ status: z.ZodLiteral<"ok">;
106
+ uptimeSec: z.ZodNumber;
107
+ }, z.core.$strip>;
108
+ declare const ErrorResponseSchema: z.ZodObject<{
109
+ error: z.ZodObject<{
110
+ code: z.ZodString;
111
+ message: z.ZodString;
112
+ details: z.ZodOptional<z.ZodUnknown>;
113
+ }, z.core.$strip>;
114
+ }, z.core.$strip>;
115
+ type RubricDimension = z.infer<typeof RubricDimensionSchema>;
116
+ type FailureMode = z.infer<typeof FailureModeSchema>;
117
+ type Rubric = z.infer<typeof RubricSchema>;
118
+ type JudgeRequest = z.infer<typeof JudgeRequestSchema>;
119
+ type JudgeResult = z.infer<typeof JudgeResultSchema>;
120
+ type RubricInfo = z.infer<typeof RubricInfoSchema>;
121
+ type ListRubricsResponse = z.infer<typeof ListRubricsResponseSchema>;
122
+ type VersionResponse = z.infer<typeof VersionResponseSchema>;
123
+ type ErrorResponse = z.infer<typeof ErrorResponseSchema>;
124
+ /**
125
+ * Bump on any breaking change to a request/response schema.
126
+ * Non-breaking (additive) changes don't require a bump.
127
+ */
128
+ declare const WIRE_VERSION = "1.0.0";
129
+ /**
130
+ * Stable hash of a rubric. Used to make scores comparable across runs:
131
+ * if the rubricVersion matches, the rubric was identical.
132
+ */
133
+ declare function hashRubric(rubric: Rubric): string;
134
+
135
+ /** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */
136
+ declare class WireError extends Error {
137
+ readonly code: string;
138
+ readonly status: number;
139
+ readonly details?: unknown | undefined;
140
+ constructor(code: string, message: string, status?: number, details?: unknown | undefined);
141
+ }
142
+ declare function handleJudge(req: JudgeRequest): Promise<JudgeResult>;
143
+ declare function handleListRubrics(): ListRubricsResponse;
144
+ declare function handleVersion(): VersionResponse;
145
+
146
+ /**
147
+ * Built-in rubrics shipped with agent-eval.
148
+ *
149
+ * A rubric is a set of scoring axes plus a system prompt that tells the
150
+ * judging LLM how to grade against those axes. Built-in rubrics are
151
+ * curated for use cases that recur across Tangle projects — call them
152
+ * by name from any client.
153
+ *
154
+ * Adding a rubric:
155
+ * 1. Define the Rubric object below with a clear `description` and
156
+ * named `dimensions`.
157
+ * 2. Register it in `BUILTIN_RUBRICS` at the bottom.
158
+ * 3. Add a test in `tests/wire/rubrics.test.ts`.
159
+ *
160
+ * Custom rubrics: callers pass `rubric` inline to /v1/judge instead of
161
+ * `rubricName` — see schemas.ts.
162
+ */
163
+
164
+ declare const BUILTIN_RUBRICS: Record<string, Rubric>;
165
+ /** Get a built-in rubric by name, or undefined. */
166
+ declare function getBuiltinRubric(name: string): Rubric | undefined;
167
+ /** List built-in rubrics with their stable versions. */
168
+ declare function listBuiltinRubrics(): {
169
+ name: string;
170
+ description: string;
171
+ dimensions: {
172
+ id: string;
173
+ description: string;
174
+ weight: number;
175
+ }[];
176
+ failureModes: string[];
177
+ rubricVersion: string;
178
+ }[];
179
+
180
+ declare function buildOpenApi(packageVersion: string): OpenAPIObject;
181
+
182
+ declare function createApp(): Hono<hono_types.BlankEnv, hono_types.BlankSchema, "/">;
183
+ interface ServeOptions {
184
+ /** Default 5005. */
185
+ port?: number;
186
+ /** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */
187
+ host?: string;
188
+ }
189
+ declare function startServer(opts?: ServeOptions): ServerType;
190
+
191
+ interface RpcRequest {
192
+ method: 'judge' | 'listRubrics' | 'version';
193
+ params?: unknown;
194
+ }
195
+ interface RpcSuccess {
196
+ result: unknown;
197
+ }
198
+ interface RpcError {
199
+ error: {
200
+ code: string;
201
+ message: string;
202
+ details?: unknown;
203
+ };
204
+ }
205
+ declare function dispatchRpc(req: RpcRequest): Promise<RpcSuccess | RpcError>;
206
+ /** Read one JSON request from stdin, write one JSON response to stdout. */
207
+ declare function runRpcOnce(method?: string): Promise<number>;
208
+ /** Read JSONL requests from stdin, write JSONL responses to stdout. */
209
+ declare function runRpcBatch(method?: string): Promise<number>;
210
+
211
+ export { BUILTIN_RUBRICS, type ErrorResponse, ErrorResponseSchema, type FailureMode, FailureModeSchema, HealthResponseSchema, type JudgeRequest, JudgeRequestSchema, type JudgeResult, JudgeResultSchema, type ListRubricsResponse, ListRubricsResponseSchema, type Rubric, type RubricDimension, RubricDimensionSchema, type RubricInfo, RubricInfoSchema, RubricSchema, type ServeOptions, type VersionResponse, VersionResponseSchema, WIRE_VERSION, WireError, buildOpenApi, createApp, dispatchRpc, getBuiltinRubric, handleJudge, handleListRubrics, handleVersion, hashRubric, listBuiltinRubrics, runRpcBatch, runRpcOnce, startServer };
@@ -0,0 +1,56 @@
1
+ import {
2
+ BUILTIN_RUBRICS,
3
+ ErrorResponseSchema,
4
+ FailureModeSchema,
5
+ HealthResponseSchema,
6
+ JudgeRequestSchema,
7
+ JudgeResultSchema,
8
+ ListRubricsResponseSchema,
9
+ RubricDimensionSchema,
10
+ RubricInfoSchema,
11
+ RubricSchema,
12
+ VersionResponseSchema,
13
+ WIRE_VERSION,
14
+ WireError,
15
+ buildOpenApi,
16
+ createApp,
17
+ dispatchRpc,
18
+ getBuiltinRubric,
19
+ handleJudge,
20
+ handleListRubrics,
21
+ handleVersion,
22
+ hashRubric,
23
+ listBuiltinRubrics,
24
+ runRpcBatch,
25
+ runRpcOnce,
26
+ startServer
27
+ } from "../chunk-OZPRSK4A.js";
28
+ import "../chunk-ITN4YOZY.js";
29
+ export {
30
+ BUILTIN_RUBRICS,
31
+ ErrorResponseSchema,
32
+ FailureModeSchema,
33
+ HealthResponseSchema,
34
+ JudgeRequestSchema,
35
+ JudgeResultSchema,
36
+ ListRubricsResponseSchema,
37
+ RubricDimensionSchema,
38
+ RubricInfoSchema,
39
+ RubricSchema,
40
+ VersionResponseSchema,
41
+ WIRE_VERSION,
42
+ WireError,
43
+ buildOpenApi,
44
+ createApp,
45
+ dispatchRpc,
46
+ getBuiltinRubric,
47
+ handleJudge,
48
+ handleListRubrics,
49
+ handleVersion,
50
+ hashRubric,
51
+ listBuiltinRubrics,
52
+ runRpcBatch,
53
+ runRpcOnce,
54
+ startServer
55
+ };
56
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.12.0",
3
+ "version": "0.14.0",
4
4
  "description": "Trace-first evaluation framework for Tangle agents. Core (spans, pipelines, sandbox harness, OTLP export), trust (dataset, red-team, calibration, behavior DSL), builder-of-builders (three-layer eval, resumable sessions, meta-runtime correlation), and frontier (meta-eval correlation study, Process Reward Modeling, bisector).",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -20,8 +20,16 @@
20
20
  "types": "./dist/telemetry/file.d.ts",
21
21
  "import": "./dist/telemetry/file.js",
22
22
  "default": "./dist/telemetry/file.js"
23
+ },
24
+ "./wire": {
25
+ "types": "./dist/wire/index.d.ts",
26
+ "import": "./dist/wire/index.js",
27
+ "default": "./dist/wire/index.js"
23
28
  }
24
29
  },
30
+ "bin": {
31
+ "agent-eval": "./dist/cli.js"
32
+ },
25
33
  "files": [
26
34
  "dist"
27
35
  ],
@@ -29,11 +37,16 @@
29
37
  "access": "public"
30
38
  },
31
39
  "dependencies": {
40
+ "@asteasolutions/zod-to-openapi": "^8.5.0",
32
41
  "@ax-llm/ax": "^19.0.25",
33
- "@tangle-network/tcloud": "^0.2.0"
42
+ "@hono/node-server": "^2.0.0",
43
+ "@tangle-network/tcloud": "^0.2.0",
44
+ "hono": "^4.12.15",
45
+ "zod": "^4.3.6"
34
46
  },
35
47
  "devDependencies": {
36
48
  "@types/node": "^25.6.0",
49
+ "openapi3-ts": "^4.5.0",
37
50
  "tsup": "^8.0.0",
38
51
  "typescript": "^5.7.0",
39
52
  "vitest": "^3.0.0"
@@ -47,6 +60,7 @@
47
60
  "dev": "tsup --watch",
48
61
  "test": "vitest run",
49
62
  "test:watch": "vitest",
50
- "typecheck": "tsc --noEmit"
63
+ "typecheck": "tsc --noEmit",
64
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
51
65
  }
52
66
  }