npm - @tangle-network/agent-eval - Versions diffs - 0.11.1 → 0.13.0 - Mend

@tangle-network/agent-eval 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +96 -11
package/dist/chunk-ITN4YOZY.js +215 -0
package/dist/chunk-ITN4YOZY.js.map +1 -0
package/dist/chunk-OZPRSK4A.js +594 -0
package/dist/chunk-OZPRSK4A.js.map +1 -0
package/dist/cli.d.ts +1 -0
package/dist/cli.js +104 -0
package/dist/cli.js.map +1 -0
package/dist/index.d.ts +597 -4
package/dist/index.js +908 -241
package/dist/index.js.map +1 -1
package/dist/sink-fetch-C0B8ximv.d.ts +101 -0
package/dist/telemetry/file.d.ts +19 -0
package/dist/telemetry/file.js +40 -0
package/dist/telemetry/file.js.map +1 -0
package/dist/telemetry/index.d.ts +38 -0
package/dist/telemetry/index.js +128 -0
package/dist/telemetry/index.js.map +1 -0
package/dist/wire/index.d.ts +211 -0
package/dist/wire/index.js +56 -0
package/dist/wire/index.js.map +1 -0
package/package.json +27 -3

package/dist/chunk-OZPRSK4A.js ADDED Viewed

@@ -0,0 +1,594 @@
+import {
+  callLlmJson
+} from "./chunk-ITN4YOZY.js";
+// src/wire/schemas.ts
+import { extendZodWithOpenApi } from "@asteasolutions/zod-to-openapi";
+import { z } from "zod";
+extendZodWithOpenApi(z);
+var RubricDimensionSchema = z.object({
+  id: z.string().min(1).describe('Short stable id like "buyer_quality" \u2014 used as the key in scoring output.'),
+  description: z.string().min(1).describe("One-line plain-English meaning. Read by humans reviewing low scores."),
+  weight: z.number().min(0).default(1).describe("Relative weight in the composite score. Default 1; 0 disables."),
+  min: z.number().default(0).describe("Lower bound of valid score for this dimension."),
+  max: z.number().default(1).describe("Upper bound of valid score for this dimension.")
+}).openapi("RubricDimension");
+var FailureModeSchema = z.object({
+  id: z.string().min(1).describe('Short stable id like "ai-cadence" \u2014 used in detection lists.'),
+  description: z.string().min(1).describe("Plain-English description of the failure pattern.")
+}).openapi("FailureMode");
+var RubricSchema = z.object({
+  name: z.string().min(1).describe('Stable name like "anti-slop" \u2014 used by clients to invoke this rubric.'),
+  description: z.string().min(1).describe("What this rubric measures. Shown in /v1/rubrics listing."),
+  systemPrompt: z.string().min(1).describe(
+    'Instructs the judging LLM. Should explain the persona (e.g. "senior engineer reviewing voice"), what to score on, and what to return.'
+  ),
+  dimensions: z.array(RubricDimensionSchema).min(1).describe("Scoring axes. The composite score is a weighted sum of these."),
+  failureModes: z.array(FailureModeSchema).default([]).describe("Patterns to detect; each detected mode appears in the result.failureModes list."),
+  wins: z.array(FailureModeSchema).default([]).describe("Positive patterns; each detected one appears in the result.wins list.")
+}).openapi("Rubric");
+var JudgeRequestSchema = z.object({
+  rubricName: z.string().optional().describe("Use a built-in rubric by name. Mutually exclusive with `rubric`."),
+  rubric: RubricSchema.optional().describe(
+    "Inline rubric definition. Mutually exclusive with `rubricName`."
+  ),
+  content: z.string().min(1).describe("The text being judged \u2014 a tweet, a blog post, a code snippet, anything stringly."),
+  context: z.record(z.string(), z.unknown()).optional().describe(
+    "Free-form metadata for the rubric to use \u2014 analytics, source URL, author, etc. Surfaced to the LLM."
+  ),
+  model: z.string().optional().describe('Override the judge model (default routes via tcloud). e.g. "claude-opus-4-7".')
+}).refine((v) => Boolean(v.rubricName) !== Boolean(v.rubric), {
+  message: "Provide exactly one of `rubricName` or `rubric`."
+}).openapi("JudgeRequest");
+var JudgeResultSchema = z.object({
+  composite: z.number().min(0).max(1).describe("Weighted combination of dimension scores in 0..1. The single number to gate on."),
+  dimensions: z.record(z.string(), z.number()).describe("Per-dimension score, keyed by RubricDimension.id."),
+  failureModes: z.array(z.string()).default([]).describe("Failure-mode ids detected in the content (subset of rubric.failureModes ids)."),
+  wins: z.array(z.string()).default([]).describe("Win ids detected in the content (subset of rubric.wins ids)."),
+  rationale: z.string().describe("Plain-English explanation of the score. Surfaced to the human reviewer."),
+  rubricVersion: z.string().describe(
+    "Stable hash of the rubric used. Scores are only comparable across runs when this matches."
+  ),
+  model: z.string().describe("Model that produced the judgement, for reproducibility."),
+  durationMs: z.number().int().nonnegative().describe("End-to-end wall time for this call.")
+}).openapi("JudgeResult");
+var RubricInfoSchema = z.object({
+  name: z.string().describe("Pass this to /v1/judge as `rubricName`."),
+  description: z.string().describe("What this rubric measures."),
+  dimensions: z.array(z.object({ id: z.string(), description: z.string(), weight: z.number() })).describe("The scoring axes this rubric uses, with weights."),
+  failureModes: z.array(z.string()).default([]).describe("Failure-mode ids this rubric detects."),
+  rubricVersion: z.string().describe("Stable hash \u2014 match this to compare scores across runs.")
+}).openapi("RubricInfo");
+var ListRubricsResponseSchema = z.object({
+  rubrics: z.array(RubricInfoSchema)
+}).openapi("ListRubricsResponse");
+var VersionResponseSchema = z.object({
+  package: z.string().describe('Package name (always "@tangle-network/agent-eval").'),
+  version: z.string().describe("Semver of the running server. Match your client to this."),
+  wireVersion: z.string().describe(
+    "Wire-protocol semver. Bumps separately from package version when the schema changes."
+  ),
+  apiSurface: z.array(z.string()).describe("List of supported method names.")
+}).openapi("VersionResponse");
+var HealthResponseSchema = z.object({
+  status: z.literal("ok"),
+  uptimeSec: z.number()
+}).openapi("HealthResponse");
+var ErrorResponseSchema = z.object({
+  error: z.object({
+    code: z.string().describe('Machine-readable code: "validation_error", "rubric_not_found", "judge_error".'),
+    message: z.string().describe("Human-readable message."),
+    details: z.unknown().optional().describe("Optional structured detail.")
+  }).describe("Errors are always wrapped in this shape across all endpoints.")
+}).openapi("ErrorResponse");
+var WIRE_VERSION = "1.0.0";
+function hashRubric(rubric) {
+  const stable = JSON.stringify(rubric, Object.keys(rubric).sort());
+  let h = 5381;
+  for (let i = 0; i < stable.length; i++) {
+    h = h * 33 ^ stable.charCodeAt(i);
+  }
+  return `${rubric.name}@${(h >>> 0).toString(16).padStart(8, "0")}`;
+}
+// src/wire/rubrics.ts
+var ANTI_SLOP = {
+  name: "anti-slop",
+  description: "Voice and signal quality for content aimed at senior engineers. Catches AI cadence, marketing tone, and engagement-bait shapes.",
+  systemPrompt: `You are evaluating a piece of content written for senior engineers and technical founders.
+You score three things:
+- buyer_quality (0..1): would a senior engineer in the target ICP find this worth their attention? High = specific, earned, technically interesting. Low = generic, hyped, off-target.
+- voice (0..1): does it read like a person who built the thing, or like AI/marketing copy?
+- signal (0..1): does it contain a non-obvious detail, constraint, or claim a reader couldn't get from the public docs?
+Detect failure modes (return ids matching):
+- ai-cadence: rule-of-three openings, em-dash flourish, "Let me explain", "Here's the thing", AI rhythm
+- marketing-tone: "We're excited to announce", "thrilled", "delighted", "game-changer", buzzword stack
+- vague-claim: technical claim without a specific component, file, or measurement
+- no-hook: opening doesn't earn attention from the target reader
+- engagement-bait: "agree?", "thoughts?", listicles, controversy-fishing, hook-detail-pitch
+- off-icp: content shape would attract motivational/grift/hype audiences instead of buyers
+- stale-claim: repeats a positioning line we've used many times this month
+Detect wins (return ids matching):
+- specific-component: names a real file, component, or measurement
+- earned-detail: shares a non-obvious detail not derivable from public docs
+- constraint-articulated: names a real tradeoff and the side chosen
+- honest-failure: describes a real failure mode and what was done about it
+Return ONLY JSON matching the response schema. Be conservative \u2014 most content has 0-1 wins and 1-2 failure modes, not many of each.`,
+  dimensions: [
+    {
+      id: "buyer_quality",
+      description: "Would the target buyer find this worth attention?",
+      weight: 0.5,
+      min: 0,
+      max: 1
+    },
+    {
+      id: "voice",
+      description: "Does it sound like a builder, not AI or marketing?",
+      weight: 0.3,
+      min: 0,
+      max: 1
+    },
+    {
+      id: "signal",
+      description: "Non-obvious detail, constraint, or claim?",
+      weight: 0.2,
+      min: 0,
+      max: 1
+    }
+  ],
+  failureModes: [
+    { id: "ai-cadence", description: "AI-rhythm openings and transitions" },
+    { id: "marketing-tone", description: "Buzzwords, hype, corporate-PR voice" },
+    { id: "vague-claim", description: "Technical claim without specifics" },
+    { id: "no-hook", description: "Opening fails to earn attention" },
+    { id: "engagement-bait", description: "Listicle/controversy/agree-pattern" },
+    { id: "off-icp", description: "Voice attracts the wrong audience" },
+    { id: "stale-claim", description: "Reuses an over-used positioning line" }
+  ],
+  wins: [
+    { id: "specific-component", description: "Names a real file/component/number" },
+    { id: "earned-detail", description: "Detail not in public docs" },
+    { id: "constraint-articulated", description: "Names a real tradeoff" },
+    { id: "honest-failure", description: "Describes a real failure honestly" }
+  ]
+};
+var BUILTIN_RUBRICS = {
+  "anti-slop": ANTI_SLOP
+};
+function getBuiltinRubric(name) {
+  return BUILTIN_RUBRICS[name];
+}
+function listBuiltinRubrics() {
+  return Object.values(BUILTIN_RUBRICS).map((r) => ({
+    name: r.name,
+    description: r.description,
+    dimensions: r.dimensions.map((d) => ({
+      id: d.id,
+      description: d.description,
+      weight: d.weight
+    })),
+    failureModes: r.failureModes.map((f) => f.id),
+    rubricVersion: hashRubric(r)
+  }));
+}
+// src/wire/handlers.ts
+import { readFileSync } from "fs";
+import { dirname, resolve } from "path";
+import { fileURLToPath } from "url";
+var WireError = class extends Error {
+  constructor(code, message, status = 400, details) {
+    super(message);
+    this.code = code;
+    this.status = status;
+    this.details = details;
+    this.name = "WireError";
+  }
+  code;
+  status;
+  details;
+};
+function judgeOutputSchema(rubric) {
+  return {
+    name: "JudgeOutput",
+    schema: {
+      type: "object",
+      additionalProperties: false,
+      properties: {
+        dimensions: {
+          type: "object",
+          additionalProperties: false,
+          properties: Object.fromEntries(
+            rubric.dimensions.map((d) => [
+              d.id,
+              { type: "number", minimum: d.min, maximum: d.max }
+            ])
+          ),
+          required: rubric.dimensions.map((d) => d.id)
+        },
+        failureModes: {
+          type: "array",
+          items: { type: "string", enum: rubric.failureModes.map((f) => f.id) }
+        },
+        wins: {
+          type: "array",
+          items: { type: "string", enum: rubric.wins.map((w) => w.id) }
+        },
+        rationale: { type: "string" }
+      },
+      required: ["dimensions", "rationale"]
+    }
+  };
+}
+function compositeScore(dimensions, rubric) {
+  let weighted = 0;
+  let totalWeight = 0;
+  for (const dim of rubric.dimensions) {
+    const raw = dimensions[dim.id] ?? 0;
+    const range = dim.max - dim.min || 1;
+    const normalized = Math.max(0, Math.min(1, (raw - dim.min) / range));
+    weighted += normalized * dim.weight;
+    totalWeight += dim.weight;
+  }
+  return totalWeight > 0 ? weighted / totalWeight : 0;
+}
+function buildJudgePrompt(content, context) {
+  const ctx = context && Object.keys(context).length ? JSON.stringify(context) : "";
+  return [
+    `CONTENT TO JUDGE:`,
+    content,
+    "",
+    ctx ? `CONTEXT (metadata, analytics, etc.):` : "",
+    ctx ? ctx : ""
+  ].filter(Boolean).join("\n");
+}
+var DEFAULT_JUDGE_MODEL = "claude-sonnet-4-6";
+async function handleJudge(req) {
+  let rubric;
+  if (req.rubricName) {
+    const found = getBuiltinRubric(req.rubricName);
+    if (!found) {
+      throw new WireError("rubric_not_found", `No built-in rubric named "${req.rubricName}".`, 404);
+    }
+    rubric = found;
+  } else if (req.rubric) {
+    rubric = req.rubric;
+  } else {
+    throw new WireError("validation_error", "Provide either `rubricName` or `rubric`.", 422);
+  }
+  const startedAt = Date.now();
+  const model = req.model ?? DEFAULT_JUDGE_MODEL;
+  const { value, result } = await callLlmJson({
+    model,
+    messages: [
+      { role: "system", content: rubric.systemPrompt },
+      { role: "user", content: buildJudgePrompt(req.content, req.context) }
+    ],
+    jsonSchema: judgeOutputSchema(rubric),
+    temperature: 0,
+    timeoutMs: 6e4
+  });
+  if (!value || typeof value !== "object" || !value.dimensions) {
+    throw new WireError("judge_error", "Judge returned malformed output.", 500, value);
+  }
+  const composite = compositeScore(value.dimensions, rubric);
+  const durationMs = Date.now() - startedAt;
+  return {
+    composite,
+    dimensions: value.dimensions,
+    failureModes: value.failureModes ?? [],
+    wins: value.wins ?? [],
+    rationale: value.rationale,
+    rubricVersion: hashRubric(rubric),
+    model: result.model,
+    durationMs
+  };
+}
+function handleListRubrics() {
+  return { rubrics: listBuiltinRubrics() };
+}
+var CACHED_VERSION;
+function readPackageVersion() {
+  if (CACHED_VERSION) return CACHED_VERSION;
+  const here = dirname(fileURLToPath(import.meta.url));
+  const candidates = [
+    resolve(here, "..", "..", "package.json"),
+    // src/wire → repo root
+    resolve(here, "..", "package.json")
+    // dist → repo root
+  ];
+  for (const path of candidates) {
+    try {
+      const pkg = JSON.parse(readFileSync(path, "utf-8"));
+      if (pkg.version) {
+        CACHED_VERSION = pkg.version;
+        return pkg.version;
+      }
+    } catch {
+    }
+  }
+  return "0.0.0-unknown";
+}
+function handleVersion() {
+  return {
+    package: "@tangle-network/agent-eval",
+    version: readPackageVersion(),
+    wireVersion: WIRE_VERSION,
+    apiSurface: ["judge", "listRubrics", "version"]
+  };
+}
+// src/wire/openapi.ts
+import { OpenApiGeneratorV31, OpenAPIRegistry } from "@asteasolutions/zod-to-openapi";
+function buildOpenApi(packageVersion) {
+  const registry = new OpenAPIRegistry();
+  registry.register("JudgeRequest", JudgeRequestSchema);
+  registry.register("JudgeResult", JudgeResultSchema);
+  registry.register("ListRubricsResponse", ListRubricsResponseSchema);
+  registry.register("VersionResponse", VersionResponseSchema);
+  registry.register("HealthResponse", HealthResponseSchema);
+  registry.register("ErrorResponse", ErrorResponseSchema);
+  registry.registerPath({
+    method: "post",
+    path: "/v1/judge",
+    summary: "Score a piece of content against a rubric",
+    description: "Runs the judging LLM with the named (or inline) rubric and returns dimension scores, detected failure modes, wins, and a composite score in 0..1.",
+    request: {
+      body: {
+        content: {
+          "application/json": { schema: JudgeRequestSchema }
+        }
+      }
+    },
+    responses: {
+      200: {
+        description: "Successful judgement",
+        content: { "application/json": { schema: JudgeResultSchema } }
+      },
+      400: {
+        description: "Validation error",
+        content: { "application/json": { schema: ErrorResponseSchema } }
+      },
+      404: {
+        description: "Rubric not found",
+        content: { "application/json": { schema: ErrorResponseSchema } }
+      },
+      500: {
+        description: "Judge error",
+        content: { "application/json": { schema: ErrorResponseSchema } }
+      }
+    }
+  });
+  registry.registerPath({
+    method: "get",
+    path: "/v1/rubrics",
+    summary: "List built-in rubrics",
+    description: "Returns every rubric registered server-side, with their dimensions and stable rubricVersion hash.",
+    responses: {
+      200: {
+        description: "Listing",
+        content: { "application/json": { schema: ListRubricsResponseSchema } }
+      }
+    }
+  });
+  registry.registerPath({
+    method: "get",
+    path: "/v1/version",
+    summary: "Server and wire-protocol version",
+    description: "Match your client version to `version`; check `wireVersion` for compatibility.",
+    responses: {
+      200: {
+        description: "Version info",
+        content: { "application/json": { schema: VersionResponseSchema } }
+      }
+    }
+  });
+  registry.registerPath({
+    method: "get",
+    path: "/healthz",
+    summary: "Liveness check",
+    responses: {
+      200: {
+        description: "OK",
+        content: { "application/json": { schema: HealthResponseSchema } }
+      }
+    }
+  });
+  const generator = new OpenApiGeneratorV31(registry.definitions);
+  return generator.generateDocument({
+    openapi: "3.1.0",
+    info: {
+      title: "@tangle-network/agent-eval \u2014 wire protocol",
+      version: packageVersion,
+      description: `HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.
+Wire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/response schemas.`,
+      contact: { name: "Tangle Network", url: "https://github.com/tangle-network/agent-eval" },
+      license: { name: "MIT" }
+    },
+    servers: [{ url: "http://localhost:5005", description: "Local agent-eval serve" }]
+  });
+}
+// src/wire/server.ts
+import { serve } from "@hono/node-server";
+import { Hono } from "hono";
+import { cors } from "hono/cors";
+var STARTED_AT = Date.now();
+function createApp() {
+  const app = new Hono();
+  app.use("*", cors());
+  app.onError((err, c) => {
+    if (err instanceof WireError) {
+      return c.json(
+        { error: { code: err.code, message: err.message, details: err.details } },
+        err.status
+      );
+    }
+    console.error("[agent-eval] unhandled error:", err);
+    return c.json(
+      { error: { code: "internal_error", message: "Internal server error." } },
+      500
+    );
+  });
+  app.get(
+    "/healthz",
+    (c) => c.json({ status: "ok", uptimeSec: (Date.now() - STARTED_AT) / 1e3 })
+  );
+  app.get("/v1/version", (c) => c.json(handleVersion()));
+  app.get("/v1/rubrics", (c) => c.json(handleListRubrics()));
+  app.post("/v1/judge", async (c) => {
+    const raw = await c.req.json().catch(() => null);
+    if (raw == null) {
+      throw new WireError("validation_error", "Request body must be JSON.", 400);
+    }
+    const parsed = JudgeRequestSchema.safeParse(raw);
+    if (!parsed.success) {
+      throw new WireError(
+        "validation_error",
+        "Request did not match JudgeRequest schema.",
+        400,
+        parsed.error.issues
+      );
+    }
+    const result = await handleJudge(parsed.data);
+    return c.json(result);
+  });
+  app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
+  return app;
+}
+function startServer(opts = {}) {
+  const app = createApp();
+  const port = opts.port ?? 5005;
+  const host = opts.host ?? "127.0.0.1";
+  return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
+    console.log(`[agent-eval] serving on http://${address}:${actualPort}`);
+  });
+}
+// src/wire/rpc.ts
+async function dispatchRpc(req) {
+  try {
+    switch (req.method) {
+      case "judge": {
+        const parsed = JudgeRequestSchema.safeParse(req.params);
+        if (!parsed.success) {
+          return {
+            error: {
+              code: "validation_error",
+              message: "params did not match JudgeRequest schema.",
+              details: parsed.error.issues
+            }
+          };
+        }
+        return { result: await handleJudge(parsed.data) };
+      }
+      case "listRubrics":
+        return { result: handleListRubrics() };
+      case "version":
+        return { result: handleVersion() };
+      default:
+        return {
+          error: {
+            code: "unknown_method",
+            message: `No such method: ${req.method}`
+          }
+        };
+    }
+  } catch (err) {
+    if (err instanceof WireError) {
+      return { error: { code: err.code, message: err.message, details: err.details } };
+    }
+    const message = err instanceof Error ? err.message : String(err);
+    return { error: { code: "internal_error", message } };
+  }
+}
+async function readAll(stream) {
+  const chunks = [];
+  for await (const chunk of stream) {
+    chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
+  }
+  return Buffer.concat(chunks).toString("utf-8");
+}
+async function runRpcOnce(method) {
+  const raw = await readAll(process.stdin);
+  let req;
+  try {
+    const body = JSON.parse(raw);
+    req = method ? { method, params: body } : body;
+  } catch (err) {
+    process.stdout.write(
+      JSON.stringify({
+        error: {
+          code: "parse_error",
+          message: `stdin was not valid JSON: ${err instanceof Error ? err.message : String(err)}`
+        }
+      }) + "\n"
+    );
+    return 1;
+  }
+  const out = await dispatchRpc(req);
+  process.stdout.write(JSON.stringify(out) + "\n");
+  return "error" in out ? 1 : 0;
+}
+async function runRpcBatch(method) {
+  const raw = await readAll(process.stdin);
+  const lines = raw.split("\n").filter((l) => l.trim().length > 0);
+  let exitCode = 0;
+  for (const line of lines) {
+    let req;
+    try {
+      const body = JSON.parse(line);
+      req = method ? { method, params: body } : body;
+    } catch (err) {
+      process.stdout.write(
+        JSON.stringify({
+          error: {
+            code: "parse_error",
+            message: `line was not valid JSON: ${err instanceof Error ? err.message : String(err)}`
+          }
+        }) + "\n"
+      );
+      exitCode = 1;
+      continue;
+    }
+    const out = await dispatchRpc(req);
+    process.stdout.write(JSON.stringify(out) + "\n");
+    if ("error" in out) exitCode = 1;
+  }
+  return exitCode;
+}
+export {
+  RubricDimensionSchema,
+  FailureModeSchema,
+  RubricSchema,
+  JudgeRequestSchema,
+  JudgeResultSchema,
+  RubricInfoSchema,
+  ListRubricsResponseSchema,
+  VersionResponseSchema,
+  HealthResponseSchema,
+  ErrorResponseSchema,
+  WIRE_VERSION,
+  hashRubric,
+  BUILTIN_RUBRICS,
+  getBuiltinRubric,
+  listBuiltinRubrics,
+  WireError,
+  handleJudge,
+  handleListRubrics,
+  handleVersion,
+  buildOpenApi,
+  createApp,
+  startServer,
+  dispatchRpc,
+  runRpcOnce,
+  runRpcBatch
+};
+//# sourceMappingURL=chunk-OZPRSK4A.js.map