npm - @openwop/openwop-conformance - Versions diffs - 1.3.0 → 1.5.0 - Mend

@openwop/openwop-conformance 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

package/src/scenarios/envelope-completion-distinguishes-truncation.test.ts ADDED Viewed

@@ -0,0 +1,223 @@
+/**
+ * envelope-completion-distinguishes-truncation — RFC 0033 §A + §B + §C
+ * truncation-vs-schema-violation retry-routing distinction.
+ *
+ * Capability-gated on `capabilities.envelopes.reliability.supported: true`
+ * AND `capabilities.envelopes.reliability.completion.distinguishesTruncation: true`
+ * AND the host's test seam. Soft-skip cleanly on hosts that conflate the two
+ * paths (legacy v1.1 behavior).
+ *
+ * Asserts two scenarios:
+ *
+ * 1. **Truncation path** (RFC 0033 §B). Mock LLM stops at `max_tokens` mid-envelope.
+ *    - `envelope.truncated` event fires.
+ *    - `envelope.retry.attempted` fires with `reason: 'truncation'`.
+ *    - The retry's `maxTokens` budget is strictly greater than the initial.
+ *
+ * 2. **Schema-violation path** (RFC 0033 §C). Mock LLM emits malformed JSON.
+ *    - NO `envelope.truncated` event.
+ *    - `envelope.retry.attempted` fires with `reason` ∈ {`schema-violation`, `parse-error`}.
+ *    - The retry's `maxTokens` budget is UNCHANGED from the initial.
+ *
+ * Both scenarios share the existing per-path fixtures
+ * (`conformance-envelope-truncated` for the truncation case;
+ * `conformance-envelope-retry-attempted` for the schema-violation case).
+ *
+ * @see RFCS/0033-envelope-completion-contract.md §A + §B + §C
+ * @see spec/v1/ai-envelope.md §"Envelope-completion criteria"
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { pollUntilTerminal } from '../lib/polling.js';
+import { isFixtureAdvertised } from '../lib/fixtures.js';
+const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
+const NODE_ID = 'structured-call';
+interface DiscoveryDoc {
+  capabilities?: {
+    envelopes?: {
+      reliability?: {
+        completion?: {
+          distinguishesTruncation?: unknown;
+          truncationBudgetMultiplier?: unknown;
+        };
+      };
+    };
+  };
+}
+interface RunEvent {
+  type: string;
+  payload?: Record<string, unknown>;
+  nodeId?: string;
+  sequence: number;
+}
+async function readDiscovery(): Promise<DiscoveryDoc | null> {
+  try {
+    const res = await driver.get('/.well-known/openwop');
+    if (res.status !== 200) return null;
+    return res.json as DiscoveryDoc;
+  } catch {
+    return null;
+  }
+}
+async function programMock(program: Array<Record<string, unknown>>): Promise<{ status: number }> {
+  const res = await driver.post('/v1/host/sample/test/mock-ai/program', { nodeId: NODE_ID, program });
+  return { status: res.status };
+}
+async function startRunAndRead(workflowId: string): Promise<{ events: RunEvent[]; terminal: unknown } | null> {
+  const create = await driver.post('/v1/runs', { workflowId });
+  if (create.status !== 201) return null;
+  const runId = (create.json as { runId: string }).runId;
+  const terminal = await pollUntilTerminal(runId, { timeoutMs: 10_000 });
+  const eventsRes = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
+  if (eventsRes.status !== 200) return null;
+  const events = ((eventsRes.json as { events?: RunEvent[] } | undefined)?.events ?? []) as RunEvent[];
+  return { events, terminal };
+}
+async function lastBudget(): Promise<number | null> {
+  const res = await driver.get(`/v1/host/sample/test/mock-ai/last-dispatch-budget?nodeId=${encodeURIComponent(NODE_ID)}`);
+  if (res.status !== 200) return null;
+  return (res.json as { maxTokens?: number | null }).maxTokens ?? null;
+}
+describe.skipIf(HTTP_SKIP)('envelope-completion-distinguishes-truncation: advertisement shape (RFC 0033 §E)', () => {
+  it('capabilities.envelopes.reliability.completion (when present) conforms to RFC 0033 §E', async () => {
+    const d = await readDiscovery();
+    if (d === null) return;
+    const completion = d.capabilities?.envelopes?.reliability?.completion;
+    if (completion === undefined) return;
+    expect(
+      typeof completion.distinguishesTruncation,
+      driver.describe('RFCS/0033-envelope-completion-contract.md §E', 'completion.distinguishesTruncation MUST be boolean when block is advertised'),
+    ).toBe('boolean');
+    if (completion.truncationBudgetMultiplier !== undefined) {
+      const n = completion.truncationBudgetMultiplier as number;
+      expect(
+        typeof n === 'number' && n >= 1 && n <= 8,
+        driver.describe('RFCS/0033-envelope-completion-contract.md §E', 'truncationBudgetMultiplier MUST be a number in [1, 8] (default 2)'),
+      ).toBe(true);
+    }
+  });
+});
+const TRUNCATED_FIXTURE = 'conformance-envelope-truncated';
+const SCHEMA_VIOLATION_FIXTURE = 'conformance-envelope-retry-attempted';
+describe.skipIf(HTTP_SKIP)('envelope-completion-distinguishes-truncation: truncation path (RFC 0033 §B)', () => {
+  it('truncation: emits envelope.truncated + envelope.retry.attempted with reason: "truncation"', async () => {
+    if (!isFixtureAdvertised(TRUNCATED_FIXTURE)) return;
+    const d = await readDiscovery();
+    if (d?.capabilities?.envelopes?.reliability?.completion?.distinguishesTruncation !== true) return;
+    const seed = await programMock([
+      { stopReason: 'max_tokens', content: '{"partial' },
+      { stopReason: 'end_turn', content: '{"valid":true}' },
+    ]);
+    if (seed.status === 404) return;
+    const result = await startRunAndRead(TRUNCATED_FIXTURE);
+    if (result === null) return;
+    const truncated = result.events.find((e) => e.type === 'envelope.truncated');
+    expect(truncated, 'envelope.truncated MUST fire on the truncation path').toBeDefined();
+    const retry = result.events.find((e) => e.type === 'envelope.retry.attempted');
+    expect(retry, 'envelope.retry.attempted MUST fire between attempts').toBeDefined();
+    expect(
+      retry!.payload?.reason,
+      driver.describe(
+        'RFCS/0033-envelope-completion-contract.md §B',
+        'truncation-routed retry MUST carry reason: "truncation" (distinct from schema-violation per RFC 0033 §A precedence rule)',
+      ),
+    ).toBe('truncation');
+  });
+  it('truncation: retry budget strictly greater than initial (RFC 0033 §B truncationBudgetMultiplier)', async () => {
+    if (!isFixtureAdvertised(TRUNCATED_FIXTURE)) return;
+    const d = await readDiscovery();
+    if (d?.capabilities?.envelopes?.reliability?.completion?.distinguishesTruncation !== true) return;
+    const seed = await programMock([
+      { stopReason: 'max_tokens', content: '{"partial' },
+      { stopReason: 'end_turn', content: '{"valid":true}' },
+    ]);
+    if (seed.status === 404) return;
+    await startRunAndRead(TRUNCATED_FIXTURE);
+    const budget = await lastBudget();
+    if (budget === null) return;
+    expect(
+      budget,
+      driver.describe(
+        'RFCS/0033-envelope-completion-contract.md §B',
+        'truncation retry MUST multiply maxTokens by truncationBudgetMultiplier — final budget > initial 50 fixture value',
+      ),
+    ).toBeGreaterThan(50);
+  });
+});
+describe.skipIf(HTTP_SKIP)('envelope-completion-distinguishes-truncation: schema-violation path (RFC 0033 §C)', () => {
+  it('schema-violation: NO envelope.truncated; envelope.retry.attempted reason ∈ {schema-violation, parse-error}', async () => {
+    if (!isFixtureAdvertised(SCHEMA_VIOLATION_FIXTURE)) return;
+    const seed = await programMock([
+      { content: 'not valid json' },
+      { content: '{"valid":true}' },
+    ]);
+    if (seed.status === 404) return;
+    const result = await startRunAndRead(SCHEMA_VIOLATION_FIXTURE);
+    if (result === null) return;
+    const truncated = result.events.find((e) => e.type === 'envelope.truncated');
+    expect(
+      truncated,
+      driver.describe(
+        'RFCS/0033-envelope-completion-contract.md §C',
+        'schema-violation path MUST NOT emit envelope.truncated (truncation and schema-violation are distinct paths per RFC 0033 §A)',
+      ),
+    ).toBeUndefined();
+    const retry = result.events.find((e) => e.type === 'envelope.retry.attempted');
+    expect(retry).toBeDefined();
+    const reason = retry!.payload?.reason as string | undefined;
+    expect(
+      reason === 'schema-violation' || reason === 'parse-error',
+      driver.describe(
+        'RFCS/0033-envelope-completion-contract.md §C',
+        'schema-violation-routed retry MUST carry reason ∈ {schema-violation, parse-error}; truncation reason is reserved for the budget-doubling path',
+      ),
+    ).toBe(true);
+  });
+  it('schema-violation: retry budget UNCHANGED from initial (no budget multiplication on this path)', async () => {
+    if (!isFixtureAdvertised(SCHEMA_VIOLATION_FIXTURE)) return;
+    const seed = await programMock([
+      { content: 'not valid json' },
+      { content: '{"valid":true}' },
+    ]);
+    if (seed.status === 404) return;
+    await startRunAndRead(SCHEMA_VIOLATION_FIXTURE);
+    const budget = await lastBudget();
+    if (budget === null) return;
+    // The schema-violation fixture doesn't set maxTokens explicitly →
+    // budget snapshots whatever the host's default is on each call.
+    // The KEY invariant: the retry call's budget MUST NOT be multiplied
+    // (the truncation path doubles; this path keeps the same). The
+    // budget on the last call equals the budget on the first call.
+    // Without a per-call history hook, we can't strictly compare; we
+    // assert the budget didn't grow into the truncation-path range
+    // (which would be ≥2× the default — typically 8000 for the
+    // sample's structuredOutput dispatch path).
+    if (budget !== null) {
+      expect(
+        budget,
+        driver.describe(
+          'RFCS/0033-envelope-completion-contract.md §C',
+          'schema-violation retry MUST NOT multiply maxTokens — budget stays at the original value (host default)',
+        ),
+      ).toBeLessThan(20_000);
+    }
+  });
+});

package/src/scenarios/envelope-nl-to-format-engaged.test.ts ADDED Viewed

@@ -0,0 +1,152 @@
+/**
+ * envelope-nl-to-format-engaged — RFC 0032 §B.5 runtime behavior (MAY tier).
+ *
+ * Capability-gated on `capabilities.envelopes.reliability.supported: true`
+ * AND `events[]` includes `envelope.nlToFormat.engaged`. Soft-skip cleanly
+ * on hosts that don't implement NL-to-Format fallback — NL-to-Format is one
+ * of many possible recovery strategies; hosts that don't advertise it don't
+ * need to emit.
+ *
+ * Asserts:
+ *   1. When retry exhaustion triggers the NL-to-Format fallback (per Tam et al.
+ *      mitigation: free-form reasoning in the first call → schema coercion
+ *      in the second call), exactly one `envelope.nlToFormat.engaged` event
+ *      fires.
+ *   2. `originalEnvelopeType` carries the envelope kind the original attempt
+ *      was trying to emit.
+ *   3. `fallbackCalls >= 1` (informational — how many secondary LLM calls
+ *      the host issued to reformat).
+ *   4. The eventual envelope acceptance (when fallback succeeds) records
+ *      normally via downstream RunEventDoc.
+ *
+ * @see RFCS/0032-envelope-reliability-events.md §B.5
+ * @see Tam et al., "Let Me Speak Freely?" — https://arxiv.org/pdf/2408.02442
+ * @see schemas/run-event-payloads.schema.json §envelopeNlToFormatEngaged
+ */
+import { describe, it, expect } from 'vitest';
+import { driver } from '../lib/driver.js';
+import { pollUntilTerminal } from '../lib/polling.js';
+import { isFixtureAdvertised } from '../lib/fixtures.js';
+const HTTP_SKIP = !process.env.OPENWOP_BASE_URL;
+const FIXTURE = 'conformance-envelope-nl-to-format-engaged';
+const NODE_ID = 'structured-call';
+interface RunEvent {
+  type: string;
+  payload?: Record<string, unknown>;
+  nodeId?: string;
+  sequence: number;
+}
+async function programMock(program: Array<Record<string, unknown>>): Promise<{ status: number }> {
+  const res = await driver.post('/v1/host/sample/test/mock-ai/program', { nodeId: NODE_ID, program });
+  return { status: res.status };
+}
+async function runAndReadEvents(): Promise<RunEvent[] | null> {
+  const create = await driver.post('/v1/runs', { workflowId: FIXTURE });
+  if (create.status !== 201) return null;
+  const runId = (create.json as { runId: string }).runId;
+  await pollUntilTerminal(runId, { timeoutMs: 10_000 });
+  const eventsRes = await driver.get(`/v1/runs/${encodeURIComponent(runId)}/events`);
+  if (eventsRes.status !== 200) return null;
+  return ((eventsRes.json as { events?: RunEvent[] } | undefined)?.events ?? []) as RunEvent[];
+}
+// Three NL responses to exhaust the retry budget; the fourth is the
+// coerced response the NL-to-Format fallback secondary call returns —
+// valid JSON matching the schema. The mock returns whatever the test
+// programmed for the Nth call; the host's fallback issues a 4th call
+// after retry exhaustion.
+const NL_THEN_COERCED_PROGRAM = [
+  { content: 'Sure, here is the result: the answer is OK.' },
+  { content: 'Of course! The result you wanted is okay.' },
+  { content: 'I think the result should be ok-ish.' },
+  { content: '{"result":"coerced-ok"}' },
+];
+describe.skipIf(HTTP_SKIP)('envelope-nl-to-format-engaged: runtime behavior (RFC 0032 §B.5 MAY)', () => {
+  it('when retry exhaustion triggers the NL-to-Format fallback, exactly one `envelope.nlToFormat.engaged` event fires', async () => {
+    if (!isFixtureAdvertised(FIXTURE)) return;
+    const seed = await programMock(NL_THEN_COERCED_PROGRAM);
+    if (seed.status === 404) return;
+    expect(seed.status).toBe(200);
+    const events = await runAndReadEvents();
+    if (events === null) return;
+    const engagements = events.filter((e) => e.type === 'envelope.nlToFormat.engaged');
+    expect(
+      engagements.length,
+      driver.describe(
+        'RFCS/0032-envelope-reliability-events.md §B.5',
+        'exactly one envelope.nlToFormat.engaged event MUST fire when the host detects NL-shape responses after retry exhaustion',
+      ),
+    ).toBe(1);
+  });
+  it('`originalEnvelopeType` carries the envelope kind the original attempt targeted', async () => {
+    if (!isFixtureAdvertised(FIXTURE)) return;
+    const seed = await programMock(NL_THEN_COERCED_PROGRAM);
+    if (seed.status === 404) return;
+    const events = await runAndReadEvents();
+    if (events === null) return;
+    const engagement = events.find((e) => e.type === 'envelope.nlToFormat.engaged');
+    expect(engagement).toBeDefined();
+    expect(
+      typeof engagement!.payload?.originalEnvelopeType,
+      driver.describe(
+        'RFCS/0032-envelope-reliability-events.md §B.5',
+        'originalEnvelopeType MUST be present and string-typed — derived from the response-schema or wrapping metadata',
+      ),
+    ).toBe('string');
+    expect((engagement!.payload?.originalEnvelopeType as string).length).toBeGreaterThan(0);
+  });
+  it('`fallbackCalls >= 1` reports the number of secondary LLM calls used to reformat free-form output into the envelope schema', async () => {
+    if (!isFixtureAdvertised(FIXTURE)) return;
+    const seed = await programMock(NL_THEN_COERCED_PROGRAM);
+    if (seed.status === 404) return;
+    const events = await runAndReadEvents();
+    if (events === null) return;
+    const engagement = events.find((e) => e.type === 'envelope.nlToFormat.engaged');
+    expect(engagement).toBeDefined();
+    const fallbackCalls = engagement!.payload?.fallbackCalls;
+    expect(typeof fallbackCalls).toBe('number');
+    expect(
+      fallbackCalls as number,
+      driver.describe(
+        'RFCS/0032-envelope-reliability-events.md §B.5',
+        'fallbackCalls MUST be >= 1 — the fallback fired at least one secondary call to reformat the free-form output',
+      ),
+    ).toBeGreaterThanOrEqual(1);
+  });
+  it('the eventual envelope acceptance (when fallback succeeds) records normally via downstream RunEventDoc', async () => {
+    if (!isFixtureAdvertised(FIXTURE)) return;
+    const seed = await programMock(NL_THEN_COERCED_PROGRAM);
+    if (seed.status === 404) return;
+    const events = await runAndReadEvents();
+    if (events === null) return;
+    const nodeCompleted = events.find((e) => e.type === 'node.completed' && e.nodeId === NODE_ID);
+    expect(
+      nodeCompleted,
+      driver.describe(
+        'RFCS/0032-envelope-reliability-events.md §B.5',
+        'NL-to-Format fallback success MUST reach node.completed — the coerced envelope flows downstream like any other accepted envelope',
+      ),
+    ).toBeDefined();
+    const completedPayload = JSON.stringify(nodeCompleted?.payload ?? {});
+    expect(
+      completedPayload.includes('coerced-ok'),
+      driver.describe(
+        'RFCS/0032-envelope-reliability-events.md §B.5',
+        'the coerced structured data from the secondary call MUST flow to the downstream RunEventDoc',
+      ),
+    ).toBe(true);
+  });
+});