npm - @demigodmode/pi-web-agent - Versions diffs - 0.2.1 → 0.2.2 - Mend

@demigodmode/pi-web-agent 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/README.md +63 -199
package/dist/scripts/live-web-eval.d.ts +1 -0
package/dist/scripts/live-web-eval.js +411 -0
package/dist/src/cache/ttl-cache.d.ts +8 -0
package/dist/src/cache/ttl-cache.js +21 -0
package/dist/src/extension.d.ts +2 -0
package/dist/src/extension.js +155 -0
package/dist/src/extract/readability.d.ts +8 -0
package/dist/src/extract/readability.js +93 -0
package/dist/src/fetch/browser-resolution.d.ts +15 -0
package/dist/src/fetch/browser-resolution.js +55 -0
package/dist/src/fetch/headless-fetch.d.ts +18 -0
package/dist/src/fetch/headless-fetch.js +87 -0
package/dist/src/fetch/http-fetch.d.ts +4 -0
package/dist/src/fetch/http-fetch.js +50 -0
package/dist/src/orchestration/index.d.ts +41 -0
package/dist/src/orchestration/index.js +9 -0
package/dist/src/orchestration/research-orchestrator.d.ts +43 -0
package/dist/src/orchestration/research-orchestrator.js +87 -0
package/dist/src/orchestration/research-types.d.ts +41 -0
package/dist/src/orchestration/research-types.js +1 -0
package/dist/src/orchestration/research-worker.d.ts +16 -0
package/dist/src/orchestration/research-worker.js +131 -0
package/dist/src/search/duckduckgo.d.ts +9 -0
package/dist/src/search/duckduckgo.js +52 -0
package/dist/src/tools/web-explore.d.ts +44 -0
package/dist/src/tools/web-explore.js +50 -0
package/dist/src/tools/web-fetch-headless.d.ts +6 -0
package/dist/src/tools/web-fetch-headless.js +14 -0
package/dist/src/tools/web-fetch.d.ts +6 -0
package/dist/src/tools/web-fetch.js +14 -0
package/dist/src/tools/web-search.d.ts +10 -0
package/dist/src/tools/web-search.js +103 -0
package/dist/src/types.d.ts +48 -0
package/dist/src/types.js +7 -0
package/dist/tests/cache/ttl-cache.test.d.ts +1 -0
package/dist/tests/cache/ttl-cache.test.js +19 -0
package/dist/tests/contracts.test.d.ts +1 -0
package/dist/tests/contracts.test.js +65 -0
package/dist/tests/extension.test.d.ts +1 -0
package/dist/tests/extension.test.js +123 -0
package/dist/tests/extract/readability.test.d.ts +1 -0
package/dist/tests/extract/readability.test.js +79 -0
package/dist/tests/fetch/browser-resolution.test.d.ts +1 -0
package/dist/tests/fetch/browser-resolution.test.js +37 -0
package/dist/tests/fetch/headless-fetch.smoke.test.d.ts +1 -0
package/dist/tests/fetch/headless-fetch.smoke.test.js +17 -0
package/dist/tests/fetch/headless-fetch.test.d.ts +1 -0
package/dist/tests/fetch/headless-fetch.test.js +150 -0
package/dist/tests/fetch/http-fetch.test.d.ts +1 -0
package/dist/tests/fetch/http-fetch.test.js +129 -0
package/dist/tests/orchestration/research-orchestrator.test.d.ts +1 -0
package/dist/tests/orchestration/research-orchestrator.test.js +298 -0
package/dist/tests/orchestration/research-worker.test.d.ts +1 -0
package/dist/tests/orchestration/research-worker.test.js +171 -0
package/dist/tests/orchestration/research-workflow.test.d.ts +1 -0
package/dist/tests/orchestration/research-workflow.test.js +119 -0
package/dist/tests/package-manifest.test.d.ts +1 -0
package/dist/tests/package-manifest.test.js +29 -0
package/dist/tests/release-foundation.test.d.ts +1 -0
package/dist/tests/release-foundation.test.js +16 -0
package/dist/tests/release-script.test.d.ts +1 -0
package/dist/tests/release-script.test.js +72 -0
package/dist/tests/search/duckduckgo.test.d.ts +1 -0
package/dist/tests/search/duckduckgo.test.js +103 -0
package/dist/tests/tools/web-explore.test.d.ts +1 -0
package/dist/tests/tools/web-explore.test.js +163 -0
package/dist/tests/tools/web-fetch-headless.test.d.ts +1 -0
package/dist/tests/tools/web-fetch-headless.test.js +31 -0
package/dist/tests/tools/web-fetch.test.d.ts +1 -0
package/dist/tests/tools/web-fetch.test.js +27 -0
package/dist/tests/tools/web-search.test.d.ts +1 -0
package/dist/tests/tools/web-search.test.js +125 -0
package/dist/vitest.config.d.ts +2 -0
package/dist/vitest.config.js +13 -0
package/package.json +5 -1

package/README.md CHANGED Viewed

@@ -1,199 +1,63 @@
-# pi-web-agent
-`@demigodmode/pi-web-agent` is a Pi package for reliable web access.
-It is built around a simple rule: searching for a page is not the same thing as reading it. This package keeps those steps separate, prefers plain HTTP by default, and is designed to say "I couldn't read this reliably" instead of making something up.
-## What it does
-The package is built around three tools:
-- `web_search` finds relevant pages and returns titles, URLs, and snippets
-- `web_fetch` fetches a specific page over plain HTTP and tries to extract readable content
-- `web_fetch_headless` is the explicit browser-based path for pages that need rendering
-The boundary between those tools is intentional.
-`web_search` is for discovery. It should not imply that a page was fetched.
-`web_fetch` is for reading a page over HTTP. If the result looks weak, incomplete, blocked, or too script-heavy, it should return `needs_headless` instead of bluffing.
-`web_fetch_headless` exists for the cases where a browser really is required. It is opt-in only.
-## Why this exists
-A lot of web tooling in coding agents gets fuzzy in exactly the wrong places. Search results get treated like page reads. Browser fallback happens behind the scenes. Failures get softened into fake confidence.
-This package is trying to do the opposite.
-The rules are straightforward:
-- no hidden browser launch
-- no automatic HTTP-to-headless fallback
-- no claiming a page was read when only snippets were available
-- explicit structured failure when the result is incomplete or blocked
-## What makes it different
-The main thing is the contract.
-`web_search` discovers sources.
-`web_fetch` reads over HTTP only.
-`web_fetch_headless` is the explicit browser path.
-That separation is the whole point. It makes failures easier to reason about and avoids the weird behavior where a tool quietly changes execution mode behind your back.
-## Install
-Install it through Pi:
-```bash
-pi install npm:@demigodmode/pi-web-agent
-```
-Update installed packages later with:
-```bash
-pi update
-```
-If you just want to inspect the package from npm directly, the package name is:
-```bash
-npm view @demigodmode/pi-web-agent
-```
-## Current status
-This repo is in early MVP shape, but it is no longer just a design doc.
-Right now it has:
-- a TypeScript project scaffold
-- shared result and status contracts
-- a DuckDuckGo HTML parser for `web_search`
-- an HTTP fetch path with readability-based extraction and conservative escalation to `needs_headless`
-- a real browser-backed `web_fetch_headless` implementation with local browser resolution
-- repo-local Pi extension wiring for development
-- a test suite around parser behavior, contracts, extraction, caching, and tool adapters
-- optional smoke coverage for local installed browsers
-So the project is real and usable, but still early.
-## Example behavior
-These are conceptual examples of the contract the package is aiming to expose.
-### Search
-`web_search("pi coding agent")`
-Returns discovery results like:
-- title
-- URL
-- snippet
-It does not imply the page was fetched.
-### HTTP fetch
-`web_fetch("https://example.com/article")`
-If the page is readable over plain HTTP, it should return extracted content.
-If the page looks too script-heavy, too thin, blocked, or otherwise unreliable, it should return `needs_headless` instead of pretending the extraction is good enough.
-### Explicit headless fetch
-`web_fetch_headless("https://example.com/app")`
-This is the browser-based path for pages that really need rendering.
-This path now launches a local browser explicitly, waits for the rendered page to settle, and then extracts readable content from the rendered HTML.
-## Local development
-Install dependencies:
-```bash
-npm install
-```
-Run tests with coverage:
-```bash
-npm test
-```
-Run the typecheck used as lint:
-```bash
-npm run lint
-```
-Build the project:
-```bash
-npm run build
-```
-To run the optional real-browser smoke test for headless fetch, set `PI_HEADLESS_SMOKE=1` before running Vitest. It stays skipped by default so local browser install differences do not make the normal test suite flaky.
-Coverage is now part of the normal `npm test` flow. Vitest prints a text summary in the terminal and writes the full HTML report to `coverage/`.
-### Trying it in Pi locally
-This repo includes a project-local Pi extension entrypoint at `.pi/extensions/pi-web-agent.ts` for development and hot reload.
-For the published npm package, Pi loads the compiled runtime from `dist/extension.js` via the `pi.extensions` entry in `package.json`.
-After starting Pi in this project, use `/reload` if you change the extension code and want Pi to pick up the latest version.
-## Project layout
-The code is split into small modules on purpose.
-- `src/extension.ts` - package entry surface
-- `src/tools/` - thin tool adapters
-- `src/search/` - search backend logic
-- `src/fetch/` - HTTP and headless fetch logic
-- `src/extract/` - readable-content extraction
-- `src/cache/` - small cache utilities
-- `src/types.ts` - shared contracts
-- `tests/` - parser, contract, extraction, fetch, and adapter tests
-## License
-AGPL-3.0-only. See `LICENSE`.
-## Release process
-1. Update `CHANGELOG.md` under `## Unreleased`.
-2. Run `npm run release:dry-run` to preview the next version.
-3. Run `npm run release` to bump version, rewrite the changelog release heading, create a release commit, and create a tag.
-4. Push the branch and tag.
-5. GitHub Actions publishes the tagged release to npm.
-## Maintainer release notes
-This repo is set up for npm Trusted Publishing from GitHub Actions.
-In npm package settings, add a trusted publisher for:
-- package: `@demigodmode/pi-web-agent`
-- provider: GitHub Actions
-- repository: `demigodmode/pi-web-agent`
-That replaces the old `NPM_TOKEN` secret flow.
-## Near-term next steps
-The next chunk of work is pretty clear:
-- keep tightening weak-content escalation on tricky HTTP targets
-- improve cleanup of noisy rendered-page extraction on busy sites
-- expand fixtures and end-to-end coverage
-- add alternate search backends behind a first-class provider abstraction
+# pi-web-agent
+[![CI](https://github.com/demigodmode/pi-web-agent/actions/workflows/ci.yml/badge.svg)](https://github.com/demigodmode/pi-web-agent/actions/workflows/ci.yml)
+[![npm version](https://img.shields.io/npm/v/@demigodmode/pi-web-agent)](https://www.npmjs.com/package/@demigodmode/pi-web-agent)
+[![Docs](https://img.shields.io/badge/docs-github%20pages-blue)](https://demigodmode.github.io/pi-web-agent/)
+`@demigodmode/pi-web-agent` is a Pi package for web access.
+The whole point is keeping the boundaries straight:
+- `web_search` is for discovery
+- `web_fetch` is for plain HTTP reads
+- `web_fetch_headless` is the explicit browser path
+- `web_explore` is the bounded research path
+That sounds obvious, but a lot of agent tooling gets fuzzy right there. This package is meant to be stricter about what it actually did and more willing to say when a read was not good enough to trust.
+## Install
+```bash
+pi install npm:@demigodmode/pi-web-agent
+```
+Later on, update installed packages with:
+```bash
+pi update
+```
+## Docs
+Docs site:
+- https://demigodmode.github.io/pi-web-agent/
+Work on the docs locally:
+```bash
+npm run docs:dev
+```
+Build the docs:
+```bash
+npm run docs:build
+```
+## Local development
+```bash
+npm install
+npm test
+npm run lint
+npm run build
+```
+For local Pi work, this repo includes `.pi/extensions/pi-web-agent.ts`.
+If Pi is already running, use `/reload` after changes.
+## License
+AGPL-3.0-only. See `LICENSE`.

package/dist/scripts/live-web-eval.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/scripts/live-web-eval.js ADDED Viewed

@@ -0,0 +1,411 @@
+import { mkdir, writeFile } from 'node:fs/promises';
+import path from 'node:path';
+import process from 'node:process';
+import { AuthStorage, createAgentSession, ModelRegistry, SessionManager } from '@mariozechner/pi-coding-agent';
+import { createWebSearchTool } from '../src/tools/web-search.js';
+const PROMPTS = [
+    {
+        id: 'prompt-1',
+        title: 'Playwright installed browser guidance',
+        prompt: 'Find current docs or discussions about Playwright launching an installed Chrome or Edge executable instead of a bundled browser, then summarize the recommended approach.'
+    },
+    {
+        id: 'prompt-2',
+        title: 'Vitest coverage configuration',
+        prompt: 'Find the current Vitest coverage docs and tell me how to enable coverage with the V8 provider in a TypeScript project.'
+    },
+    {
+        id: 'prompt-4',
+        title: 'DuckDuckGo HTML scraping pitfalls',
+        prompt: 'Find two or three current sources on DuckDuckGo HTML scraping in Node.js and tell me what the common parsing pitfalls are.'
+    }
+];
+const SEARCH_FAILURE_CASES = [
+    {
+        id: 'no-results',
+        title: 'NO_RESULTS classification',
+        expectedCode: 'NO_RESULTS',
+        expectedMessage: 'DuckDuckGo returned no usable results for this query.',
+        searchHtml: async () => `
+      <html>
+        <body>
+          <div class="results">
+            <div class="no-results">No results found for your search.</div>
+          </div>
+        </body>
+      </html>
+    `
+    },
+    {
+        id: 'parse-failed',
+        title: 'PARSE_FAILED classification',
+        expectedCode: 'PARSE_FAILED',
+        expectedMessage: 'DuckDuckGo returned a page, but it did not match the expected results format.',
+        searchHtml: async () => `
+      <html>
+        <body>
+          <main>
+            <h1>Unexpected page</h1>
+            <p>Nothing here looks like a search results page.</p>
+          </main>
+        </body>
+      </html>
+    `
+    },
+    {
+        id: 'blocked-html',
+        title: 'BLOCKED classification from challenge HTML',
+        expectedCode: 'BLOCKED',
+        expectedMessage: 'DuckDuckGo search appears to be blocked or rate limited.',
+        searchHtml: async () => `
+      <html>
+        <body>
+          <main>
+            <h1>Are you a robot?</h1>
+            <p>Please verify you are human to continue.</p>
+          </main>
+        </body>
+      </html>
+    `
+    },
+    {
+        id: 'fetch-failed',
+        title: 'FETCH_FAILED classification',
+        expectedCode: 'FETCH_FAILED',
+        expectedMessage: 'DuckDuckGo search request failed: socket hang up',
+        searchHtml: async () => {
+            throw new Error('socket hang up');
+        }
+    }
+];
+function isoNow() {
+    return new Date().toISOString();
+}
+function safeFileStamp(date = new Date()) {
+    return date.toISOString().replace(/[:.]/g, '-');
+}
+function extractText(value) {
+    if (typeof value === 'string')
+        return value;
+    if (!value || typeof value !== 'object')
+        return '';
+    if (Array.isArray(value)) {
+        return value.map(extractText).filter(Boolean).join('\n');
+    }
+    const record = value;
+    if (typeof record.text === 'string')
+        return record.text;
+    if (typeof record.content === 'string')
+        return record.content;
+    if (Array.isArray(record.content)) {
+        return record.content
+            .map((item) => {
+            if (!item || typeof item !== 'object')
+                return '';
+            const contentItem = item;
+            return typeof contentItem.text === 'string' ? contentItem.text : '';
+        })
+            .filter(Boolean)
+            .join('\n');
+    }
+    const nestedMessage = record.message;
+    if (nestedMessage && typeof nestedMessage === 'object') {
+        const nestedRecord = nestedMessage;
+        if (Array.isArray(nestedRecord.content)) {
+            return extractText(nestedMessage);
+        }
+    }
+    return '';
+}
+function toolDetails(result) {
+    if (!result || typeof result !== 'object')
+        return result;
+    const record = result;
+    return record.details ?? result;
+}
+function isEmptySearchResult(result) {
+    const details = toolDetails(result);
+    if (!details || typeof details !== 'object')
+        return false;
+    const record = details;
+    return record.status === 'ok' && Array.isArray(record.results) && record.results.length === 0;
+}
+function isUnsupportedFetchResult(result) {
+    const details = toolDetails(result);
+    return !!details && typeof details === 'object' && details.status === 'unsupported';
+}
+function isBotCheckHeadlessResult(result) {
+    const details = toolDetails(result);
+    if (!details || typeof details !== 'object')
+        return false;
+    const record = details;
+    const content = record.content;
+    const text = extractText(content);
+    const title = content && typeof content === 'object' && !Array.isArray(content)
+        ? String(content.title ?? '')
+        : '';
+    return /just a moment|security verification|verify you are not a bot/i.test(`${title}\n${text}`);
+}
+function isPostWebExploreGuardResult(result) {
+    const details = toolDetails(result);
+    if (!details || typeof details !== 'object')
+        return false;
+    const record = details;
+    const error = record.error;
+    if (!error || typeof error !== 'object')
+        return false;
+    return error.code === 'POST_WEB_EXPLORE_GUARD';
+}
+function buildMetrics(toolCalls) {
+    const webToolNames = new Set(['web_explore', 'web_search', 'web_fetch', 'web_fetch_headless']);
+    const lowLevelWebToolNames = new Set(['web_search', 'web_fetch', 'web_fetch_headless']);
+    const webToolCalls = toolCalls.filter((call) => webToolNames.has(call.toolName));
+    const firstWebTool = webToolCalls[0];
+    const firstWebExploreIndex = toolCalls.findIndex((call) => call.toolName === 'web_explore');
+    return {
+        webExploreUsed: firstWebExploreIndex !== -1,
+        webExploreFirstWebTool: firstWebTool?.toolName === 'web_explore',
+        totalToolCalls: toolCalls.length,
+        totalWebToolCalls: webToolCalls.length,
+        searchCalls: toolCalls.filter((call) => call.toolName === 'web_search').length,
+        fetchCalls: toolCalls.filter((call) => call.toolName === 'web_fetch').length,
+        headlessCalls: toolCalls.filter((call) => call.toolName === 'web_fetch_headless').length,
+        lowLevelCallsAfterExplore: firstWebExploreIndex === -1
+            ? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length
+            : toolCalls
+                .slice(firstWebExploreIndex + 1)
+                .filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length,
+        guardedLowLevelCallsAfterExplore: firstWebExploreIndex === -1
+            ? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length
+            : toolCalls
+                .slice(firstWebExploreIndex + 1)
+                .filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length,
+        emptySearches: toolCalls.filter((call) => call.toolName === 'web_search' && isEmptySearchResult(call.result)).length,
+        unsupportedFetches: toolCalls.filter((call) => (call.toolName === 'web_fetch' || call.toolName === 'web_fetch_headless') &&
+            isUnsupportedFetchResult(call.result)).length,
+        botCheckHeadlesses: toolCalls.filter((call) => call.toolName === 'web_fetch_headless' && isBotCheckHeadlessResult(call.result)).length
+    };
+}
+function evaluateVerdict(metrics, finalAnswer) {
+    const notes = [];
+    if (!metrics.webExploreUsed) {
+        notes.push('web_explore was not used');
+        return { verdict: 'fail', notes };
+    }
+    if (!metrics.webExploreFirstWebTool) {
+        notes.push('web_explore was not the first web research tool');
+    }
+    if (metrics.lowLevelCallsAfterExplore > 2) {
+        notes.push(`too many low-level calls after web_explore (${metrics.lowLevelCallsAfterExplore})`);
+    }
+    if (metrics.emptySearches > 0) {
+        notes.push(`empty web_search calls observed (${metrics.emptySearches})`);
+    }
+    if (metrics.botCheckHeadlesses > 0) {
+        notes.push(`headless bot-check pages observed (${metrics.botCheckHeadlesses})`);
+    }
+    if (!finalAnswer.trim()) {
+        notes.push('final answer text was empty');
+        return { verdict: 'fail', notes };
+    }
+    const looksClean = metrics.webExploreFirstWebTool &&
+        metrics.lowLevelCallsAfterExplore <= 1 &&
+        metrics.emptySearches === 0 &&
+        metrics.botCheckHeadlesses === 0;
+    if (looksClean) {
+        return { verdict: 'pass', notes };
+    }
+    return { verdict: 'mixed', notes };
+}
+function formatSearchFailureMarkdown(cases) {
+    if (cases.length === 0) {
+        return '## Search failure cases\n\nNone.\n';
+    }
+    const sections = cases
+        .map((testCase) => {
+        const notes = testCase.notes.length > 0 ? testCase.notes.map((note) => `- ${note}`).join('\n') : '- none';
+        return `### ${testCase.title}\n\n` +
+            `Verdict: **${testCase.verdict}**\n\n` +
+            `- expected code: ${testCase.expectedCode}\n` +
+            `- actual code: ${testCase.actualCode}\n` +
+            `- expected message: ${testCase.expectedMessage}\n` +
+            `- actual message: ${testCase.actualMessage}\n\n` +
+            `Notes:\n${notes}\n`;
+    })
+        .join('\n');
+    return `## Search failure cases\n\n${sections}`;
+}
+function formatMarkdown(run) {
+    const sections = run.prompts
+        .map((prompt) => {
+        const tools = prompt.toolCalls
+            .map((call, index) => `  ${index + 1}. ${call.toolName}`)
+            .join('\n');
+        const notes = prompt.notes.length > 0 ? prompt.notes.map((note) => `- ${note}`).join('\n') : '- none';
+        return `## ${prompt.title}\n\n` +
+            `Prompt: ${prompt.prompt}\n\n` +
+            `Verdict: **${prompt.verdict}**\n\n` +
+            `Metrics:\n` +
+            `- web_explore used: ${prompt.metrics.webExploreUsed}\n` +
+            `- web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}\n` +
+            `- total tool calls: ${prompt.metrics.totalToolCalls}\n` +
+            `- total web tool calls: ${prompt.metrics.totalWebToolCalls}\n` +
+            `- web_search calls: ${prompt.metrics.searchCalls}\n` +
+            `- web_fetch calls: ${prompt.metrics.fetchCalls}\n` +
+            `- web_fetch_headless calls: ${prompt.metrics.headlessCalls}\n` +
+            `- low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}\n` +
+            `- guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}\n` +
+            `- empty searches: ${prompt.metrics.emptySearches}\n` +
+            `- unsupported fetches: ${prompt.metrics.unsupportedFetches}\n` +
+            `- bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}\n\n` +
+            `Tool order:\n${tools || '  none'}\n\n` +
+            `Notes:\n${notes}\n\n` +
+            `Final answer:\n\n${prompt.finalAnswer.trim() || '(empty)'}\n`;
+    })
+        .join('\n---\n\n');
+    return `# live web eval\n\nStarted: ${run.startedAt}\nFinished: ${run.finishedAt}\nCWD: ${run.cwd}\n\n` +
+        `${sections}\n\n---\n\n${formatSearchFailureMarkdown(run.searchFailureCases)}`;
+}
+function evaluateSearchFailureCase(expectedCode, actualCode, expectedMessage, actualMessage) {
+    const notes = [];
+    if (actualCode !== expectedCode) {
+        notes.push(`expected code ${expectedCode} but got ${actualCode}`);
+    }
+    if (actualMessage !== expectedMessage) {
+        notes.push(`expected message \"${expectedMessage}\" but got \"${actualMessage}\"`);
+    }
+    return {
+        verdict: notes.length === 0 ? 'pass' : 'fail',
+        notes
+    };
+}
+async function runPrompt(promptCase, cwd, authStorage, modelRegistry) {
+    const startedAt = Date.now();
+    const toolCalls = [];
+    let finalAnswer = '';
+    const { session } = await createAgentSession({
+        cwd,
+        authStorage,
+        modelRegistry,
+        sessionManager: SessionManager.inMemory()
+    });
+    const unsubscribe = session.subscribe((event) => {
+        if (event.type === 'tool_execution_start') {
+            toolCalls.push({
+                toolName: event.toolName,
+                args: event.args,
+                startedAt: isoNow()
+            });
+        }
+        if (event.type === 'tool_execution_end') {
+            const active = [...toolCalls].reverse().find((call) => call.toolName === event.toolName && !call.endedAt);
+            if (active) {
+                active.endedAt = isoNow();
+                active.isError = !!event.isError;
+                active.result = event.result;
+            }
+        }
+        if (event.type === 'message_end' && event.message?.role === 'assistant') {
+            const text = extractText(event.message);
+            if (text.trim()) {
+                finalAnswer = text;
+            }
+        }
+    });
+    try {
+        await session.prompt(promptCase.prompt);
+        if (!finalAnswer.trim()) {
+            const reversedMessages = [...session.messages].reverse();
+            const lastAssistant = reversedMessages.find((message) => message?.role === 'assistant');
+            finalAnswer = lastAssistant ? extractText(lastAssistant) : '';
+        }
+    }
+    finally {
+        unsubscribe();
+        session.dispose();
+    }
+    const finishedAt = Date.now();
+    const metrics = buildMetrics(toolCalls);
+    const evaluation = evaluateVerdict(metrics, finalAnswer);
+    return {
+        id: promptCase.id,
+        title: promptCase.title,
+        prompt: promptCase.prompt,
+        startedAt: new Date(startedAt).toISOString(),
+        finishedAt: new Date(finishedAt).toISOString(),
+        durationMs: finishedAt - startedAt,
+        finalAnswer,
+        toolCalls,
+        metrics,
+        verdict: evaluation.verdict,
+        notes: evaluation.notes
+    };
+}
+async function runSearchFailureCase(testCase) {
+    const startedAt = Date.now();
+    const search = createWebSearchTool({ searchHtml: testCase.searchHtml });
+    const result = await search({ query: 'deterministic test query' });
+    const finishedAt = Date.now();
+    const actualCode = result.error?.code ?? 'NO_ERROR';
+    const actualMessage = result.error?.message ?? 'No error message returned.';
+    const evaluation = evaluateSearchFailureCase(testCase.expectedCode, actualCode, testCase.expectedMessage, actualMessage);
+    return {
+        id: testCase.id,
+        title: testCase.title,
+        startedAt: new Date(startedAt).toISOString(),
+        finishedAt: new Date(finishedAt).toISOString(),
+        durationMs: finishedAt - startedAt,
+        expectedCode: testCase.expectedCode,
+        actualCode,
+        expectedMessage: testCase.expectedMessage,
+        actualMessage,
+        verdict: evaluation.verdict,
+        notes: evaluation.notes
+    };
+}
+async function main() {
+    const cwd = process.cwd();
+    const startedAt = isoNow();
+    const authStorage = AuthStorage.create();
+    const modelRegistry = ModelRegistry.create(authStorage);
+    const prompts = [];
+    for (const promptCase of PROMPTS) {
+        console.log(`Running ${promptCase.id}: ${promptCase.title}`);
+        prompts.push(await runPrompt(promptCase, cwd, authStorage, modelRegistry));
+    }
+    const searchFailureCases = [];
+    for (const testCase of SEARCH_FAILURE_CASES) {
+        console.log(`Running ${testCase.id}: ${testCase.title}`);
+        searchFailureCases.push(await runSearchFailureCase(testCase));
+    }
+    const run = {
+        startedAt,
+        finishedAt: isoNow(),
+        cwd,
+        prompts,
+        searchFailureCases
+    };
+    const outputDir = path.join(cwd, 'local_docs', 'tmp', 'live-evals');
+    await mkdir(outputDir, { recursive: true });
+    const stamp = safeFileStamp();
+    const jsonPath = path.join(outputDir, `${stamp}.json`);
+    const mdPath = path.join(outputDir, `${stamp}.md`);
+    await writeFile(jsonPath, `${JSON.stringify(run, null, 2)}\n`, 'utf8');
+    await writeFile(mdPath, `${formatMarkdown(run)}\n`, 'utf8');
+    console.log(`\nSaved JSON: ${jsonPath}`);
+    console.log(`Saved Markdown: ${mdPath}`);
+    for (const prompt of run.prompts) {
+        console.log(`\n${prompt.id} -> ${prompt.verdict}`);
+        console.log(`  web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}`);
+        console.log(`  low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}`);
+        console.log(`  guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}`);
+        console.log(`  empty searches: ${prompt.metrics.emptySearches}`);
+        console.log(`  bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}`);
+    }
+    for (const testCase of run.searchFailureCases) {
+        console.log(`\n${testCase.id} -> ${testCase.verdict}`);
+        console.log(`  expected code: ${testCase.expectedCode}`);
+        console.log(`  actual code: ${testCase.actualCode}`);
+    }
+}
+await main();

package/dist/src/cache/ttl-cache.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+export declare function createCacheKey(parts: Array<string | number | boolean>): string;
+export declare function createTtlCache<T>({ ttlMs, now }: {
+    ttlMs: number;
+    now?: () => number;
+}): {
+    get(key: string): T | undefined;
+    set(key: string, value: T): void;
+};