npm - engram-sdk - Versions diffs - 0.5.1 → 0.5.2 - Mend

engram-sdk 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +4 -2
package/dist/cli.js +85 -2
package/dist/cli.js.map +1 -1
package/dist/hosted.d.ts.map +1 -1
package/dist/hosted.js +38 -1
package/dist/hosted.js.map +1 -1
package/dist/server.d.ts.map +1 -1
package/dist/server.js +218 -0
package/dist/server.js.map +1 -1
package/dist/telemetry.js +1 -1
package/dist/telemetry.js.map +1 -1
package/package.json +1 -1
package/.mcpregistry_github_token +0 -1
package/.mcpregistry_registry_token +0 -1
package/deploy/fly.toml +0 -26
package/eval-codebase-v2-NOTE.md +0 -22
package/fly.toml +0 -33
package/hackernews-post.md +0 -45
package/hn-posts/2026-02-23.md +0 -64
package/rescore-codebase.ts +0 -184
package/rescore-vscode.ts +0 -142
package/signal-quality-plan.md +0 -23

package/rescore-codebase.ts DELETED Viewed

@@ -1,184 +0,0 @@
-#!/usr/bin/env npx tsx
-/**
- * rescore-codebase.ts — Re-score existing codebase eval results using LLM judge
- * Uses the saved answers + ground truth, just re-runs scoring
- */
-import { readFileSync, writeFileSync } from 'fs';
-import { resolve, dirname } from 'path';
-import { fileURLToPath } from 'url';
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const GEMINI_KEY = readFileSync(resolve(process.env.HOME!, '.config/engram/gemini-key'), 'utf8').trim();
-const RESULTS_PATH = resolve(__dirname, 'eval-scale-data/codebase-results-vscode.json');
-const REPORT_PATH = resolve(__dirname, 'eval-scale-data/codebase-report-vscode-v2.json');
-async function geminiCall(prompt: string, maxTokens = 100): Promise<string> {
-  for (let attempt = 0; attempt < 3; attempt++) {
-    try {
-      const response = await fetch(
-        `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=${GEMINI_KEY}`,
-        {
-          method: 'POST',
-          headers: { 'Content-Type': 'application/json' },
-          body: JSON.stringify({
-            contents: [{ parts: [{ text: prompt }] }],
-            generationConfig: { maxOutputTokens: maxTokens, temperature: 0 },
-          }),
-        }
-      );
-      if (response.status === 429) {
-        const retryAfter = parseInt(response.headers.get('retry-after') || '10');
-        console.log(`    Rate limited, waiting ${retryAfter}s...`);
-        await new Promise(r => setTimeout(r, retryAfter * 1000));
-        continue;
-      }
-      if (!response.ok) {
-        console.log(`    API error ${response.status}, retrying...`);
-        await new Promise(r => setTimeout(r, 2000));
-        continue;
-      }
-      const data = await response.json() as any;
-      return data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
-    } catch (e: any) {
-      console.log(`    Fetch error: ${e.message}, retrying...`);
-      await new Promise(r => setTimeout(r, 2000));
-    }
-  }
-  return '';
-}
-async function scoreAnswer(question: string, truth: string, answer: string): Promise<number> {
-  const prompt = `You are evaluating an AI's answer about a codebase. Score it from 0.0 to 1.0.
-- 1.0 = Correct and complete
-- 0.7 = Mostly correct, minor gaps
-- 0.5 = Partially correct
-- 0.3 = Mentions something relevant but mostly wrong
-- 0.0 = Wrong or "I don't know"
-Question: ${question}
-Ground Truth: ${truth}
-AI's Answer: ${answer}
-Respond with ONLY a decimal number (e.g. 0.7). Nothing else.`;
-  const response = await geminiCall(prompt);
-  const cleaned = response.trim();
-  // Try direct float parse first
-  const direct = parseFloat(cleaned);
-  if (!isNaN(direct) && direct >= 0 && direct <= 1) return direct;
-  // Try regex
-  const match = cleaned.match(/(0\.\d+|1\.0|0|1)/);
-  if (match) return parseFloat(match[1]);
-  console.log(`    Failed to parse score: "${cleaned}"`);
-  return -1; // Mark as failed, don't default to 0
-}
-async function main() {
-  const results = JSON.parse(readFileSync(RESULTS_PATH, 'utf8'));
-  console.log(`Rescoring ${results.length} results...\n`);
-  const systems = ['engram', 'cappedContext', 'naiveRag', 'grepSearch'] as const;
-  let totalScored = 0;
-  let totalFailed = 0;
-  for (let i = 0; i < results.length; i++) {
-    const r = results[i];
-    console.log(`[${i+1}/${results.length}] (${r.category}/${r.difficulty}) ${r.question.slice(0, 70)}...`);
-    const scores: Record<string, number> = {};
-    for (const sys of systems) {
-      if (!r[sys]?.answer) { scores[sys] = 0; continue; }
-      const score = await scoreAnswer(r.question, r.groundTruth, r[sys].answer);
-      if (score === -1) {
-        totalFailed++;
-        scores[sys] = 0;
-      } else {
-        scores[sys] = score;
-      }
-      r[sys].score = scores[sys];
-    }
-    totalScored += systems.length;
-    const line = systems.map(s => `${s[0].toUpperCase()}:${scores[s].toFixed(2)}`).join(' ');
-    console.log(`  ${line}`);
-    // Save progress every 5 questions
-    if ((i + 1) % 5 === 0 || i === results.length - 1) {
-      writeFileSync(RESULTS_PATH.replace('.json', '-rescored2.json'), JSON.stringify(results, null, 2));
-    }
-  }
-  console.log(`\nScored: ${totalScored}, Failed parses: ${totalFailed}\n`);
-  // Generate report
-  const avg = (sys: string) => {
-    const vals = results.map((r: any) => r[sys]?.score ?? 0);
-    return vals.reduce((a: number, b: number) => a + b, 0) / vals.length;
-  };
-  const avgTokens = (sys: string) => {
-    const vals = results.map((r: any) => r[sys]?.tokensUsed ?? 0);
-    return Math.round(vals.reduce((a: number, b: number) => a + b, 0) / vals.length);
-  };
-  console.log('=== VS Code Codebase Evaluation Report ===\n');
-  console.log('OVERALL (50 questions)');
-  console.log(`${'System'.padEnd(20)} ${'Accuracy'.padEnd(12)} Avg Tokens`);
-  for (const sys of systems) {
-    const acc = (avg(sys) * 100).toFixed(1);
-    console.log(`${sys.padEnd(20)} ${(acc + '%').padEnd(12)} ${avgTokens(sys)}`);
-  }
-  // Per category
-  const categories = [...new Set(results.map((r: any) => r.category))];
-  for (const cat of categories) {
-    const catResults = results.filter((r: any) => r.category === cat);
-    const catAvg = (sys: string) => {
-      const vals = catResults.map((r: any) => r[sys]?.score ?? 0);
-      return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
-    };
-    console.log(`\n  ${cat.toUpperCase()} (n=${catResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${catAvg(s)}%`).join(' ')}`);
-  }
-  // Per difficulty
-  const diffs = [...new Set(results.map((r: any) => r.difficulty))];
-  for (const diff of diffs) {
-    const diffResults = results.filter((r: any) => r.difficulty === diff);
-    const diffAvg = (sys: string) => {
-      const vals = diffResults.map((r: any) => r[sys]?.score ?? 0);
-      return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
-    };
-    console.log(`\n  ${diff.toUpperCase()} (n=${diffResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${diffAvg(s)}%`).join(' ')}`);
-  }
-  const tokenSavings = (1 - avgTokens('engram') / avgTokens('cappedContext')) * 100;
-  console.log(`\n  Token savings vs capped context: ${tokenSavings.toFixed(1)}%`);
-  // Save report
-  const report = {
-    timestamp: new Date().toISOString(),
-    totalQuestions: results.length,
-    failedParses: totalFailed,
-    overall: Object.fromEntries(systems.map(s => [s, {
-      accuracy: (avg(s) * 100).toFixed(1),
-      avgTokens: avgTokens(s),
-    }])),
-    byCategory: Object.fromEntries(categories.map(c => {
-      const cr = results.filter((r: any) => r.category === c);
-      return [c, Object.fromEntries(systems.map(s => [s, (cr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / cr.length * 100).toFixed(1)]))];
-    })),
-    byDifficulty: Object.fromEntries(diffs.map(d => {
-      const dr = results.filter((r: any) => r.difficulty === d);
-      return [d, Object.fromEntries(systems.map(s => [s, (dr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / dr.length * 100).toFixed(1)]))];
-    })),
-    tokenSavingsVsCapped: tokenSavings.toFixed(1) + '%',
-  };
-  writeFileSync(REPORT_PATH, JSON.stringify(report, null, 2));
-  console.log(`\nReport saved: ${REPORT_PATH}`);
-}
-main().catch(console.error);

package/rescore-vscode.ts DELETED Viewed

@@ -1,142 +0,0 @@
-#!/usr/bin/env npx tsx
-/**
- * rescore-vscode.ts -- Re-score the VS Code codebase eval results
- *
- * The original eval generated good answers but the judge scoring returned
- * unparseable responses (all 0s). This script re-runs ONLY the scoring step.
- */
-import { readFileSync, writeFileSync } from 'fs';
-import { homedir } from 'os';
-import { join } from 'path';
-const GEMINI_KEY = readFileSync(join(homedir(), '.config/engram/gemini-key'), 'utf8').trim();
-const EVAL_DIR = join(homedir(), '.openclaw/workspace/engram/eval-scale-data');
-const RESULTS_PATH = join(EVAL_DIR, 'codebase-results-vscode.json');
-const RESCORED_PATH = join(EVAL_DIR, 'codebase-results-vscode-rescored.json');
-const RATE_LIMIT_MS = 1500;
-async function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
-async function withRetry<T>(fn: () => Promise<T>, retries = 5): Promise<T> {
-  for (let i = 0; i < retries; i++) {
-    try {
-      return await fn();
-    } catch (err: any) {
-      if (err.message?.includes('429') && i < retries - 1) {
-        const backoff = Math.min(1500 * Math.pow(2, i + 1), 60000);
-        console.log(`  [Retry ${i + 1}/${retries}] 429, waiting ${backoff}ms...`);
-        await sleep(backoff);
-      } else {
-        throw err;
-      }
-    }
-  }
-  throw new Error('Exhausted retries');
-}
-async function geminiCall(prompt: string, maxTokens = 200): Promise<string> {
-  const res = await fetch(
-    `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=${GEMINI_KEY}`,
-    {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        contents: [{ parts: [{ text: prompt }] }],
-        generationConfig: { maxOutputTokens: maxTokens, temperature: 0.0 },
-      }),
-    }
-  );
-  if (!res.ok) {
-    const body = await res.text();
-    throw new Error(`Gemini ${res.status}: ${body.slice(0, 200)}`);
-  }
-  const data = await res.json() as any;
-  return data.candidates?.[0]?.content?.parts?.[0]?.text?.trim() || '';
-}
-async function scoreAnswer(question: string, groundTruth: string, answer: string): Promise<number> {
-  const prompt = `You are a strict code knowledge evaluator. Score this answer about the VS Code codebase.
-Question: ${question}
-Correct Answer: ${groundTruth}
-Given Answer: ${answer}
-Score from 0.0 to 1.0:
-- 1.0 = Completely correct, mentions the right files/classes/functions
-- 0.7 = Mostly correct, minor details missing
-- 0.5 = Partially correct, gets the general area but misses specifics
-- 0.3 = Vaguely related but mostly wrong
-- 0.0 = Completely wrong or says "insufficient context"
-Respond with ONLY a single number between 0.0 and 1.0. Nothing else.`;
-  const response = await withRetry(() => geminiCall(prompt, 10));
-  const score = parseFloat(response);
-  if (isNaN(score) || score < 0 || score > 1) {
-    console.log(`  Warning: unparseable score "${response}", defaulting to 0`);
-    return 0;
-  }
-  return score;
-}
-async function main() {
-  console.log('=== Re-scoring VS Code Codebase Eval ===\n');
-  const results = JSON.parse(readFileSync(RESULTS_PATH, 'utf8'));
-  console.log(`Loaded ${results.length} results\n`);
-  for (let i = 0; i < results.length; i++) {
-    const r = results[i];
-    console.log(`[${i + 1}/${results.length}] (${r.category}/${r.difficulty}) ${r.question.slice(0, 65)}...`);
-    // Score each system individually (more reliable than 4-at-once)
-    await sleep(RATE_LIMIT_MS);
-    r.engram.score = await scoreAnswer(r.question, r.groundTruth, r.engram.answer);
-    await sleep(RATE_LIMIT_MS);
-    r.cappedContext.score = await scoreAnswer(r.question, r.groundTruth, r.cappedContext.answer);
-    await sleep(RATE_LIMIT_MS);
-    r.naiveRag.score = await scoreAnswer(r.question, r.groundTruth, r.naiveRag.answer);
-    await sleep(RATE_LIMIT_MS);
-    r.grepSearch.score = await scoreAnswer(r.question, r.groundTruth, r.grepSearch.answer);
-    console.log(`  E:${r.engram.score.toFixed(2)} C:${r.cappedContext.score.toFixed(2)} R:${r.naiveRag.score.toFixed(2)} G:${r.grepSearch.score.toFixed(2)}`);
-    // Save every 5
-    if ((i + 1) % 5 === 0) {
-      writeFileSync(RESCORED_PATH, JSON.stringify(results, null, 2));
-    }
-  }
-  writeFileSync(RESCORED_PATH, JSON.stringify(results, null, 2));
-  console.log(`\nSaved to ${RESCORED_PATH}`);
-  // Print summary
-  const n = results.length;
-  const avg = (key: string) => (results.reduce((s: number, r: any) => s + r[key].score, 0) / n * 100).toFixed(1);
-  const avgTok = (key: string) => Math.round(results.reduce((s: number, r: any) => s + r[key].tokensUsed, 0) / n);
-  console.log(`\n=== RESULTS (${n} questions) ===`);
-  console.log(`Engram:         ${avg('engram')}%  (${avgTok('engram')} tok/q)`);
-  console.log(`Capped Context: ${avg('cappedContext')}%  (${avgTok('cappedContext')} tok/q)`);
-  console.log(`Naive RAG:      ${avg('naiveRag')}%  (${avgTok('naiveRag')} tok/q)`);
-  console.log(`Grep Search:    ${avg('grepSearch')}%  (${avgTok('grepSearch')} tok/q)`);
-  // By category
-  const cats = [...new Set(results.map((r: any) => r.category))];
-  for (const cat of cats) {
-    const cr = results.filter((r: any) => r.category === cat);
-    const catAvg = (key: string) => (cr.reduce((s: number, r: any) => s + r[key].score, 0) / cr.length * 100).toFixed(1);
-    console.log(`\n${(cat as string).toUpperCase()} (n=${cr.length}): E:${catAvg('engram')}% C:${catAvg('cappedContext')}% R:${catAvg('naiveRag')}% G:${catAvg('grepSearch')}%`);
-  }
-}
-main().catch(err => {
-  console.error('Fatal:', err);
-  process.exit(1);
-});

package/signal-quality-plan.md DELETED Viewed

@@ -1,23 +0,0 @@
-# Signal Quality Plan — More Memories, Better Organization
-Philosophy: More memories is GOOD. The graph handles scale. The problem isn't quantity — it's classification accuracy in the features that scan broadly (alerts, briefing, consolidation).
-## Fix 1: Smarter Status Classification at Intake
-**Problem:** Auto-ingest marks too many things as "pending" — descriptions of future plans in conversation get flagged as commitments even when they're just discussed possibilities.
-**Fix:** Tighten the prompt to distinguish "committed to doing X" from "discussed doing X". Add a post-intake validation in remember() that checks if the content actually represents a real commitment.
-## Fix 2: Salience Calibration
-**Problem:** 84% of memories have salience ≥ 0.6, making the signal useless for differentiation.
-**Fix:** Recalibrate extract.ts salience estimation. Most memories should be 0.3-0.5 (normal), with only truly important ones (decisions, preferences, corrections) at 0.7+.
-## Fix 3: Briefing as MEMORY.md Replacement
-**Problem:** Briefing dumps a flat list of facts. MEMORY.md is curated with sections and narrative. Agents prefer MEMORY.md.
-**Fix:** Make briefing() cluster memories by entity/topic and present them as organized sections. Add a "what changed since last session" section.
-## Fix 4: Alerts Precision
-**Problem:** 238 pending items, most are completed tasks or discussed-but-not-committed plans. False positive rate is too high.
-**Fix:** Add a "fulfilled" detection sweep — if a pending memory's content matches a later completed action, auto-mark it fulfilled. Also add confidence threshold for pending alerts.
-## Fix 5: Consolidation Quality Gate
-**Problem:** Consolidation processes all episodes including low-value ones, wasting LLM tokens.
-**Fix:** Filter episodes by salience before consolidation. Only consolidate episodes with salience ≥ 0.3.