npm - guild-agents - Versions diffs - 1.2.0 → 1.4.0 - Mend

guild-agents 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +16 -0
package/bin/guild.js +73 -0
package/package.json +5 -2
package/src/commands/eval.js +225 -0
package/src/commands/stats.js +147 -0
package/src/commands/workspace.js +38 -1
package/src/templates/skills/build-feature/evals/evals.json +53 -0
package/src/templates/skills/build-feature/evals/triggers.json +16 -0
package/src/templates/skills/council/SKILL.md +27 -6
package/src/templates/skills/council/evals/evals.json +41 -0
package/src/templates/skills/council/evals/triggers.json +16 -0
package/src/templates/skills/create-pr/evals/evals.json +44 -0
package/src/templates/skills/create-pr/evals/triggers.json +16 -0
package/src/templates/skills/debug/SKILL.md +1 -1
package/src/templates/skills/debug/evals/triggers.json +16 -0
package/src/templates/skills/dev-flow/evals/evals.json +36 -0
package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
package/src/templates/skills/new-feature/evals/evals.json +41 -0
package/src/templates/skills/new-feature/evals/triggers.json +16 -0
package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
package/src/templates/skills/re-specialize/evals/evals.json +48 -0
package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
package/src/templates/skills/review/evals/evals.json +43 -0
package/src/templates/skills/review/evals/triggers.json +16 -0
package/src/templates/skills/session-end/evals/evals.json +40 -0
package/src/templates/skills/session-end/evals/triggers.json +16 -0
package/src/templates/skills/session-start/evals/evals.json +50 -0
package/src/templates/skills/session-start/evals/triggers.json +16 -0
package/src/templates/skills/status/evals/evals.json +40 -0
package/src/templates/skills/status/evals/triggers.json +16 -0
package/src/templates/skills/tdd/evals/triggers.json +16 -0
package/src/templates/skills/verify/evals/triggers.json +16 -0
package/src/utils/accounting.js +139 -0
package/src/utils/benchmark.js +128 -0
package/src/utils/description-analyzer.js +92 -0
package/src/utils/eval-runner.js +139 -0
package/src/utils/pricing.js +28 -0
package/src/utils/semantic-matcher.js +91 -0
package/src/utils/trigger-matcher.js +64 -0
package/src/utils/trigger-runner.js +132 -0
package/src/utils/workspace.js +89 -0

package/src/utils/pricing.js ADDED Viewed

@@ -0,0 +1,28 @@
+/**
+ * pricing.js — Model pricing table and cost calculation.
+ *
+ * Prices per million tokens (USD).
+ * Source: https://docs.anthropic.com/en/docs/about-claude/models
+ */
+export const DEFAULT_PRICING = {
+  'claude-opus-4-6': { input: 15.00, output: 75.00 },
+  'claude-sonnet-4-5': { input: 3.00, output: 15.00 },
+  'claude-haiku-4-5': { input: 0.80, output: 4.00 },
+};
+const SHORT_NAMES = {
+  'claude-opus-4-6': 'Opus',
+  'claude-sonnet-4-5': 'Sonnet',
+  'claude-haiku-4-5': 'Haiku',
+};
+export function estimateCost(model, inputTokens, outputTokens) {
+  const pricing = DEFAULT_PRICING[model];
+  if (!pricing) return 0;
+  return (inputTokens * pricing.input + outputTokens * pricing.output) / 1_000_000;
+}
+export function getModelShortName(model) {
+  return SHORT_NAMES[model] || model;
+}

package/src/utils/semantic-matcher.js ADDED Viewed

@@ -0,0 +1,91 @@
+/**
+ * semantic-matcher.js — LLM-based trigger scoring via Anthropic Haiku.
+ *
+ * Calls the Anthropic Messages API to score how well a user prompt
+ * matches a skill. Optional complement to the keyword matcher.
+ */
+export const SEMANTIC_MODEL_DEFAULT = 'claude-haiku-4-5-20251001';
+const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages';
+const SYSTEM_PROMPT = `You are a skill-routing classifier. Given a user prompt and a skill name + description, score how likely the user wants to trigger this skill.
+Respond with ONLY a JSON object, no other text:
+{"score": <0-100>, "reasoning": "<one sentence>"}
+Score guide:
+- 90-100: Clear, direct match
+- 60-89: Likely match, related intent
+- 30-59: Possible but ambiguous
+- 0-29: Unrelated`;
+/**
+ * Scores a prompt against a skill using the Anthropic Messages API.
+ * @param {string} prompt - User prompt to classify
+ * @param {string} skillName - Skill identifier
+ * @param {string} skillDescription - Skill description text
+ * @returns {Promise<{ score: number, reasoning: string, error?: boolean }>}
+ */
+export async function scoreMatchSemantic(prompt, skillName, skillDescription) {
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  const model = process.env.GUILD_SEMANTIC_MODEL || SEMANTIC_MODEL_DEFAULT;
+  try {
+    const response = await fetch(ANTHROPIC_API_URL, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'x-api-key': apiKey,
+        'anthropic-version': '2023-06-01',
+      },
+      body: JSON.stringify({
+        model,
+        max_tokens: 100,
+        system: SYSTEM_PROMPT,
+        messages: [
+          {
+            role: 'user',
+            content: `User prompt: "${prompt}"\nSkill: ${skillName}\nDescription: ${skillDescription}`,
+          },
+        ],
+      }),
+    });
+    if (!response.ok) {
+      return { score: 0, reasoning: `API error: ${response.status} ${response.statusText}`, error: true };
+    }
+    const data = await response.json();
+    const text = data.content[0].text;
+    return parseResponse(text);
+  } catch (err) {
+    return { score: 0, reasoning: err.message, error: true };
+  }
+}
+/**
+ * Parses the LLM response, extracting JSON with fallback.
+ * @param {string} text
+ * @returns {{ score: number, reasoning: string, error?: boolean }}
+ */
+function parseResponse(text) {
+  // Try direct parse first
+  try {
+    const parsed = JSON.parse(text);
+    return { score: parsed.score / 100, reasoning: parsed.reasoning };
+  } catch {
+    // Fallback: extract first JSON object from text
+    const match = text.match(/\{[^}]+\}/);
+    if (match) {
+      try {
+        const parsed = JSON.parse(match[0]);
+        return { score: parsed.score / 100, reasoning: parsed.reasoning };
+      } catch {
+        // Fall through
+      }
+    }
+    return { score: 0, reasoning: 'parse-error', error: true };
+  }
+}

package/src/utils/trigger-matcher.js ADDED Viewed

@@ -0,0 +1,64 @@
+/**
+ * trigger-matcher.js — Scores prompts against skill descriptions.
+ *
+ * Uses keyword overlap scoring to determine how well a user prompt
+ * matches a skill's description. No LLM calls — purely programmatic.
+ */
+/**
+ * Tokenizes text into lowercase words, stripping punctuation.
+ * @param {string} text
+ * @returns {string[]}
+ */
+export function tokenize(text) {
+  return text
+    .toLowerCase()
+    .replace(/[—–\-/]/g, ' ')
+    .replace(/[^\w\s]/g, '')
+    .split(/\s+/)
+    .filter(w => w.length > 1);
+}
+const STOP_WORDS = new Set([
+  'the', 'is', 'at', 'in', 'on', 'to', 'of', 'for', 'and', 'or', 'an',
+  'it', 'by', 'as', 'be', 'do', 'if', 'no', 'so', 'up', 'we', 'my',
+  'use', 'when', 'with', 'from', 'this', 'that', 'will', 'can', 'has',
+  'not', 'are', 'was', 'but', 'all', 'any', 'its', 'you', 'your',
+  'skill', 'discipline',
+]);
+/**
+ * Scores how well a prompt matches a description.
+ * Returns 0-1.
+ */
+export function scoreMatch(prompt, description) {
+  const promptTokens = tokenize(prompt).filter(w => !STOP_WORDS.has(w));
+  if (promptTokens.length === 0) return 0;
+  const descTokens = new Set(tokenize(description).filter(w => !STOP_WORDS.has(w)));
+  let matches = 0;
+  for (const token of promptTokens) {
+    if (descTokens.has(token)) {
+      matches++;
+    } else {
+      for (const dt of descTokens) {
+        if (dt.includes(token) || token.includes(dt)) {
+          matches += 0.5;
+          break;
+        }
+      }
+    }
+  }
+  return matches / promptTokens.length;
+}
+/**
+ * Ranks all skills by match score descending.
+ */
+export function rankSkills(prompt, skills) {
+  return skills
+    .map(s => ({ ...s, score: scoreMatch(prompt, s.description) }))
+    .sort((a, b) => b.score - a.score);
+}

package/src/utils/trigger-runner.js ADDED Viewed

@@ -0,0 +1,132 @@
+/**
+ * trigger-runner.js — Loads and executes trigger tests for skills.
+ */
+import { readFileSync, existsSync, readdirSync } from 'fs';
+import { join, dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { rankSkills } from './trigger-matcher.js';
+import { extractFrontmatterBlock, parseYamlFrontmatter } from './workflow-parser.js';
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const TEMPLATES_DIR = join(__dirname, '..', 'templates', 'skills');
+/**
+ * Loads triggers.json for a skill template.
+ * @param {string} skillName
+ * @returns {object|null}
+ */
+export function loadTriggers(skillName) {
+  const triggersPath = join(TEMPLATES_DIR, skillName, 'evals', 'triggers.json');
+  if (!existsSync(triggersPath)) return null;
+  return JSON.parse(readFileSync(triggersPath, 'utf8'));
+}
+/**
+ * Loads all skill names and descriptions from templates.
+ * @returns {{ name: string, description: string }[]}
+ */
+export function loadAllSkillDescriptions() {
+  const skillDirs = readdirSync(TEMPLATES_DIR, { withFileTypes: true })
+    .filter(d => d.isDirectory())
+    .map(d => d.name);
+  const skills = [];
+  for (const name of skillDirs) {
+    const skillPath = join(TEMPLATES_DIR, name, 'SKILL.md');
+    if (!existsSync(skillPath)) continue;
+    const content = readFileSync(skillPath, 'utf8');
+    const block = extractFrontmatterBlock(content);
+    if (!block) continue;
+    const fm = parseYamlFrontmatter(block.yaml);
+    if (fm.description) {
+      skills.push({ name, description: fm.description });
+    }
+  }
+  return skills;
+}
+/**
+ * Runs trigger tests for a skill.
+ *
+ * When matcherType is "keyword" and a test has keywordExpected defined,
+ * that value overrides shouldTrigger for accuracy calculation. This lets
+ * tests document the ideal (semantic) expectation while being honest
+ * about what keyword matching can achieve.
+ *
+ * @param {object} triggers - Trigger test config from triggers.json
+ * @param {Array} allSkills - All skill descriptions
+ * @param {object} [options] - Options
+ * @param {boolean} [options.semantic=false] - Use semantic matcher
+ * @param {Function} [options.scoreMatchSemantic] - Semantic scoring function (injected for testability)
+ */
+export async function runTriggerTests(triggers, allSkills, options = {}) {
+  const { semantic = false, scoreMatchSemantic: semanticFn } = options;
+  const threshold = triggers.threshold || 0.3;
+  const isKeyword = !semantic && triggers.matcherType === 'keyword';
+  const results = [];
+  for (const test of triggers.tests) {
+    let actual, score, rank, reasoning;
+    if (semantic && semanticFn) {
+      const targetSkill = allSkills.find(s => s.name === triggers.skill);
+      const semanticResult = await semanticFn(test.prompt, triggers.skill, targetSkill?.description || triggers.description);
+      score = semanticResult.score;
+      actual = score >= threshold;
+      rank = null;
+      reasoning = semanticResult.reasoning;
+    } else {
+      const ranked = rankSkills(test.prompt, allSkills);
+      const targetRank = ranked.findIndex(s => s.name === triggers.skill);
+      score = targetRank >= 0 ? ranked[targetRank].score : 0;
+      actual = targetRank === 0 && score >= threshold;
+      rank = targetRank + 1;
+    }
+    const hasOverride = isKeyword && test.keywordExpected !== undefined;
+    const expected = hasOverride ? test.keywordExpected : test.shouldTrigger;
+    const result = {
+      prompt: test.prompt,
+      expected,
+      actual,
+      score,
+      rank,
+      matcherUsed: semantic ? 'semantic' : 'keyword',
+    };
+    if (reasoning) {
+      result.reasoning = reasoning;
+    }
+    if (hasOverride) {
+      result.semanticExpected = test.shouldTrigger;
+    }
+    results.push(result);
+  }
+  return results;
+}
+/**
+ * Computes precision, recall, and accuracy from trigger test results.
+ */
+export function computeAccuracy(results) {
+  if (results.length === 0) return { precision: 0, recall: 0, accuracy: 0, total: 0, tp: 0, fp: 0, fn: 0, tn: 0 };
+  let tp = 0, fp = 0, fn = 0, tn = 0;
+  for (const r of results) {
+    if (r.expected && r.actual) tp++;
+    else if (!r.expected && r.actual) fp++;
+    else if (r.expected && !r.actual) fn++;
+    else tn++;
+  }
+  const precision = (tp + fp) > 0 ? tp / (tp + fp) : 0;
+  const recall = (tp + fn) > 0 ? tp / (tp + fn) : 0;
+  const accuracy = (tp + tn) / results.length;
+  return { precision, recall, accuracy, total: results.length, tp, fp, fn, tn };
+}

package/src/utils/workspace.js CHANGED Viewed

@@ -1,8 +1,15 @@
 import { existsSync, readFileSync, readdirSync } from 'fs';
 import { join, dirname, resolve } from 'path';
+import { execFileSync } from 'node:child_process';
 export const WORKSPACE_FILE = 'guild-workspace.json';
+export const PRESET_COMMANDS = {
+  test:  { cmd: 'npm', args: ['test'] },
+  lint:  { cmd: 'npm', args: ['run', 'lint'] },
+  build: { cmd: 'npm', args: ['run', 'build'] },
+};
 export function findWorkspaceRoot(startDir = process.cwd()) {
   let dir = resolve(startDir);
   while (true) {
@@ -80,3 +87,85 @@ export function generateWorkspaceContext(workspace, currentMemberName) {
   return lines.join('\n');
 }
+export function collectMemberContext(workspace, currentMemberName) {
+  if (!workspace) return '';
+  const siblings = workspace.members.filter(m => m.name !== currentMemberName);
+  if (siblings.length === 0) return '';
+  const lines = [`## Workspace: ${workspace.name}`, ''];
+  for (const member of siblings) {
+    lines.push(`### ${member.name} (sibling — ${member.absolutePath})`);
+    const projectMdPath = join(member.absolutePath, 'PROJECT.md');
+    if (existsSync(projectMdPath)) {
+      const content = readFileSync(projectMdPath, 'utf8');
+      const stackMatch = content.match(/\*\*Stack:\*\*\s*(.+)/);
+      if (stackMatch) {
+        lines.push(`- **Stack:** ${stackMatch[1].trim()}`);
+      }
+    }
+    const claudeMdPath = join(member.absolutePath, 'CLAUDE.md');
+    if (existsSync(claudeMdPath)) {
+      const content = readFileSync(claudeMdPath, 'utf8');
+      const structureMatch = content.match(/## Project structure\n(.+)/);
+      if (structureMatch) {
+        lines.push(`- **Structure:** ${structureMatch[1].trim()}`);
+      }
+    }
+    const sessionMdPath = join(member.absolutePath, 'SESSION.md');
+    if (existsSync(sessionMdPath)) {
+      const content = readFileSync(sessionMdPath, 'utf8');
+      const taskMatch = content.match(/\*\*Current task:\*\*\s*(.+)/);
+      if (taskMatch) {
+        lines.push(`- **Current task:** ${taskMatch[1].trim()}`);
+      }
+    }
+    lines.push(`You can read any file under ${member.absolutePath}/ for deeper analysis.`);
+    lines.push('');
+  }
+  return lines.join('\n').trim();
+}
+export function runInMember(member, cmd, args) {
+  if (!existsSync(member.absolutePath)) {
+    return {
+      member: member.name,
+      status: 'failed',
+      output: `Directory not found: ${member.absolutePath}`,
+      duration: 0,
+    };
+  }
+  const start = Date.now();
+  try {
+    const stdout = execFileSync(cmd, args, {
+      cwd: member.absolutePath,
+      encoding: 'utf8',
+      stdio: ['pipe', 'pipe', 'pipe'],
+    });
+    const duration = Date.now() - start;
+    return {
+      member: member.name,
+      status: 'passed',
+      output: stdout.trim(),
+      duration,
+    };
+  } catch (error) {
+    const duration = Date.now() - start;
+    const stdout = error.stdout || '';
+    const stderr = error.stderr || '';
+    return {
+      member: member.name,
+      status: 'failed',
+      output: (stdout + stderr).trim(),
+      duration,
+    };
+  }
+}