npm - tarsk - Versions diffs - 0.5.41 → 0.5.43 - Mend

tarsk 0.5.41 → 0.5.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/dist/bundled-skills/skill-creator/scripts/generate_report.js ADDED Viewed

@@ -0,0 +1,345 @@
+#!/usr/bin/env node
+/** Generate an HTML report from run_loop.js output. */
+const fs = require("fs");
+const path = require("path");
+function escapeHtml(value) {
+  return String(value)
+    .replace(/&/g, "&amp;")
+    .replace(/</g, "&lt;")
+    .replace(/>/g, "&gt;")
+    .replace(/"/g, "&quot;")
+    .replace(/'/g, "&#39;");
+}
+function aggregateRuns(results) {
+  let correct = 0;
+  let total = 0;
+  for (const r of results) {
+    const runs = r.runs ?? 0;
+    const triggers = r.triggers ?? 0;
+    total += runs;
+    if (r.should_trigger ?? true) {
+      correct += triggers;
+    } else {
+      correct += runs - triggers;
+    }
+  }
+  return [correct, total];
+}
+function scoreClass(correct, total) {
+  if (total > 0) {
+    const ratio = correct / total;
+    if (ratio >= 0.8) return "score-good";
+    if (ratio >= 0.5) return "score-ok";
+  }
+  return "score-bad";
+}
+function generateHtml(data, autoRefresh = false, skillName = "") {
+  const history = data.history ?? [];
+  const titlePrefix = skillName ? escapeHtml(`${skillName} \u2014 `) : "";
+  const trainQueries = [];
+  const testQueries = [];
+  if (history.length) {
+    for (const r of history[0].train_results ?? history[0].results ?? []) {
+      trainQueries.push({ query: r.query, should_trigger: r.should_trigger ?? true });
+    }
+    if (history[0].test_results) {
+      for (const r of history[0].test_results) {
+        testQueries.push({ query: r.query, should_trigger: r.should_trigger ?? true });
+      }
+    }
+  }
+  const refreshTag = autoRefresh ? '    <meta http-equiv="refresh" content="5">\n' : "";
+  const htmlParts = [
+    `<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+${refreshTag}    <title>${titlePrefix}Skill Description Optimization</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        body {
+            font-family: 'Lora', Georgia, serif;
+            max-width: 100%;
+            margin: 0 auto;
+            padding: 20px;
+            background: #faf9f5;
+            color: #141413;
+        }
+        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
+        .explainer {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+            color: #b0aea5;
+            font-size: 0.875rem;
+            line-height: 1.6;
+        }
+        .summary {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+        }
+        .summary p { margin: 5px 0; }
+        .best { color: #788c5d; font-weight: bold; }
+        .table-container {
+            overflow-x: auto;
+            width: 100%;
+        }
+        table {
+            border-collapse: collapse;
+            background: white;
+            border: 1px solid #e8e6dc;
+            border-radius: 6px;
+            font-size: 12px;
+            min-width: 100%;
+        }
+        th, td {
+            padding: 8px;
+            text-align: left;
+            border: 1px solid #e8e6dc;
+            white-space: normal;
+            word-wrap: break-word;
+        }
+        th {
+            font-family: 'Poppins', sans-serif;
+            background: #141413;
+            color: #faf9f5;
+            font-weight: 500;
+        }
+        th.test-col {
+            background: #6a9bcc;
+        }
+        th.query-col { min-width: 200px; }
+        td.description {
+            font-family: monospace;
+            font-size: 11px;
+            word-wrap: break-word;
+            max-width: 400px;
+        }
+        td.result {
+            text-align: center;
+            font-size: 16px;
+            min-width: 40px;
+        }
+        td.test-result {
+            background: #f0f6fc;
+        }
+        .pass { color: #788c5d; }
+        .fail { color: #c44; }
+        .rate {
+            font-size: 9px;
+            color: #b0aea5;
+            display: block;
+        }
+        tr:hover { background: #faf9f5; }
+        .score {
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: bold;
+            font-size: 11px;
+        }
+        .score-good { background: #eef2e8; color: #788c5d; }
+        .score-ok { background: #fef3c7; color: #d97706; }
+        .score-bad { background: #fceaea; color: #c44; }
+        .train-label { color: #b0aea5; font-size: 10px; }
+        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
+        .best-row { background: #f5f8f2; }
+        th.positive-col { border-bottom: 3px solid #788c5d; }
+        th.negative-col { border-bottom: 3px solid #c44; }
+        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
+        th.test-col.negative-col { border-bottom: 3px solid #c44; }
+        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
+        .legend-item { display: flex; align-items: center; gap: 6px; }
+        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
+        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
+        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
+        .swatch-test { background: #6a9bcc; }
+        .swatch-train { background: #141413; }
+    </style>
+</head>
+<body>
+    <h1>${titlePrefix}Skill Description Optimization</h1>
+    <div class="explainer">
+        <strong>Optimizing your skill's description.</strong> This page updates automatically as Tarsk tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Tarsk will apply the best-performing description to your skill.
+    </div>
+`,
+  ];
+  const bestTestScore = data.best_test_score;
+  htmlParts.push(`
+    <div class="summary">
+        <p><strong>Original:</strong> ${escapeHtml(data.original_description ?? "N/A")}</p>
+        <p class="best"><strong>Best:</strong> ${escapeHtml(data.best_description ?? "N/A")}</p>
+        <p><strong>Best Score:</strong> ${data.best_score ?? "N/A"} ${bestTestScore ? "(test)" : "(train)"}</p>
+        <p><strong>Iterations:</strong> ${data.iterations_run ?? 0} | <strong>Train:</strong> ${data.train_size ?? "?"} | <strong>Test:</strong> ${data.test_size ?? "?"}</p>
+    </div>
+`);
+  htmlParts.push(`
+    <div class="legend">
+        <span style="font-weight:600">Query columns:</span>
+        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
+        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
+    </div>
+`);
+  htmlParts.push(`
+    <div class="table-container">
+    <table>
+        <thead>
+            <tr>
+                <th>Iter</th>
+                <th>Train</th>
+                <th>Test</th>
+                <th class="query-col">Description</th>
+`);
+  for (const qinfo of trainQueries) {
+    const polarity = qinfo.should_trigger ? "positive-col" : "negative-col";
+    htmlParts.push(`                <th class="${polarity}">${escapeHtml(qinfo.query)}</th>\n`);
+  }
+  for (const qinfo of testQueries) {
+    const polarity = qinfo.should_trigger ? "positive-col" : "negative-col";
+    htmlParts.push(
+      `                <th class="test-col ${polarity}">${escapeHtml(qinfo.query)}</th>\n`,
+    );
+  }
+  htmlParts.push(`            </tr>
+        </thead>
+        <tbody>
+`);
+  let bestIter;
+  if (testQueries.length) {
+    bestIter = history.reduce(
+      (best, h) => ((h.test_passed ?? 0) > (best.test_passed ?? 0) ? h : best),
+      history[0],
+    ).iteration;
+  } else {
+    bestIter = history.reduce(
+      (best, h) =>
+        (h.train_passed ?? h.passed ?? 0) > (best.train_passed ?? best.passed ?? 0) ? h : best,
+      history[0],
+    ).iteration;
+  }
+  for (const h of history) {
+    const iteration = h.iteration ?? "?";
+    const description = h.description ?? "";
+    const trainResults = h.train_results ?? h.results ?? [];
+    const testResults = h.test_results ?? [];
+    const trainByQuery = Object.fromEntries(trainResults.map((r) => [r.query, r]));
+    const testByQuery = testResults.length
+      ? Object.fromEntries(testResults.map((r) => [r.query, r]))
+      : {};
+    const [trainCorrect, trainRuns] = aggregateRuns(trainResults);
+    const [testCorrect, testRuns] = aggregateRuns(testResults);
+    const trainClass = scoreClass(trainCorrect, trainRuns);
+    const testClass = scoreClass(testCorrect, testRuns);
+    const rowClass = iteration === bestIter ? "best-row" : "";
+    htmlParts.push(`            <tr class="${rowClass}">
+                <td>${iteration}</td>
+                <td><span class="score ${trainClass}">${trainCorrect}/${trainRuns}</span></td>
+                <td><span class="score ${testClass}">${testCorrect}/${testRuns}</span></td>
+                <td class="description">${escapeHtml(description)}</td>
+`);
+    for (const qinfo of trainQueries) {
+      const r = trainByQuery[qinfo.query] ?? {};
+      const didPass = r.pass ?? false;
+      const triggers = r.triggers ?? 0;
+      const runs = r.runs ?? 0;
+      const icon = didPass ? "✓" : "✗";
+      const cssClass = didPass ? "pass" : "fail";
+      htmlParts.push(
+        `                <td class="result ${cssClass}">${icon}<span class="rate">${triggers}/${runs}</span></td>\n`,
+      );
+    }
+    for (const qinfo of testQueries) {
+      const r = testByQuery[qinfo.query] ?? {};
+      const didPass = r.pass ?? false;
+      const triggers = r.triggers ?? 0;
+      const runs = r.runs ?? 0;
+      const icon = didPass ? "✓" : "✗";
+      const cssClass = didPass ? "pass" : "fail";
+      htmlParts.push(
+        `                <td class="result test-result ${cssClass}">${icon}<span class="rate">${triggers}/${runs}</span></td>\n`,
+      );
+    }
+    htmlParts.push("            </tr>\n");
+  }
+  htmlParts.push(`        </tbody>
+    </table>
+    </div>
+</body>
+</html>
+`);
+  return htmlParts.join("");
+}
+function parseArgs(argv) {
+  const args = { input: null, output: null, skillName: "" };
+  const positional = [];
+  for (let i = 2; i < argv.length; i++) {
+    const arg = argv[i];
+    if (arg === "-o" || arg === "--output") args.output = argv[++i];
+    else if (arg === "--skill-name") args.skillName = argv[++i] ?? "";
+    else if (!arg.startsWith("-")) positional.push(arg);
+  }
+  args.input = positional[0] ?? null;
+  return args;
+}
+function main() {
+  const args = parseArgs(process.argv);
+  if (!args.input) {
+    console.error(
+      "Usage: node generate_report.js <input.json|-> [-o output.html] [--skill-name NAME]",
+    );
+    process.exit(1);
+  }
+  let data;
+  if (args.input === "-") {
+    data = JSON.parse(fs.readFileSync(0, "utf-8"));
+  } else {
+    data = JSON.parse(fs.readFileSync(args.input, "utf-8"));
+  }
+  const htmlOutput = generateHtml(data, false, args.skillName);
+  if (args.output) {
+    fs.writeFileSync(args.output, htmlOutput);
+    console.error(`Report written to ${args.output}`);
+  } else {
+    process.stdout.write(htmlOutput);
+  }
+}
+if (require.main === module) {
+  main();
+}
+module.exports = { generateHtml };

package/dist/bundled-skills/skill-creator/scripts/improve_description.js ADDED Viewed

@@ -0,0 +1,263 @@
+#!/usr/bin/env node
+/** Improve a skill description based on eval results. */
+const fs = require("fs");
+const path = require("path");
+const { spawnSync } = require("child_process");
+const { parseSkillMd } = require("./utils.js");
+function callTarsk(prompt, model, timeoutMs = 300000) {
+  const cmd = ["tarsk", "-p", "--output-format", "text"];
+  if (model) {
+    cmd.push("--model", model);
+  }
+  const env = { ...process.env };
+  delete env.TarskCODE;
+  const result = spawnSync("tarsk", cmd.slice(1), {
+    input: prompt,
+    encoding: "utf-8",
+    env,
+    timeout: timeoutMs,
+    maxBuffer: 50 * 1024 * 1024,
+  });
+  if (result.error) {
+    throw result.error;
+  }
+  if (result.status !== 0) {
+    throw new Error(`tarsk -p exited ${result.status}\nstderr: ${result.stderr}`);
+  }
+  return result.stdout;
+}
+function improveDescription(
+  skillName,
+  skillContent,
+  currentDescription,
+  evalResults,
+  history,
+  model,
+  testResults = null,
+  logDir = null,
+  iteration = null,
+) {
+  const failedTriggers = evalResults.results.filter((r) => r.should_trigger && !r.pass);
+  const falseTriggers = evalResults.results.filter((r) => !r.should_trigger && !r.pass);
+  const trainScore = `${evalResults.summary.passed}/${evalResults.summary.total}`;
+  let scoresSummary;
+  if (testResults) {
+    const testScore = `${testResults.summary.passed}/${testResults.summary.total}`;
+    scoresSummary = `Train: ${trainScore}, Test: ${testScore}`;
+  } else {
+    scoresSummary = `Train: ${trainScore}`;
+  }
+  let prompt = `You are optimizing a skill description for a Tarsk Code skill called "${skillName}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Tarsk sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
+The description appears in Tarsk's "available_skills" list. When a user sends a query, Tarsk decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
+Here's the current description:
+<current_description>
+"${currentDescription}"
+</current_description>
+Current scores (${scoresSummary}):
+<scores_summary>
+`;
+  if (failedTriggers.length) {
+    prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n";
+    for (const r of failedTriggers) {
+      prompt += `  - "${r.query}" (triggered ${r.triggers}/${r.runs} times)\n`;
+    }
+    prompt += "\n";
+  }
+  if (falseTriggers.length) {
+    prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n";
+    for (const r of falseTriggers) {
+      prompt += `  - "${r.query}" (triggered ${r.triggers}/${r.runs} times)\n`;
+    }
+    prompt += "\n";
+  }
+  if (history.length) {
+    prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n";
+    for (const h of history) {
+      const trainS = `${h.train_passed ?? h.passed ?? 0}/${h.train_total ?? h.total ?? 0}`;
+      const testS = h.test_passed != null ? `${h.test_passed}/${h.test_total ?? "?"}` : null;
+      const scoreStr = `train=${trainS}` + (testS ? `, test=${testS}` : "");
+      prompt += `<attempt ${scoreStr}>\n`;
+      prompt += `Description: "${h.description}"\n`;
+      if (h.results) {
+        prompt += "Train results:\n";
+        for (const r of h.results) {
+          const status = r.pass ? "PASS" : "FAIL";
+          prompt += `  [${status}] "${r.query.slice(0, 80)}" (triggered ${r.triggers}/${r.runs})\n`;
+        }
+      }
+      if (h.note) {
+        prompt += `Note: ${h.note}\n`;
+      }
+      prompt += "</attempt>\n\n";
+    }
+  }
+  prompt += `</scores_summary>
+Skill content (for context on what the skill does):
+<skill_content>
+${skillContent}
+</skill_content>
+Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
+1. Avoid overfitting
+2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
+Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. There is a hard limit of 1024 characters — descriptions over that will be truncated, so stay comfortably under it.
+Here are some tips that we've found to work well in writing these descriptions:
+- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
+- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
+- The description competes with other skills for Tarsk's attention — make it distinctive and immediately recognizable.
+- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
+I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end.
+Please respond with only the new description text in <new_description> tags, nothing else.`;
+  let text = callTarsk(prompt, model);
+  let match = text.match(/<new_description>([\s\S]*?)<\/new_description>/);
+  let description = match
+    ? match[1].trim().replace(/^["']|["']$/g, "")
+    : text.trim().replace(/^["']|["']$/g, "");
+  const transcript = {
+    iteration,
+    prompt,
+    response: text,
+    parsed_description: description,
+    char_count: description.length,
+    over_limit: description.length > 1024,
+  };
+  if (description.length > 1024) {
+    const shortenPrompt =
+      `${prompt}\n\n` +
+      `---\n\n` +
+      `A previous attempt produced this description, which at ` +
+      `${description.length} characters is over the 1024-character hard limit:\n\n` +
+      `"${description}"\n\n` +
+      `Rewrite it to be under 1024 characters while keeping the most ` +
+      `important trigger words and intent coverage. Respond with only ` +
+      `the new description in <new_description> tags.`;
+    const shortenText = callTarsk(shortenPrompt, model);
+    match = shortenText.match(/<new_description>([\s\S]*?)<\/new_description>/);
+    const shortened = match
+      ? match[1].trim().replace(/^["']|["']$/g, "")
+      : shortenText.trim().replace(/^["']|["']$/g, "");
+    transcript.rewrite_prompt = shortenPrompt;
+    transcript.rewrite_response = shortenText;
+    transcript.rewrite_description = shortened;
+    transcript.rewrite_char_count = shortened.length;
+    description = shortened;
+  }
+  transcript.final_description = description;
+  if (logDir) {
+    fs.mkdirSync(logDir, { recursive: true });
+    const logFile = path.join(logDir, `improve_iter_${iteration ?? "unknown"}.json`);
+    fs.writeFileSync(logFile, JSON.stringify(transcript, null, 2));
+  }
+  return description;
+}
+function parseArgs(argv) {
+  const args = {
+    evalResults: null,
+    skillPath: null,
+    history: null,
+    model: null,
+    verbose: false,
+  };
+  for (let i = 2; i < argv.length; i++) {
+    const arg = argv[i];
+    if (arg === "--eval-results") args.evalResults = argv[++i];
+    else if (arg === "--skill-path") args.skillPath = argv[++i];
+    else if (arg === "--history") args.history = argv[++i];
+    else if (arg === "--model") args.model = argv[++i];
+    else if (arg === "--verbose") args.verbose = true;
+  }
+  return args;
+}
+function main() {
+  const args = parseArgs(process.argv);
+  if (!args.evalResults || !args.skillPath || !args.model) {
+    console.error(
+      "Usage: node improve_description.js --eval-results PATH --skill-path PATH --model MODEL [--history PATH] [--verbose]",
+    );
+    process.exit(1);
+  }
+  const skillPath = path.resolve(args.skillPath);
+  if (!fs.existsSync(path.join(skillPath, "SKILL.md"))) {
+    console.error(`Error: No SKILL.md found at ${skillPath}`);
+    process.exit(1);
+  }
+  const evalResults = JSON.parse(fs.readFileSync(args.evalResults, "utf-8"));
+  let history = [];
+  if (args.history) {
+    history = JSON.parse(fs.readFileSync(args.history, "utf-8"));
+  }
+  const { name, content } = parseSkillMd(skillPath);
+  const currentDescription = evalResults.description;
+  if (args.verbose) {
+    console.error(`Current: ${currentDescription}`);
+    console.error(`Score: ${evalResults.summary.passed}/${evalResults.summary.total}`);
+  }
+  const newDescription = improveDescription(
+    name,
+    content,
+    currentDescription,
+    evalResults,
+    history,
+    args.model,
+  );
+  if (args.verbose) {
+    console.error(`Improved: ${newDescription}`);
+  }
+  const output = {
+    description: newDescription,
+    history: [
+      ...history,
+      {
+        description: currentDescription,
+        passed: evalResults.summary.passed,
+        failed: evalResults.summary.failed,
+        total: evalResults.summary.total,
+        results: evalResults.results,
+      },
+    ],
+  };
+  console.log(JSON.stringify(output, null, 2));
+}
+if (require.main === module) {
+  main();
+}
+module.exports = { improveDescription, callTarsk };