npm - agent-challenge - Versions diffs - 1.0.0 → 1.3.0 - Mend

agent-challenge 1.0.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/src/agentchallenge.js +186 -12

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-challenge",
-  "version": "1.0.0",
+  "version": "1.3.0",
   "description": "Drop-in LLM authentication for any API endpoint. Reasoning puzzles that agents solve once, then pass through forever. Stateless HMAC tokens, no database.",
   "main": "src/agentchallenge.js",
   "type": "module",

package/src/agentchallenge.js CHANGED Viewed

@@ -1,5 +1,5 @@
 /**
- * agent-challenge v1.0.0 (JavaScript/Node.js port)
+ * agent-challenge v1.1.0 (JavaScript/Node.js port)
  *
  * LLM-solvable challenge-response authentication for AI agent APIs.
  * 12 static challenge types with fully randomized inputs.
@@ -1361,11 +1361,148 @@ CHALLENGE_TYPES.string_interleave = () => {
   }
 };
+// Chained arithmetic: multi-step chains + knowledge facts — GPT-5.2 100%, GPT-4o 30%
+CHALLENGE_TYPES.chained_arithmetic = () => {
+  const pattern = pick(["add_mul_sub_mod", "mul_add_mul_mod", "add_square_sub_mod", "mul_sub_add_mod"]);
+  if (pattern === "add_mul_sub_mod") {
+    const a = randInt(2,9), b = randInt(2,9), c = randInt(2,5), d = randInt(1,9), m = randInt(3,7);
+    return { prompt: buildPrompt(pick([
+      `Add ${a} and ${b}. Multiply by ${c}. Subtract ${d}. Find the remainder when divided by ${m}.`,
+      `What is ((${a} + ${b}) × ${c} - ${d}) mod ${m}?`,
+      `Compute ${a} + ${b}, multiply the sum by ${c}, subtract ${d}, remainder mod ${m}.`,
+      `Sum ${a} and ${b}, then multiply by ${c}, then subtract ${d}. What is the remainder after dividing by ${m}?`,
+      `(${a} + ${b}) × ${c} − ${d}. Divide by ${m} and give the remainder.`,
+    ])), answer: String(((a + b) * c - d) % m) };
+  } else if (pattern === "mul_add_mul_mod") {
+    const a = randInt(2,7), b = randInt(2,5), c = randInt(3,9), d = randInt(2,4), m = randInt(3,7);
+    return { prompt: buildPrompt(pick([
+      `Multiply ${a} by ${b}. Add ${c}. Multiply by ${d}. Find the remainder when divided by ${m}.`,
+      `What is ((${a} × ${b} + ${c}) × ${d}) mod ${m}?`,
+      `Compute ${a} × ${b}, add ${c}, multiply by ${d}, remainder mod ${m}.`,
+      `Product of ${a} and ${b}, plus ${c}, times ${d}. What is the remainder after dividing by ${m}?`,
+    ])), answer: String(((a * b + c) * d) % m) };
+  } else if (pattern === "add_square_sub_mod") {
+    const a = randInt(2,5), b = randInt(1,4), c = randInt(1,8), m = randInt(3,7);
+    return { prompt: buildPrompt(pick([
+      `Add ${a} and ${b}. Square the result. Subtract ${c}. Find the remainder when divided by ${m}.`,
+      `What is ((${a} + ${b})² - ${c}) mod ${m}?`,
+      `Compute (${a} + ${b}) squared, subtract ${c}, remainder mod ${m}.`,
+      `Sum ${a} and ${b}, square it, subtract ${c}. Remainder after dividing by ${m}?`,
+    ])), answer: String(((a + b) ** 2 - c) % m) };
+  } else { // mul_sub_add_mod
+    const a = randInt(3,9), b = randInt(2,5), c = randInt(1, Math.min(a*b-1,9)), d = randInt(2,9), m = randInt(3,7);
+    return { prompt: buildPrompt(pick([
+      `Multiply ${a} by ${b}. Subtract ${c}. Add ${d}. Find the remainder when divided by ${m}.`,
+      `What is (${a} × ${b} - ${c} + ${d}) mod ${m}?`,
+      `Compute ${a} × ${b}, subtract ${c}, add ${d}, remainder mod ${m}.`,
+      `Product of ${a} and ${b}, minus ${c}, plus ${d}. Remainder after dividing by ${m}?`,
+    ])), answer: String((a * b - c + d) % m) };
+  }
+};
+// Power modulo: base^exp mod m — GPT-5.2 100%, GPT-4o 80%
+CHALLENGE_TYPES.power_mod = () => {
+  const base = randInt(2, 5), exp = randInt(3, 6), m = randInt(5, 13);
+  const answer = (base ** exp) % m;
+  const templates = [
+    `Compute ${base} raised to the power ${exp}, then find the remainder when divided by ${m}.`,
+    `What is ${base}^${exp} mod ${m}?`,
+    `Calculate ${base} to the ${exp}th power. Find the remainder when divided by ${m}.`,
+    `Raise ${base} to the power of ${exp}. What is the remainder after dividing by ${m}?`,
+  ];
+  return { prompt: buildPrompt(pick(templates)), answer: String(answer) };
+};
+// Knowledge + math: world-knowledge facts (values stated) + arithmetic + mod
+// GPT-5.2: 100% | GPT-4o: ~85-93% | Humans: need Google for fact verification
+CHALLENGE_TYPES.knowledge_math = () => {
+  const FACTS = [
+    ["The atomic number of oxygen is {v}", 8],
+    ["The atomic number of carbon is {v}", 6],
+    ["The atomic number of nitrogen is {v}", 7],
+    ["The atomic number of neon is {v}", 10],
+    ["The atomic number of sodium is {v}", 11],
+    ["The atomic number of iron is {v}", 26],
+    ["The atomic number of copper is {v}", 29],
+    ["The atomic number of gold is {v}", 79],
+    ["The atomic number of silver is {v}", 47],
+    ["There are {v} planets in our solar system", 8],
+    ["There are {v} continents on Earth", 7],
+    ["A hexagon has {v} sides", 6],
+    ["A pentagon has {v} sides", 5],
+    ["A standard guitar has {v} strings", 6],
+    ["A violin has {v} strings", 4],
+    ["The English alphabet has {v} letters", 26],
+    ["An adult human has {v} teeth", 32],
+    ["The US flag has {v} stripes", 13],
+    ["A spider has {v} legs", 8],
+    ["An insect has {v} legs", 6],
+    ["There are {v} Harry Potter books in the main series", 7],
+    ["Brazil has won {v} FIFA World Cups", 5],
+    ["The Olympic flag has {v} rings", 5],
+    ["A standard die has {v} total dots across all faces", 21],
+    ["There are {v} ounces in a pound", 16],
+    ["There are {v} inches in a foot", 12],
+    ["A byte has {v} bits", 8],
+    ["Beethoven composed {v} symphonies", 9],
+    ["A soccer team fields {v} players", 11],
+    ["A basketball team has {v} players on court", 5],
+    ["A golf course has {v} holes", 18],
+    ["A standard deck has {v} cards", 52],
+    ["A marathon is approximately {v} miles", 26],
+    ["A chess board has {v} squares", 64],
+    ["There are {v} hours in a day", 24],
+    ["A human cell has {v} chromosomes", 46],
+    ["A piano has {v} keys", 88],
+  ];
+  const [f1, f2] = pickN(FACTS, 2);
+  const [tmpl1, val1] = f1;
+  const [tmpl2, val2] = f2;
+  const sent1 = tmpl1.replace('{v}', val1);
+  const sent2 = tmpl2.replace('{v}', val2);
+  const m = randInt(3, 9);
+  const ops = [];
+  if (val1 + val2 < 200) ops.push('add');
+  if (val1 * val2 < 5000) ops.push('mul');
+  if (val1 !== val2) ops.push('sub');
+  const op = pick(ops);
+  let result, opText;
+  if (op === 'add') {
+    result = (val1 + val2) % m;
+    opText = pick(["Add these two numbers", "Sum these two numbers"]);
+  } else if (op === 'mul') {
+    result = (val1 * val2) % m;
+    opText = pick(["Multiply these two numbers", "Find the product of these two numbers"]);
+  } else {
+    result = ((val1 >= val2 ? val1 - val2 : val2 - val1) % m + m) % m;
+    opText = val1 >= val2 ? "Subtract the second from the first" : "Subtract the first from the second";
+  }
+  return { prompt: buildPrompt(`${sent1} and ${sent2}. ${opText}, then find the remainder when divided by ${m}.`), answer: String(result) };
+};
+// Helper: pick N unique items from array
+function pickN(arr, n) {
+  const copy = [...arr];
+  const result = [];
+  for (let i = 0; i < n; i++) {
+    const idx = Math.floor(Math.random() * copy.length);
+    result.push(copy.splice(idx, 1)[0]);
+  }
+  return result;
+}
 const DIFFICULTY_MAP = {
-  easy: ['reverse_string', 'simple_math', 'pattern', 'counting', 'string_length', 'first_last'],
-  medium: ['reverse_string', 'simple_math', 'rot13', 'letter_position', 'extract_letters', 'pattern', 'counting', 'sorting', 'binary', 'ascii_value', 'string_math'],
-  hard: ['caesar', 'word_math', 'transform', 'binary', 'sorting', 'rot13', 'extract_letters', 'letter_position', 'counting', 'pattern', 'reverse_string', 'simple_math', 'substring', 'zigzag'],
-  agentic: ['chained_transform', 'multi_step_math', 'base_conversion_chain', 'word_extraction_chain', 'letter_math', 'caesar', 'nested_operations', 'string_interleave'],
+  // Easy: gpt-4o-mini solves 100% single-shot (empirically validated)
+  easy: ['simple_math', 'string_math', 'binary', 'pattern'],
+  // Medium: gpt-4o ~90%, gpt-4o-mini ~60%
+  medium: ['sorting', 'word_math'],
+  // Hard: gpt-5.2 100%, gpt-4o ~70-85%
+  hard: ['nested_operations', 'base_conversion_chain', 'power_mod', 'knowledge_math'],
+  // Agentic: multi-step chains, blocks both gpt-4o and gpt-4o-mini
+  agentic: ['chained_arithmetic'],
 };
@@ -2099,18 +2236,55 @@ async function safeSolve(prompt, llmFn, opts = {}) {
     if (!result.safe) throw new Error(`Prompt rejected (${result.method}): ${result.reason} (score: ${result.score})`);
   }
-  const answer = await llmFn(ISOLATION_PROMPT, prompt);
-  if (!answer || typeof answer !== 'string') throw new Error('LLM returned empty or invalid response');
+  let raw = await llmFn(ISOLATION_PROMPT, prompt);
+  if (!raw || typeof raw !== 'string') throw new Error('LLM returned empty or invalid response');
+  let answer = raw.trim();
+  // Strip markdown code fences
+  if (answer.startsWith('```') && answer.endsWith('```')) {
+    answer = answer.slice(3).replace(/`+$/, '').trim();
+    if (answer.includes('\n')) answer = answer.split('\n').pop().trim();
+  }
+  // Strip surrounding quotes
+  if (answer.length >= 2 && answer[0] === answer[answer.length - 1] && '"\'`'.includes(answer[0])) {
+    answer = answer.slice(1, -1).trim();
+  }
+  if (answer.length > maxAnswerLength) {
+    throw new Error(`Answer too long (${answer.length} chars) — possible injection in output`);
+  }
+  // Take first non-empty line if multi-line
+  if (answer.includes('\n')) {
+    const lines = answer.split('\n').map(l => l.trim()).filter(Boolean);
+    answer = lines.length ? lines[0] : '';
+  }
-  const trimmed = answer.trim();
-  if (trimmed.length > maxAnswerLength) {
-    throw new Error(`Answer too long (${trimmed.length} chars) — possible injection in output`);
+  // Strip explanation markers and extract raw answer
+  const markers = ['the answer is', 'the result is', 'the solution is', 'therefore',
+    'so the answer', 'which gives', 'this means', 'let me', 'step 1', 'step 2', 'first,', "here's"];
+  const lower = answer.toLowerCase();
+  for (const marker of markers) {
+    const idx = lower.indexOf(marker);
+    if (idx !== -1) {
+      const candidate = answer.slice(idx + marker.length).trim().replace(/^:/, '').trim();
+      if (candidate && candidate.length <= maxAnswerLength) { answer = candidate; break; }
+    }
   }
-  if (/https?:\/\/|<script|eval\(/.test(trimmed.toLowerCase())) {
+  // Re-strip quotes after extraction
+  if (answer.length >= 2 && answer[0] === answer[answer.length - 1] && '"\'`'.includes(answer[0])) {
+    answer = answer.slice(1, -1).trim();
+  }
+  // Suspicious content check
+  if (/https?:\/\/|<script|eval\(|import |require\(|__proto__/i.test(answer)) {
     throw new Error('Answer contains suspicious content — possible injection');
   }
-  return trimmed;
+  if (!answer) throw new Error('LLM returned empty answer after cleanup');
+  return answer;
 }
 export { CHALLENGE_TYPES, DIFFICULTY_MAP, _callLLM, validatePrompt, validatePromptSync, safeSolve, ISOLATION_PROMPT };