agent-challenge 1.0.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/agentchallenge.js +186 -12
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-challenge",
3
- "version": "1.0.0",
3
+ "version": "1.3.0",
4
4
  "description": "Drop-in LLM authentication for any API endpoint. Reasoning puzzles that agents solve once, then pass through forever. Stateless HMAC tokens, no database.",
5
5
  "main": "src/agentchallenge.js",
6
6
  "type": "module",
@@ -1,5 +1,5 @@
1
1
  /**
2
- * agent-challenge v1.0.0 (JavaScript/Node.js port)
2
+ * agent-challenge v1.1.0 (JavaScript/Node.js port)
3
3
  *
4
4
  * LLM-solvable challenge-response authentication for AI agent APIs.
5
5
  * 12 static challenge types with fully randomized inputs.
@@ -1361,11 +1361,148 @@ CHALLENGE_TYPES.string_interleave = () => {
1361
1361
  }
1362
1362
  };
1363
1363
 
1364
+ // Chained arithmetic: multi-step chains + knowledge facts — GPT-5.2 100%, GPT-4o 30%
1365
+ CHALLENGE_TYPES.chained_arithmetic = () => {
1366
+ const pattern = pick(["add_mul_sub_mod", "mul_add_mul_mod", "add_square_sub_mod", "mul_sub_add_mod"]);
1367
+
1368
+ if (pattern === "add_mul_sub_mod") {
1369
+ const a = randInt(2,9), b = randInt(2,9), c = randInt(2,5), d = randInt(1,9), m = randInt(3,7);
1370
+ return { prompt: buildPrompt(pick([
1371
+ `Add ${a} and ${b}. Multiply by ${c}. Subtract ${d}. Find the remainder when divided by ${m}.`,
1372
+ `What is ((${a} + ${b}) × ${c} - ${d}) mod ${m}?`,
1373
+ `Compute ${a} + ${b}, multiply the sum by ${c}, subtract ${d}, remainder mod ${m}.`,
1374
+ `Sum ${a} and ${b}, then multiply by ${c}, then subtract ${d}. What is the remainder after dividing by ${m}?`,
1375
+ `(${a} + ${b}) × ${c} − ${d}. Divide by ${m} and give the remainder.`,
1376
+ ])), answer: String(((a + b) * c - d) % m) };
1377
+ } else if (pattern === "mul_add_mul_mod") {
1378
+ const a = randInt(2,7), b = randInt(2,5), c = randInt(3,9), d = randInt(2,4), m = randInt(3,7);
1379
+ return { prompt: buildPrompt(pick([
1380
+ `Multiply ${a} by ${b}. Add ${c}. Multiply by ${d}. Find the remainder when divided by ${m}.`,
1381
+ `What is ((${a} × ${b} + ${c}) × ${d}) mod ${m}?`,
1382
+ `Compute ${a} × ${b}, add ${c}, multiply by ${d}, remainder mod ${m}.`,
1383
+ `Product of ${a} and ${b}, plus ${c}, times ${d}. What is the remainder after dividing by ${m}?`,
1384
+ ])), answer: String(((a * b + c) * d) % m) };
1385
+ } else if (pattern === "add_square_sub_mod") {
1386
+ const a = randInt(2,5), b = randInt(1,4), c = randInt(1,8), m = randInt(3,7);
1387
+ return { prompt: buildPrompt(pick([
1388
+ `Add ${a} and ${b}. Square the result. Subtract ${c}. Find the remainder when divided by ${m}.`,
1389
+ `What is ((${a} + ${b})² - ${c}) mod ${m}?`,
1390
+ `Compute (${a} + ${b}) squared, subtract ${c}, remainder mod ${m}.`,
1391
+ `Sum ${a} and ${b}, square it, subtract ${c}. Remainder after dividing by ${m}?`,
1392
+ ])), answer: String(((a + b) ** 2 - c) % m) };
1393
+ } else { // mul_sub_add_mod
1394
+ const a = randInt(3,9), b = randInt(2,5), c = randInt(1, Math.min(a*b-1,9)), d = randInt(2,9), m = randInt(3,7);
1395
+ return { prompt: buildPrompt(pick([
1396
+ `Multiply ${a} by ${b}. Subtract ${c}. Add ${d}. Find the remainder when divided by ${m}.`,
1397
+ `What is (${a} × ${b} - ${c} + ${d}) mod ${m}?`,
1398
+ `Compute ${a} × ${b}, subtract ${c}, add ${d}, remainder mod ${m}.`,
1399
+ `Product of ${a} and ${b}, minus ${c}, plus ${d}. Remainder after dividing by ${m}?`,
1400
+ ])), answer: String((a * b - c + d) % m) };
1401
+ }
1402
+ };
1403
+
1404
+ // Power modulo: base^exp mod m — GPT-5.2 100%, GPT-4o 80%
1405
+ CHALLENGE_TYPES.power_mod = () => {
1406
+ const base = randInt(2, 5), exp = randInt(3, 6), m = randInt(5, 13);
1407
+ const answer = (base ** exp) % m;
1408
+ const templates = [
1409
+ `Compute ${base} raised to the power ${exp}, then find the remainder when divided by ${m}.`,
1410
+ `What is ${base}^${exp} mod ${m}?`,
1411
+ `Calculate ${base} to the ${exp}th power. Find the remainder when divided by ${m}.`,
1412
+ `Raise ${base} to the power of ${exp}. What is the remainder after dividing by ${m}?`,
1413
+ ];
1414
+ return { prompt: buildPrompt(pick(templates)), answer: String(answer) };
1415
+ };
1416
+
1417
+ // Knowledge + math: world-knowledge facts (values stated) + arithmetic + mod
1418
+ // GPT-5.2: 100% | GPT-4o: ~85-93% | Humans: need Google for fact verification
1419
+ CHALLENGE_TYPES.knowledge_math = () => {
1420
+ const FACTS = [
1421
+ ["The atomic number of oxygen is {v}", 8],
1422
+ ["The atomic number of carbon is {v}", 6],
1423
+ ["The atomic number of nitrogen is {v}", 7],
1424
+ ["The atomic number of neon is {v}", 10],
1425
+ ["The atomic number of sodium is {v}", 11],
1426
+ ["The atomic number of iron is {v}", 26],
1427
+ ["The atomic number of copper is {v}", 29],
1428
+ ["The atomic number of gold is {v}", 79],
1429
+ ["The atomic number of silver is {v}", 47],
1430
+ ["There are {v} planets in our solar system", 8],
1431
+ ["There are {v} continents on Earth", 7],
1432
+ ["A hexagon has {v} sides", 6],
1433
+ ["A pentagon has {v} sides", 5],
1434
+ ["A standard guitar has {v} strings", 6],
1435
+ ["A violin has {v} strings", 4],
1436
+ ["The English alphabet has {v} letters", 26],
1437
+ ["An adult human has {v} teeth", 32],
1438
+ ["The US flag has {v} stripes", 13],
1439
+ ["A spider has {v} legs", 8],
1440
+ ["An insect has {v} legs", 6],
1441
+ ["There are {v} Harry Potter books in the main series", 7],
1442
+ ["Brazil has won {v} FIFA World Cups", 5],
1443
+ ["The Olympic flag has {v} rings", 5],
1444
+ ["A standard die has {v} total dots across all faces", 21],
1445
+ ["There are {v} ounces in a pound", 16],
1446
+ ["There are {v} inches in a foot", 12],
1447
+ ["A byte has {v} bits", 8],
1448
+ ["Beethoven composed {v} symphonies", 9],
1449
+ ["A soccer team fields {v} players", 11],
1450
+ ["A basketball team has {v} players on court", 5],
1451
+ ["A golf course has {v} holes", 18],
1452
+ ["A standard deck has {v} cards", 52],
1453
+ ["A marathon is approximately {v} miles", 26],
1454
+ ["A chess board has {v} squares", 64],
1455
+ ["There are {v} hours in a day", 24],
1456
+ ["A human cell has {v} chromosomes", 46],
1457
+ ["A piano has {v} keys", 88],
1458
+ ];
1459
+ const [f1, f2] = pickN(FACTS, 2);
1460
+ const [tmpl1, val1] = f1;
1461
+ const [tmpl2, val2] = f2;
1462
+ const sent1 = tmpl1.replace('{v}', val1);
1463
+ const sent2 = tmpl2.replace('{v}', val2);
1464
+ const m = randInt(3, 9);
1465
+
1466
+ const ops = [];
1467
+ if (val1 + val2 < 200) ops.push('add');
1468
+ if (val1 * val2 < 5000) ops.push('mul');
1469
+ if (val1 !== val2) ops.push('sub');
1470
+ const op = pick(ops);
1471
+
1472
+ let result, opText;
1473
+ if (op === 'add') {
1474
+ result = (val1 + val2) % m;
1475
+ opText = pick(["Add these two numbers", "Sum these two numbers"]);
1476
+ } else if (op === 'mul') {
1477
+ result = (val1 * val2) % m;
1478
+ opText = pick(["Multiply these two numbers", "Find the product of these two numbers"]);
1479
+ } else {
1480
+ result = ((val1 >= val2 ? val1 - val2 : val2 - val1) % m + m) % m;
1481
+ opText = val1 >= val2 ? "Subtract the second from the first" : "Subtract the first from the second";
1482
+ }
1483
+ return { prompt: buildPrompt(`${sent1} and ${sent2}. ${opText}, then find the remainder when divided by ${m}.`), answer: String(result) };
1484
+ };
1485
+
1486
+ // Helper: pick N unique items from array
1487
+ function pickN(arr, n) {
1488
+ const copy = [...arr];
1489
+ const result = [];
1490
+ for (let i = 0; i < n; i++) {
1491
+ const idx = Math.floor(Math.random() * copy.length);
1492
+ result.push(copy.splice(idx, 1)[0]);
1493
+ }
1494
+ return result;
1495
+ }
1496
+
1364
1497
  const DIFFICULTY_MAP = {
1365
- easy: ['reverse_string', 'simple_math', 'pattern', 'counting', 'string_length', 'first_last'],
1366
- medium: ['reverse_string', 'simple_math', 'rot13', 'letter_position', 'extract_letters', 'pattern', 'counting', 'sorting', 'binary', 'ascii_value', 'string_math'],
1367
- hard: ['caesar', 'word_math', 'transform', 'binary', 'sorting', 'rot13', 'extract_letters', 'letter_position', 'counting', 'pattern', 'reverse_string', 'simple_math', 'substring', 'zigzag'],
1368
- agentic: ['chained_transform', 'multi_step_math', 'base_conversion_chain', 'word_extraction_chain', 'letter_math', 'caesar', 'nested_operations', 'string_interleave'],
1498
+ // Easy: gpt-4o-mini solves 100% single-shot (empirically validated)
1499
+ easy: ['simple_math', 'string_math', 'binary', 'pattern'],
1500
+ // Medium: gpt-4o ~90%, gpt-4o-mini ~60%
1501
+ medium: ['sorting', 'word_math'],
1502
+ // Hard: gpt-5.2 100%, gpt-4o ~70-85%
1503
+ hard: ['nested_operations', 'base_conversion_chain', 'power_mod', 'knowledge_math'],
1504
+ // Agentic: multi-step chains, blocks both gpt-4o and gpt-4o-mini
1505
+ agentic: ['chained_arithmetic'],
1369
1506
  };
1370
1507
 
1371
1508
 
@@ -2099,18 +2236,55 @@ async function safeSolve(prompt, llmFn, opts = {}) {
2099
2236
  if (!result.safe) throw new Error(`Prompt rejected (${result.method}): ${result.reason} (score: ${result.score})`);
2100
2237
  }
2101
2238
 
2102
- const answer = await llmFn(ISOLATION_PROMPT, prompt);
2103
- if (!answer || typeof answer !== 'string') throw new Error('LLM returned empty or invalid response');
2239
+ let raw = await llmFn(ISOLATION_PROMPT, prompt);
2240
+ if (!raw || typeof raw !== 'string') throw new Error('LLM returned empty or invalid response');
2241
+
2242
+ let answer = raw.trim();
2243
+
2244
+ // Strip markdown code fences
2245
+ if (answer.startsWith('```') && answer.endsWith('```')) {
2246
+ answer = answer.slice(3).replace(/`+$/, '').trim();
2247
+ if (answer.includes('\n')) answer = answer.split('\n').pop().trim();
2248
+ }
2249
+ // Strip surrounding quotes
2250
+ if (answer.length >= 2 && answer[0] === answer[answer.length - 1] && '"\'`'.includes(answer[0])) {
2251
+ answer = answer.slice(1, -1).trim();
2252
+ }
2253
+
2254
+ if (answer.length > maxAnswerLength) {
2255
+ throw new Error(`Answer too long (${answer.length} chars) — possible injection in output`);
2256
+ }
2257
+
2258
+ // Take first non-empty line if multi-line
2259
+ if (answer.includes('\n')) {
2260
+ const lines = answer.split('\n').map(l => l.trim()).filter(Boolean);
2261
+ answer = lines.length ? lines[0] : '';
2262
+ }
2104
2263
 
2105
- const trimmed = answer.trim();
2106
- if (trimmed.length > maxAnswerLength) {
2107
- throw new Error(`Answer too long (${trimmed.length} chars) possible injection in output`);
2264
+ // Strip explanation markers and extract raw answer
2265
+ const markers = ['the answer is', 'the result is', 'the solution is', 'therefore',
2266
+ 'so the answer', 'which gives', 'this means', 'let me', 'step 1', 'step 2', 'first,', "here's"];
2267
+ const lower = answer.toLowerCase();
2268
+ for (const marker of markers) {
2269
+ const idx = lower.indexOf(marker);
2270
+ if (idx !== -1) {
2271
+ const candidate = answer.slice(idx + marker.length).trim().replace(/^:/, '').trim();
2272
+ if (candidate && candidate.length <= maxAnswerLength) { answer = candidate; break; }
2273
+ }
2108
2274
  }
2109
- if (/https?:\/\/|<script|eval\(/.test(trimmed.toLowerCase())) {
2275
+
2276
+ // Re-strip quotes after extraction
2277
+ if (answer.length >= 2 && answer[0] === answer[answer.length - 1] && '"\'`'.includes(answer[0])) {
2278
+ answer = answer.slice(1, -1).trim();
2279
+ }
2280
+
2281
+ // Suspicious content check
2282
+ if (/https?:\/\/|<script|eval\(|import |require\(|__proto__/i.test(answer)) {
2110
2283
  throw new Error('Answer contains suspicious content — possible injection');
2111
2284
  }
2112
2285
 
2113
- return trimmed;
2286
+ if (!answer) throw new Error('LLM returned empty answer after cleanup');
2287
+ return answer;
2114
2288
  }
2115
2289
 
2116
2290
  export { CHALLENGE_TYPES, DIFFICULTY_MAP, _callLLM, validatePrompt, validatePromptSync, safeSolve, ISOLATION_PROMPT };