agent-challenge 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agentchallenge.js +119 -12
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-challenge",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Drop-in LLM authentication for any API endpoint. Reasoning puzzles that agents solve once, then pass through forever. Stateless HMAC tokens, no database.",
|
|
5
5
|
"main": "src/agentchallenge.js",
|
|
6
6
|
"type": "module",
|
package/src/agentchallenge.js
CHANGED
|
@@ -1361,17 +1361,44 @@ CHALLENGE_TYPES.string_interleave = () => {
|
|
|
1361
1361
|
}
|
|
1362
1362
|
};
|
|
1363
1363
|
|
|
1364
|
-
// Chained arithmetic:
|
|
1364
|
+
// Chained arithmetic: multi-step chains + knowledge facts — GPT-5.2 100%, GPT-4o 30%
|
|
1365
1365
|
CHALLENGE_TYPES.chained_arithmetic = () => {
|
|
1366
|
-
const
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1366
|
+
const pattern = pick(["add_mul_sub_mod", "mul_add_mul_mod", "add_square_sub_mod", "mul_sub_add_mod"]);
|
|
1367
|
+
|
|
1368
|
+
if (pattern === "add_mul_sub_mod") {
|
|
1369
|
+
const a = randInt(2,9), b = randInt(2,9), c = randInt(2,5), d = randInt(1,9), m = randInt(3,7);
|
|
1370
|
+
return { prompt: buildPrompt(pick([
|
|
1371
|
+
`Add ${a} and ${b}. Multiply by ${c}. Subtract ${d}. Find the remainder when divided by ${m}.`,
|
|
1372
|
+
`What is ((${a} + ${b}) × ${c} - ${d}) mod ${m}?`,
|
|
1373
|
+
`Compute ${a} + ${b}, multiply the sum by ${c}, subtract ${d}, remainder mod ${m}.`,
|
|
1374
|
+
`Sum ${a} and ${b}, then multiply by ${c}, then subtract ${d}. What is the remainder after dividing by ${m}?`,
|
|
1375
|
+
`(${a} + ${b}) × ${c} − ${d}. Divide by ${m} and give the remainder.`,
|
|
1376
|
+
])), answer: String(((a + b) * c - d) % m) };
|
|
1377
|
+
} else if (pattern === "mul_add_mul_mod") {
|
|
1378
|
+
const a = randInt(2,7), b = randInt(2,5), c = randInt(3,9), d = randInt(2,4), m = randInt(3,7);
|
|
1379
|
+
return { prompt: buildPrompt(pick([
|
|
1380
|
+
`Multiply ${a} by ${b}. Add ${c}. Multiply by ${d}. Find the remainder when divided by ${m}.`,
|
|
1381
|
+
`What is ((${a} × ${b} + ${c}) × ${d}) mod ${m}?`,
|
|
1382
|
+
`Compute ${a} × ${b}, add ${c}, multiply by ${d}, remainder mod ${m}.`,
|
|
1383
|
+
`Product of ${a} and ${b}, plus ${c}, times ${d}. What is the remainder after dividing by ${m}?`,
|
|
1384
|
+
])), answer: String(((a * b + c) * d) % m) };
|
|
1385
|
+
} else if (pattern === "add_square_sub_mod") {
|
|
1386
|
+
const a = randInt(2,5), b = randInt(1,4), c = randInt(1,8), m = randInt(3,7);
|
|
1387
|
+
return { prompt: buildPrompt(pick([
|
|
1388
|
+
`Add ${a} and ${b}. Square the result. Subtract ${c}. Find the remainder when divided by ${m}.`,
|
|
1389
|
+
`What is ((${a} + ${b})² - ${c}) mod ${m}?`,
|
|
1390
|
+
`Compute (${a} + ${b}) squared, subtract ${c}, remainder mod ${m}.`,
|
|
1391
|
+
`Sum ${a} and ${b}, square it, subtract ${c}. Remainder after dividing by ${m}?`,
|
|
1392
|
+
])), answer: String(((a + b) ** 2 - c) % m) };
|
|
1393
|
+
} else { // mul_sub_add_mod
|
|
1394
|
+
const a = randInt(3,9), b = randInt(2,5), c = randInt(1, Math.min(a*b-1,9)), d = randInt(2,9), m = randInt(3,7);
|
|
1395
|
+
return { prompt: buildPrompt(pick([
|
|
1396
|
+
`Multiply ${a} by ${b}. Subtract ${c}. Add ${d}. Find the remainder when divided by ${m}.`,
|
|
1397
|
+
`What is (${a} × ${b} - ${c} + ${d}) mod ${m}?`,
|
|
1398
|
+
`Compute ${a} × ${b}, subtract ${c}, add ${d}, remainder mod ${m}.`,
|
|
1399
|
+
`Product of ${a} and ${b}, minus ${c}, plus ${d}. Remainder after dividing by ${m}?`,
|
|
1400
|
+
])), answer: String((a * b - c + d) % m) };
|
|
1401
|
+
}
|
|
1375
1402
|
};
|
|
1376
1403
|
|
|
1377
1404
|
// Power modulo: base^exp mod m — GPT-5.2 100%, GPT-4o 80%
|
|
@@ -1387,13 +1414,93 @@ CHALLENGE_TYPES.power_mod = () => {
|
|
|
1387
1414
|
return { prompt: buildPrompt(pick(templates)), answer: String(answer) };
|
|
1388
1415
|
};
|
|
1389
1416
|
|
|
1417
|
+
// Knowledge + math: world-knowledge facts (values stated) + arithmetic + mod
|
|
1418
|
+
// GPT-5.2: 100% | GPT-4o: ~85-93% | Humans: need Google for fact verification
|
|
1419
|
+
CHALLENGE_TYPES.knowledge_math = () => {
|
|
1420
|
+
const FACTS = [
|
|
1421
|
+
["The atomic number of oxygen is {v}", 8],
|
|
1422
|
+
["The atomic number of carbon is {v}", 6],
|
|
1423
|
+
["The atomic number of nitrogen is {v}", 7],
|
|
1424
|
+
["The atomic number of neon is {v}", 10],
|
|
1425
|
+
["The atomic number of sodium is {v}", 11],
|
|
1426
|
+
["The atomic number of iron is {v}", 26],
|
|
1427
|
+
["The atomic number of copper is {v}", 29],
|
|
1428
|
+
["The atomic number of gold is {v}", 79],
|
|
1429
|
+
["The atomic number of silver is {v}", 47],
|
|
1430
|
+
["There are {v} planets in our solar system", 8],
|
|
1431
|
+
["There are {v} continents on Earth", 7],
|
|
1432
|
+
["A hexagon has {v} sides", 6],
|
|
1433
|
+
["A pentagon has {v} sides", 5],
|
|
1434
|
+
["A standard guitar has {v} strings", 6],
|
|
1435
|
+
["A violin has {v} strings", 4],
|
|
1436
|
+
["The English alphabet has {v} letters", 26],
|
|
1437
|
+
["An adult human has {v} teeth", 32],
|
|
1438
|
+
["The US flag has {v} stripes", 13],
|
|
1439
|
+
["A spider has {v} legs", 8],
|
|
1440
|
+
["An insect has {v} legs", 6],
|
|
1441
|
+
["There are {v} Harry Potter books in the main series", 7],
|
|
1442
|
+
["Brazil has won {v} FIFA World Cups", 5],
|
|
1443
|
+
["The Olympic flag has {v} rings", 5],
|
|
1444
|
+
["A standard die has {v} total dots across all faces", 21],
|
|
1445
|
+
["There are {v} ounces in a pound", 16],
|
|
1446
|
+
["There are {v} inches in a foot", 12],
|
|
1447
|
+
["A byte has {v} bits", 8],
|
|
1448
|
+
["Beethoven composed {v} symphonies", 9],
|
|
1449
|
+
["A soccer team fields {v} players", 11],
|
|
1450
|
+
["A basketball team has {v} players on court", 5],
|
|
1451
|
+
["A golf course has {v} holes", 18],
|
|
1452
|
+
["A standard deck has {v} cards", 52],
|
|
1453
|
+
["A marathon is approximately {v} miles", 26],
|
|
1454
|
+
["A chess board has {v} squares", 64],
|
|
1455
|
+
["There are {v} hours in a day", 24],
|
|
1456
|
+
["A human cell has {v} chromosomes", 46],
|
|
1457
|
+
["A piano has {v} keys", 88],
|
|
1458
|
+
];
|
|
1459
|
+
const [f1, f2] = pickN(FACTS, 2);
|
|
1460
|
+
const [tmpl1, val1] = f1;
|
|
1461
|
+
const [tmpl2, val2] = f2;
|
|
1462
|
+
const sent1 = tmpl1.replace('{v}', val1);
|
|
1463
|
+
const sent2 = tmpl2.replace('{v}', val2);
|
|
1464
|
+
const m = randInt(3, 9);
|
|
1465
|
+
|
|
1466
|
+
const ops = [];
|
|
1467
|
+
if (val1 + val2 < 200) ops.push('add');
|
|
1468
|
+
if (val1 * val2 < 5000) ops.push('mul');
|
|
1469
|
+
if (val1 !== val2) ops.push('sub');
|
|
1470
|
+
const op = pick(ops);
|
|
1471
|
+
|
|
1472
|
+
let result, opText;
|
|
1473
|
+
if (op === 'add') {
|
|
1474
|
+
result = (val1 + val2) % m;
|
|
1475
|
+
opText = pick(["Add these two numbers", "Sum these two numbers"]);
|
|
1476
|
+
} else if (op === 'mul') {
|
|
1477
|
+
result = (val1 * val2) % m;
|
|
1478
|
+
opText = pick(["Multiply these two numbers", "Find the product of these two numbers"]);
|
|
1479
|
+
} else {
|
|
1480
|
+
result = ((val1 >= val2 ? val1 - val2 : val2 - val1) % m + m) % m;
|
|
1481
|
+
opText = val1 >= val2 ? "Subtract the second from the first" : "Subtract the first from the second";
|
|
1482
|
+
}
|
|
1483
|
+
return { prompt: buildPrompt(`${sent1} and ${sent2}. ${opText}, then find the remainder when divided by ${m}.`), answer: String(result) };
|
|
1484
|
+
};
|
|
1485
|
+
|
|
1486
|
+
// Helper: pick N unique items from array
|
|
1487
|
+
function pickN(arr, n) {
|
|
1488
|
+
const copy = [...arr];
|
|
1489
|
+
const result = [];
|
|
1490
|
+
for (let i = 0; i < n; i++) {
|
|
1491
|
+
const idx = Math.floor(Math.random() * copy.length);
|
|
1492
|
+
result.push(copy.splice(idx, 1)[0]);
|
|
1493
|
+
}
|
|
1494
|
+
return result;
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1390
1497
|
const DIFFICULTY_MAP = {
|
|
1391
1498
|
// Easy: gpt-4o-mini solves 100% single-shot (empirically validated)
|
|
1392
1499
|
easy: ['simple_math', 'string_math', 'binary', 'pattern'],
|
|
1393
1500
|
// Medium: gpt-4o ~90%, gpt-4o-mini ~60%
|
|
1394
1501
|
medium: ['sorting', 'word_math'],
|
|
1395
|
-
// Hard: gpt-5.2 100%, gpt-4o ~70-
|
|
1396
|
-
hard: ['nested_operations', 'base_conversion_chain', 'power_mod'],
|
|
1502
|
+
// Hard: gpt-5.2 100%, gpt-4o ~70-85%
|
|
1503
|
+
hard: ['nested_operations', 'base_conversion_chain', 'power_mod', 'knowledge_math'],
|
|
1397
1504
|
// Agentic: multi-step chains, blocks both gpt-4o and gpt-4o-mini
|
|
1398
1505
|
agentic: ['chained_arithmetic'],
|
|
1399
1506
|
};
|