agenr 0.9.16 → 0.9.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/cli-main.js +112 -45
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.9.17 - 2026-02-27
|
|
4
|
+
|
|
5
|
+
### Changed
|
|
6
|
+
- Optimized LLM dedup in consolidate clustering: batch up to 10 pairs per API
|
|
7
|
+
call with 5 concurrent batches. Reduces a 2400-pair dedup queue from ~60min
|
|
8
|
+
(sequential, 1 call per pair) to ~2min.
|
|
9
|
+
|
|
3
10
|
## 0.9.16 - 2026-02-27
|
|
4
11
|
|
|
5
12
|
### Added
|
package/dist/cli-main.js
CHANGED
|
@@ -4254,15 +4254,26 @@ var DEFAULT_IDEMPOTENCY_DAYS = 7;
|
|
|
4254
4254
|
var DEFAULT_NEIGHBOR_LIMIT = 20;
|
|
4255
4255
|
var MAX_ACTIVE_EMBEDDED_ENTRIES2 = 2e4;
|
|
4256
4256
|
var MILLISECONDS_PER_DAY = 1e3 * 60 * 60 * 24;
|
|
4257
|
-
var
|
|
4257
|
+
var LLM_DEDUP_BATCH_TOOL_NAME = "batch_dedup_check";
|
|
4258
|
+
var LLM_DEDUP_BATCH_SIZE = 10;
|
|
4259
|
+
var LLM_DEDUP_CONCURRENCY = 5;
|
|
4258
4260
|
var LLM_DEDUP_TOOL_SCHEMA = Type.Object({
|
|
4259
4261
|
same: Type.Boolean(),
|
|
4260
4262
|
reason: Type.String()
|
|
4261
4263
|
});
|
|
4262
|
-
var
|
|
4263
|
-
|
|
4264
|
-
|
|
4265
|
-
|
|
4264
|
+
var LLM_DEDUP_BATCH_TOOL_SCHEMA = Type.Object({
|
|
4265
|
+
results: Type.Array(
|
|
4266
|
+
Type.Object({
|
|
4267
|
+
pair: Type.Number(),
|
|
4268
|
+
same: Type.Boolean(),
|
|
4269
|
+
reason: Type.Optional(Type.String())
|
|
4270
|
+
})
|
|
4271
|
+
)
|
|
4272
|
+
});
|
|
4273
|
+
var LLM_DEDUP_BATCH_TOOL = {
|
|
4274
|
+
name: LLM_DEDUP_BATCH_TOOL_NAME,
|
|
4275
|
+
description: "Return dedup results for all pairs",
|
|
4276
|
+
parameters: LLM_DEDUP_BATCH_TOOL_SCHEMA
|
|
4266
4277
|
};
|
|
4267
4278
|
function toNumber5(value) {
|
|
4268
4279
|
if (typeof value === "number") {
|
|
@@ -4303,70 +4314,98 @@ function normalizeSubject2(value) {
|
|
|
4303
4314
|
function pairKey(a, b) {
|
|
4304
4315
|
return a < b ? `${a}|${b}` : `${b}|${a}`;
|
|
4305
4316
|
}
|
|
4306
|
-
function
|
|
4317
|
+
function buildLlmDedupBatchContext(pairs) {
|
|
4307
4318
|
const systemPrompt = [
|
|
4308
4319
|
"You are a deduplication assistant for knowledge entries.",
|
|
4309
|
-
"
|
|
4310
|
-
"
|
|
4311
|
-
].join("\n");
|
|
4312
|
-
const userPrompt = [
|
|
4313
|
-
"Are these two knowledge entries expressing the same fact or genuinely distinct?",
|
|
4314
|
-
`Entry A: ${entryA.content}`,
|
|
4315
|
-
`Entry B: ${entryB.content}`,
|
|
4316
|
-
'Set "same" to true only when both entries represent the same knowledge.'
|
|
4320
|
+
"For each numbered pair, decide if they express the same knowledge.",
|
|
4321
|
+
"Call batch_dedup_check once with your results."
|
|
4317
4322
|
].join("\n");
|
|
4323
|
+
const pairBlocks = pairs.map(
|
|
4324
|
+
(pair, index) => [
|
|
4325
|
+
`Pair ${index + 1}:`,
|
|
4326
|
+
` Entry A: ${pair.entry.content}`,
|
|
4327
|
+
` Entry B: ${pair.candidate.content}`
|
|
4328
|
+
].join("\n")
|
|
4329
|
+
).join("\n\n");
|
|
4318
4330
|
return {
|
|
4319
4331
|
systemPrompt,
|
|
4320
4332
|
messages: [
|
|
4321
4333
|
{
|
|
4322
4334
|
role: "user",
|
|
4323
|
-
content:
|
|
4335
|
+
content: pairBlocks,
|
|
4324
4336
|
timestamp: Date.now()
|
|
4325
4337
|
}
|
|
4326
4338
|
],
|
|
4327
|
-
tools: [
|
|
4339
|
+
tools: [LLM_DEDUP_BATCH_TOOL]
|
|
4328
4340
|
};
|
|
4329
4341
|
}
|
|
4330
|
-
function
|
|
4342
|
+
function extractLlmDedupBatchFromToolCall(message) {
|
|
4331
4343
|
for (const block of message.content) {
|
|
4332
|
-
if (block.type !== "toolCall" || block.name !==
|
|
4344
|
+
if (block.type !== "toolCall" || block.name !== LLM_DEDUP_BATCH_TOOL_NAME) {
|
|
4333
4345
|
continue;
|
|
4334
4346
|
}
|
|
4335
4347
|
const args = block.arguments;
|
|
4336
|
-
if (!args ||
|
|
4348
|
+
if (!args || !Array.isArray(args.results)) {
|
|
4337
4349
|
continue;
|
|
4338
4350
|
}
|
|
4339
|
-
|
|
4340
|
-
|
|
4341
|
-
|
|
4342
|
-
|
|
4351
|
+
const parsed = args.results.filter(
|
|
4352
|
+
(result) => typeof result === "object" && result !== null && typeof result.pair === "number" && typeof result.same === "boolean"
|
|
4353
|
+
).map((result) => ({
|
|
4354
|
+
pair: result.pair,
|
|
4355
|
+
same: result.same,
|
|
4356
|
+
reason: typeof result.reason === "string" ? result.reason : void 0
|
|
4357
|
+
}));
|
|
4358
|
+
return parsed;
|
|
4343
4359
|
}
|
|
4344
4360
|
return null;
|
|
4345
4361
|
}
|
|
4346
|
-
async function
|
|
4362
|
+
async function llmDedupCheckBatch(llmClient, pairs) {
|
|
4363
|
+
if (pairs.length === 0) {
|
|
4364
|
+
return [];
|
|
4365
|
+
}
|
|
4366
|
+
const fallback = Array.from({ length: pairs.length }, () => false);
|
|
4347
4367
|
try {
|
|
4348
|
-
const timeoutMs =
|
|
4368
|
+
const timeoutMs = 3e4;
|
|
4369
|
+
let timer;
|
|
4349
4370
|
const response = await Promise.race([
|
|
4350
4371
|
runSimpleStream({
|
|
4351
4372
|
model: llmClient.resolvedModel.model,
|
|
4352
|
-
context:
|
|
4373
|
+
context: buildLlmDedupBatchContext(pairs),
|
|
4353
4374
|
options: {
|
|
4354
4375
|
apiKey: llmClient.credentials.apiKey
|
|
4355
4376
|
},
|
|
4356
4377
|
verbose: false
|
|
4357
4378
|
}),
|
|
4358
|
-
new Promise(
|
|
4359
|
-
|
|
4360
|
-
)
|
|
4361
|
-
]);
|
|
4379
|
+
new Promise((_, reject) => {
|
|
4380
|
+
timer = setTimeout(() => reject(new Error("llmDedupCheckBatch timed out")), timeoutMs);
|
|
4381
|
+
})
|
|
4382
|
+
]).finally(() => clearTimeout(timer));
|
|
4362
4383
|
if (response.stopReason === "error" || response.errorMessage) {
|
|
4363
|
-
return
|
|
4384
|
+
return fallback;
|
|
4385
|
+
}
|
|
4386
|
+
const parsed = extractLlmDedupBatchFromToolCall(response);
|
|
4387
|
+
if (!parsed) {
|
|
4388
|
+
return fallback;
|
|
4389
|
+
}
|
|
4390
|
+
const results = [...fallback];
|
|
4391
|
+
for (const result of parsed) {
|
|
4392
|
+
const pairIndex = Math.trunc(result.pair) - 1;
|
|
4393
|
+
if (pairIndex < 0 || pairIndex >= results.length) {
|
|
4394
|
+
continue;
|
|
4395
|
+
}
|
|
4396
|
+
results[pairIndex] = result.same;
|
|
4364
4397
|
}
|
|
4365
|
-
|
|
4366
|
-
return parsed?.same === true;
|
|
4398
|
+
return results;
|
|
4367
4399
|
} catch {
|
|
4368
|
-
return
|
|
4400
|
+
return fallback;
|
|
4401
|
+
}
|
|
4402
|
+
}
|
|
4403
|
+
function chunkPairs(items, size) {
|
|
4404
|
+
const chunks = [];
|
|
4405
|
+
for (let i = 0; i < items.length; i += size) {
|
|
4406
|
+
chunks.push(items.slice(i, i + size));
|
|
4369
4407
|
}
|
|
4408
|
+
return chunks;
|
|
4370
4409
|
}
|
|
4371
4410
|
function parseDaysSince(value, now) {
|
|
4372
4411
|
const parsed = new Date(value);
|
|
@@ -4476,6 +4515,7 @@ async function buildClusters(db, options = {}) {
|
|
|
4476
4515
|
const looseUnionPairs = /* @__PURE__ */ new Set();
|
|
4477
4516
|
const llmDedupQueue = [];
|
|
4478
4517
|
let llmDedupCalls = 0;
|
|
4518
|
+
let llmDedupCheckedPairs = 0;
|
|
4479
4519
|
let llmDedupMatches = 0;
|
|
4480
4520
|
for (const entry of candidates) {
|
|
4481
4521
|
unionFind.add(entry.id);
|
|
@@ -4507,18 +4547,45 @@ async function buildClusters(db, options = {}) {
|
|
|
4507
4547
|
llmDedupQueue.push({ entry, candidate, key });
|
|
4508
4548
|
}
|
|
4509
4549
|
}
|
|
4510
|
-
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
4514
|
-
|
|
4515
|
-
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
|
|
4519
|
-
|
|
4550
|
+
if (llmClient && llmDedupQueue.length > 0) {
|
|
4551
|
+
const startedAt = Date.now();
|
|
4552
|
+
const batches = chunkPairs(llmDedupQueue, LLM_DEDUP_BATCH_SIZE);
|
|
4553
|
+
for (let i = 0; i < batches.length; i += LLM_DEDUP_CONCURRENCY) {
|
|
4554
|
+
const batchGroup = batches.slice(i, i + LLM_DEDUP_CONCURRENCY);
|
|
4555
|
+
const settled = await Promise.allSettled(
|
|
4556
|
+
batchGroup.map(
|
|
4557
|
+
(batch) => llmDedupCheckBatch(
|
|
4558
|
+
llmClient,
|
|
4559
|
+
batch.map((pair) => ({
|
|
4560
|
+
entry: pair.entry,
|
|
4561
|
+
candidate: pair.candidate
|
|
4562
|
+
}))
|
|
4563
|
+
)
|
|
4564
|
+
)
|
|
4565
|
+
);
|
|
4566
|
+
llmDedupCalls += batchGroup.length;
|
|
4567
|
+
for (let batchIndex = 0; batchIndex < batchGroup.length; batchIndex += 1) {
|
|
4568
|
+
const batch = batchGroup[batchIndex];
|
|
4569
|
+
const result2 = settled[batchIndex];
|
|
4570
|
+
const matches = result2 && result2.status === "fulfilled" ? result2.value : Array.from({ length: batch.length }, () => false);
|
|
4571
|
+
llmDedupCheckedPairs += batch.length;
|
|
4572
|
+
for (let pairIndex = 0; pairIndex < batch.length; pairIndex += 1) {
|
|
4573
|
+
const pair = batch[pairIndex];
|
|
4574
|
+
if (matches[pairIndex] !== true) {
|
|
4575
|
+
continue;
|
|
4576
|
+
}
|
|
4577
|
+
llmDedupMatches += 1;
|
|
4578
|
+
looseUnionPairs.add(pair.key);
|
|
4579
|
+
unionFind.union(pair.entry.id, pair.candidate.id);
|
|
4580
|
+
}
|
|
4581
|
+
}
|
|
4582
|
+
const elapsedSeconds = Math.max(1, Math.floor((Date.now() - startedAt) / 1e3));
|
|
4583
|
+
const remainingPairs = Math.max(0, llmDedupQueue.length - llmDedupCheckedPairs);
|
|
4584
|
+
const estimatedRemainingSeconds = Math.round(elapsedSeconds / llmDedupCheckedPairs * remainingPairs);
|
|
4585
|
+
onLog(
|
|
4586
|
+
`[dedup] Checked ${llmDedupCheckedPairs}/${llmDedupQueue.length} pairs (${llmDedupMatches} matched) ~${estimatedRemainingSeconds}s remaining`
|
|
4587
|
+
);
|
|
4520
4588
|
}
|
|
4521
|
-
onLog(`[dedup] Checked ${llmDedupCalls}/${llmDedupQueue.length} pairs (${llmDedupMatches} matched)`);
|
|
4522
4589
|
}
|
|
4523
4590
|
const groups = /* @__PURE__ */ new Map();
|
|
4524
4591
|
for (const entry of candidates) {
|