agenr 0.9.60 → 0.9.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/cli-main.js +483 -4
- package/package.json +13 -8
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.9.61] - 2026-03-03
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- **Synthetic cold-start signals** (#417 Phase 3): `agenr ingest --bulk --synthetic` now runs a post-ingest cross-session mention analysis pass. Entries that appear across multiple session transcripts receive synthetic recall events so they start with meaningful recall profiles instead of flat zero-history signals.
|
|
8
|
+
- `--synthetic` and `--synthetic-dry-run` flags for `agenr ingest`.
|
|
9
|
+
- Quality score seeding from synthetic recall signals with four tiers (`0.6`, `0.65`, `0.7`, `0.8`) based on distinct session count and temporal spread. Seeding is guarded so only entries still at `quality_score = 0.5` are updated.
|
|
10
|
+
|
|
11
|
+
### Improvements
|
|
12
|
+
|
|
13
|
+
- Synthetic event generation writes flat `signal_value = 0.4` per event (frequency is carried by event count), caps ANN fan-out at top-5 neighbors per entry, and uses chunked transaction batches with `INSERT OR IGNORE` idempotency on the existing synthetic dedup index.
|
|
14
|
+
|
|
3
15
|
## 0.9.60 (2026-03-03)
|
|
4
16
|
|
|
5
17
|
### Features
|
package/dist/cli-main.js
CHANGED
|
@@ -81,6 +81,7 @@ import {
|
|
|
81
81
|
clearBulkIngestMeta,
|
|
82
82
|
closeDb,
|
|
83
83
|
composeEmbeddingText,
|
|
84
|
+
cosineSimilarity,
|
|
84
85
|
createLlmClient,
|
|
85
86
|
createLogger,
|
|
86
87
|
describeAuth,
|
|
@@ -93,6 +94,7 @@ import {
|
|
|
93
94
|
initSchema,
|
|
94
95
|
isCompleteConfig,
|
|
95
96
|
isRecord,
|
|
97
|
+
mapBufferToVector,
|
|
96
98
|
maskSecret,
|
|
97
99
|
mergeConfigPatch,
|
|
98
100
|
normalizeLabel,
|
|
@@ -122,6 +124,7 @@ import {
|
|
|
122
124
|
getTopCoRecallEdges,
|
|
123
125
|
parseDaysBetween,
|
|
124
126
|
toNumber,
|
|
127
|
+
toRowsAffected,
|
|
125
128
|
toStringValue
|
|
126
129
|
} from "./chunk-D7W3PO7U.js";
|
|
127
130
|
|
|
@@ -11355,6 +11358,415 @@ import fs24 from "fs/promises";
|
|
|
11355
11358
|
import path22 from "path";
|
|
11356
11359
|
import * as clack9 from "@clack/prompts";
|
|
11357
11360
|
|
|
11361
|
+
// src/db/recall/synthetic.ts
|
|
11362
|
+
import { randomUUID } from "crypto";
|
|
11363
|
+
var DEFAULT_SIMILARITY_THRESHOLD = 0.75;
|
|
11364
|
+
var DEFAULT_MIN_SESSION_COUNT = 2;
|
|
11365
|
+
var VECTOR_FAN_OUT = 5;
|
|
11366
|
+
var DEFAULT_MAX_EVENTS_PER_ENTRY = VECTOR_FAN_OUT;
|
|
11367
|
+
var DEFAULT_SIGNAL_VALUE = 0.4;
|
|
11368
|
+
var INSERT_BATCH_SIZE = 500;
|
|
11369
|
+
function normalizeSourceFiles(sourceFiles) {
|
|
11370
|
+
if (!sourceFiles || sourceFiles.length === 0) {
|
|
11371
|
+
return [];
|
|
11372
|
+
}
|
|
11373
|
+
return Array.from(
|
|
11374
|
+
new Set(
|
|
11375
|
+
sourceFiles.map((value) => value.trim()).filter((value) => value.length > 0)
|
|
11376
|
+
)
|
|
11377
|
+
);
|
|
11378
|
+
}
|
|
11379
|
+
function normalizeThreshold(value) {
|
|
11380
|
+
if (!Number.isFinite(value)) {
|
|
11381
|
+
return DEFAULT_SIMILARITY_THRESHOLD;
|
|
11382
|
+
}
|
|
11383
|
+
const normalized = Number(value);
|
|
11384
|
+
if (normalized <= 0) {
|
|
11385
|
+
return 0;
|
|
11386
|
+
}
|
|
11387
|
+
if (normalized >= 1) {
|
|
11388
|
+
return 1;
|
|
11389
|
+
}
|
|
11390
|
+
return normalized;
|
|
11391
|
+
}
|
|
11392
|
+
function normalizePositiveInt2(value, fallback) {
|
|
11393
|
+
if (!Number.isFinite(value)) {
|
|
11394
|
+
return fallback;
|
|
11395
|
+
}
|
|
11396
|
+
const parsed = Math.floor(Number(value));
|
|
11397
|
+
if (parsed < 1) {
|
|
11398
|
+
return fallback;
|
|
11399
|
+
}
|
|
11400
|
+
return parsed;
|
|
11401
|
+
}
|
|
11402
|
+
function normalizeNullableText(value) {
|
|
11403
|
+
const text4 = toStringValue(value).trim();
|
|
11404
|
+
return text4.length > 0 ? text4 : null;
|
|
11405
|
+
}
|
|
11406
|
+
function normalizeTimestamp(value, fallbackIso) {
|
|
11407
|
+
const parsed = new Date(value);
|
|
11408
|
+
if (Number.isFinite(parsed.getTime())) {
|
|
11409
|
+
return parsed.toISOString();
|
|
11410
|
+
}
|
|
11411
|
+
return fallbackIso;
|
|
11412
|
+
}
|
|
11413
|
+
async function loadEntryCandidates(db, sourceFiles) {
|
|
11414
|
+
const scopedPlaceholders = sourceFiles.map(() => "?").join(", ");
|
|
11415
|
+
const scopedClause = sourceFiles.length > 0 ? `AND source_file IN (${scopedPlaceholders})` : "";
|
|
11416
|
+
const result = await db.execute({
|
|
11417
|
+
sql: `
|
|
11418
|
+
SELECT
|
|
11419
|
+
id,
|
|
11420
|
+
source_file,
|
|
11421
|
+
created_at,
|
|
11422
|
+
subject,
|
|
11423
|
+
norm_content_hash,
|
|
11424
|
+
platform,
|
|
11425
|
+
project,
|
|
11426
|
+
embedding
|
|
11427
|
+
FROM entries
|
|
11428
|
+
WHERE retired = 0
|
|
11429
|
+
AND superseded_by IS NULL
|
|
11430
|
+
${scopedClause}
|
|
11431
|
+
`,
|
|
11432
|
+
args: sourceFiles
|
|
11433
|
+
});
|
|
11434
|
+
return result.rows.map((row) => {
|
|
11435
|
+
const sourceFile = toStringValue(row.source_file).trim();
|
|
11436
|
+
return {
|
|
11437
|
+
id: toStringValue(row.id),
|
|
11438
|
+
sourceFile,
|
|
11439
|
+
createdAt: toStringValue(row.created_at),
|
|
11440
|
+
subject: normalizeNullableText(row.subject),
|
|
11441
|
+
fingerprint: normalizeNullableText(row.norm_content_hash),
|
|
11442
|
+
platform: normalizeNullableText(row.platform),
|
|
11443
|
+
project: normalizeNullableText(row.project),
|
|
11444
|
+
embedding: row.embedding ? mapBufferToVector(row.embedding) : null
|
|
11445
|
+
};
|
|
11446
|
+
}).filter((row) => row.id.length > 0);
|
|
11447
|
+
}
|
|
11448
|
+
async function loadEntryMetadata(db) {
|
|
11449
|
+
const result = await db.execute({
|
|
11450
|
+
sql: `
|
|
11451
|
+
SELECT id, source_file, created_at, platform, project
|
|
11452
|
+
FROM entries
|
|
11453
|
+
WHERE retired = 0
|
|
11454
|
+
AND superseded_by IS NULL
|
|
11455
|
+
`
|
|
11456
|
+
});
|
|
11457
|
+
const metadata = /* @__PURE__ */ new Map();
|
|
11458
|
+
for (const row of result.rows) {
|
|
11459
|
+
const id = toStringValue(row.id);
|
|
11460
|
+
if (id.length === 0) {
|
|
11461
|
+
continue;
|
|
11462
|
+
}
|
|
11463
|
+
metadata.set(id, {
|
|
11464
|
+
sourceFile: toStringValue(row.source_file).trim(),
|
|
11465
|
+
createdAt: toStringValue(row.created_at),
|
|
11466
|
+
platform: normalizeNullableText(row.platform),
|
|
11467
|
+
project: normalizeNullableText(row.project)
|
|
11468
|
+
});
|
|
11469
|
+
}
|
|
11470
|
+
return metadata;
|
|
11471
|
+
}
|
|
11472
|
+
function createDefaultVectorSearch(db, embeddingsByEntryId) {
|
|
11473
|
+
return async (entryId, k, threshold) => {
|
|
11474
|
+
const queryEmbedding = embeddingsByEntryId.get(entryId);
|
|
11475
|
+
if (!queryEmbedding || queryEmbedding.length === 0) {
|
|
11476
|
+
return [];
|
|
11477
|
+
}
|
|
11478
|
+
const result = await db.execute({
|
|
11479
|
+
sql: `
|
|
11480
|
+
SELECT e.id, e.embedding
|
|
11481
|
+
FROM vector_top_k('idx_entries_embedding', vector32(?), ?) AS v
|
|
11482
|
+
CROSS JOIN entries AS e ON e.rowid = v.id
|
|
11483
|
+
WHERE e.embedding IS NOT NULL
|
|
11484
|
+
AND e.retired = 0
|
|
11485
|
+
AND e.superseded_by IS NULL
|
|
11486
|
+
`,
|
|
11487
|
+
args: [JSON.stringify(queryEmbedding), k + 1]
|
|
11488
|
+
});
|
|
11489
|
+
const neighbors = [];
|
|
11490
|
+
for (const row of result.rows) {
|
|
11491
|
+
const id = toStringValue(row.id);
|
|
11492
|
+
if (id.length === 0 || id === entryId) {
|
|
11493
|
+
continue;
|
|
11494
|
+
}
|
|
11495
|
+
const embedding = mapBufferToVector(row.embedding);
|
|
11496
|
+
if (embedding.length === 0) {
|
|
11497
|
+
continue;
|
|
11498
|
+
}
|
|
11499
|
+
const score = cosineSimilarity(queryEmbedding, embedding);
|
|
11500
|
+
if (score >= threshold) {
|
|
11501
|
+
neighbors.push({ id, score });
|
|
11502
|
+
}
|
|
11503
|
+
}
|
|
11504
|
+
neighbors.sort((left, right) => right.score - left.score);
|
|
11505
|
+
return neighbors.slice(0, k);
|
|
11506
|
+
};
|
|
11507
|
+
}
|
|
11508
|
+
function toSyntheticQualityMentions(entry, neighbors, metadataById, similarityThreshold, fallbackIso) {
|
|
11509
|
+
const mentionsBySession = /* @__PURE__ */ new Map();
|
|
11510
|
+
for (const neighbor of neighbors) {
|
|
11511
|
+
if (neighbor.score < similarityThreshold) {
|
|
11512
|
+
continue;
|
|
11513
|
+
}
|
|
11514
|
+
const metadata = metadataById.get(neighbor.id);
|
|
11515
|
+
if (!metadata) {
|
|
11516
|
+
continue;
|
|
11517
|
+
}
|
|
11518
|
+
const sessionId = metadata.sourceFile.trim();
|
|
11519
|
+
if (sessionId.length === 0 || sessionId === entry.sourceFile) {
|
|
11520
|
+
continue;
|
|
11521
|
+
}
|
|
11522
|
+
const mention = {
|
|
11523
|
+
sessionId,
|
|
11524
|
+
recalledAt: normalizeTimestamp(metadata.createdAt, fallbackIso),
|
|
11525
|
+
score: neighbor.score,
|
|
11526
|
+
platform: metadata.platform,
|
|
11527
|
+
project: metadata.project
|
|
11528
|
+
};
|
|
11529
|
+
const existing = mentionsBySession.get(sessionId);
|
|
11530
|
+
if (!existing || mention.score > existing.score) {
|
|
11531
|
+
mentionsBySession.set(sessionId, mention);
|
|
11532
|
+
}
|
|
11533
|
+
}
|
|
11534
|
+
return [...mentionsBySession.values()];
|
|
11535
|
+
}
|
|
11536
|
+
async function insertSyntheticEvents(db, events) {
|
|
11537
|
+
let inserted = 0;
|
|
11538
|
+
for (let index = 0; index < events.length; index += INSERT_BATCH_SIZE) {
|
|
11539
|
+
const chunk = events.slice(index, index + INSERT_BATCH_SIZE);
|
|
11540
|
+
await db.execute("BEGIN IMMEDIATE");
|
|
11541
|
+
try {
|
|
11542
|
+
for (const event of chunk) {
|
|
11543
|
+
const result = await db.execute({
|
|
11544
|
+
sql: `
|
|
11545
|
+
INSERT OR IGNORE INTO recall_events (
|
|
11546
|
+
id,
|
|
11547
|
+
entry_id,
|
|
11548
|
+
content_fingerprint,
|
|
11549
|
+
subject_key,
|
|
11550
|
+
session_id,
|
|
11551
|
+
platform,
|
|
11552
|
+
project,
|
|
11553
|
+
source,
|
|
11554
|
+
signal_type,
|
|
11555
|
+
signal_value,
|
|
11556
|
+
recalled_at,
|
|
11557
|
+
created_at
|
|
11558
|
+
)
|
|
11559
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, 'synthetic', 'cross_session_mention', ?, ?, ?)
|
|
11560
|
+
`,
|
|
11561
|
+
args: [
|
|
11562
|
+
randomUUID(),
|
|
11563
|
+
event.entryId,
|
|
11564
|
+
event.fingerprint,
|
|
11565
|
+
event.subject,
|
|
11566
|
+
event.sessionId,
|
|
11567
|
+
event.platform,
|
|
11568
|
+
event.project,
|
|
11569
|
+
DEFAULT_SIGNAL_VALUE,
|
|
11570
|
+
event.recalledAt,
|
|
11571
|
+
event.recalledAt
|
|
11572
|
+
]
|
|
11573
|
+
});
|
|
11574
|
+
inserted += Math.max(0, toRowsAffected(result.rowsAffected));
|
|
11575
|
+
}
|
|
11576
|
+
await db.execute("COMMIT");
|
|
11577
|
+
} catch (error) {
|
|
11578
|
+
try {
|
|
11579
|
+
await db.execute("ROLLBACK");
|
|
11580
|
+
} catch {
|
|
11581
|
+
}
|
|
11582
|
+
throw error;
|
|
11583
|
+
}
|
|
11584
|
+
}
|
|
11585
|
+
if (events.length > 0) {
|
|
11586
|
+
await db.execute("ANALYZE recall_events");
|
|
11587
|
+
await db.execute("PRAGMA optimize");
|
|
11588
|
+
}
|
|
11589
|
+
return inserted;
|
|
11590
|
+
}
|
|
11591
|
+
async function generateSyntheticEvents(options) {
|
|
11592
|
+
const startedAt = Date.now();
|
|
11593
|
+
const sourceFiles = normalizeSourceFiles(options.sourceFiles);
|
|
11594
|
+
const similarityThreshold = normalizeThreshold(options.similarityThreshold);
|
|
11595
|
+
const minSessionCount = normalizePositiveInt2(options.minSessionCount, DEFAULT_MIN_SESSION_COUNT);
|
|
11596
|
+
const maxEventsPerEntry = normalizePositiveInt2(options.maxEventsPerEntry, DEFAULT_MAX_EVENTS_PER_ENTRY);
|
|
11597
|
+
const dryRun = options.dryRun === true;
|
|
11598
|
+
const entries = await loadEntryCandidates(options.db, sourceFiles);
|
|
11599
|
+
const entriesAnalyzed = entries.length;
|
|
11600
|
+
if (entriesAnalyzed === 0) {
|
|
11601
|
+
return {
|
|
11602
|
+
entriesAnalyzed,
|
|
11603
|
+
entriesQualified: 0,
|
|
11604
|
+
eventsEmitted: 0,
|
|
11605
|
+
durationMs: Math.max(0, Date.now() - startedAt)
|
|
11606
|
+
};
|
|
11607
|
+
}
|
|
11608
|
+
const metadataById = await loadEntryMetadata(options.db);
|
|
11609
|
+
const embeddingsByEntryId = /* @__PURE__ */ new Map();
|
|
11610
|
+
for (const entry of entries) {
|
|
11611
|
+
if (entry.embedding && entry.embedding.length > 0) {
|
|
11612
|
+
embeddingsByEntryId.set(entry.id, entry.embedding);
|
|
11613
|
+
}
|
|
11614
|
+
}
|
|
11615
|
+
const vectorSearch = options.vectorSearch ?? createDefaultVectorSearch(options.db, embeddingsByEntryId);
|
|
11616
|
+
const fallbackIso = (/* @__PURE__ */ new Date()).toISOString();
|
|
11617
|
+
const eventsToInsert = [];
|
|
11618
|
+
let entriesQualified = 0;
|
|
11619
|
+
for (const entry of entries) {
|
|
11620
|
+
if (!entry.embedding || entry.embedding.length === 0) {
|
|
11621
|
+
continue;
|
|
11622
|
+
}
|
|
11623
|
+
if (!entry.fingerprint || entry.fingerprint.length === 0) {
|
|
11624
|
+
continue;
|
|
11625
|
+
}
|
|
11626
|
+
if (entry.sourceFile.length === 0) {
|
|
11627
|
+
continue;
|
|
11628
|
+
}
|
|
11629
|
+
const neighbors = await vectorSearch(entry.id, VECTOR_FAN_OUT, similarityThreshold);
|
|
11630
|
+
const mentions = toSyntheticQualityMentions(entry, neighbors, metadataById, similarityThreshold, fallbackIso);
|
|
11631
|
+
const sessionCount = mentions.length;
|
|
11632
|
+
if (sessionCount < minSessionCount) {
|
|
11633
|
+
continue;
|
|
11634
|
+
}
|
|
11635
|
+
entriesQualified += 1;
|
|
11636
|
+
mentions.sort((left, right) => right.score - left.score || left.recalledAt.localeCompare(right.recalledAt)).slice(0, maxEventsPerEntry).forEach((mention) => {
|
|
11637
|
+
eventsToInsert.push({
|
|
11638
|
+
entryId: entry.id,
|
|
11639
|
+
fingerprint: entry.fingerprint,
|
|
11640
|
+
subject: entry.subject,
|
|
11641
|
+
sessionId: mention.sessionId,
|
|
11642
|
+
platform: mention.platform ?? entry.platform,
|
|
11643
|
+
project: mention.project ?? entry.project,
|
|
11644
|
+
recalledAt: mention.recalledAt
|
|
11645
|
+
});
|
|
11646
|
+
});
|
|
11647
|
+
}
|
|
11648
|
+
const eventsEmitted = dryRun ? eventsToInsert.length : await insertSyntheticEvents(options.db, eventsToInsert);
|
|
11649
|
+
if (options.verbose === true) {
|
|
11650
|
+
process.stderr.write(
|
|
11651
|
+
`[synthetic] analyzed ${entriesAnalyzed} entries, qualified ${entriesQualified}, ${dryRun ? "would emit" : "emitted"} ${eventsEmitted} events
|
|
11652
|
+
`
|
|
11653
|
+
);
|
|
11654
|
+
}
|
|
11655
|
+
return {
|
|
11656
|
+
entriesAnalyzed,
|
|
11657
|
+
entriesQualified,
|
|
11658
|
+
eventsEmitted,
|
|
11659
|
+
durationMs: Math.max(0, Date.now() - startedAt)
|
|
11660
|
+
};
|
|
11661
|
+
}
|
|
11662
|
+
|
|
11663
|
+
// src/quality/seed.ts
|
|
11664
|
+
function normalizeSourceFiles2(sourceFiles) {
|
|
11665
|
+
if (!sourceFiles || sourceFiles.length === 0) {
|
|
11666
|
+
return [];
|
|
11667
|
+
}
|
|
11668
|
+
return Array.from(
|
|
11669
|
+
new Set(
|
|
11670
|
+
sourceFiles.map((value) => value.trim()).filter((value) => value.length > 0)
|
|
11671
|
+
)
|
|
11672
|
+
);
|
|
11673
|
+
}
|
|
11674
|
+
function parseCount(value) {
|
|
11675
|
+
const parsed = toNumber(value);
|
|
11676
|
+
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
11677
|
+
return 0;
|
|
11678
|
+
}
|
|
11679
|
+
return Math.floor(parsed);
|
|
11680
|
+
}
|
|
11681
|
+
function parseSpreadDays(value) {
|
|
11682
|
+
const parsed = toNumber(value);
|
|
11683
|
+
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
11684
|
+
return 0;
|
|
11685
|
+
}
|
|
11686
|
+
return parsed;
|
|
11687
|
+
}
|
|
11688
|
+
function resolveSeedQuality(candidate) {
|
|
11689
|
+
if (candidate.sessionCount >= 10 && candidate.temporalSpreadDays >= 30) {
|
|
11690
|
+
return 0.8;
|
|
11691
|
+
}
|
|
11692
|
+
if (candidate.sessionCount >= 5 && candidate.temporalSpreadDays >= 14) {
|
|
11693
|
+
return 0.7;
|
|
11694
|
+
}
|
|
11695
|
+
if (candidate.sessionCount >= 3 && candidate.temporalSpreadDays >= 7) {
|
|
11696
|
+
return 0.65;
|
|
11697
|
+
}
|
|
11698
|
+
if (candidate.sessionCount >= 2) {
|
|
11699
|
+
return 0.6;
|
|
11700
|
+
}
|
|
11701
|
+
return null;
|
|
11702
|
+
}
|
|
11703
|
+
async function loadSeedCandidates(db, sourceFiles) {
|
|
11704
|
+
const placeholders = sourceFiles.map(() => "?").join(", ");
|
|
11705
|
+
const sourceScopeClause = sourceFiles.length > 0 ? `AND e.source_file IN (${placeholders})` : "";
|
|
11706
|
+
const result = await db.execute({
|
|
11707
|
+
sql: `
|
|
11708
|
+
SELECT
|
|
11709
|
+
e.id AS entry_id,
|
|
11710
|
+
COUNT(DISTINCT re.session_id) AS session_count,
|
|
11711
|
+
COALESCE(julianday(MAX(re.recalled_at)) - julianday(MIN(re.recalled_at)), 0) AS temporal_spread_days
|
|
11712
|
+
FROM entries AS e
|
|
11713
|
+
INNER JOIN recall_events AS re ON re.entry_id = e.id
|
|
11714
|
+
WHERE e.retired = 0
|
|
11715
|
+
AND e.superseded_by IS NULL
|
|
11716
|
+
AND re.source = 'synthetic'
|
|
11717
|
+
AND re.session_id IS NOT NULL
|
|
11718
|
+
${sourceScopeClause}
|
|
11719
|
+
GROUP BY e.id
|
|
11720
|
+
`,
|
|
11721
|
+
args: sourceFiles
|
|
11722
|
+
});
|
|
11723
|
+
const candidates = [];
|
|
11724
|
+
for (const row of result.rows) {
|
|
11725
|
+
const entryId = toStringValue(row.entry_id);
|
|
11726
|
+
if (entryId.length === 0) {
|
|
11727
|
+
continue;
|
|
11728
|
+
}
|
|
11729
|
+
candidates.push({
|
|
11730
|
+
entryId,
|
|
11731
|
+
sessionCount: parseCount(row.session_count),
|
|
11732
|
+
temporalSpreadDays: parseSpreadDays(row.temporal_spread_days)
|
|
11733
|
+
});
|
|
11734
|
+
}
|
|
11735
|
+
return candidates;
|
|
11736
|
+
}
|
|
11737
|
+
async function seedQualityScores(db, sourceFiles) {
|
|
11738
|
+
const normalizedSourceFiles = normalizeSourceFiles2(sourceFiles);
|
|
11739
|
+
const candidates = await loadSeedCandidates(db, normalizedSourceFiles);
|
|
11740
|
+
let updated = 0;
|
|
11741
|
+
await db.execute("BEGIN IMMEDIATE");
|
|
11742
|
+
try {
|
|
11743
|
+
for (const candidate of candidates) {
|
|
11744
|
+
const qualityScore = resolveSeedQuality(candidate);
|
|
11745
|
+
if (qualityScore === null) {
|
|
11746
|
+
continue;
|
|
11747
|
+
}
|
|
11748
|
+
const result = await db.execute({
|
|
11749
|
+
sql: `
|
|
11750
|
+
UPDATE entries
|
|
11751
|
+
SET quality_score = ?
|
|
11752
|
+
WHERE id = ?
|
|
11753
|
+
AND quality_score = 0.5
|
|
11754
|
+
`,
|
|
11755
|
+
args: [qualityScore, candidate.entryId]
|
|
11756
|
+
});
|
|
11757
|
+
updated += Math.max(0, toRowsAffected(result.rowsAffected));
|
|
11758
|
+
}
|
|
11759
|
+
await db.execute("COMMIT");
|
|
11760
|
+
} catch (error) {
|
|
11761
|
+
try {
|
|
11762
|
+
await db.execute("ROLLBACK");
|
|
11763
|
+
} catch {
|
|
11764
|
+
}
|
|
11765
|
+
throw error;
|
|
11766
|
+
}
|
|
11767
|
+
return { updated };
|
|
11768
|
+
}
|
|
11769
|
+
|
|
11358
11770
|
// src/ingest/write-queue.ts
|
|
11359
11771
|
var CancelledError = class extends Error {
|
|
11360
11772
|
constructor(message = "Write queue item was cancelled.") {
|
|
@@ -11997,7 +12409,7 @@ async function resolveInputFiles(inputPaths, globPattern, expandInputFilesFn) {
|
|
|
11997
12409
|
|
|
11998
12410
|
// src/commands/ingest/helpers.ts
|
|
11999
12411
|
import path19 from "path";
|
|
12000
|
-
import { randomUUID } from "crypto";
|
|
12412
|
+
import { randomUUID as randomUUID2 } from "crypto";
|
|
12001
12413
|
function retryBackoffMs(attempt) {
|
|
12002
12414
|
if (attempt <= 1) return 1e4;
|
|
12003
12415
|
if (attempt === 2) return 3e4;
|
|
@@ -12149,7 +12561,7 @@ async function insertIngestLogForFile(db, params) {
|
|
|
12149
12561
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
12150
12562
|
`,
|
|
12151
12563
|
args: [
|
|
12152
|
-
|
|
12564
|
+
randomUUID2(),
|
|
12153
12565
|
params.filePath,
|
|
12154
12566
|
params.contentHash,
|
|
12155
12567
|
(/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -12818,7 +13230,9 @@ async function runIngestCommand(inputPaths, options, deps) {
|
|
|
12818
13230
|
sleepFn: deps?.sleepFn ?? sleep,
|
|
12819
13231
|
shouldShutdownFn: deps?.shouldShutdownFn ?? isShutdownRequested,
|
|
12820
13232
|
createWriteQueueFn: deps?.createWriteQueueFn ?? ((queueOptions) => new WriteQueue(queueOptions)),
|
|
12821
|
-
embedFn: deps?.embedFn ?? embed
|
|
13233
|
+
embedFn: deps?.embedFn ?? embed,
|
|
13234
|
+
generateSyntheticEventsFn: deps?.generateSyntheticEventsFn ?? generateSyntheticEvents,
|
|
13235
|
+
seedQualityScoresFn: deps?.seedQualityScoresFn ?? seedQualityScores
|
|
12822
13236
|
};
|
|
12823
13237
|
const clackOutput = { output: process.stderr };
|
|
12824
13238
|
clack9.intro(banner(), clackOutput);
|
|
@@ -12858,6 +13272,8 @@ async function runIngestCommand(inputPaths, options, deps) {
|
|
|
12858
13272
|
const dryRun = options.dryRun === true;
|
|
12859
13273
|
const bulkRequested = options.bulk === true;
|
|
12860
13274
|
const bulkMode = bulkRequested && !dryRun;
|
|
13275
|
+
const syntheticRequested = options.synthetic === true;
|
|
13276
|
+
const syntheticDryRun = options.syntheticDryRun === true;
|
|
12861
13277
|
const json = options.json === true;
|
|
12862
13278
|
const force = options.force === true;
|
|
12863
13279
|
const skipIngested = force ? false : options.skipIngested !== false;
|
|
@@ -12895,6 +13311,15 @@ async function runIngestCommand(inputPaths, options, deps) {
|
|
|
12895
13311
|
if (bulkRequested && dryRun) {
|
|
12896
13312
|
clack9.log.warn(formatWarn("[bulk] --bulk is ignored when --dry-run is enabled."), clackOutput);
|
|
12897
13313
|
}
|
|
13314
|
+
if (syntheticDryRun && !syntheticRequested) {
|
|
13315
|
+
clack9.log.warn(
|
|
13316
|
+
formatWarn("[synthetic] --synthetic-dry-run requires --synthetic; synthetic pass will be skipped."),
|
|
13317
|
+
clackOutput
|
|
13318
|
+
);
|
|
13319
|
+
}
|
|
13320
|
+
if (syntheticRequested && !bulkRequested) {
|
|
13321
|
+
clack9.log.warn(formatWarn("[synthetic] --synthetic requires --bulk; synthetic pass will be skipped."), clackOutput);
|
|
13322
|
+
}
|
|
12898
13323
|
const files = await resolveInputFiles(inputPaths, globPattern, resolvedDeps.expandInputFilesFn);
|
|
12899
13324
|
const targetsWithSizes = await Promise.all(
|
|
12900
13325
|
files.map(async (filePath) => {
|
|
@@ -13064,6 +13489,9 @@ async function runIngestCommand(inputPaths, options, deps) {
|
|
|
13064
13489
|
let firstPassFailedIndexSet = /* @__PURE__ */ new Set();
|
|
13065
13490
|
let bulkTeardownComplete = false;
|
|
13066
13491
|
let bulkVectorRebuildDurationSeconds = null;
|
|
13492
|
+
let syntheticSummaryLine = null;
|
|
13493
|
+
let qualitySeedLine = null;
|
|
13494
|
+
let syntheticResult = null;
|
|
13067
13495
|
let cleanupFailure = null;
|
|
13068
13496
|
let pipelineError = null;
|
|
13069
13497
|
const updateProgressFn = (completedCount, totalCount, verb) => {
|
|
@@ -13165,6 +13593,40 @@ async function runIngestCommand(inputPaths, options, deps) {
|
|
|
13165
13593
|
cleanupFailure = asError(error);
|
|
13166
13594
|
}
|
|
13167
13595
|
}
|
|
13596
|
+
const shouldRunSyntheticPass = bulkMode && syntheticRequested && !syntheticDryRun && state.totalEntriesStored > 0 && !stoppedForShutdown && !pipelineError && !cleanupFailure;
|
|
13597
|
+
const shouldRunSyntheticDryRun = bulkMode && syntheticRequested && syntheticDryRun && state.totalEntriesStored > 0 && !stoppedForShutdown && !pipelineError && !cleanupFailure;
|
|
13598
|
+
if (shouldRunSyntheticPass || shouldRunSyntheticDryRun) {
|
|
13599
|
+
const syntheticSourceFiles = Array.from(
|
|
13600
|
+
new Set(
|
|
13601
|
+
results.filter(
|
|
13602
|
+
(result) => Boolean(result) && !result.error && !result.skipped && result.entriesStored > 0
|
|
13603
|
+
).map((result) => result.file)
|
|
13604
|
+
)
|
|
13605
|
+
);
|
|
13606
|
+
try {
|
|
13607
|
+
let qualitySeeded = 0;
|
|
13608
|
+
if (!syntheticDryRun) {
|
|
13609
|
+
const seededBefore = await resolvedDeps.seedQualityScoresFn(db, syntheticSourceFiles);
|
|
13610
|
+
qualitySeeded += seededBefore.updated;
|
|
13611
|
+
}
|
|
13612
|
+
syntheticResult = await resolvedDeps.generateSyntheticEventsFn({
|
|
13613
|
+
db,
|
|
13614
|
+
sourceFiles: syntheticSourceFiles,
|
|
13615
|
+
dryRun: syntheticDryRun,
|
|
13616
|
+
verbose
|
|
13617
|
+
});
|
|
13618
|
+
if (!syntheticDryRun) {
|
|
13619
|
+
const seededAfter = await resolvedDeps.seedQualityScoresFn(db, syntheticSourceFiles);
|
|
13620
|
+
qualitySeeded += seededAfter.updated;
|
|
13621
|
+
}
|
|
13622
|
+
const syntheticVerb = syntheticDryRun ? "would emit" : "emitted";
|
|
13623
|
+
syntheticSummaryLine = `Synthetic recall events: analyzed ${syntheticResult.entriesAnalyzed} entries, ${syntheticVerb} ${syntheticResult.eventsEmitted} events for ${syntheticResult.entriesQualified} entries`;
|
|
13624
|
+
qualitySeedLine = qualitySeeded > 0 ? `Quality seeding: updated ${qualitySeeded} entries.` : null;
|
|
13625
|
+
} catch (error) {
|
|
13626
|
+
clack9.log.error(formatError(`[synthetic] post-ingest synthesis failed: ${toErrorMessage(error)}`), clackOutput);
|
|
13627
|
+
cleanupFailure = asError(error);
|
|
13628
|
+
}
|
|
13629
|
+
}
|
|
13168
13630
|
if (!dryRun) {
|
|
13169
13631
|
try {
|
|
13170
13632
|
await walCheckpoint(db);
|
|
@@ -13243,7 +13705,16 @@ async function runIngestCommand(inputPaths, options, deps) {
|
|
|
13243
13705
|
const bulkRebuildLine = bulkMode && bulkVectorRebuildDurationSeconds !== null ? `Bulk mode: FTS rebuild + vector index rebuilt in ${bulkVectorRebuildDurationSeconds.toFixed(1)}s.` : null;
|
|
13244
13706
|
const bulkDedupLine = bulkMode ? `Bulk dedup: ${bulkDedupSkippedHashMinhash} entries skipped (hash/MinHash).` : null;
|
|
13245
13707
|
clack9.note(
|
|
13246
|
-
[
|
|
13708
|
+
[
|
|
13709
|
+
doneLine,
|
|
13710
|
+
chunkFailureLine,
|
|
13711
|
+
bulkRebuildLine,
|
|
13712
|
+
bulkDedupLine,
|
|
13713
|
+
syntheticSummaryLine,
|
|
13714
|
+
qualitySeedLine,
|
|
13715
|
+
...retryLines,
|
|
13716
|
+
...failedFileLines
|
|
13717
|
+
].filter((line) => Boolean(line)).join("\n"),
|
|
13247
13718
|
"Ingest Complete",
|
|
13248
13719
|
clackOutput
|
|
13249
13720
|
);
|
|
@@ -13282,6 +13753,14 @@ function registerIngestCommand(program) {
|
|
|
13282
13753
|
"--bulk",
|
|
13283
13754
|
"Optimize large imports by disabling FTS/vector indexes during writes and rebuilding afterward",
|
|
13284
13755
|
false
|
|
13756
|
+
).option(
|
|
13757
|
+
"--synthetic",
|
|
13758
|
+
"Run synthetic recall event generation after bulk ingest (requires --bulk)",
|
|
13759
|
+
false
|
|
13760
|
+
).option(
|
|
13761
|
+
"--synthetic-dry-run",
|
|
13762
|
+
"Show what synthetic recall events would be generated without writing (requires --synthetic)",
|
|
13763
|
+
false
|
|
13285
13764
|
).option("--no-retry", "Disable auto-retry for failed files").option("--no-pre-fetch", "Disable elaborative encoding pre-fetch").option(
|
|
13286
13765
|
"--whole-file",
|
|
13287
13766
|
"Force whole-file extraction mode. Sends each file as a single LLM call. Auto-detected for large-context models; use this flag to force it for any model. Ignored in watch mode.",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agenr",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.61",
|
|
4
4
|
"openclaw": {
|
|
5
5
|
"extensions": [
|
|
6
6
|
"dist/openclaw-plugin/index.js"
|
|
@@ -11,6 +11,13 @@
|
|
|
11
11
|
"bin": {
|
|
12
12
|
"agenr": "dist/cli.js"
|
|
13
13
|
},
|
|
14
|
+
"scripts": {
|
|
15
|
+
"build": "tsup src/cli.ts src/cli-main.ts src/openclaw-plugin/index.ts --format esm --dts",
|
|
16
|
+
"dev": "tsup src/cli.ts src/cli-main.ts --format esm --watch",
|
|
17
|
+
"test": "vitest run",
|
|
18
|
+
"test:watch": "vitest",
|
|
19
|
+
"typecheck": "tsc --noEmit"
|
|
20
|
+
},
|
|
14
21
|
"dependencies": {
|
|
15
22
|
"@clack/prompts": "^1.0.1",
|
|
16
23
|
"@libsql/client": "^0.17.0",
|
|
@@ -54,11 +61,9 @@
|
|
|
54
61
|
"README.md"
|
|
55
62
|
],
|
|
56
63
|
"author": "agenr-ai",
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
"test:watch": "vitest",
|
|
62
|
-
"typecheck": "tsc --noEmit"
|
|
64
|
+
"pnpm": {
|
|
65
|
+
"overrides": {
|
|
66
|
+
"fast-xml-parser": "^5.3.6"
|
|
67
|
+
}
|
|
63
68
|
}
|
|
64
|
-
}
|
|
69
|
+
}
|