scientify 1.12.1 → 1.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/README.zh.md +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -5
- package/dist/index.js.map +1 -1
- package/dist/src/cli/research.d.ts +1 -1
- package/dist/src/cli/research.d.ts.map +1 -1
- package/dist/src/cli/research.js +123 -227
- package/dist/src/cli/research.js.map +1 -1
- package/dist/src/commands/metabolism-status.d.ts +2 -2
- package/dist/src/commands/metabolism-status.d.ts.map +1 -1
- package/dist/src/commands/metabolism-status.js +75 -72
- package/dist/src/commands/metabolism-status.js.map +1 -1
- package/dist/src/commands.d.ts.map +1 -1
- package/dist/src/commands.js +55 -0
- package/dist/src/commands.js.map +1 -1
- package/dist/src/hooks/research-mode.d.ts.map +1 -1
- package/dist/src/hooks/research-mode.js +54 -37
- package/dist/src/hooks/research-mode.js.map +1 -1
- package/dist/src/hooks/scientify-signature.d.ts.map +1 -1
- package/dist/src/hooks/scientify-signature.js +5 -2
- package/dist/src/hooks/scientify-signature.js.map +1 -1
- package/dist/src/knowledge-state/render.d.ts +1 -0
- package/dist/src/knowledge-state/render.d.ts.map +1 -1
- package/dist/src/knowledge-state/render.js +101 -33
- package/dist/src/knowledge-state/render.js.map +1 -1
- package/dist/src/knowledge-state/store.d.ts.map +1 -1
- package/dist/src/knowledge-state/store.js +206 -33
- package/dist/src/knowledge-state/store.js.map +1 -1
- package/dist/src/knowledge-state/types.d.ts +12 -0
- package/dist/src/knowledge-state/types.d.ts.map +1 -1
- package/dist/src/literature/subscription-state.d.ts.map +1 -1
- package/dist/src/literature/subscription-state.js +579 -7
- package/dist/src/literature/subscription-state.js.map +1 -1
- package/dist/src/research-subscriptions/constants.d.ts +1 -1
- package/dist/src/research-subscriptions/constants.js +1 -1
- package/dist/src/research-subscriptions/parse.d.ts.map +1 -1
- package/dist/src/research-subscriptions/parse.js +10 -0
- package/dist/src/research-subscriptions/parse.js.map +1 -1
- package/dist/src/research-subscriptions/prompt.d.ts +1 -1
- package/dist/src/research-subscriptions/prompt.d.ts.map +1 -1
- package/dist/src/research-subscriptions/prompt.js +142 -221
- package/dist/src/research-subscriptions/prompt.js.map +1 -1
- package/dist/src/research-subscriptions/types.d.ts +1 -0
- package/dist/src/research-subscriptions/types.d.ts.map +1 -1
- package/dist/src/templates/bootstrap.d.ts.map +1 -1
- package/dist/src/templates/bootstrap.js +19 -32
- package/dist/src/templates/bootstrap.js.map +1 -1
- package/dist/src/tools/scientify-cron.d.ts +4 -2
- package/dist/src/tools/scientify-cron.d.ts.map +1 -1
- package/dist/src/tools/scientify-cron.js +369 -17
- package/dist/src/tools/scientify-cron.js.map +1 -1
- package/dist/src/tools/scientify-literature-state.d.ts +8 -0
- package/dist/src/tools/scientify-literature-state.d.ts.map +1 -1
- package/dist/src/tools/scientify-literature-state.js +140 -71
- package/dist/src/tools/scientify-literature-state.js.map +1 -1
- package/openclaw.plugin.json +2 -4
- package/package.json +1 -1
- package/skills/research-subscription/SKILL.md +7 -0
|
@@ -9,6 +9,13 @@ const DEFAULT_SOURCES = ["openalex", "arxiv"];
|
|
|
9
9
|
const MAX_MEMORY_NOTES = 30;
|
|
10
10
|
const MAX_MEMORY_KEYS = 60;
|
|
11
11
|
const TOP_HINT_LIMIT = 8;
|
|
12
|
+
const DEFAULT_FULLTEXT_FETCH_TIMEOUT_MS = 20_000;
|
|
13
|
+
const RETRY_FULLTEXT_FETCH_TIMEOUT_MS = 35_000;
|
|
14
|
+
const MIN_FULLTEXT_TEXT_CHARS = 2_000;
|
|
15
|
+
const MAX_STRICT_FULLTEXT_ATTEMPTS = 5;
|
|
16
|
+
const ARXIV_API_URL = "https://export.arxiv.org/api/query";
|
|
17
|
+
const STRICT_EMPTY_FALLBACK_MAX_RESULTS = 12;
|
|
18
|
+
const STRICT_EMPTY_FALLBACK_MAX_QUERIES = 4;
|
|
12
19
|
const FEEDBACK_SIGNAL_DELTA = {
|
|
13
20
|
read: 1,
|
|
14
21
|
skip: -1,
|
|
@@ -171,6 +178,201 @@ function derivePaperId(paper) {
|
|
|
171
178
|
const digest = createHash("sha1").update(fallback || JSON.stringify(paper)).digest("hex");
|
|
172
179
|
return `hash:${digest.slice(0, 20)}`;
|
|
173
180
|
}
|
|
181
|
+
function normalizeArxivToken(token) {
|
|
182
|
+
const cleaned = normalizeText(token).replace(/^arxiv:/i, "");
|
|
183
|
+
if (!cleaned)
|
|
184
|
+
return undefined;
|
|
185
|
+
const modern = cleaned.match(/^(\d{4}\.\d{4,5}(?:v\d+)?)$/i);
|
|
186
|
+
if (modern?.[1])
|
|
187
|
+
return modern[1].toLowerCase();
|
|
188
|
+
const legacy = cleaned.match(/^([a-z\-]+(?:\.[a-z\-]+)?\/\d{7}(?:v\d+)?)$/i);
|
|
189
|
+
if (legacy?.[1])
|
|
190
|
+
return legacy[1].toLowerCase();
|
|
191
|
+
return undefined;
|
|
192
|
+
}
|
|
193
|
+
function stripArxivVersion(id) {
|
|
194
|
+
return id.replace(/v\d+$/i, "");
|
|
195
|
+
}
|
|
196
|
+
function parseArxivIdCandidatesFromPaper(paper) {
|
|
197
|
+
const candidates = [];
|
|
198
|
+
const pushToken = (value) => {
|
|
199
|
+
if (!value)
|
|
200
|
+
return;
|
|
201
|
+
const normalized = normalizeArxivToken(value);
|
|
202
|
+
if (normalized)
|
|
203
|
+
candidates.push(normalized);
|
|
204
|
+
};
|
|
205
|
+
pushToken(paper.id);
|
|
206
|
+
const combined = [paper.url, paper.title].filter((item) => Boolean(item)).join(" ");
|
|
207
|
+
for (const m of combined.matchAll(/\b(\d{4}\.\d{4,5}(?:v\d+)?)\b/gi)) {
|
|
208
|
+
pushToken(m[1]);
|
|
209
|
+
}
|
|
210
|
+
for (const m of combined.matchAll(/\b([a-z\-]+(?:\.[a-z\-]+)?\/\d{7}(?:v\d+)?)\b/gi)) {
|
|
211
|
+
pushToken(m[1]);
|
|
212
|
+
}
|
|
213
|
+
const expanded = [];
|
|
214
|
+
const seen = new Set();
|
|
215
|
+
for (const item of candidates) {
|
|
216
|
+
if (!seen.has(item)) {
|
|
217
|
+
seen.add(item);
|
|
218
|
+
expanded.push(item);
|
|
219
|
+
}
|
|
220
|
+
const base = stripArxivVersion(item);
|
|
221
|
+
if (!seen.has(base)) {
|
|
222
|
+
seen.add(base);
|
|
223
|
+
expanded.push(base);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
return expanded;
|
|
227
|
+
}
|
|
228
|
+
function htmlToPlainText(html) {
|
|
229
|
+
return html
|
|
230
|
+
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
231
|
+
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
|
232
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
|
|
233
|
+
.replace(/<svg[\s\S]*?<\/svg>/gi, " ")
|
|
234
|
+
.replace(/<math[\s\S]*?<\/math>/gi, " ")
|
|
235
|
+
.replace(/<\/?(?:p|div|section|article|h\d|li|ul|ol|br|tr|td|th|table|blockquote)[^>]*>/gi, "\n")
|
|
236
|
+
.replace(/<[^>]+>/g, " ")
|
|
237
|
+
.replace(/ /gi, " ")
|
|
238
|
+
.replace(/&/gi, "&")
|
|
239
|
+
.replace(/</gi, "<")
|
|
240
|
+
.replace(/>/gi, ">")
|
|
241
|
+
.replace(/"/gi, "\"")
|
|
242
|
+
.replace(/'/gi, "'")
|
|
243
|
+
.replace(/\r/g, "")
|
|
244
|
+
.replace(/[ \t]+\n/g, "\n")
|
|
245
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
246
|
+
.replace(/[ \t]{2,}/g, " ")
|
|
247
|
+
.trim();
|
|
248
|
+
}
|
|
249
|
+
async function fetchArxivFullTextByHtmlCandidates(arxivIds, timeoutMs) {
|
|
250
|
+
const candidates = [];
|
|
251
|
+
const seen = new Set();
|
|
252
|
+
for (const id of arxivIds) {
|
|
253
|
+
const normalized = normalizeArxivToken(id);
|
|
254
|
+
if (!normalized)
|
|
255
|
+
continue;
|
|
256
|
+
for (const host of ["https://arxiv.org/html", "https://ar5iv.org/html"]) {
|
|
257
|
+
const url = `${host}/${normalized}`;
|
|
258
|
+
if (seen.has(url))
|
|
259
|
+
continue;
|
|
260
|
+
seen.add(url);
|
|
261
|
+
candidates.push({ url, tag: host.includes("ar5iv") ? "ar5iv_html" : "arxiv_html" });
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
const errors = [];
|
|
265
|
+
for (const candidate of candidates) {
|
|
266
|
+
const controller = new AbortController();
|
|
267
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
268
|
+
try {
|
|
269
|
+
const res = await fetch(candidate.url, {
|
|
270
|
+
signal: controller.signal,
|
|
271
|
+
headers: {
|
|
272
|
+
"User-Agent": "scientify-fulltext-bootstrap/1.0",
|
|
273
|
+
},
|
|
274
|
+
});
|
|
275
|
+
if (!res.ok) {
|
|
276
|
+
errors.push(`${candidate.tag}:http_${res.status}`);
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
const rawHtml = await res.text();
|
|
280
|
+
const plain = htmlToPlainText(rawHtml);
|
|
281
|
+
if (plain.length < MIN_FULLTEXT_TEXT_CHARS) {
|
|
282
|
+
errors.push(`${candidate.tag}:content_too_short(${plain.length})`);
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
return {
|
|
286
|
+
ok: true,
|
|
287
|
+
sourceUrl: candidate.url,
|
|
288
|
+
sourceTag: candidate.tag,
|
|
289
|
+
plainText: plain,
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
catch (error) {
|
|
293
|
+
errors.push(`${candidate.tag}:${error instanceof Error ? error.name || error.message : "fetch_failed"}`);
|
|
294
|
+
}
|
|
295
|
+
finally {
|
|
296
|
+
clearTimeout(timer);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
return {
|
|
300
|
+
ok: false,
|
|
301
|
+
reason: errors.length > 0 ? errors.join(";") : "html_fulltext_unavailable",
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
async function backfillStrictCoreFullText(args) {
|
|
305
|
+
const updated = [];
|
|
306
|
+
let attempted = 0;
|
|
307
|
+
let completed = 0;
|
|
308
|
+
const failures = [];
|
|
309
|
+
for (const paper of args.corePapers) {
|
|
310
|
+
if (paper.fullTextRead === true || paper.readStatus === "fulltext") {
|
|
311
|
+
updated.push(paper);
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
const arxivIds = parseArxivIdCandidatesFromPaper({
|
|
315
|
+
id: paper.id,
|
|
316
|
+
url: paper.url,
|
|
317
|
+
title: paper.title,
|
|
318
|
+
});
|
|
319
|
+
if (arxivIds.length === 0) {
|
|
320
|
+
updated.push({
|
|
321
|
+
...paper,
|
|
322
|
+
fullTextRead: false,
|
|
323
|
+
readStatus: paper.readStatus ?? "metadata",
|
|
324
|
+
unreadReason: paper.unreadReason ??
|
|
325
|
+
"Automatic full-text bootstrap currently supports arXiv papers with parseable IDs only.",
|
|
326
|
+
});
|
|
327
|
+
continue;
|
|
328
|
+
}
|
|
329
|
+
if (attempted >= args.maxAttempts) {
|
|
330
|
+
updated.push({
|
|
331
|
+
...paper,
|
|
332
|
+
fullTextRead: false,
|
|
333
|
+
readStatus: paper.readStatus ?? "metadata",
|
|
334
|
+
unreadReason: paper.unreadReason ?? "Full-text bootstrap attempt budget reached in this run.",
|
|
335
|
+
});
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
attempted += 1;
|
|
339
|
+
let fetched = await fetchArxivFullTextByHtmlCandidates(arxivIds, DEFAULT_FULLTEXT_FETCH_TIMEOUT_MS);
|
|
340
|
+
if (!fetched.ok) {
|
|
341
|
+
fetched = await fetchArxivFullTextByHtmlCandidates(arxivIds, RETRY_FULLTEXT_FETCH_TIMEOUT_MS);
|
|
342
|
+
}
|
|
343
|
+
if (!fetched.ok) {
|
|
344
|
+
failures.push(`${arxivIds[0]}:${fetched.reason}`);
|
|
345
|
+
updated.push({
|
|
346
|
+
...paper,
|
|
347
|
+
fullTextRead: false,
|
|
348
|
+
readStatus: paper.readStatus ?? "metadata",
|
|
349
|
+
unreadReason: paper.unreadReason ?? `Automatic full-text fetch failed: ${fetched.reason}`,
|
|
350
|
+
});
|
|
351
|
+
continue;
|
|
352
|
+
}
|
|
353
|
+
completed += 1;
|
|
354
|
+
const excerpt = fetched.plainText.slice(0, 360).replace(/\s+/g, " ").trim();
|
|
355
|
+
updated.push({
|
|
356
|
+
...paper,
|
|
357
|
+
fullTextRead: true,
|
|
358
|
+
readStatus: "fulltext",
|
|
359
|
+
fullTextSource: fetched.sourceTag,
|
|
360
|
+
fullTextRef: fetched.sourceUrl,
|
|
361
|
+
unreadReason: undefined,
|
|
362
|
+
...(paper.keyEvidenceSpans && paper.keyEvidenceSpans.length > 0
|
|
363
|
+
? {}
|
|
364
|
+
: excerpt.length > 0
|
|
365
|
+
? { keyEvidenceSpans: [excerpt] }
|
|
366
|
+
: {}),
|
|
367
|
+
});
|
|
368
|
+
}
|
|
369
|
+
return {
|
|
370
|
+
corePapers: updated,
|
|
371
|
+
attempted,
|
|
372
|
+
completed,
|
|
373
|
+
failures,
|
|
374
|
+
};
|
|
375
|
+
}
|
|
174
376
|
function sanitizeKeyword(raw) {
|
|
175
377
|
const normalized = normalizeText(raw).toLowerCase();
|
|
176
378
|
if (normalized.length < 2 || normalized.length > 48)
|
|
@@ -188,6 +390,216 @@ function tokenizeKeywords(raw) {
|
|
|
188
390
|
}
|
|
189
391
|
return [...seen];
|
|
190
392
|
}
|
|
393
|
+
function decodeXmlEntities(raw) {
|
|
394
|
+
return raw
|
|
395
|
+
.replace(/</g, "<")
|
|
396
|
+
.replace(/>/g, ">")
|
|
397
|
+
.replace(/&/g, "&")
|
|
398
|
+
.replace(/"/g, "\"")
|
|
399
|
+
.replace(/'/g, "'");
|
|
400
|
+
}
|
|
401
|
+
function stripXmlTag(raw, tag) {
|
|
402
|
+
const match = raw.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`, "i"));
|
|
403
|
+
if (!match?.[1])
|
|
404
|
+
return "";
|
|
405
|
+
return normalizeText(decodeXmlEntities(match[1].replace(/<[^>]+>/g, " ").trim()));
|
|
406
|
+
}
|
|
407
|
+
function parseArxivAtomCandidates(xml) {
|
|
408
|
+
const entries = xml.match(/<entry>([\s\S]*?)<\/entry>/gi) ?? [];
|
|
409
|
+
const parsed = [];
|
|
410
|
+
for (const entryRaw of entries) {
|
|
411
|
+
const title = stripXmlTag(entryRaw, "title");
|
|
412
|
+
const summary = stripXmlTag(entryRaw, "summary");
|
|
413
|
+
const idUrl = stripXmlTag(entryRaw, "id");
|
|
414
|
+
const published = stripXmlTag(entryRaw, "published");
|
|
415
|
+
const arxivCandidates = parseArxivIdCandidatesFromPaper({ id: idUrl, url: idUrl, title });
|
|
416
|
+
const arxivId = arxivCandidates[0];
|
|
417
|
+
if (!title || !arxivId)
|
|
418
|
+
continue;
|
|
419
|
+
parsed.push({
|
|
420
|
+
id: `arxiv:${stripArxivVersion(arxivId)}`,
|
|
421
|
+
title,
|
|
422
|
+
summary,
|
|
423
|
+
url: `https://arxiv.org/abs/${stripArxivVersion(arxivId)}`,
|
|
424
|
+
...(published ? { published } : {}),
|
|
425
|
+
});
|
|
426
|
+
}
|
|
427
|
+
return parsed;
|
|
428
|
+
}
|
|
429
|
+
function buildStrictFallbackQueries(topic) {
|
|
430
|
+
const normalizedTopic = normalizeText(topic);
|
|
431
|
+
const queries = [normalizedTopic];
|
|
432
|
+
const tokens = tokenizeKeywords(normalizedTopic).filter((token) => token.length >= 3).slice(0, 8);
|
|
433
|
+
if (tokens.length >= 2) {
|
|
434
|
+
queries.push(tokens.join(" "));
|
|
435
|
+
}
|
|
436
|
+
if (tokens.length >= 4) {
|
|
437
|
+
queries.push(tokens.slice(0, 4).join(" "));
|
|
438
|
+
}
|
|
439
|
+
const seen = new Set();
|
|
440
|
+
const deduped = [];
|
|
441
|
+
for (const query of queries) {
|
|
442
|
+
const key = normalizeText(query).toLowerCase();
|
|
443
|
+
if (!key || seen.has(key))
|
|
444
|
+
continue;
|
|
445
|
+
seen.add(key);
|
|
446
|
+
deduped.push(query);
|
|
447
|
+
}
|
|
448
|
+
return deduped.slice(0, STRICT_EMPTY_FALLBACK_MAX_QUERIES);
|
|
449
|
+
}
|
|
450
|
+
function countTokenOverlap(tokens, text) {
|
|
451
|
+
const hay = ` ${normalizeText(text).toLowerCase()} `;
|
|
452
|
+
let score = 0;
|
|
453
|
+
for (const token of tokens) {
|
|
454
|
+
if (token.length < 2)
|
|
455
|
+
continue;
|
|
456
|
+
if (hay.includes(` ${token} `))
|
|
457
|
+
score += 1;
|
|
458
|
+
}
|
|
459
|
+
return score;
|
|
460
|
+
}
|
|
461
|
+
function scoreFallbackCandidate(topicTokens, paper) {
|
|
462
|
+
const titleOverlap = countTokenOverlap(topicTokens, paper.title);
|
|
463
|
+
const abstractOverlap = countTokenOverlap(topicTokens, paper.summary ?? "");
|
|
464
|
+
const publishedAt = paper.published ? Date.parse(paper.published) : NaN;
|
|
465
|
+
const recencyBoost = Number.isFinite(publishedAt)
|
|
466
|
+
? Math.max(0, Math.min(8, (Date.now() - publishedAt) / (1000 * 60 * 60 * 24 * -180)))
|
|
467
|
+
: 0;
|
|
468
|
+
const rawScore = 60 + titleOverlap * 8 + abstractOverlap * 3 + recencyBoost;
|
|
469
|
+
return Math.max(50, Math.min(99, Math.round(rawScore)));
|
|
470
|
+
}
|
|
471
|
+
async function fetchArxivFallbackByQuery(query) {
|
|
472
|
+
const params = new URLSearchParams({
|
|
473
|
+
search_query: query,
|
|
474
|
+
start: "0",
|
|
475
|
+
max_results: String(STRICT_EMPTY_FALLBACK_MAX_RESULTS),
|
|
476
|
+
sortBy: "relevance",
|
|
477
|
+
sortOrder: "descending",
|
|
478
|
+
});
|
|
479
|
+
const controller = new AbortController();
|
|
480
|
+
const timer = setTimeout(() => controller.abort(), 15_000);
|
|
481
|
+
try {
|
|
482
|
+
const res = await fetch(`${ARXIV_API_URL}?${params.toString()}`, {
|
|
483
|
+
signal: controller.signal,
|
|
484
|
+
headers: {
|
|
485
|
+
"User-Agent": "scientify-empty-fallback/1.0",
|
|
486
|
+
},
|
|
487
|
+
});
|
|
488
|
+
if (!res.ok)
|
|
489
|
+
return [];
|
|
490
|
+
const xml = await res.text();
|
|
491
|
+
return parseArxivAtomCandidates(xml);
|
|
492
|
+
}
|
|
493
|
+
catch {
|
|
494
|
+
return [];
|
|
495
|
+
}
|
|
496
|
+
finally {
|
|
497
|
+
clearTimeout(timer);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
async function strictCoreFallbackSeed(args) {
|
|
501
|
+
const queries = buildStrictFallbackQueries(args.topic);
|
|
502
|
+
const byId = new Map();
|
|
503
|
+
const traces = [];
|
|
504
|
+
for (const query of queries) {
|
|
505
|
+
const rows = await fetchArxivFallbackByQuery(query);
|
|
506
|
+
traces.push({
|
|
507
|
+
query,
|
|
508
|
+
reason: "strict_core_backfill_seed",
|
|
509
|
+
source: "arxiv",
|
|
510
|
+
candidates: rows.length,
|
|
511
|
+
filteredTo: rows.length,
|
|
512
|
+
resultCount: rows.length,
|
|
513
|
+
});
|
|
514
|
+
for (const row of rows) {
|
|
515
|
+
if (!byId.has(row.id))
|
|
516
|
+
byId.set(row.id, row);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
const topicTokens = tokenizeKeywords(args.topic);
|
|
520
|
+
const ranked = [...byId.values()]
|
|
521
|
+
.map((row) => ({
|
|
522
|
+
row,
|
|
523
|
+
score: scoreFallbackCandidate(topicTokens, row),
|
|
524
|
+
}))
|
|
525
|
+
.sort((a, b) => b.score - a.score);
|
|
526
|
+
const unseen = ranked.filter((item) => !args.knownPaperIds.has(item.row.id));
|
|
527
|
+
const effectivePool = unseen.length > 0 ? unseen : ranked;
|
|
528
|
+
const selected = effectivePool.slice(0, Math.max(1, Math.min(10, args.maxPapers)));
|
|
529
|
+
const papers = selected.map(({ row, score }) => ({
|
|
530
|
+
id: row.id,
|
|
531
|
+
title: row.title,
|
|
532
|
+
url: row.url,
|
|
533
|
+
score,
|
|
534
|
+
reason: "auto_seeded_fallback_after_sparse_core_strict_run",
|
|
535
|
+
}));
|
|
536
|
+
const corePapers = selected.map(({ row, score }) => ({
|
|
537
|
+
id: row.id,
|
|
538
|
+
title: row.title,
|
|
539
|
+
url: row.url,
|
|
540
|
+
source: "arxiv",
|
|
541
|
+
...(row.published ? { publishedAt: row.published } : {}),
|
|
542
|
+
score,
|
|
543
|
+
reason: "auto_seeded_fallback_after_sparse_core_strict_run",
|
|
544
|
+
...(row.summary ? { summary: row.summary } : {}),
|
|
545
|
+
fullTextRead: false,
|
|
546
|
+
readStatus: "metadata",
|
|
547
|
+
unreadReason: "Auto-seeded fallback candidate; full-text bootstrap pending.",
|
|
548
|
+
}));
|
|
549
|
+
return {
|
|
550
|
+
papers,
|
|
551
|
+
corePapers,
|
|
552
|
+
explorationTrace: traces,
|
|
553
|
+
notes: `strict_core_backfill_seed selected=${selected.length} queries=${queries.length}`,
|
|
554
|
+
};
|
|
555
|
+
}
|
|
556
|
+
function dedupePaperRecords(records) {
|
|
557
|
+
const byId = new Map();
|
|
558
|
+
for (const record of records) {
|
|
559
|
+
const id = derivePaperId(record);
|
|
560
|
+
const existing = byId.get(id);
|
|
561
|
+
if (!existing) {
|
|
562
|
+
byId.set(id, { ...record, ...(record.id ? {} : { id }) });
|
|
563
|
+
continue;
|
|
564
|
+
}
|
|
565
|
+
byId.set(id, {
|
|
566
|
+
id: existing.id ?? record.id ?? id,
|
|
567
|
+
title: existing.title ?? record.title,
|
|
568
|
+
url: existing.url ?? record.url,
|
|
569
|
+
score: typeof existing.score === "number" && Number.isFinite(existing.score)
|
|
570
|
+
? typeof record.score === "number" && Number.isFinite(record.score)
|
|
571
|
+
? Math.max(existing.score, record.score)
|
|
572
|
+
: existing.score
|
|
573
|
+
: record.score,
|
|
574
|
+
reason: existing.reason ?? record.reason,
|
|
575
|
+
});
|
|
576
|
+
}
|
|
577
|
+
return [...byId.values()];
|
|
578
|
+
}
|
|
579
|
+
function dedupeKnowledgePapers(records) {
|
|
580
|
+
const byId = new Map();
|
|
581
|
+
for (const record of records) {
|
|
582
|
+
const id = derivePaperId({ id: record.id, title: record.title, url: record.url });
|
|
583
|
+
const existing = byId.get(id);
|
|
584
|
+
if (!existing) {
|
|
585
|
+
byId.set(id, {
|
|
586
|
+
...record,
|
|
587
|
+
...(record.id ? {} : { id }),
|
|
588
|
+
});
|
|
589
|
+
continue;
|
|
590
|
+
}
|
|
591
|
+
byId.set(id, {
|
|
592
|
+
...existing,
|
|
593
|
+
...record,
|
|
594
|
+
id: existing.id ?? record.id ?? id,
|
|
595
|
+
title: existing.title ?? record.title,
|
|
596
|
+
url: existing.url ?? record.url,
|
|
597
|
+
summary: existing.summary ?? record.summary,
|
|
598
|
+
unreadReason: existing.unreadReason ?? record.unreadReason,
|
|
599
|
+
});
|
|
600
|
+
}
|
|
601
|
+
return [...byId.values()];
|
|
602
|
+
}
|
|
191
603
|
function normalizeSource(raw) {
|
|
192
604
|
if (!raw)
|
|
193
605
|
return undefined;
|
|
@@ -502,8 +914,161 @@ export async function recordIncrementalPush(args) {
|
|
|
502
914
|
const topicState = getOrCreateTopicState(root, args.scope, args.topic, args.preferences);
|
|
503
915
|
const memory = ensureTopicMemoryState(topicState);
|
|
504
916
|
const now = Date.now();
|
|
917
|
+
const normalizedPapersFromKnowledgeState = (args.knowledgeState?.corePapers ?? [])
|
|
918
|
+
.filter((paper) => paper && typeof paper === "object")
|
|
919
|
+
.map((paper) => ({
|
|
920
|
+
...(paper.id ? { id: paper.id } : {}),
|
|
921
|
+
...(paper.title ? { title: paper.title } : {}),
|
|
922
|
+
...(paper.url ? { url: paper.url } : {}),
|
|
923
|
+
...(typeof paper.score === "number" && Number.isFinite(paper.score) ? { score: paper.score } : {}),
|
|
924
|
+
...(paper.reason ? { reason: paper.reason } : {}),
|
|
925
|
+
}));
|
|
926
|
+
let effectivePapers = args.papers.length > 0
|
|
927
|
+
? args.papers
|
|
928
|
+
: normalizedPapersFromKnowledgeState.length > 0
|
|
929
|
+
? normalizedPapersFromKnowledgeState
|
|
930
|
+
: [];
|
|
931
|
+
const incomingRunLog = args.knowledgeState?.runLog
|
|
932
|
+
? { ...args.knowledgeState.runLog }
|
|
933
|
+
: undefined;
|
|
934
|
+
const incomingRunProfile = incomingRunLog?.runProfile === "fast" || incomingRunLog?.runProfile === "strict"
|
|
935
|
+
? incomingRunLog.runProfile
|
|
936
|
+
: undefined;
|
|
937
|
+
let effectiveRunLog = incomingRunLog ? { ...incomingRunLog } : undefined;
|
|
938
|
+
if (incomingRunProfile === "strict" && effectiveRunLog) {
|
|
939
|
+
const requiredCoreRaw = typeof effectiveRunLog.requiredCorePapers === "number" && Number.isFinite(effectiveRunLog.requiredCorePapers)
|
|
940
|
+
? Math.floor(effectiveRunLog.requiredCorePapers)
|
|
941
|
+
: 0;
|
|
942
|
+
if (requiredCoreRaw > 0) {
|
|
943
|
+
effectiveRunLog.requiredCorePapers = Math.max(1, requiredCoreRaw);
|
|
944
|
+
}
|
|
945
|
+
else {
|
|
946
|
+
delete effectiveRunLog.requiredCorePapers;
|
|
947
|
+
}
|
|
948
|
+
if (typeof effectiveRunLog.requiredFullTextCoveragePct !== "number" ||
|
|
949
|
+
!Number.isFinite(effectiveRunLog.requiredFullTextCoveragePct) ||
|
|
950
|
+
effectiveRunLog.requiredFullTextCoveragePct < 80) {
|
|
951
|
+
effectiveRunLog.requiredFullTextCoveragePct = 80;
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
let effectiveKnowledgeState = args.knowledgeState || effectiveRunLog
|
|
955
|
+
? {
|
|
956
|
+
...(args.knowledgeState ?? {}),
|
|
957
|
+
...(effectiveRunLog ? { runLog: effectiveRunLog } : {}),
|
|
958
|
+
}
|
|
959
|
+
: undefined;
|
|
960
|
+
if (incomingRunProfile === "strict") {
|
|
961
|
+
const requiredCoreFloor = Math.max(1, Math.min(topicState.preferences.maxPapers, effectiveRunLog?.requiredCorePapers ?? Math.min(3, topicState.preferences.maxPapers)));
|
|
962
|
+
const existingCorePapers = effectiveKnowledgeState?.corePapers ?? [];
|
|
963
|
+
const strictSignalCount = Math.max(existingCorePapers.length, effectivePapers.length);
|
|
964
|
+
if (strictSignalCount < requiredCoreFloor) {
|
|
965
|
+
const knownIds = new Set(Object.keys(topicState.pushedPapers));
|
|
966
|
+
for (const paper of effectivePapers)
|
|
967
|
+
knownIds.add(derivePaperId(paper));
|
|
968
|
+
for (const paper of existingCorePapers) {
|
|
969
|
+
knownIds.add(derivePaperId({ id: paper.id, title: paper.title, url: paper.url }));
|
|
970
|
+
}
|
|
971
|
+
const fallback = await strictCoreFallbackSeed({
|
|
972
|
+
topic: topicState.topic,
|
|
973
|
+
maxPapers: requiredCoreFloor,
|
|
974
|
+
knownPaperIds: knownIds,
|
|
975
|
+
});
|
|
976
|
+
if (fallback.papers.length > 0) {
|
|
977
|
+
const existingIds = new Set(effectivePapers.map((paper) => derivePaperId(paper)));
|
|
978
|
+
let fallbackPapers = fallback.papers.filter((paper) => !existingIds.has(derivePaperId(paper)));
|
|
979
|
+
const needed = Math.max(0, requiredCoreFloor - strictSignalCount);
|
|
980
|
+
if (needed > 0) {
|
|
981
|
+
if (fallbackPapers.length === 0)
|
|
982
|
+
fallbackPapers = fallback.papers;
|
|
983
|
+
fallbackPapers = fallbackPapers.slice(0, needed);
|
|
984
|
+
}
|
|
985
|
+
const fallbackIds = new Set(fallbackPapers.map((paper) => derivePaperId(paper)));
|
|
986
|
+
const fallbackCore = fallback.corePapers.filter((paper) => fallbackIds.has(derivePaperId({ id: paper.id, title: paper.title, url: paper.url })));
|
|
987
|
+
effectivePapers = dedupePaperRecords([...effectivePapers, ...fallbackPapers]);
|
|
988
|
+
const mergedRunLog = {
|
|
989
|
+
...(effectiveRunLog ?? { runProfile: "strict" }),
|
|
990
|
+
notes: [
|
|
991
|
+
effectiveRunLog?.notes,
|
|
992
|
+
fallback.notes,
|
|
993
|
+
`strict_core_topup required=${requiredCoreFloor} before=${strictSignalCount} added=${fallbackPapers.length}`,
|
|
994
|
+
]
|
|
995
|
+
.filter((item) => Boolean(item && item.trim().length > 0))
|
|
996
|
+
.join(" || "),
|
|
997
|
+
};
|
|
998
|
+
effectiveRunLog = mergedRunLog;
|
|
999
|
+
effectiveKnowledgeState = {
|
|
1000
|
+
...(effectiveKnowledgeState ?? {}),
|
|
1001
|
+
corePapers: dedupeKnowledgePapers([...(effectiveKnowledgeState?.corePapers ?? []), ...fallbackCore]),
|
|
1002
|
+
explorationTrace: [
|
|
1003
|
+
...(effectiveKnowledgeState?.explorationTrace ?? []),
|
|
1004
|
+
...fallback.explorationTrace,
|
|
1005
|
+
],
|
|
1006
|
+
runLog: mergedRunLog,
|
|
1007
|
+
};
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
if (incomingRunProfile === "strict") {
|
|
1012
|
+
const strictCoreFromState = effectiveKnowledgeState?.corePapers ?? [];
|
|
1013
|
+
const strictCoreSeed = strictCoreFromState.length > 0
|
|
1014
|
+
? strictCoreFromState
|
|
1015
|
+
: effectivePapers.map((paper) => ({
|
|
1016
|
+
...(paper.id ? { id: paper.id } : {}),
|
|
1017
|
+
...(paper.title ? { title: paper.title } : {}),
|
|
1018
|
+
...(paper.url ? { url: paper.url } : {}),
|
|
1019
|
+
...(typeof paper.score === "number" && Number.isFinite(paper.score) ? { score: paper.score } : {}),
|
|
1020
|
+
...(paper.reason ? { reason: paper.reason } : {}),
|
|
1021
|
+
fullTextRead: false,
|
|
1022
|
+
readStatus: "metadata",
|
|
1023
|
+
unreadReason: "Full text not fetched yet; pending strict full-text bootstrap.",
|
|
1024
|
+
}));
|
|
1025
|
+
if (strictCoreSeed.length > 0) {
|
|
1026
|
+
const strictAttemptLimit = Math.max(1, Math.min(MAX_STRICT_FULLTEXT_ATTEMPTS, effectiveRunLog?.requiredCorePapers ?? strictCoreSeed.length));
|
|
1027
|
+
const backfilled = await backfillStrictCoreFullText({
|
|
1028
|
+
corePapers: strictCoreSeed,
|
|
1029
|
+
maxAttempts: strictAttemptLimit,
|
|
1030
|
+
});
|
|
1031
|
+
const strictRunLog = {
|
|
1032
|
+
...(effectiveRunLog ?? { runProfile: "strict" }),
|
|
1033
|
+
fullTextAttempted: backfilled.attempted,
|
|
1034
|
+
fullTextCompleted: backfilled.completed,
|
|
1035
|
+
notes: [
|
|
1036
|
+
effectiveRunLog?.notes,
|
|
1037
|
+
`strict_fulltext_bootstrap attempted=${backfilled.attempted} completed=${backfilled.completed}`,
|
|
1038
|
+
...(backfilled.failures.length > 0
|
|
1039
|
+
? [`strict_fulltext_failures=${backfilled.failures.slice(0, 8).join(" | ")}`]
|
|
1040
|
+
: []),
|
|
1041
|
+
]
|
|
1042
|
+
.filter((item) => Boolean(item && item.trim().length > 0))
|
|
1043
|
+
.join(" || "),
|
|
1044
|
+
};
|
|
1045
|
+
effectiveRunLog = strictRunLog;
|
|
1046
|
+
effectiveKnowledgeState = {
|
|
1047
|
+
...(effectiveKnowledgeState ?? {}),
|
|
1048
|
+
corePapers: backfilled.corePapers,
|
|
1049
|
+
runLog: strictRunLog,
|
|
1050
|
+
};
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
const statusRaw = normalizeText(args.status ?? "").toLowerCase();
|
|
1054
|
+
const researchArtifactsCount = effectivePapers.length +
|
|
1055
|
+
(effectiveKnowledgeState?.explorationPapers?.length ?? 0) +
|
|
1056
|
+
(effectiveKnowledgeState?.knowledgeChanges?.length ?? 0) +
|
|
1057
|
+
(effectiveKnowledgeState?.knowledgeUpdates?.length ?? 0) +
|
|
1058
|
+
(effectiveKnowledgeState?.hypotheses?.length ?? 0) +
|
|
1059
|
+
(effectiveKnowledgeState?.explorationTrace?.length ?? 0);
|
|
1060
|
+
let normalizedStatus = statusRaw.length > 0 ? statusRaw : undefined;
|
|
1061
|
+
const coercedFromEmptyWithArtifacts = normalizedStatus === "empty" && researchArtifactsCount > 0;
|
|
1062
|
+
if (coercedFromEmptyWithArtifacts) {
|
|
1063
|
+
normalizedStatus = "degraded_quality";
|
|
1064
|
+
}
|
|
1065
|
+
const hasRunError = Boolean(effectiveKnowledgeState?.runLog?.error && normalizeText(effectiveKnowledgeState.runLog.error).length > 0);
|
|
1066
|
+
const requiresArtifacts = normalizedStatus === "ok" || normalizedStatus === "fallback_representative" || normalizedStatus === "degraded_quality";
|
|
1067
|
+
if (requiresArtifacts && researchArtifactsCount === 0 && !hasRunError) {
|
|
1068
|
+
throw new Error("record payload has no research artifacts. Use status=empty for no-result runs, or include run_log.error for failed runs.");
|
|
1069
|
+
}
|
|
505
1070
|
let recordedPapers = 0;
|
|
506
|
-
for (const rawPaper of
|
|
1071
|
+
for (const rawPaper of effectivePapers) {
|
|
507
1072
|
const id = derivePaperId(rawPaper);
|
|
508
1073
|
const existing = topicState.pushedPapers[id];
|
|
509
1074
|
if (existing) {
|
|
@@ -538,7 +1103,12 @@ export async function recordIncrementalPush(args) {
|
|
|
538
1103
|
}
|
|
539
1104
|
topicState.totalRuns += 1;
|
|
540
1105
|
topicState.lastRunAtMs = now;
|
|
541
|
-
topicState.lastStatus =
|
|
1106
|
+
topicState.lastStatus = normalizedStatus ?? (recordedPapers > 0 ? "ok" : "empty");
|
|
1107
|
+
const effectiveNote = coercedFromEmptyWithArtifacts
|
|
1108
|
+
? [args.note?.trim(), "status coerced: empty -> degraded_quality because research artifacts were present"]
|
|
1109
|
+
.filter((item) => Boolean(item && item.length > 0))
|
|
1110
|
+
.join(" | ")
|
|
1111
|
+
: args.note;
|
|
542
1112
|
const knowledgeCommitted = await commitKnowledgeRun({
|
|
543
1113
|
projectId: args.projectId ?? topicState.lastProjectId,
|
|
544
1114
|
scope: topicState.scope,
|
|
@@ -546,9 +1116,9 @@ export async function recordIncrementalPush(args) {
|
|
|
546
1116
|
topicKey: topicState.topicKey,
|
|
547
1117
|
status: topicState.lastStatus,
|
|
548
1118
|
runId: args.runId,
|
|
549
|
-
note:
|
|
550
|
-
papers:
|
|
551
|
-
knowledgeState:
|
|
1119
|
+
note: effectiveNote,
|
|
1120
|
+
papers: effectivePapers,
|
|
1121
|
+
knowledgeState: effectiveKnowledgeState,
|
|
552
1122
|
});
|
|
553
1123
|
topicState.lastStatus = knowledgeCommitted.summary.lastStatus ?? topicState.lastStatus;
|
|
554
1124
|
topicState.lastProjectId = knowledgeCommitted.projectId;
|
|
@@ -560,18 +1130,20 @@ export async function recordIncrementalPush(args) {
|
|
|
560
1130
|
topicKey: topicState.topicKey,
|
|
561
1131
|
status: topicState.lastStatus,
|
|
562
1132
|
runId: knowledgeCommitted.runId,
|
|
1133
|
+
run_id: knowledgeCommitted.runId,
|
|
1134
|
+
run_profile: effectiveKnowledgeState?.runLog?.runProfile ?? null,
|
|
563
1135
|
projectId: knowledgeCommitted.projectId,
|
|
564
1136
|
streamKey: knowledgeCommitted.streamKey,
|
|
565
1137
|
preferences: topicState.preferences,
|
|
566
1138
|
recordedPapers,
|
|
567
|
-
papers:
|
|
1139
|
+
papers: effectivePapers.map((paper) => ({
|
|
568
1140
|
id: derivePaperId(paper),
|
|
569
1141
|
title: paper.title?.trim(),
|
|
570
1142
|
url: paper.url?.trim(),
|
|
571
1143
|
...(typeof paper.score === "number" && Number.isFinite(paper.score) ? { score: paper.score } : {}),
|
|
572
1144
|
...(paper.reason ? { reason: paper.reason.trim() } : {}),
|
|
573
1145
|
})),
|
|
574
|
-
note:
|
|
1146
|
+
note: effectiveNote,
|
|
575
1147
|
knowledgeStateSummary: knowledgeCommitted.summary,
|
|
576
1148
|
});
|
|
577
1149
|
return {
|