scientify 1.13.0 → 1.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -396,6 +396,98 @@ function tokenizeKeywords(raw) {
396
396
  }
397
397
  return [...seen];
398
398
  }
399
+ function inferTopicAliases(tokens) {
400
+ const normalized = tokens
401
+ .map((token) => token.toLowerCase())
402
+ .filter((token) => /^[a-z][a-z0-9_-]*$/.test(token))
403
+ .slice(0, 6);
404
+ if (normalized.length < 3)
405
+ return [];
406
+ const aliases = new Set();
407
+ const [a, b, c] = normalized;
408
+ if (a.length >= 2 && b.length >= 1 && c.length >= 1) {
409
+ aliases.add(`${a.slice(0, 2)}${b[0]}${c[0]}`);
410
+ }
411
+ aliases.add(`${a[0]}${b[0]}${c[0]}`);
412
+ const hasLow = normalized.includes("low");
413
+ const hasRank = normalized.includes("rank");
414
+ const hasAdapt = normalized.some((token) => token.startsWith("adapt"));
415
+ if (hasLow && hasRank && hasAdapt)
416
+ aliases.add("lora");
417
+ return [...aliases].filter((alias) => alias.length >= 3 && alias.length <= 8);
418
+ }
419
+ function buildScoringTokens(topic) {
420
+ const stopwords = new Set([
421
+ "from",
422
+ "with",
423
+ "without",
424
+ "first",
425
+ "basics",
426
+ "basic",
427
+ "foundational",
428
+ "foundation",
429
+ "seminal",
430
+ "classic",
431
+ "avoid",
432
+ "benchmark",
433
+ "only",
434
+ "prefer",
435
+ "authoritative",
436
+ "latest",
437
+ "recent",
438
+ "paper",
439
+ "papers",
440
+ "study",
441
+ "works",
442
+ ]);
443
+ const rawTokens = tokenizeKeywords(topic);
444
+ const aliases = inferTopicAliases(rawTokens);
445
+ const base = rawTokens.filter((token) => token.length >= 4 && !stopwords.has(token));
446
+ if (base.length > 0)
447
+ return [...new Set([...base, ...aliases])].slice(0, 10);
448
+ return [...new Set([...rawTokens, ...aliases])].slice(0, 10);
449
+ }
450
+ function buildRetrievalSeedTokens(topic) {
451
+ const directiveWords = new Set([
452
+ "from",
453
+ "with",
454
+ "without",
455
+ "first",
456
+ "basics",
457
+ "basic",
458
+ "foundational",
459
+ "foundation",
460
+ "seminal",
461
+ "classic",
462
+ "avoid",
463
+ "benchmark",
464
+ "only",
465
+ "prefer",
466
+ "authoritative",
467
+ "latest",
468
+ "recent",
469
+ "paper",
470
+ "papers",
471
+ "study",
472
+ "works",
473
+ "strict",
474
+ "fast",
475
+ ]);
476
+ const rawTokens = tokenizeKeywords(topic);
477
+ const aliases = inferTopicAliases(rawTokens);
478
+ const tokens = rawTokens
479
+ .map((token) => token.toLowerCase())
480
+ .filter((token) => token.length >= 3 && !directiveWords.has(token));
481
+ return [...new Set([...tokens, ...aliases])].slice(0, 10);
482
+ }
483
+ const FOUNDATIONAL_HINT_RE = /\b(foundational|foundation|seminal|classic|groundwork|original paper|from basics|start from basics|first principles)\b|\u57fa\u7840|\u5950\u57fa|\u7ecf\u5178|\u539f\u59cb/u;
484
+ const AVOID_BENCHMARK_HINT_RE = /\b(avoid benchmark|benchmark-only|no benchmark|less benchmark|not benchmark only)\b|\u5c11\u63a8.*benchmark|\u4e0d\u8981.*benchmark/u;
485
+ const SURVEY_HINT_RE = /\b(survey|review|taxonomy|overview|tutorial)\b|\u7efc\u8ff0|\u8bc4\u8ff0/u;
486
+ const AUTHORITY_HINT_RE = /\b(authoritative|high impact|top-tier|highly cited|landmark|canonical)\b|\u6743\u5a01|\u9ad8\u5f15\u7528/u;
487
+ const RECENT_HINT_RE = /\b(latest|recent|state[- ]of[- ]the[- ]art|newest)\b|\u6700\u65b0|\u8fd1\u671f/u;
488
+ const BENCHMARK_WORD_RE = /\b(benchmark|leaderboard|dataset|evaluation)\b/i;
489
+ const METHOD_WORD_RE = /\b(method|approach|adaptation|training|fine[- ]?tuning|optimization|algorithm|framework|model)\b/i;
490
+ const SURVEY_WORD_RE = /\b(survey|review|taxonomy|overview|tutorial)\b/i;
399
491
  function decodeXmlEntities(raw) {
400
492
  return raw
401
493
  .replace(/&lt;/g, "<")
@@ -447,8 +539,9 @@ function dedupeQueries(queries, limit) {
447
539
  return deduped;
448
540
  }
449
541
  function buildStrictFallbackQueries(topic) {
450
- const normalizedTopic = normalizeText(topic);
451
- const tokens = tokenizeKeywords(normalizedTopic).filter((token) => token.length >= 3).slice(0, 10);
542
+ const seedTokens = buildRetrievalSeedTokens(topic);
543
+ const normalizedTopic = seedTokens.length > 0 ? seedTokens.join(" ") : normalizeText(topic);
544
+ const tokens = seedTokens.length > 0 ? seedTokens : tokenizeKeywords(normalizedTopic).filter((token) => token.length >= 3).slice(0, 10);
452
545
  const queries = [normalizedTopic];
453
546
  if (tokens.length >= 2)
454
547
  queries.push(tokens.slice(0, 4).join(" "));
@@ -457,8 +550,9 @@ function buildStrictFallbackQueries(topic) {
457
550
  return dedupeQueries(queries, STRICT_EMPTY_FALLBACK_MAX_QUERIES);
458
551
  }
459
552
  function buildTieredFallbackQueries(topic) {
460
- const normalizedTopic = normalizeText(topic);
461
- const tokens = tokenizeKeywords(normalizedTopic).filter((token) => token.length >= 3).slice(0, 10);
553
+ const seedTokens = buildRetrievalSeedTokens(topic);
554
+ const normalizedTopic = seedTokens.length > 0 ? seedTokens.join(" ") : normalizeText(topic);
555
+ const tokens = seedTokens.length > 0 ? seedTokens : tokenizeKeywords(normalizedTopic).filter((token) => token.length >= 3).slice(0, 10);
462
556
  const tierA = buildStrictFallbackQueries(topic);
463
557
  const tierB = dedupeQueries([
464
558
  ...tokens.slice(0, 6).map((token) => `${token} adaptation`),
@@ -478,18 +572,69 @@ function buildTieredFallbackQueries(topic) {
478
572
  tierC,
479
573
  };
480
574
  }
575
+ function inferRequirementProfile(raw) {
576
+ const text = normalizeText(raw);
577
+ return {
578
+ foundationalFirst: FOUNDATIONAL_HINT_RE.test(text),
579
+ avoidBenchmarkOnly: AVOID_BENCHMARK_HINT_RE.test(text),
580
+ preferSurvey: SURVEY_HINT_RE.test(text),
581
+ preferAuthority: AUTHORITY_HINT_RE.test(text),
582
+ preferRecent: RECENT_HINT_RE.test(text),
583
+ };
584
+ }
585
+ function inferCandidateYear(paper) {
586
+ if (paper.published) {
587
+ const ts = Date.parse(paper.published);
588
+ if (Number.isFinite(ts))
589
+ return new Date(ts).getUTCFullYear();
590
+ }
591
+ const modern = paper.id.match(/:(\d{2})(\d{2})\./);
592
+ if (modern?.[1]) {
593
+ const yy = Number.parseInt(modern[1], 10);
594
+ if (Number.isFinite(yy))
595
+ return 2000 + yy;
596
+ }
597
+ return undefined;
598
+ }
599
+ function isBenchmarkOnlyPaper(paper) {
600
+ const text = `${paper.title} ${paper.summary ?? ""}`;
601
+ return BENCHMARK_WORD_RE.test(text) && !METHOD_WORD_RE.test(text);
602
+ }
603
+ function isSurveyPaper(paper) {
604
+ const text = `${paper.title} ${paper.summary ?? ""}`;
605
+ return SURVEY_WORD_RE.test(text);
606
+ }
607
+ function isFoundationalPaper(args) {
608
+ const year = args.year;
609
+ const nowYear = new Date().getUTCFullYear();
610
+ const oldEnough = typeof year === "number" ? year <= nowYear - 2 : false;
611
+ const title = normalizeText(args.paper.title).toLowerCase();
612
+ const tokenHit = args.topicTokens.some((token) => token.length >= 4 && title.includes(token));
613
+ return oldEnough || tokenHit;
614
+ }
481
615
  function countTokenOverlap(tokens, text) {
482
- const hay = ` ${normalizeText(text).toLowerCase()} `;
616
+ const hay = ` ${normalizeText(text)
617
+ .toLowerCase()
618
+ .replace(/[_-]+/g, " ")
619
+ .replace(/[^\p{L}\p{N}\s]+/gu, " ")
620
+ .replace(/\s+/g, " ")} `;
483
621
  let score = 0;
484
622
  for (const token of tokens) {
485
623
  if (token.length < 2)
486
624
  continue;
487
- if (hay.includes(` ${token} `))
625
+ const normalizedToken = token
626
+ .toLowerCase()
627
+ .replace(/[_-]+/g, " ")
628
+ .replace(/[^\p{L}\p{N}\s]+/gu, " ")
629
+ .trim();
630
+ if (!normalizedToken)
631
+ continue;
632
+ if (hay.includes(` ${normalizedToken} `))
488
633
  score += 1;
489
634
  }
490
635
  return score;
491
636
  }
492
- function scoreFallbackCandidate(topicTokens, paper, tier) {
637
+ function scoreFallbackCandidate(topicTokens, paper, tier, requirements) {
493
638
  const titleOverlap = countTokenOverlap(topicTokens, paper.title);
494
639
  const abstractOverlap = countTokenOverlap(topicTokens, paper.summary ?? "");
495
640
  const publishedAt = paper.published ? Date.parse(paper.published) : NaN;
@@ -497,7 +642,30 @@ function scoreFallbackCandidate(topicTokens, paper, tier) {
497
642
  ? Math.max(0, Math.min(8, (Date.now() - publishedAt) / (1000 * 60 * 60 * 24 * -180)))
498
643
  : 0;
499
644
  const tierBoost = tier === "tierA" ? 8 : tier === "tierB" ? 4 : 1;
500
- const rawScore = 60 + tierBoost + titleOverlap * 8 + abstractOverlap * 3 + recencyBoost;
645
+ const year = inferCandidateYear(paper);
646
+ const isBenchmarkOnly = isBenchmarkOnlyPaper(paper);
647
+ const isSurvey = isSurveyPaper(paper);
648
+ const isFoundational = isFoundationalPaper({ paper, year, topicTokens });
649
+ const nowYear = new Date().getUTCFullYear();
650
+ const recencyPenalty = typeof year === "number" && year >= nowYear ? 4 : 0;
651
+ let rawScore = 60 + tierBoost + titleOverlap * 8 + abstractOverlap * 3 + recencyBoost - recencyPenalty;
652
+ if (requirements.foundationalFirst) {
653
+ rawScore += isFoundational ? 10 : -4;
654
+ }
655
+ if (requirements.preferSurvey) {
656
+ rawScore += isSurvey ? 8 : 0;
657
+ }
658
+ if (requirements.preferAuthority) {
659
+ rawScore += isSurvey ? 3 : 0;
660
+ if (isFoundational)
661
+ rawScore += 2;
662
+ }
663
+ if (requirements.preferRecent && typeof year === "number" && year >= nowYear - 1) {
664
+ rawScore += 4;
665
+ }
666
+ if (requirements.avoidBenchmarkOnly && isBenchmarkOnly) {
667
+ rawScore -= 15;
668
+ }
501
669
  return Math.max(50, Math.min(99, Math.round(rawScore)));
502
670
  }
503
671
  async function fetchArxivFallbackByQuery(query) {
@@ -557,17 +725,49 @@ async function strictCoreFallbackSeed(args) {
557
725
  }
558
726
  }
559
727
  const topicTokens = tokenizeKeywords(args.topic);
728
+ const scoringTokens = buildScoringTokens(args.topic);
560
729
  const ranked = [...byId.values()]
561
- .map(({ row, tier }) => ({
562
- row,
563
- tier,
564
- score: scoreFallbackCandidate(topicTokens, row, tier),
565
- }))
730
+ .map(({ row, tier }) => {
731
+ const year = inferCandidateYear(row);
732
+ const isSurvey = isSurveyPaper(row);
733
+ const isBenchmarkOnly = isBenchmarkOnlyPaper(row);
734
+ const isFoundational = isFoundationalPaper({ paper: row, year, topicTokens });
735
+ const relevance = countTokenOverlap(scoringTokens, `${row.title} ${row.summary ?? ""}`);
736
+ return {
737
+ row,
738
+ tier,
739
+ year,
740
+ isSurvey,
741
+ isBenchmarkOnly,
742
+ isFoundational,
743
+ relevance,
744
+ score: scoreFallbackCandidate(scoringTokens.length > 0 ? scoringTokens : topicTokens, row, tier, args.requirements),
745
+ };
746
+ })
566
747
  .sort((a, b) => b.score - a.score);
567
748
  const unseen = ranked.filter((item) => !args.knownPaperIds.has(item.row.id));
568
- const effectivePool = unseen.length > 0 ? unseen : ranked;
749
+ const poolBeforeRelevance = unseen.length > 0 ? unseen : ranked;
750
+ const minRelevance = scoringTokens.length >= 2 ? 2 : 1;
569
751
  const candidatePool = Math.max(1, Math.min(40, Math.floor(args.candidatePool ?? Math.max(DEFAULT_STRICT_CANDIDATE_POOL, args.maxPapers * 4))));
570
752
  const minCoreFloor = Math.max(1, Math.min(args.maxPapers, args.minCoreFloor ?? DEFAULT_STRICT_MIN_CORE_FLOOR));
753
+ const effectivePoolByRelevance = poolBeforeRelevance.filter((item) => item.relevance >= minRelevance);
754
+ const focusTokens = scoringTokens.filter((token) => token.length >= 5);
755
+ const weakRelevanceWithFocusPool = poolBeforeRelevance.filter((item) => {
756
+ if (item.relevance < 1)
757
+ return false;
758
+ if (focusTokens.length === 0)
759
+ return true;
760
+ const focusHit = countTokenOverlap(focusTokens, `${item.row.title} ${item.row.summary ?? ""}`);
761
+ return focusHit >= 1;
762
+ });
763
+ const weakRelevancePool = weakRelevanceWithFocusPool.length > 0
764
+ ? weakRelevanceWithFocusPool
765
+ : poolBeforeRelevance.filter((item) => item.relevance >= 1);
766
+ const effectivePool = effectivePoolByRelevance.length >= minCoreFloor
767
+ ? effectivePoolByRelevance
768
+ : weakRelevancePool.length > 0
769
+ ? weakRelevancePool
770
+ : poolBeforeRelevance;
571
771
  const targetCount = Math.max(minCoreFloor, Math.min(args.maxPapers, candidatePool));
572
772
  const tierTargets = {
573
773
  tierA: Math.max(1, Math.round(targetCount * TIER_A_RATIO)),
@@ -597,6 +797,40 @@ async function strictCoreFallbackSeed(args) {
597
797
  tierStats[item.tier].selected += 1;
598
798
  }
599
799
  }
800
+ const ensureAtLeast = (predicate, need) => {
801
+ while (selected.filter(predicate).length < need) {
802
+ const candidate = effectivePool.find((item) => !selectedIds.has(item.row.id) && predicate(item));
803
+ if (!candidate)
804
+ break;
805
+ const replaceIndex = selected.findIndex((item) => !predicate(item));
806
+ if (replaceIndex < 0)
807
+ break;
808
+ selectedIds.delete(selected[replaceIndex].row.id);
809
+ selected[replaceIndex] = candidate;
810
+ selectedIds.add(candidate.row.id);
811
+ }
812
+ };
813
+ if (args.requirements.foundationalFirst) {
814
+ ensureAtLeast((item) => item.isFoundational, Math.min(2, targetCount));
815
+ }
816
+ if (args.requirements.preferSurvey) {
817
+ ensureAtLeast((item) => item.isSurvey, 1);
818
+ }
819
+ if (args.requirements.avoidBenchmarkOnly) {
820
+ for (let i = 0; i < selected.length; i += 1) {
821
+ if (!selected[i].isBenchmarkOnly)
822
+ continue;
823
+ const replacement = effectivePool.find((item) => !selectedIds.has(item.row.id) && !item.isBenchmarkOnly);
824
+ if (!replacement)
825
+ break;
826
+ selectedIds.delete(selected[i].row.id);
827
+ selected[i] = replacement;
828
+ selectedIds.add(replacement.row.id);
829
+ }
830
+ }
831
+ tierStats.tierA.selected = selected.filter((item) => item.tier === "tierA").length;
832
+ tierStats.tierB.selected = selected.filter((item) => item.tier === "tierB").length;
833
+ tierStats.tierC.selected = selected.filter((item) => item.tier === "tierC").length;
600
834
  const papers = selected.map(({ row, score }) => ({
601
835
  id: row.id,
602
836
  title: row.title,
@@ -621,7 +855,7 @@ async function strictCoreFallbackSeed(args) {
621
855
  papers,
622
856
  corePapers,
623
857
  explorationTrace: traces,
624
- notes: `strict_core_backfill_seed selected=${selected.length} pool=${candidatePool} floor=${minCoreFloor}`,
858
+ notes: `strict_core_backfill_seed selected=${selected.length} pool=${candidatePool} floor=${minCoreFloor} relevance_floor=${minRelevance} req_foundational=${args.requirements.foundationalFirst} req_avoid_benchmark=${args.requirements.avoidBenchmarkOnly} req_survey=${args.requirements.preferSurvey}`,
625
859
  recallTierStats: tierStats,
626
860
  };
627
861
  }
@@ -1254,6 +1488,14 @@ export async function recordIncrementalPush(args) {
1254
1488
  ...(effectiveRunLog ? { runLog: effectiveRunLog } : {}),
1255
1489
  }
1256
1490
  : undefined;
1491
+ const requirementProfile = inferRequirementProfile([
1492
+ topicState.topic,
1493
+ args.note,
1494
+ effectiveRunLog?.notes,
1495
+ effectiveKnowledgeState?.runLog?.notes,
1496
+ ]
1497
+ .filter((item) => Boolean(item && item.trim().length > 0))
1498
+ .join(" "));
1257
1499
  if (incomingRunProfile === "strict") {
1258
1500
  const strictMinCoreFloor = Math.max(1, Math.min(topicState.preferences.maxPapers, DEFAULT_STRICT_MIN_CORE_FLOOR));
1259
1501
  const requiredCoreFloor = Math.max(1, Math.min(topicState.preferences.maxPapers, effectiveRunLog?.requiredCorePapers ?? strictMinCoreFloor));
@@ -1273,6 +1515,7 @@ export async function recordIncrementalPush(args) {
1273
1515
  candidatePool: strictCandidatePool,
1274
1516
  minCoreFloor: requiredCoreFloor,
1275
1517
  knownPaperIds: knownIds,
1518
+ requirements: requirementProfile,
1276
1519
  });
1277
1520
  if (fallback.papers.length > 0) {
1278
1521
  const existingIds = new Set(effectivePapers.map((paper) => derivePaperId(paper)));