npm - sweet-search - Versions diffs - 2.5.2 → 2.5.3 - Mend

sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/core/cli.js +24 -3
package/core/graph/graph-expansion.js +215 -36
package/core/graph/graph-extractor.js +196 -11
package/core/graph/graph-search.js +395 -92
package/core/graph/hcgs-generator.js +2 -1
package/core/graph/index.js +2 -0
package/core/graph/repo-map.js +28 -6
package/core/graph/structural-answer-cues.js +168 -0
package/core/graph/structural-callsite-hints.js +40 -0
package/core/graph/structural-context-format.js +40 -0
package/core/graph/structural-context.js +450 -0
package/core/graph/structural-forward-push.js +156 -0
package/core/graph/structural-header-context.js +19 -0
package/core/graph/structural-importance.js +148 -0
package/core/graph/structural-pagerank.js +197 -0
package/core/graph/summary-manager.js +13 -9
package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
package/core/incremental-indexing/application/file-watcher.mjs +197 -0
package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
package/core/incremental-indexing/application/operator-cli.mjs +554 -0
package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
package/core/incremental-indexing/application/reconciler.mjs +477 -0
package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
package/core/indexing/admission-policy.js +139 -0
package/core/indexing/artifact-builder.js +29 -12
package/core/indexing/ast-chunker.js +107 -30
package/core/indexing/dedup/exemplar-selector.js +19 -1
package/core/indexing/gitignore-filter.js +223 -0
package/core/indexing/incremental-tracker.js +99 -30
package/core/indexing/index-codebase-v21.js +6 -5
package/core/indexing/index-maintainer.mjs +698 -6
package/core/indexing/indexer-ann.js +99 -15
package/core/indexing/indexer-build.js +158 -45
package/core/indexing/indexer-empty-baseline.js +80 -0
package/core/indexing/indexer-manifest.js +66 -0
package/core/indexing/indexer-phases.js +56 -23
package/core/indexing/indexer-sparse-gram.js +54 -13
package/core/indexing/indexer-utils.js +26 -208
package/core/indexing/indexing-file-policy.js +32 -7
package/core/indexing/maintainer-launcher.mjs +137 -0
package/core/indexing/merkle-tracker.js +251 -244
package/core/indexing/model-pool.js +46 -5
package/core/infrastructure/code-graph-repository.js +758 -6
package/core/infrastructure/code-graph-visibility.js +157 -0
package/core/infrastructure/codebase-repository.js +100 -13
package/core/infrastructure/config/search.js +1 -1
package/core/infrastructure/db-utils.js +118 -0
package/core/infrastructure/dedup-hashing.js +10 -13
package/core/infrastructure/hardware-capability.js +17 -7
package/core/infrastructure/index.js +8 -2
package/core/infrastructure/language-patterns/maps.js +4 -1
package/core/infrastructure/language-patterns/registry-core.js +56 -17
package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
package/core/infrastructure/language-patterns.js +69 -0
package/core/infrastructure/model-registry.js +20 -0
package/core/infrastructure/native-inference.js +7 -12
package/core/infrastructure/native-resolver.js +52 -37
package/core/infrastructure/native-sparse-gram.js +261 -20
package/core/infrastructure/native-tokenizer.js +6 -15
package/core/infrastructure/simd-distance.js +10 -16
package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
package/core/infrastructure/structural-alias-resolver.js +122 -0
package/core/infrastructure/structural-candidate-ranker.js +34 -0
package/core/infrastructure/structural-context-repository.js +472 -0
package/core/infrastructure/structural-context-utils.js +51 -0
package/core/infrastructure/structural-graph-signals.js +121 -0
package/core/infrastructure/structural-qualified-resolution.js +15 -0
package/core/infrastructure/structural-source-definitions.js +100 -0
package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
package/core/infrastructure/tree-sitter-provider.js +811 -37
package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
package/core/query/query-router.js +55 -5
package/core/ranking/file-kind-ranking.js +2192 -15
package/core/ranking/late-interaction-index.js +87 -12
package/core/search/cli-decoration.js +290 -0
package/core/search/context-expander.js +988 -78
package/core/search/index.js +1 -0
package/core/search/output-policy.js +275 -0
package/core/search/search-anchor.js +499 -0
package/core/search/search-boost.js +93 -1
package/core/search/search-cli.js +61 -204
package/core/search/search-hybrid.js +250 -10
package/core/search/search-pattern-chunks.js +57 -8
package/core/search/search-pattern-planner.js +68 -9
package/core/search/search-pattern-prefilter.js +30 -10
package/core/search/search-pattern-ripgrep.js +40 -4
package/core/search/search-pattern-sparse-overlay.js +256 -0
package/core/search/search-pattern.js +117 -29
package/core/search/search-postprocess.js +479 -5
package/core/search/search-read-semantic.js +260 -23
package/core/search/search-read.js +82 -64
package/core/search/search-reader-pin.js +71 -0
package/core/search/search-rrf.js +279 -0
package/core/search/search-semantic.js +110 -5
package/core/search/search-server.js +130 -57
package/core/search/search-trace.js +107 -0
package/core/search/server-identity.js +93 -0
package/core/search/session-daemon-prewarm.mjs +33 -10
package/core/search/sweet-search.js +399 -7
package/core/skills/sweet-index/SKILL.md +8 -6
package/core/vector-store/binary-hnsw-index.js +194 -30
package/core/vector-store/float-vector-store.js +96 -6
package/core/vector-store/hnsw-index.js +220 -49
package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
package/eval/agent-read-workflows/bin/ss-find +15 -0
package/eval/agent-read-workflows/bin/ss-grep +12 -0
package/eval/agent-read-workflows/bin/ss-read +14 -0
package/eval/agent-read-workflows/bin/ss-search +18 -0
package/eval/agent-read-workflows/bin/ss-semantic +12 -0
package/eval/agent-read-workflows/bin/ss-trace +11 -0
package/mcp/read-tool.js +109 -0
package/mcp/server.js +55 -15
package/mcp/tool-handlers.js +14 -124
package/mcp/trace-tool.js +81 -0
package/package.json +25 -10
package/scripts/hooks/intercept-read.mjs +55 -0
package/scripts/hooks/remind-tools.mjs +40 -0
package/scripts/init.js +698 -54
package/scripts/inject-agent-instructions.js +431 -0
package/scripts/install-prompt-reminders.js +188 -0
package/scripts/install-tool-enforcement.js +220 -0
package/scripts/smoke-test.js +12 -9
package/scripts/uninstall.js +276 -18
package/scripts/write-claude-rules.js +110 -0

package/core/search/search-anchor.js ADDED Viewed

@@ -0,0 +1,499 @@
+/**
+ * Identifier-Anchored Retrieval (IAR).
+ *
+ * Aider / Cursor / Cody / Greptile all couple dense retrieval with an
+ * exact-name symbol lookup so abstract natural-language queries that
+ * happen to mention a real entity name can land on that entity even
+ * when the encoder ranks something tangentially-similar higher.
+ *
+ * This module:
+ *   1. Extracts identifier-shaped tokens from the query (PascalCase,
+ *      camelCase, snake_case, kCamel, ≥3 chars, not stopwords/keywords).
+ *   2. Looks them up case-insensitively against the entities graph
+ *      (any kind: function, method, struct, type, class, etc.).
+ *   3. Maps each matched entity to the cAST/LI chunk that covers it.
+ *   4. Injects those chunks into the candidate set with a baseline
+ *      lexical-anchor score, deduped against existing fused results.
+ *
+ * The downstream pipeline (entity-kind preference, name precision,
+ * doc/test demotion, MMR) then ranks the augmented candidate set
+ * using its existing rules. IAR is purely additive — it can only
+ * surface entities that genuinely exist in the index.
+ *
+ * Disable via `ablations: new Set(['no-anchor-injection'])`.
+ */
+import { extractNameHints } from '../ranking/file-kind-ranking.js';
+/**
+ * Extract IDENTIFIER-shaped anchor names from a query.
+ *
+ * Strictly tighter than `extractNameHints` (which is permissive enough
+ * for ranking tiebreakers — it treats any 3+ char non-keyword as a
+ * hint). For IAR we need to AVOID firing on plain English words like
+ * "request", "config", "default" that happen to share lowercase
+ * spelling with real entities, because that drags those entities
+ * ahead of the user's actual target.
+ *
+ * Required shape — at least one of:
+ *   - has an uppercase letter (PascalCase, camelCase, kPrefix style)
+ *   - contains an underscore (snake_case_func, ALL_CAPS_CONST)
+ *
+ * That matches how programmers actually NAME entities. A query token
+ * like "FastifyInstance", "kSchemaParams", "BindBody", "calculate_path"
+ * fires; "request", "lifecycle", "config", "default" doesn't. The
+ * downstream lookup is case-insensitive, so this filter doesn't lose
+ * anything except the ambiguous English-word path.
+ *
+ * Token length floor stays at 3 to drop noise like "is", "to", "by".
+ */
+export function extractStrictAnchorNames(query, opts = {}) {
+  const tokens = String(query || '').match(/[A-Za-z_][A-Za-z0-9_]+/g) || [];
+  const hints = new Set();
+  const allowPlainTitlecase = opts.allowPlainTitlecase === true;
+  for (const token of tokens) {
+    if (token.length < 3) continue;
+    // Require strong identifier shape: internal uppercase, acronym,
+    // underscore, or digit. Plain sentence Titlecase ("Downloads") is too
+    // ambiguous for injection; ranking tiebreakers can still use it later.
+    if (!isStrongIdentifierToken(token) && !(allowPlainTitlecase && isPlainTitlecase(token))) continue;
+    hints.add(token);
+  }
+  return hints;
+}
+const DEFAULT_PER_QUERY_ENTITY_LIMIT = 16;
+const ANCHOR_BASELINE_SCORE = 0.50;        // floor for an injected chunk
+const ANCHOR_PER_HINT_BONUS = 0.10;        // per matched anchor name
+const ANCHOR_MAX_SCORE = 0.85;             // ceiling — never beat a strong fused top-1
+const EXISTING_BOOST = 0.05;               // additive boost when the chunk is already fused
+// Entity types that count as "the user named THIS THING by writing its name"
+// — used by the existing-boost score-floor and new-injection gates below.
+// See block comment at the gate site for rationale. Function/method/component
+// entities are NOT here: the dense ranker handles those well, and IAR floor +
+// post-fusion definition-match boost stack to ~1.0 scores that bulldoze the
+// more specific function the user actually wants on prototype/property-of-X
+// style queries.
+const CLASS_LIKE_ENTITY_TYPES = new Set([
+  'class', 'module', 'interface', 'trait',
+  'struct', 'record', 'enum', 'namespace',
+]);
+/**
+ * Uniqueness ceiling for anchor names: hints whose lowercase form matches
+ * MORE entities than this threshold are dropped before injection. KPR/SPAR
+ * pattern (arXiv 2507.03922, 2110.06918): entity-aware injection helps in
+ * proportion to rarity.
+ *
+ * **Default: 0 (gate DISABLED).** On the current 60-probe dev/held-out split
+ * (40/20, seed=42, stratified by repo) the gate at ceil=8 transfers
+ * asymmetrically — dev gains 2 PASS / loses 0, held-out gains 0 PASS / loses
+ * 1 (S3-Q3 fastify). One probe (S3-Q3) had a brittle pre-fix PASS that
+ * relied on IAR flooding + MMR diversity penalty rather than dense-ranking
+ * signal. The principle is sound but the eval set is too small (60 queries)
+ * to ship a non-zero default per the BEIR-grade methodology in CLAUDE.md
+ * §Benchmark Methodology — held-out regressions are non-negotiable.
+ *
+ * Opt in via `SWEET_SEARCH_IAR_UNIQUENESS_CEIL=N`. Aligned with the existing
+ * ref-count homonym ceiling (file-kind-ranking.js, env
+ * SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE, default 12); experiments
+ * suggest 8 for IAR. Set higher for less aggressive gating, 0 to disable.
+ */
+// Default 0 = gate disabled. Held-out 60-probe eval (2026-05-07) showed no
+// ceil value transfers: corpus stats lock dev/held-out probes together (the
+// same hint Fastify=46 that helps dev S3-Q7+S4-Q2 hurts held-out S3-Q3).
+// Re-evaluate when a >200-query post-cutoff (FreshStack-style) eval lands.
+const DEFAULT_UNIQUENESS_CEIL = 0;
+function readUniquenessCeil(opts) {
+  if (opts && Number.isFinite(opts.uniquenessCeil)) {
+    return opts.uniquenessCeil;
+  }
+  const raw = process.env.SWEET_SEARCH_IAR_UNIQUENESS_CEIL;
+  if (raw == null || raw === '') return DEFAULT_UNIQUENESS_CEIL;
+  const n = Number.parseInt(raw, 10);
+  if (!Number.isFinite(n) || n < 0) return DEFAULT_UNIQUENESS_CEIL;
+  return n; // 0 means "no gate"
+}
+/**
+ * Find the LI chunk that covers a given (filePath, startLine, endLine)
+ * region. Linear scan over the LI document Map — typical projects have
+ * a few hundred to a few thousand chunks; this runs in microseconds.
+ *
+ * Prefers the SMALLEST containing chunk when several overlap (canonical
+ * symbol-aligned chunk vs an enclosing parent chunk).
+ *
+ * @param {object} liIndex - LateInteractionIndex instance with .documents Map
+ * @param {{ filePath: string, startLine: number, endLine: number }} entity
+ * @returns {{ id: string, metadata: object, content?: string, text?: string }|null}
+ */
+function findChunkForEntity(liIndex, entity) {
+  if (!liIndex || !entity) return null;
+  let best = null;
+  let bestSize = Infinity;
+  // Header-chunk fallback: used when no chunk fully contains the entity
+  // (large classes/modules whose body the cAST chunker split into multiple
+  // sub-chunks). The header chunk emitted by parseFileToChunks for an
+  // oversized boundary starts at the entity's declaration line and carries
+  // the declaration name + opening body — exactly the canonical anchor we
+  // want for an identifier-anchored injection. Without this fallback, IAR
+  // silently no-ops on every entity larger than the chunk budget (e.g.
+  // sinatra Base 1100 lines, fastify Server, etc.) — entity exists in the
+  // graph but no chunk strictly contains it.
+  let headerBest = null;
+  let headerBestSize = Infinity;
+  const entityNameLc = String(entity.name || '').toLowerCase();
+  for (const [id, doc] of liIndex.documents) {
+    const m = doc?.metadata;
+    if (!m || m.file !== entity.filePath) continue;
+    const cs = m.startLine, ce = m.endLine;
+    if (cs == null || ce == null) continue;
+    if (cs <= entity.startLine && ce >= entity.endLine) {
+      const size = ce - cs;
+      if (size < bestSize) {
+        best = { id, ...doc };
+        bestSize = size;
+      }
+    } else if (
+      // Strict fallback gate: chunk starts at the SAME line as the entity
+      // declaration AND its symbol name matches the entity name (case-
+      // insensitive). The line equality protects against picking up a
+      // method chunk that happens to live inside the entity's range and
+      // share part of the name; the name-equality protects against picking
+      // up an adjacent declaration that just happened to start at the
+      // same line on a multi-line statement.
+      entityNameLc
+      && cs === entity.startLine
+      && m.name
+      && String(m.name).toLowerCase() === entityNameLc
+    ) {
+      const size = ce - cs;
+      if (size < headerBestSize) {
+        headerBest = { id, ...doc };
+        headerBestSize = size;
+      }
+    }
+  }
+  return best || headerBest;
+}
+function chunkKey(r) {
+  const m = r.metadata || {};
+  const file = m.file || r.file;
+  const sl = m.startLine ?? r.startLine;
+  const el = m.endLine ?? r.endLine;
+  return `${file}|${sl}|${el}`;
+}
+function scoreForAnchor(entity, hintsLower) {
+  const nameLc = String(entity.name || '').toLowerCase();
+  let matched = 0;
+  for (const h of hintsLower) {
+    if (nameLc === h || nameLc.includes(h) || h.includes(nameLc)) matched++;
+  }
+  return Math.min(ANCHOR_MAX_SCORE, ANCHOR_BASELINE_SCORE + ANCHOR_PER_HINT_BONUS * matched);
+}
+function isPlainTitlecase(token) {
+  return /^[A-Z][a-z0-9]+$/.test(token);
+}
+function isStrongIdentifierToken(token) {
+  return token.includes('_') || /[a-z][A-Z]/.test(token) || /[A-Z].*[A-Z]/.test(token) || /\d/.test(token);
+}
+function entityMatchesAnchorHint(entity, hints) {
+  const name = String(entity?.name || '');
+  if (!name) return false;
+  const nameLower = name.toLowerCase();
+  for (const hint of hints) {
+    if (isStrongIdentifierToken(hint)) {
+      const hintLower = hint.toLowerCase();
+      if (nameLower === hintLower || nameLower.includes(hintLower) || hintLower.includes(nameLower)) {
+        return true;
+      }
+      continue;
+    }
+    if (isPlainTitlecase(hint)) {
+      if (name === hint || name.includes(hint) || hint.includes(name)) return true;
+      continue;
+    }
+    const hintLower = hint.toLowerCase();
+    if (nameLower === hintLower || nameLower.includes(hintLower) || hintLower.includes(nameLower)) {
+      return true;
+    }
+  }
+  return false;
+}
+/**
+ * Inject anchor candidates into a fused result list.
+ *
+ * @param {Array} fused - Result list after CC/RRF fusion (mutates a copy)
+ * @param {string} query - The user's query
+ * @param {object} opts
+ * @param {object} opts.codeGraphRepo - CodeGraphRepository
+ * @param {object} opts.lateInteractionIndex - LateInteractionIndex
+ * @param {Set<string>} [opts.ablations]
+ * @param {number} [opts.entityLimit]
+ * @returns {{ results: Array, stats: { hintCount: number, entitiesFound: number,
+ *             newCandidates: number, existingBoosted: number } }}
+ */
+export function injectAnchorCandidates(fused, query, opts = {}) {
+  const ablations = opts.ablations;
+  if (ablations && (ablations instanceof Set ? ablations.has('no-anchor-injection') : Array.isArray(ablations) && ablations.includes('no-anchor-injection'))) {
+    return { results: fused, stats: { hintCount: 0, entitiesFound: 0, newCandidates: 0, existingBoosted: 0 } };
+  }
+  const repo = opts.codeGraphRepo;
+  const liIndex = opts.lateInteractionIndex;
+  if (!repo || !liIndex || typeof repo.findEntitiesByAnyName !== 'function') {
+    return { results: fused, stats: { hintCount: 0, entitiesFound: 0, newCandidates: 0, existingBoosted: 0 } };
+  }
+  const allHints = [...extractStrictAnchorNames(query || '', {
+    allowPlainTitlecase: opts.allowPlainTitlecase !== false,
+  })];
+  if (allHints.length === 0) {
+    return { results: fused, stats: { hintCount: 0, entitiesFound: 0, newCandidates: 0, existingBoosted: 0 } };
+  }
+  // Uniqueness gate: drop any hint whose lowercase form matches more
+  // entities than the ceiling. IDF-gated injection pattern (KPR arXiv
+  // 2507.03922, SPAR arXiv 2110.06918, "Match Your Words" arXiv 2112.05662).
+  // Rare identifiers benefit from anchor injection; common identifiers
+  // ("Get", "Fastify", "Set") flood the candidate set with mostly-irrelevant
+  // entities — even the canonical pick is unreliable when 50 entities share
+  // the bare name. Cleaner to skip the hint entirely than to inject a
+  // possibly-wrong "canonical" entity. Mirrors the existing ref-count homonym
+  // gate (file-kind-ranking.js, env SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE,
+  // default 12). IAR uses a tighter default (8) because anchor injection is
+  // more sensitive to homonym noise than ref-count rescaling.
+  //
+  // Override env: SWEET_SEARCH_IAR_UNIQUENESS_CEIL=N. Set to 0 to disable.
+  const ceil = readUniquenessCeil(opts);
+  let hints = allHints;
+  let droppedCommon = [];
+  if (ceil > 0 && typeof repo.countEntitiesByAnyName === 'function') {
+    let countMap = null;
+    try {
+      countMap = repo.countEntitiesByAnyName(allHints);
+    } catch {
+      countMap = null;
+    }
+    if (countMap) {
+      const kept = [];
+      for (const h of allHints) {
+        const c = countMap.get(h.toLowerCase()) || 0;
+        if (c === 0 || c <= ceil) {
+          kept.push(h);
+        } else {
+          droppedCommon.push({ hint: h, count: c });
+        }
+      }
+      hints = kept;
+    }
+  }
+  if (hints.length === 0) {
+    return {
+      results: fused,
+      stats: {
+        hintCount: allHints.length,
+        entitiesFound: 0,
+        newCandidates: 0,
+        existingBoosted: 0,
+        droppedCommon,
+        uniquenessCeil: ceil,
+      },
+    };
+  }
+  const hintsLower = hints.map(s => s.toLowerCase());
+  let entities = [];
+  try {
+    const totalLimit = opts.entityLimit ?? DEFAULT_PER_QUERY_ENTITY_LIMIT;
+    if (hints.length > 1) {
+      // Per-hint quota: each hint gets up to ceil(totalLimit / hints.length)
+      // entities, deduped by id. Without this, a common hint (e.g. "Sinatra"
+      // matching 50+ small `module Sinatra` wrappers) saturates the budget
+      // via the `ORDER BY (end_line - start_line) ASC` tie-breaker — the
+      // smallest-entity-first ordering crowds out rarer co-hints
+      // ("IndifferentHash", "ExtendedRack", "TemplateCache") that are
+      // typically what the user is actually asking about. KPR/SPAR's
+      // IDF-gated injection (arXiv 2507.03922) handles this by ratioing
+      // anchor weight to rarity; here we instead enforce diversity at the
+      // candidate set level so per-hint specificity bias surfaces later
+      // in scoreForAnchor.
+      const perHint = Math.max(1, Math.ceil(totalLimit / hints.length));
+      const seen = new Set();
+      const entityKey = (e) => e?.id != null
+        ? `id:${e.id}`
+        : `${e?.filePath || ''}|${e?.startLine ?? ''}|${e?.endLine ?? ''}|${e?.name || ''}`;
+      for (const h of hints) {
+        // Pull a wider candidate window per hint (3x quota) so the in-JS
+        // re-ranking below can prefer case-exact matches over case-folded
+        // homonyms. Without this, a Pascal-case hint like "Helpers" gets
+        // out-prioritized by 5 tiny case-folded `def helpers` methods
+        // (2 lines each) that beat the canonical `module Helpers`
+        // (436 lines) under the SQL's `(end_line - start_line) ASC`
+        // tie-break. The case-sensitive preference reflects the user's
+        // own capitalization choice — they wrote "Helpers" because they
+        // mean the class/module, not a generic helper method.
+        const wider = repo.findEntitiesByAnyName([h], { limit: perHint * 3 }) || [];
+        // Stable resort: exact-case matches first, then keep the upstream
+        // size order (stable sort preserves the SQL `ORDER BY size ASC`).
+        wider.sort((a, b) => {
+          const aExact = a.name === h ? 0 : 1;
+          const bExact = b.name === h ? 0 : 1;
+          return aExact - bExact;
+        });
+        let added = 0;
+        for (const e of wider) {
+          const key = entityKey(e);
+          if (seen.has(key)) continue;
+          entities.push(e);
+          seen.add(key);
+          added++;
+          if (added >= perHint || entities.length >= totalLimit) break;
+        }
+        if (entities.length >= totalLimit) break;
+      }
+    } else {
+      entities = repo.findEntitiesByAnyName(hints, { limit: totalLimit }) || [];
+    }
+  } catch {
+    return {
+      results: fused,
+      stats: {
+        hintCount: allHints.length,
+        entitiesFound: 0,
+        newCandidates: 0,
+        existingBoosted: 0,
+        droppedCommon,
+        uniquenessCeil: ceil,
+      },
+    };
+  }
+  if (entities.length === 0) {
+    return {
+      results: fused,
+      stats: {
+        hintCount: allHints.length,
+        entitiesFound: 0,
+        newCandidates: 0,
+        existingBoosted: 0,
+        droppedCommon,
+        uniquenessCeil: ceil,
+      },
+    };
+  }
+  // Index existing fused results by chunk key for dedup and existing-boost.
+  const fusedByKey = new Map();
+  for (const r of fused) fusedByKey.set(chunkKey(r), r);
+  let newCandidates = 0;
+  let existingBoosted = 0;
+  const out = fused.slice();    // copy — we'll append injections
+  const seenInjected = new Set();
+  for (const entity of entities) {
+    if (!entityMatchesAnchorHint(entity, hints)) continue;
+    const chunk = findChunkForEntity(liIndex, entity);
+    if (!chunk) continue;
+    const key = chunkKey({ metadata: chunk.metadata });
+    if (seenInjected.has(key)) continue;
+    seenInjected.add(key);
+    const anchorScore = scoreForAnchor(entity, hintsLower);
+    // Class-anchor score-floor gate (rationale below).
+    //
+    // Score-floor (existing-boost path) and new-injection both fire at
+    // full anchor baseline (0.50-0.85) ONLY when the matched entity is a
+    // class-like type — class, module, interface, trait, struct, record,
+    // enum, namespace.
+    //
+    // Without this gate, a confidently-matched class entity
+    // ("IndifferentHash" / "ExtendedRack" / "TemplateCache") that the
+    // dense ranker placed low in the fused list stays low and gets
+    // crowded out by short-file mega-envelopes on class-targeted queries.
+    //
+    // Restricting to class-like types prevents over-promoting a literal
+    // entity over more specific derived functions on prototype-style
+    // queries — "how does Fastify decorate the Reply prototype": Reply
+    // is a function-typed entity, the user wants `decorateReply`;
+    // flooring/injecting Reply blocks decorateReply from top-1.
+    //
+    // Heuristic: when the user types a class/module/interface/trait
+    // name, they almost always mean the type itself; when they type a
+    // function/method name, they may mean callers, callees, or related
+    // operations — and the dense ranker generally surfaces those better
+    // than a name-only anchor can. Marking `_anchorBoosted` on every
+    // matched entity (including non-class) preserves downstream
+    // demotion signal alignment.
+    const isClassLike = entity?.type && CLASS_LIKE_ENTITY_TYPES.has(entity.type);
+    const existing = fusedByKey.get(key);
+    if (existing) {
+      if (isClassLike) {
+        existing.score = Math.max((existing.score || 0) + EXISTING_BOOST, anchorScore);
+      } else {
+        existing.score = (existing.score || 0) + EXISTING_BOOST;
+      }
+      existing._anchorBoosted = true;
+      existing._anchorEntity = entity.name;
+      existingBoosted++;
+      continue;
+    }
+    // New-injection path: skip when entity is not class-like. The dense
+    // ranker is the authority on function/method retrieval for non-
+    // class queries; injecting a function/method chunk at 0.60 with
+    // post-fusion definition-match amplification routinely scores
+    // 1.0+ and bulldozes the legitimately-correct function the user
+    // was after.
+    if (!isClassLike) continue;
+    // Inject as a fresh candidate. Carry the LI chunk's metadata so the
+    // downstream packager has the correct file/range/type.
+    out.push({
+      id: chunk.id,
+      file: chunk.metadata?.file,
+      startLine: chunk.metadata?.startLine,
+      endLine: chunk.metadata?.endLine,
+      name: chunk.metadata?.name || entity.name,
+      type: chunk.metadata?.type || entity.type,
+      content: chunk.content || chunk.text || '',
+      metadata: { ...(chunk.metadata || {}) },
+      score: anchorScore,
+      searchPath: 'anchor',
+      _anchorInjected: true,
+      _anchorEntity: entity.name,
+      _anchorEntityType: entity.type,
+    });
+    newCandidates++;
+  }
+  // Re-sort by score so the augmented list is consistent for downstream
+  // top-k truncation.
+  out.sort((a, b) => (b.score || 0) - (a.score || 0));
+  return {
+    results: out,
+    stats: {
+      hintCount: allHints.length,
+      hintsKept: hints.length,
+      entitiesFound: entities.length,
+      newCandidates,
+      existingBoosted,
+      droppedCommon,
+      uniquenessCeil: ceil,
+    },
+  };
+}

package/core/search/search-boost.js CHANGED Viewed

@@ -10,6 +10,13 @@
 import { SYMBOL_KIND_WEIGHTS, DEFINITION_TYPES } from '../infrastructure/constants.js';
+const IDENTIFIER_AGREEMENT_STOPWORDS = new Set([
+  'and', 'are', 'can', 'does', 'for', 'from', 'get', 'has', 'have',
+  'how', 'into', 'new', 'not', 'other', 'return', 'returns', 'set',
+  'should', 'that', 'the', 'this', 'true', 'use', 'used', 'using',
+  'was', 'were', 'what', 'when', 'where', 'which', 'with', 'you',
+]);
 // =============================================================================
 // BOOST_POLICY (static property on SweetSearch)
 // =============================================================================
@@ -109,12 +116,14 @@ export function getBoostIntent(routerMode, routerConfidence) {
  * NOTE: References SweetSearch.BOOST_POLICY — we import BOOST_POLICY locally
  * and reference it directly since the static property is wired separately.
  */
-export function applyPostFusionBoosts(fusedResults, query, routerMode, routerConfidence) {
+export function applyPostFusionBoosts(fusedResults, query, routerMode, routerConfidence, options = {}) {
   const boostIntent = this.getBoostIntent(routerMode, routerConfidence);
   const policy = BOOST_POLICY[boostIntent] || BOOST_POLICY.general;
   const queryLower = query.toLowerCase().trim();
   const queryTokens = this.extractQueryTokens(query);
+  const agentFormats = new Set(['agent', 'agent_preview', 'agent_full', 'agent_full_xl']);
+  const allowIdentifierAgreement = !agentFormats.has(options.format);
   return fusedResults.map(result => {
     let totalBoost = 1.0;
@@ -140,6 +149,16 @@ export function applyPostFusionBoosts(fusedResults, query, routerMode, routerCon
       }
     }
+    // 2.5 Identifier agreement: prefer symbols/files whose meaningful
+    // identifier words are named by the natural-language query.
+    const idBoost = allowIdentifierAgreement
+      ? this.computeIdentifierAgreementBoost?.(result, query)
+      : 1.0;
+    if (idBoost > 1.0) {
+      totalBoost *= idBoost;
+      boostDetails.push(`id:${idBoost.toFixed(2)}`);
+    }
     // 3. Symbol Kind Hierarchy (always mild)
     if (policy.kindHierarchy) {
       const kindWeight = SYMBOL_KIND_WEIGHTS[result.type] || 0.5;
@@ -175,6 +194,78 @@ export function applyPostFusionBoosts(fusedResults, query, routerMode, routerCon
   }).sort((a, b) => b.score - a.score);
 }
+function envFloat(name, fallback, min = 0, max = 1) {
+  const raw = process.env[name];
+  if (raw == null || raw === '') return fallback;
+  const parsed = Number.parseFloat(raw);
+  return Number.isFinite(parsed) && parsed >= min && parsed <= max ? parsed : fallback;
+}
+function splitIdentifierTerms(value) {
+  return String(value || '')
+    .replace(/_[0-9a-f]{8}(?=\.[^.]+$|$)/gi, '')
+    .replace(/\.[^.]+$/, '')
+    .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
+    .toLowerCase()
+    .split(/[^a-z0-9]+/)
+    .map(stemIdentifierTerm)
+    .filter(term => term.length >= 3 && !IDENTIFIER_AGREEMENT_STOPWORDS.has(term));
+}
+function stemIdentifierTerm(term) {
+  if (term.endsWith('ies') && term.length > 4) return `${term.slice(0, -3)}y`;
+  for (const suffix of ['ing', 'ers', 'ied', 'ed', 'es', 's']) {
+    if (term.endsWith(suffix) && term.length > suffix.length + 3) {
+      return term.slice(0, -suffix.length);
+    }
+  }
+  return term;
+}
+/**
+ * Boost candidates whose symbol/file identifier terms agree with query terms.
+ *
+ * This is intentionally small and corpus-agnostic: it only helps when the
+ * candidate exposes meaningful identifier words, and it never fabricates a
+ * match from comments or benchmark labels.
+ */
+export function computeIdentifierAgreementBoost(result, query) {
+  const weight = envFloat('SWEET_SEARCH_IDENTIFIER_AGREEMENT_BOOST', 0.40, 0, 1);
+  if (weight === 0) return 1.0;
+  const queryTerms = new Set(splitIdentifierTerms(query));
+  if (queryTerms.size === 0) return 1.0;
+  const fileName = (result.file || result.path || result.metadata?.file || '')
+    .split('/')
+    .pop() || '';
+  const candidateTerms = new Set([
+    ...splitIdentifierTerms(result.name || result.metadata?.name || ''),
+    ...splitIdentifierTerms(fileName),
+  ]);
+  if (candidateTerms.size === 0) return 1.0;
+  let hits = 0;
+  for (const queryTerm of queryTerms) {
+    if (candidateTerms.has(queryTerm)) {
+      hits++;
+      continue;
+    }
+    if (queryTerm.length >= 5) {
+      for (const candidateTerm of candidateTerms) {
+        if (candidateTerm.includes(queryTerm) || queryTerm.includes(candidateTerm)) {
+          hits++;
+          break;
+        }
+      }
+    }
+  }
+  if (hits === 0) return 1.0;
+  const agreement = hits / Math.min(queryTerms.size, Math.max(2, candidateTerms.size));
+  return 1.0 + weight * Math.min(1, agreement);
+}
 /**
  * Compute definition boost (PHASE_1_FIXES helper)
  */
@@ -190,6 +281,7 @@ export function computeDefinitionBoost(result, queryLower, queryTokens) {
   const exactNameMatch = queryTokens.some(token => resultNameLower === token);
   if (filenameMatchesQuery && isDefinitionType) return 2.0;
+  if (filenameMatchesQuery) return 1.3;
   if (exactNameMatch && isDefinitionType) return 1.5;
   if (isDefinitionType) return 1.2;
   return 1.0;