sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
package/mcp/server.js CHANGED
@@ -11,6 +11,7 @@ import { z } from 'zod';
11
11
  import { existsSync, statSync, readFileSync } from 'node:fs';
12
12
  import path from 'node:path';
13
13
  import { fileURLToPath } from 'node:url';
14
+ import { launchMaintainer } from '../core/indexing/maintainer-launcher.mjs';
14
15
 
15
16
  import {
16
17
  SearchOutputSchema,
@@ -18,16 +19,19 @@ import {
18
19
  HealthOutputSchema,
19
20
  RepoMapOutputSchema,
20
21
  VocabPrewarmOutputSchema,
21
- ReadOutputSchema,
22
- ReadSemanticOutputSchema,
23
22
  handleSearch,
24
23
  handleIndex,
25
24
  checkHealth,
26
25
  handleRepoMap,
27
26
  handleVocabPrewarm,
27
+ } from './tool-handlers.js';
28
+ import { TraceOutputSchema, handleTrace } from './trace-tool.js';
29
+ import {
30
+ ReadOutputSchema,
31
+ ReadSemanticOutputSchema,
28
32
  handleRead,
29
33
  handleReadSemantic,
30
- } from './tool-handlers.js';
34
+ } from './read-tool.js';
31
35
 
32
36
  const __filename = fileURLToPath(import.meta.url);
33
37
  const __dirname = path.dirname(__filename);
@@ -101,9 +105,10 @@ async function getConfig() {
101
105
  const coreDir = path.join(__dirname, '..', 'core');
102
106
 
103
107
  const searchDeps = { getSearcher };
108
+ const traceDeps = { PROJECT_ROOT };
104
109
  const indexDeps = { PROJECT_ROOT, coreDir };
105
110
  const healthDeps = { getConfig, PROJECT_ROOT };
106
- const repoMapDeps = { coreDir };
111
+ const repoMapDeps = { coreDir, PROJECT_ROOT };
107
112
  const vocabDeps = { coreDir };
108
113
 
109
114
  // ---------------------------------------------------------------------------
@@ -126,7 +131,7 @@ const server = new McpServer({
126
131
  // ---------------------------------------------------------------------------
127
132
 
128
133
  server.registerTool('search', {
129
- description: 'Search the codebase using hybrid semantic/lexical/structural search. Use format="agent" with a regex for ColGrep pattern search that returns self-contained code blocks eliminates follow-up file reads.',
134
+ description: 'Hybrid code search (semantic + lexical + structural). USE INSTEAD OF native Grep for code-discovery tasks. Returns ranked, auto-expanded, self-contained code blocks by default (`format="agent"`) no follow-up Read needed. Pass `regex` for ColGrep pattern search (regex anchor + semantic re-rank), `structural=true` for callers/callees/impact, or omit for hybrid auto-routing. Pass `format="benchmark"` only for retrieval-quality measurement, not agent consumption.',
130
135
  inputSchema: {
131
136
  query: z.string().min(1).max(1000).describe('Search query (1-1000 chars)'),
132
137
  k: z.number().int().min(1).max(200).default(10).describe('Number of results (1-200)'),
@@ -136,10 +141,10 @@ server.registerTool('search', {
136
141
  .describe('Force structural graph search mode (callers, callees, implementations)'),
137
142
  regex: z.string().max(4096).optional()
138
143
  .describe('Regex pattern for ColGrep pattern search (implies mode=pattern)'),
139
- format: z.enum(['benchmark', 'agent', 'agent_preview', 'agent_full']).default('benchmark').optional()
140
- .describe('Output format. "agent"/"agent_preview" returns bounded code blocks (4K budget). "agent_full" returns expanded code for top-3 (8K budget).'),
141
- tokenBudget: z.number().int().min(500).max(16000).default(4000).optional()
142
- .describe('Agent mode: total token budget for all results (default: 4000)'),
144
+ format: z.enum(['benchmark', 'agent', 'agent_preview', 'agent_full']).default('agent').optional()
145
+ .describe('Output format. Default "agent" returns ranked, self-contained code blocks for agent consumption. Use "benchmark" only for retrieval-quality measurement.'),
146
+ tokenBudget: z.number().int().min(500).max(16000).optional()
147
+ .describe('Agent mode: optional token budget override. Omit to let the tool pick.'),
143
148
  },
144
149
  outputSchema: SearchOutputSchema,
145
150
  annotations: {
@@ -150,8 +155,31 @@ server.registerTool('search', {
150
155
  },
151
156
  }, async (args) => handleSearch(args, searchDeps));
152
157
 
158
+ server.registerTool('trace', {
159
+ description: 'Trace callers, callees, and transitive impact paths for one specific symbol — returns a single structural-context package adapted to the token budget. USE WHEN the question is "who calls X", "what does X depend on", or "what would break if I changed X". For general code discovery use `search` instead; for navigation around an unfamiliar repo use `repo-map` first.',
160
+ inputSchema: {
161
+ symbol: z.string().min(1).max(256)
162
+ .describe('Symbol/entity name to trace, e.g. processOrder or EmployeeService.processOrder'),
163
+ file: z.string().max(1000).optional()
164
+ .describe('Optional indexed file path to disambiguate duplicate symbol names'),
165
+ query: z.string().max(1000).optional()
166
+ .describe('Optional natural-language hint used only to rank structural context'),
167
+ maxDepth: z.number().int().min(1).max(4).default(3).optional()
168
+ .describe('Maximum transitive impact depth (default: 3, capped at 4)'),
169
+ tokenBudget: z.number().int().min(1000).max(16000).optional()
170
+ .describe('Optional token budget. Omit for adaptive 4k/8k/12k selection.'),
171
+ },
172
+ outputSchema: TraceOutputSchema,
173
+ annotations: {
174
+ readOnlyHint: true,
175
+ destructiveHint: false,
176
+ idempotentHint: true,
177
+ openWorldHint: false,
178
+ },
179
+ }, async (args) => handleTrace(args, traceDeps));
180
+
153
181
  server.registerTool('index', {
154
- description: 'Index or re-index the codebase',
182
+ description: 'Index or re-index the codebase. USE BEFORE first search if the project has not been indexed yet, or after large source changes (`mode="full"`). The Claude Code SessionStart hook installed by `sweet-search init` keeps the incremental index fresh during normal sessions, so manual re-indexing is rarely needed.',
155
183
  inputSchema: {
156
184
  mode: z.enum(['incremental', 'full']).default('incremental')
157
185
  .describe('Indexing mode'),
@@ -166,7 +194,7 @@ server.registerTool('index', {
166
194
  }, async (args) => handleIndex(args, indexDeps));
167
195
 
168
196
  server.registerTool('health', {
169
- description: 'Check health status of all search subsystems',
197
+ description: 'Check health of every sweet-search subsystem (index, embedding model, late-interaction reranker, structural graph, daemon). USE WHEN searches return empty unexpectedly, results look stale, or latency is unusual — diagnoses missing index, model load failures, daemon issues. Read-only, fast.',
170
198
  outputSchema: HealthOutputSchema,
171
199
  annotations: {
172
200
  readOnlyHint: true,
@@ -189,7 +217,7 @@ server.registerTool('health', {
189
217
  });
190
218
 
191
219
  server.registerTool('repo-map', {
192
- description: 'Generate a PageRank-scored repository map showing the most important symbols in the codebase, fitted to a token budget. Useful for giving LLMs a compressed structural overview.',
220
+ description: 'Compressed structural overview of the codebase as a PageRank-scored symbol list, fitted to a token budget. USE FIRST when exploring an unfamiliar repo to orient yourself, or to brief a delegated agent before handing off a task. Not for targeted lookups use `search` for that. Pass `focusFiles` / `focusEntities` to bias the map toward the area you care about.',
193
221
  inputSchema: {
194
222
  tokenBudget: z.number().int().min(100).max(100000).default(1024)
195
223
  .describe('Maximum token budget for the output (default: 1024)'),
@@ -208,7 +236,7 @@ server.registerTool('repo-map', {
208
236
  }, async (args) => handleRepoMap(args, repoMapDeps));
209
237
 
210
238
  server.registerTool('vocab-prewarm', {
211
- description: 'Mine the codebase for search vocabulary and warm all search modes (lexical, semantic, hybrid) with project-specific terms',
239
+ description: 'Pre-warm sweet-search caches by mining the codebase for project-specific vocabulary across lexical / semantic / hybrid modes. USE ONCE after a fresh index to make the first batch of searches faster; generally not needed for one-off queries because the daemon-prewarm hook handles cold-start warmup automatically.',
212
240
  inputSchema: {
213
241
  depth: z.enum(['light', 'medium', 'deep']).default('medium').describe('Mining depth'),
214
242
  modes: z.array(z.enum(['lexical', 'semantic', 'hybrid'])).default(['lexical', 'semantic', 'hybrid']).describe('Search modes to warm'),
@@ -229,7 +257,7 @@ server.registerTool('vocab-prewarm', {
229
257
  }, async (args) => handleVocabPrewarm(args, vocabDeps));
230
258
 
231
259
  server.registerTool('read', {
232
- description: 'Read one or more files for exact code understanding. Replaces the default Read tool for most code-reading workflows. Uses the filesystem as ground truth, supports line ranges and batching, and attaches symbol-aware chunk metadata when the file is indexed.',
260
+ description: 'Read 1-20 files (with optional line ranges) for exact code understanding. USE INSTEAD OF the native Read tool for code-discovery reads batches multiple files in one call, attaches symbol-aware chunk metadata when the file is indexed, and returns the exact bytes from disk. Native Read remains fine for files you are about to Edit (Edit needs a prior Read of that exact file).',
233
261
  inputSchema: {
234
262
  files: z.array(z.object({
235
263
  path: z.string().describe('File path relative to project root (or absolute)'),
@@ -244,7 +272,7 @@ server.registerTool('read', {
244
272
  }, async (args) => handleRead(args, { PROJECT_ROOT }));
245
273
 
246
274
  server.registerTool('read-semantic', {
247
- description: 'Read only the spans of a file relevant to a query. Selects spans via hybrid retrieval (lexical + symbol + ColBERT-style late-interaction MaxSim) with RRF fusion and LI re-rank, then re-reads exact lines from disk. Returns 1-N small spans instead of the full file. Falls back to a plain read if the file is not indexed.',
275
+ description: 'Read only the spans of a file relevant to a question. USE WHEN you know the file but the relevant span is unclear — selects spans via hybrid retrieval (lexical + symbol + ColBERT MaxSim, RRF-fused and LI-reranked), then re-reads exact lines from disk. Returns 1-N small spans instead of the full file. Avoid running this on multiple files unless the task is explicitly multi-file — call `search` with the question instead. Falls back to a plain read when the file is not indexed.',
248
276
  inputSchema: {
249
277
  file: z.string().describe('File path (project-relative or absolute)'),
250
278
  query: z.string().min(1).max(500).describe('What you want to understand about this file'),
@@ -368,6 +396,18 @@ async function main() {
368
396
  const transport = new StdioServerTransport();
369
397
  await server.connect(transport);
370
398
  console.error(`[sweet-search-mcp] Server started (project: ${PROJECT_ROOT})`);
399
+
400
+ // MCP is opt-in (only runs when the user configures it). When it IS running,
401
+ // reuse the SAME shared launcher so the default-on maintainer starts here too
402
+ // — but MCP is never REQUIRED for incremental indexing (the warm search-server
403
+ // path is the durable guarantee). stdout is the MCP protocol channel, so the
404
+ // launcher's stdout-clean contract is load-bearing; never let it break MCP.
405
+ try {
406
+ const result = launchMaintainer({ cwd: PROJECT_ROOT });
407
+ if (result.spawned) console.error(`[sweet-search-mcp] incremental maintainer started (pid ${result.pid})`);
408
+ } catch (err) {
409
+ console.error(`[sweet-search-mcp] maintainer launch (non-fatal): ${err?.message || err}`);
410
+ }
371
411
  }
372
412
 
373
413
  main().catch((err) => {
@@ -88,65 +88,6 @@ export const VocabPrewarmOutputSchema = z.object({
88
88
  dryRun: z.boolean().optional(),
89
89
  });
90
90
 
91
- const ReadFileResultSchema = z.object({
92
- file: z.string(),
93
- absolutePath: z.string().optional(),
94
- ok: z.boolean(),
95
- exact: z.boolean().optional(),
96
- indexed: z.boolean().optional(),
97
- language: z.string().nullable().optional(),
98
- totalLines: z.number().int().optional(),
99
- bytes: z.number().int().optional(),
100
- mtimeMs: z.number().optional(),
101
- range: z.object({
102
- startLine: z.number().int(),
103
- endLine: z.number().int(),
104
- }).nullable().optional(),
105
- text: z.string().optional(),
106
- chunks: z.array(z.object({
107
- id: z.string(),
108
- symbol: z.string().nullable().optional(),
109
- type: z.string().nullable().optional(),
110
- startLine: z.number().int().nullable().optional(),
111
- endLine: z.number().int().nullable().optional(),
112
- signature: z.string().nullable().optional(),
113
- })).optional(),
114
- error: z.string().optional(),
115
- timings: z.object({ totalMs: z.number() }).optional(),
116
- });
117
-
118
- export const ReadOutputSchema = z.object({
119
- files: z.array(ReadFileResultSchema),
120
- totalMs: z.number(),
121
- });
122
-
123
- const ReadSemanticSpanSchema = z.object({
124
- startLine: z.number().int(),
125
- endLine: z.number().int(),
126
- score: z.number(),
127
- symbols: z.array(z.string()).optional(),
128
- types: z.array(z.string()).optional(),
129
- chunkIds: z.array(z.string()).optional(),
130
- text: z.string(),
131
- truncated: z.boolean().optional(),
132
- });
133
-
134
- export const ReadSemanticOutputSchema = z.object({
135
- file: z.string(),
136
- query: z.string(),
137
- ok: z.boolean(),
138
- indexed: z.boolean(),
139
- fellBack: z.boolean(),
140
- reason: z.string().optional(),
141
- language: z.string().nullable().optional(),
142
- totalLines: z.number().int().optional(),
143
- spans: z.array(ReadSemanticSpanSchema),
144
- charsReturned: z.number().int().optional(),
145
- approxTokensReturned: z.number().int().optional(),
146
- signals: z.record(z.string(), z.any()).optional(),
147
- timings: z.record(z.string(), z.number()).optional(),
148
- });
149
-
150
91
  // ---------------------------------------------------------------------------
151
92
  // Internal state for health DB cache (module-scoped, not exported)
152
93
  // ---------------------------------------------------------------------------
@@ -419,19 +360,25 @@ export async function checkHealth({ getConfig, PROJECT_ROOT }) {
419
360
 
420
361
  /**
421
362
  * @param {{ tokenBudget: number, focusFiles?: string[], focusEntities?: string[] }} args
422
- * @param {{ coreDir: string }} deps
363
+ * @param {{ coreDir: string, PROJECT_ROOT?: string }} deps
423
364
  */
424
- export async function handleRepoMap({ tokenBudget, focusFiles, focusEntities }, { coreDir }) {
365
+ export async function handleRepoMap({ tokenBudget, focusFiles, focusEntities }, { coreDir, PROJECT_ROOT }) {
425
366
  try {
426
- const { generateRepoMap } = await import(
427
- path.join(coreDir, 'graph', 'index.js')
428
- );
429
-
430
- const result = generateRepoMap({
367
+ const [{ generateRepoMap }, { withPinnedRead }] = await Promise.all([
368
+ import(path.join(coreDir, 'graph', 'index.js')),
369
+ import(path.join(coreDir, 'search', 'search-reader-pin.js')),
370
+ ]);
371
+
372
+ const result = await withPinnedRead({
373
+ projectRoot: PROJECT_ROOT,
374
+ meta: { tool: 'repo-map' },
375
+ }, (manifestEpoch, pin) => generateRepoMap({
431
376
  tokenBudget,
432
377
  focusFiles,
433
378
  focusEntities,
434
- });
379
+ manifest: pin?.manifest,
380
+ manifestEpoch: manifestEpoch ?? undefined,
381
+ }));
435
382
 
436
383
  const summary = `Repo map: ${result.entityCount}/${result.totalEntities} entities across ${result.fileCount} files (${result.pageRankTimeMs}ms)`;
437
384
  const text = `${summary}\n\n${result.text}`;
@@ -528,60 +475,3 @@ export async function handleVocabPrewarm({ depth, modes, top, incremental, dryRu
528
475
  };
529
476
  }
530
477
  }
531
-
532
- // ---------------------------------------------------------------------------
533
- // read — filesystem-grounded reader
534
- // ---------------------------------------------------------------------------
535
-
536
- /**
537
- * @param {{ files: Array<{path: string, startLine?: number, endLine?: number}>, includeMetadata?: boolean }} args
538
- * @param {{ PROJECT_ROOT: string }} deps
539
- */
540
- export async function handleRead(args, deps) {
541
- try {
542
- const { readFiles, formatReadResults } = await import('../core/search/index.js');
543
- const result = await readFiles(args.files || [], {
544
- projectRoot: deps.PROJECT_ROOT,
545
- includeMetadata: args.includeMetadata !== false,
546
- });
547
- return {
548
- content: [{ type: 'text', text: formatReadResults(result, 'agent') }],
549
- structuredContent: result,
550
- };
551
- } catch (err) {
552
- const msg = (err.message || 'read failed').split('\n')[0];
553
- return { content: [{ type: 'text', text: `read error: ${msg}` }], isError: true };
554
- }
555
- }
556
-
557
- // ---------------------------------------------------------------------------
558
- // read-semantic — hybrid span selection + filesystem-grounded re-read
559
- // ---------------------------------------------------------------------------
560
-
561
- /**
562
- * @param {{ file: string, query: string, topK?: number, threshold?: number, contextLines?: number, maxChars?: number, maxTokens?: number, verbose?: boolean }} args
563
- * @param {{ PROJECT_ROOT: string }} deps
564
- */
565
- export async function handleReadSemantic(args, deps) {
566
- try {
567
- const { readSemantic, formatReadSemanticResult } = await import('../core/search/index.js');
568
- const result = await readSemantic({
569
- path: args.file,
570
- query: args.query,
571
- topK: args.topK,
572
- threshold: args.threshold,
573
- contextLines: args.contextLines,
574
- maxChars: args.maxChars,
575
- maxTokens: args.maxTokens,
576
- projectRoot: deps.PROJECT_ROOT,
577
- verbose: args.verbose,
578
- });
579
- return {
580
- content: [{ type: 'text', text: formatReadSemanticResult(result, 'agent') }],
581
- structuredContent: result,
582
- };
583
- } catch (err) {
584
- const msg = (err.message || 'read-semantic failed').split('\n')[0];
585
- return { content: [{ type: 'text', text: `read-semantic error: ${msg}` }], isError: true };
586
- }
587
- }
@@ -0,0 +1,81 @@
1
+ import { z } from 'zod';
2
+
3
+ const TraceEntitySchema = z.object({
4
+ id: z.union([z.string(), z.number()]).optional(),
5
+ name: z.string(),
6
+ type: z.string(),
7
+ filePath: z.string().nullable().optional(),
8
+ file: z.string().nullable().optional(),
9
+ startLine: z.number().int().nullable().optional(),
10
+ endLine: z.number().int().nullable().optional(),
11
+ contextLine: z.number().int().nullable().optional(),
12
+ relationship: z.string().nullable().optional(),
13
+ depth: z.number().int().optional(),
14
+ importance: z.number().optional(),
15
+ presentation: z.enum(['full', 'preview', 'summary']).optional(),
16
+ summary: z.string().optional(),
17
+ code: z.string().nullable().optional(),
18
+ codeTokens: z.number().int().optional(),
19
+ });
20
+
21
+ export const TraceOutputSchema = z.object({
22
+ format: z.literal('structural_context'),
23
+ tool: z.literal('trace'),
24
+ symbol: z.string(),
25
+ target: z.any().nullable(),
26
+ disambiguation: z.array(z.any()),
27
+ budgetTier: z.string(),
28
+ budgetReason: z.string(),
29
+ tokenBudget: z.number().int(),
30
+ tokensUsed: z.number().int(),
31
+ maxDepth: z.number().int(),
32
+ answerCues: z.object({
33
+ targetTerms: z.array(z.string()),
34
+ keySymbols: z.array(z.string()).optional(),
35
+ branchTerms: z.array(z.string()).optional(),
36
+ branchSnippets: z.array(z.string()).optional(),
37
+ citationFocus: z.string().nullable().optional(),
38
+ relatedDefinitions: z.array(z.string()).optional(),
39
+ topCallers: z.array(z.string()),
40
+ topCallees: z.array(z.string()),
41
+ criticalPaths: z.array(z.string()),
42
+ }).optional(),
43
+ stats: z.record(z.string(), z.any()),
44
+ sections: z.object({
45
+ callers: z.object({ total: z.number().int(), shown: z.number().int(), items: z.array(TraceEntitySchema) }),
46
+ callees: z.object({ total: z.number().int(), shown: z.number().int(), items: z.array(TraceEntitySchema) }),
47
+ impact: z.object({ total: z.number().int(), shown: z.number().int(), paths: z.array(z.any()) }),
48
+ }),
49
+ });
50
+
51
+ /**
52
+ * @param {{ symbol: string, file?: string, query?: string, tokenBudget?: number, maxDepth?: number }} args
53
+ * @param {{ PROJECT_ROOT: string }} deps
54
+ */
55
+ export async function handleTrace({ symbol, file, query, tokenBudget, maxDepth }, { PROJECT_ROOT }) {
56
+ try {
57
+ const { traceSymbol, formatStructuralContext } = await import('../core/search/search-trace.js');
58
+ const result = traceSymbol(symbol, {
59
+ projectRoot: PROJECT_ROOT,
60
+ filePath: file,
61
+ queryHint: query,
62
+ tokenBudget,
63
+ maxDepth,
64
+ });
65
+ return {
66
+ content: [{ type: 'text', text: formatStructuralContext(result) }],
67
+ structuredContent: result,
68
+ isError: !result.target,
69
+ };
70
+ } catch (err) {
71
+ const safeMessage = (err.message || 'Trace failed')
72
+ .split('\n')[0]
73
+ .replace(/\/[^\s:]+/g, '<path>')
74
+ .replace(/[A-Z]:\\[^\s:]+/gi, '<path>')
75
+ .replace(/\\\\[^\s:]+/g, '<path>');
76
+ return {
77
+ content: [{ type: 'text', text: `Trace error: ${safeMessage}` }],
78
+ isError: true,
79
+ };
80
+ }
81
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sweet-search",
3
- "version": "2.5.2",
3
+ "version": "2.5.4",
4
4
  "description": "Sweet Search - SOTA Hybrid Code Search Engine with WASM CatBoost Query Router, Semantic/Lexical/Structural Search, and Multilingual Support",
5
5
  "type": "module",
6
6
  "main": "core/search/sweet-search.js",
@@ -40,6 +40,7 @@
40
40
  "files": [
41
41
  "core/*.js",
42
42
  "core/infrastructure/",
43
+ "core/incremental-indexing/",
43
44
  "core/embedding/",
44
45
  "core/indexing/",
45
46
  "core/search/",
@@ -55,9 +56,22 @@
55
56
  "scripts/uninstall.js",
56
57
  "scripts/verify-runtime.js",
57
58
  "scripts/smoke-test.js",
59
+ "scripts/inject-agent-instructions.js",
60
+ "scripts/write-claude-rules.js",
61
+ "scripts/install-prompt-reminders.js",
62
+ "scripts/install-tool-enforcement.js",
63
+ "scripts/hooks/",
58
64
  "core/training/query-router/features/",
59
65
  "core/training/query-router/output/v45_router_d4.js",
60
66
  "core/training/query-router/output/v46_router_d4.js",
67
+ "core/prompt-optimization/data/p7-final/",
68
+ "eval/agent-read-workflows/bin/ss-search",
69
+ "eval/agent-read-workflows/bin/ss-find",
70
+ "eval/agent-read-workflows/bin/ss-grep",
71
+ "eval/agent-read-workflows/bin/ss-semantic",
72
+ "eval/agent-read-workflows/bin/ss-trace",
73
+ "eval/agent-read-workflows/bin/ss-read",
74
+ "eval/agent-read-workflows/bin/_ss-helpers.mjs",
61
75
  "crates/wasm-router/pkg/",
62
76
  "LICENSE",
63
77
  "NOTICE"
@@ -89,8 +103,6 @@
89
103
  "test:watch": "vitest",
90
104
  "test:coverage": "vitest run --coverage",
91
105
  "test:bench": "vitest bench",
92
- "eval": "node evaluation/run-evaluation.js",
93
- "eval:by-lang": "node evaluation/run-evaluation.js --by-language",
94
106
  "eval:bench": "node eval/run_all.js",
95
107
  "eval:bench:check": "node eval/run_all.js --regression-check",
96
108
  "eval:bench:baseline": "node eval/run_all.js --save-baseline",
@@ -101,7 +113,9 @@
101
113
  "eval:multirepo:test": "node eval/scripts/multirepo-bench.js --split=test",
102
114
  "bench:read-workflows": "node eval/read-workflows/run-bench.js",
103
115
  "bench:agent-read-workflows": "node eval/agent-read-workflows/run-bench.js",
116
+ "bench:structural-context": "node eval/structural-context/run-bench.js",
104
117
  "eval:fetch-repos": "node eval/scripts/fetch-benchmark-repos.js",
118
+ "eval:prompt": "node scripts/eval-prompt-evolution.mjs",
105
119
  "features": "node core/training/query-router/features/extractor.js",
106
120
  "features:benchmark": "node core/training/query-router/features/extractor.js --benchmark",
107
121
  "features:names": "node core/training/query-router/features/extractor.js --names",
@@ -119,9 +133,11 @@
119
133
  "dependencies": {
120
134
  "@babel/helper-validator-identifier": "^7.28.5",
121
135
  "@modelcontextprotocol/sdk": "^1.26.0",
136
+ "@node-rs/xxhash": "^1.7.6",
122
137
  "better-sqlite3": "^11.7.0",
123
138
  "fast-glob": "^3.3.3",
124
139
  "franc-min": "^6.2.0",
140
+ "minimatch": "^10.1.1",
125
141
  "onnxruntime-node": "^1.24.3",
126
142
  "p-limit": "^6.2.0",
127
143
  "sharp": "^0.34.5",
@@ -136,18 +152,17 @@
136
152
  "@vitest/coverage-v8": "^4.0.16",
137
153
  "eslint": "^9.39.4",
138
154
  "fast-check": "^4.5.3",
139
- "minimatch": "^10.1.1",
140
155
  "p-map": "^7.0.4",
141
156
  "typescript": "^5.9.3",
142
157
  "vitest": "^4.0.16"
143
158
  },
144
159
  "optionalDependencies": {
145
- "@sweet-search/native-darwin-arm64": "2.5.2",
146
- "@sweet-search/native-darwin-x64": "2.5.2",
147
- "@sweet-search/native-linux-arm64-gnu": "2.5.2",
148
- "@sweet-search/native-linux-arm64-gnu-cuda": "2.5.2",
149
- "@sweet-search/native-linux-x64-gnu": "2.5.2",
150
- "@sweet-search/native-linux-x64-gnu-cuda": "2.5.2"
160
+ "@sweet-search/native-darwin-arm64": "2.5.4",
161
+ "@sweet-search/native-darwin-x64": "2.5.4",
162
+ "@sweet-search/native-linux-arm64-gnu": "2.5.4",
163
+ "@sweet-search/native-linux-arm64-gnu-cuda": "2.5.4",
164
+ "@sweet-search/native-linux-x64-gnu": "2.5.4",
165
+ "@sweet-search/native-linux-x64-gnu-cuda": "2.5.4"
151
166
  },
152
167
  "engines": {
153
168
  "node": ">=18.0.0"
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * sweet-search PreToolUse hook for `Read`.
4
+ *
5
+ * Plan reference: §4D. Hints — never blocks. Edit workflows legitimately
6
+ * require `Read` before `Edit`, so a hard deny would break tooling. The
7
+ * hint nudges the agent toward `ss-read` (exact range) or `ss-semantic`
8
+ * (when the relevant span is unclear) for code-understanding reads.
9
+ *
10
+ * Per Claude Code hook contract for PreToolUse:
11
+ * - To surface text into the model's context (so the agent sees the hint
12
+ * and may adjust): stdout JSON with hookSpecificOutput.additionalContext.
13
+ * - Plain stderr reaches the user only — NOT the model. (We did this in
14
+ * the first cut and the hint never landed where it mattered.)
15
+ * - Exit 0 + permissionDecision='allow' → tool runs AND context is
16
+ * injected. Exit 2 would deny.
17
+ *
18
+ * Reference: https://code.claude.com/docs/en/hooks.md (PreToolUse output).
19
+ *
20
+ * The hint is universal (doesn't depend on which file is being Read), so
21
+ * we drain stdin without parsing it. Always exits 0 — the Read continues.
22
+ */
23
+
24
+ const HINT = (
25
+ '[sweet-search] Tip: prefer `ss-read <file> <start> <end>` for exact ranges, '
26
+ + 'or `ss-semantic <file> "<question>" --max-tokens 800` when the relevant '
27
+ + 'span is unclear. Native `Read` is best for files you already know precisely '
28
+ + '(e.g. before `Edit`). See AGENTS.md / CLAUDE.md for the full tool-routing tree.'
29
+ );
30
+
31
+ function emitDecision() {
32
+ const payload = {
33
+ hookSpecificOutput: {
34
+ hookEventName: 'PreToolUse',
35
+ permissionDecision: 'allow',
36
+ additionalContext: HINT,
37
+ },
38
+ };
39
+ process.stdout.write(JSON.stringify(payload));
40
+ process.exit(0);
41
+ }
42
+
43
+ // Claude Code passes the tool invocation as JSON on stdin. We don't parse
44
+ // it because the hint is universal. Drain to keep the pipe clean, then
45
+ // emit the decision.
46
+ let _drained = '';
47
+ process.stdin.on('data', (c) => { _drained += c; });
48
+ process.stdin.on('end', emitDecision);
49
+ process.stdin.on('error', emitDecision);
50
+
51
+ // Edge case: no stdin attached (running the script standalone for debug).
52
+ // `isTTY === undefined` on a piped stdin; truthy when stdin is the terminal.
53
+ if (process.stdin.isTTY) {
54
+ emitDecision();
55
+ }
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * sweet-search UserPromptSubmit reminder hook.
4
+ *
5
+ * Claude Code surfaces this hook's stdout as additional context on every
6
+ * user prompt. The reminder keeps sweet-search tool selection fresh in
7
+ * the agent's working memory — the largest measured failure mode (May
8
+ * 2026) is the agent drifting back to native `Grep` / `Read` even after
9
+ * sweet-search has provided a `sufficient=YES` pack.
10
+ *
11
+ * Plan reference: §4C. Token cost is intentionally minimal because this
12
+ * fires every prompt; the wins come from avoided re-search loops, not
13
+ * from longer guidance.
14
+ *
15
+ * Installed by `sweet-search init` into `.claude/hooks/sweet-search-remind-tools.mjs`
16
+ * with a `.claude/settings.json` `hooks.UserPromptSubmit` entry that
17
+ * sweet-search owns by filename match. `sweet-search uninstall` removes
18
+ * both the file and the settings entry.
19
+ */
20
+
21
+ // Tool surface mirrors the shipped M++ policy (the ss-* tools). Kept terse —
22
+ // the full decision tree lives in CLAUDE.md/AGENTS.md; this is just a nudge to
23
+ // stop the agent drifting back to raw grep/read between prompts.
24
+ const REMINDER = [
25
+ '<sweet-search-reminder>',
26
+ 'Use the index-backed ss-* tools for code search/navigation, not raw grep/find/cat:',
27
+ '- Exact symbol/literal/error string: ss-grep "<regex>" -k 5 (trust the top hit)',
28
+ '- Known symbol, NL underperforms: ss-find "<query>" --regex "\\b<symbol>\\b" -k 5',
29
+ '- Concept/behavior (no exact symbol): ss-search "<query>"',
30
+ '- Callers/callees/impact of a symbol: ss-trace <symbol>',
31
+ '- Known file, unclear span: ss-semantic <file> "<query>"',
32
+ '- Known file + line range: ss-read <file> <start> <end>',
33
+ 'STOP the instant your evidence answers the query — one confirmed file+symbol is enough;',
34
+ 'a second call costs more than it saves. Multi-file flow questions get one follow-up.',
35
+ '</sweet-search-reminder>',
36
+ '',
37
+ ].join('\n');
38
+
39
+ process.stdout.write(REMINDER);
40
+ process.exit(0);