seer-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (371) hide show
  1. package/.vscode/settings.json +3 -0
  2. package/LICENSE +176 -0
  3. package/README.md +272 -0
  4. package/README_dev.md +199 -0
  5. package/dist/bundle/ci.d.ts +47 -0
  6. package/dist/bundle/ci.d.ts.map +1 -0
  7. package/dist/bundle/ci.js +113 -0
  8. package/dist/bundle/ci.js.map +1 -0
  9. package/dist/bundle/contract.d.ts +111 -0
  10. package/dist/bundle/contract.d.ts.map +1 -0
  11. package/dist/bundle/contract.js +352 -0
  12. package/dist/bundle/contract.js.map +1 -0
  13. package/dist/bundle/export.d.ts +36 -0
  14. package/dist/bundle/export.d.ts.map +1 -0
  15. package/dist/bundle/export.js +152 -0
  16. package/dist/bundle/export.js.map +1 -0
  17. package/dist/bundle/external.d.ts +66 -0
  18. package/dist/bundle/external.d.ts.map +1 -0
  19. package/dist/bundle/external.js +238 -0
  20. package/dist/bundle/external.js.map +1 -0
  21. package/dist/bundle/format.d.ts +94 -0
  22. package/dist/bundle/format.d.ts.map +1 -0
  23. package/dist/bundle/format.js +42 -0
  24. package/dist/bundle/format.js.map +1 -0
  25. package/dist/bundle/import.d.ts +49 -0
  26. package/dist/bundle/import.d.ts.map +1 -0
  27. package/dist/bundle/import.js +116 -0
  28. package/dist/bundle/import.js.map +1 -0
  29. package/dist/cli/index.d.ts +3 -0
  30. package/dist/cli/index.d.ts.map +1 -0
  31. package/dist/cli/index.js +1402 -0
  32. package/dist/cli/index.js.map +1 -0
  33. package/dist/cli/init.d.ts +48 -0
  34. package/dist/cli/init.d.ts.map +1 -0
  35. package/dist/cli/init.js +284 -0
  36. package/dist/cli/init.js.map +1 -0
  37. package/dist/db/schema.d.ts +3 -0
  38. package/dist/db/schema.d.ts.map +1 -0
  39. package/dist/db/schema.js +616 -0
  40. package/dist/db/schema.js.map +1 -0
  41. package/dist/db/store.d.ts +1011 -0
  42. package/dist/db/store.d.ts.map +1 -0
  43. package/dist/db/store.js +3888 -0
  44. package/dist/db/store.js.map +1 -0
  45. package/dist/graph/pagerank.d.ts +9 -0
  46. package/dist/graph/pagerank.d.ts.map +1 -0
  47. package/dist/graph/pagerank.js +47 -0
  48. package/dist/graph/pagerank.js.map +1 -0
  49. package/dist/indexer/architecture.d.ts +72 -0
  50. package/dist/indexer/architecture.d.ts.map +1 -0
  51. package/dist/indexer/architecture.js +112 -0
  52. package/dist/indexer/architecture.js.map +1 -0
  53. package/dist/indexer/behavior.d.ts +75 -0
  54. package/dist/indexer/behavior.d.ts.map +1 -0
  55. package/dist/indexer/behavior.js +395 -0
  56. package/dist/indexer/behavior.js.map +1 -0
  57. package/dist/indexer/boundaries.d.ts +60 -0
  58. package/dist/indexer/boundaries.d.ts.map +1 -0
  59. package/dist/indexer/boundaries.js +366 -0
  60. package/dist/indexer/boundaries.js.map +1 -0
  61. package/dist/indexer/churn.d.ts +15 -0
  62. package/dist/indexer/churn.d.ts.map +1 -0
  63. package/dist/indexer/churn.js +49 -0
  64. package/dist/indexer/churn.js.map +1 -0
  65. package/dist/indexer/classify.d.ts +9 -0
  66. package/dist/indexer/classify.d.ts.map +1 -0
  67. package/dist/indexer/classify.js +90 -0
  68. package/dist/indexer/classify.js.map +1 -0
  69. package/dist/indexer/context.d.ts +176 -0
  70. package/dist/indexer/context.d.ts.map +1 -0
  71. package/dist/indexer/context.js +193 -0
  72. package/dist/indexer/context.js.map +1 -0
  73. package/dist/indexer/continuity.d.ts +67 -0
  74. package/dist/indexer/continuity.d.ts.map +1 -0
  75. package/dist/indexer/continuity.js +288 -0
  76. package/dist/indexer/continuity.js.map +1 -0
  77. package/dist/indexer/detectchanges.d.ts +32 -0
  78. package/dist/indexer/detectchanges.d.ts.map +1 -0
  79. package/dist/indexer/detectchanges.js +74 -0
  80. package/dist/indexer/detectchanges.js.map +1 -0
  81. package/dist/indexer/discovery.d.ts +37 -0
  82. package/dist/indexer/discovery.d.ts.map +1 -0
  83. package/dist/indexer/discovery.js +136 -0
  84. package/dist/indexer/discovery.js.map +1 -0
  85. package/dist/indexer/externaldeps.d.ts +18 -0
  86. package/dist/indexer/externaldeps.d.ts.map +1 -0
  87. package/dist/indexer/externaldeps.js +288 -0
  88. package/dist/indexer/externaldeps.js.map +1 -0
  89. package/dist/indexer/freshness.d.ts +48 -0
  90. package/dist/indexer/freshness.d.ts.map +1 -0
  91. package/dist/indexer/freshness.js +128 -0
  92. package/dist/indexer/freshness.js.map +1 -0
  93. package/dist/indexer/git.d.ts +144 -0
  94. package/dist/indexer/git.d.ts.map +1 -0
  95. package/dist/indexer/git.js +444 -0
  96. package/dist/indexer/git.js.map +1 -0
  97. package/dist/indexer/index.d.ts +145 -0
  98. package/dist/indexer/index.d.ts.map +1 -0
  99. package/dist/indexer/index.js +930 -0
  100. package/dist/indexer/index.js.map +1 -0
  101. package/dist/indexer/modules.d.ts +62 -0
  102. package/dist/indexer/modules.d.ts.map +1 -0
  103. package/dist/indexer/modules.js +293 -0
  104. package/dist/indexer/modules.js.map +1 -0
  105. package/dist/indexer/preflight.d.ts +154 -0
  106. package/dist/indexer/preflight.d.ts.map +1 -0
  107. package/dist/indexer/preflight.js +399 -0
  108. package/dist/indexer/preflight.js.map +1 -0
  109. package/dist/indexer/protoScanner.d.ts +34 -0
  110. package/dist/indexer/protoScanner.d.ts.map +1 -0
  111. package/dist/indexer/protoScanner.js +133 -0
  112. package/dist/indexer/protoScanner.js.map +1 -0
  113. package/dist/indexer/risk.d.ts +115 -0
  114. package/dist/indexer/risk.d.ts.map +1 -0
  115. package/dist/indexer/risk.js +194 -0
  116. package/dist/indexer/risk.js.map +1 -0
  117. package/dist/indexer/serviceHostScanner.d.ts +25 -0
  118. package/dist/indexer/serviceHostScanner.d.ts.map +1 -0
  119. package/dist/indexer/serviceHostScanner.js +95 -0
  120. package/dist/indexer/serviceHostScanner.js.map +1 -0
  121. package/dist/indexer/serviceLinks.d.ts +105 -0
  122. package/dist/indexer/serviceLinks.d.ts.map +1 -0
  123. package/dist/indexer/serviceLinks.js +509 -0
  124. package/dist/indexer/serviceLinks.js.map +1 -0
  125. package/dist/indexer/shapehash.d.ts +98 -0
  126. package/dist/indexer/shapehash.d.ts.map +1 -0
  127. package/dist/indexer/shapehash.js +354 -0
  128. package/dist/indexer/shapehash.js.map +1 -0
  129. package/dist/indexer/skeleton.d.ts +15 -0
  130. package/dist/indexer/skeleton.d.ts.map +1 -0
  131. package/dist/indexer/skeleton.js +136 -0
  132. package/dist/indexer/skeleton.js.map +1 -0
  133. package/dist/indexer/symbolhistory.d.ts +41 -0
  134. package/dist/indexer/symbolhistory.d.ts.map +1 -0
  135. package/dist/indexer/symbolhistory.js +124 -0
  136. package/dist/indexer/symbolhistory.js.map +1 -0
  137. package/dist/indexer/watcher.d.ts +68 -0
  138. package/dist/indexer/watcher.d.ts.map +1 -0
  139. package/dist/indexer/watcher.js +179 -0
  140. package/dist/indexer/watcher.js.map +1 -0
  141. package/dist/mcp/server.d.ts +80 -0
  142. package/dist/mcp/server.d.ts.map +1 -0
  143. package/dist/mcp/server.js +1610 -0
  144. package/dist/mcp/server.js.map +1 -0
  145. package/dist/parser/index.d.ts +8 -0
  146. package/dist/parser/index.d.ts.map +1 -0
  147. package/dist/parser/index.js +33 -0
  148. package/dist/parser/index.js.map +1 -0
  149. package/dist/parser/languages/cpp.d.ts +3 -0
  150. package/dist/parser/languages/cpp.d.ts.map +1 -0
  151. package/dist/parser/languages/cpp.js +350 -0
  152. package/dist/parser/languages/cpp.js.map +1 -0
  153. package/dist/parser/languages/csharp.d.ts +3 -0
  154. package/dist/parser/languages/csharp.d.ts.map +1 -0
  155. package/dist/parser/languages/csharp.js +239 -0
  156. package/dist/parser/languages/csharp.js.map +1 -0
  157. package/dist/parser/languages/go.d.ts +3 -0
  158. package/dist/parser/languages/go.d.ts.map +1 -0
  159. package/dist/parser/languages/go.js +259 -0
  160. package/dist/parser/languages/go.js.map +1 -0
  161. package/dist/parser/languages/java.d.ts +3 -0
  162. package/dist/parser/languages/java.d.ts.map +1 -0
  163. package/dist/parser/languages/java.js +391 -0
  164. package/dist/parser/languages/java.js.map +1 -0
  165. package/dist/parser/languages/python.d.ts +3 -0
  166. package/dist/parser/languages/python.d.ts.map +1 -0
  167. package/dist/parser/languages/python.js +396 -0
  168. package/dist/parser/languages/python.js.map +1 -0
  169. package/dist/parser/languages/rust.d.ts +3 -0
  170. package/dist/parser/languages/rust.d.ts.map +1 -0
  171. package/dist/parser/languages/rust.js +159 -0
  172. package/dist/parser/languages/rust.js.map +1 -0
  173. package/dist/parser/languages/typescript.d.ts +3 -0
  174. package/dist/parser/languages/typescript.d.ts.map +1 -0
  175. package/dist/parser/languages/typescript.js +1442 -0
  176. package/dist/parser/languages/typescript.js.map +1 -0
  177. package/dist/parser/parserContext.d.ts +77 -0
  178. package/dist/parser/parserContext.d.ts.map +1 -0
  179. package/dist/parser/parserContext.js +354 -0
  180. package/dist/parser/parserContext.js.map +1 -0
  181. package/dist/parser/walker.d.ts +81 -0
  182. package/dist/parser/walker.d.ts.map +1 -0
  183. package/dist/parser/walker.js +217 -0
  184. package/dist/parser/walker.js.map +1 -0
  185. package/dist/parser/worker.d.ts +66 -0
  186. package/dist/parser/worker.d.ts.map +1 -0
  187. package/dist/parser/worker.js +129 -0
  188. package/dist/parser/worker.js.map +1 -0
  189. package/dist/parser/workerpool.d.ts +107 -0
  190. package/dist/parser/workerpool.d.ts.map +1 -0
  191. package/dist/parser/workerpool.js +383 -0
  192. package/dist/parser/workerpool.js.map +1 -0
  193. package/dist/scip/format.d.ts +87 -0
  194. package/dist/scip/format.d.ts.map +1 -0
  195. package/dist/scip/format.js +31 -0
  196. package/dist/scip/format.js.map +1 -0
  197. package/dist/scip/import.d.ts +37 -0
  198. package/dist/scip/import.d.ts.map +1 -0
  199. package/dist/scip/import.js +180 -0
  200. package/dist/scip/import.js.map +1 -0
  201. package/dist/types.d.ts +392 -0
  202. package/dist/types.d.ts.map +1 -0
  203. package/dist/types.js +4 -0
  204. package/dist/types.js.map +1 -0
  205. package/docs/architecture.md +105 -0
  206. package/docs/benchmarks/methodology.md +134 -0
  207. package/docs/benchmarks/raw-results.md +71 -0
  208. package/docs/benchmarks.md +74 -0
  209. package/docs/cli.md +148 -0
  210. package/docs/examples/behavior-tests.md +70 -0
  211. package/docs/examples/change-history.md +85 -0
  212. package/docs/examples/pre-edit-context.md +81 -0
  213. package/docs/examples/service-links.md +88 -0
  214. package/docs/examples.md +80 -0
  215. package/docs/faq.md +70 -0
  216. package/docs/internals.md +104 -0
  217. package/docs/languages.md +70 -0
  218. package/docs/limits.md +52 -0
  219. package/docs/mcp.md +199 -0
  220. package/docs/quickstart.md +119 -0
  221. package/docs/testing.md +123 -0
  222. package/docs/tools.md +115 -0
  223. package/package.json +52 -0
  224. package/research-codebase.md +578 -0
  225. package/seer-cli-docs.md +326 -0
  226. package/seer-master-guide.md +246 -0
  227. package/src/bundle/ci.ts +141 -0
  228. package/src/bundle/contract.ts +387 -0
  229. package/src/bundle/export.ts +175 -0
  230. package/src/bundle/external.ts +285 -0
  231. package/src/bundle/format.ts +92 -0
  232. package/src/bundle/import.ts +157 -0
  233. package/src/cli/index.ts +1249 -0
  234. package/src/cli/init.ts +389 -0
  235. package/src/db/schema.ts +614 -0
  236. package/src/db/store.ts +4306 -0
  237. package/src/graph/pagerank.ts +53 -0
  238. package/src/indexer/architecture.ts +148 -0
  239. package/src/indexer/behavior.ts +466 -0
  240. package/src/indexer/boundaries.ts +374 -0
  241. package/src/indexer/churn.ts +58 -0
  242. package/src/indexer/classify.ts +96 -0
  243. package/src/indexer/context.ts +340 -0
  244. package/src/indexer/continuity.ts +322 -0
  245. package/src/indexer/detectchanges.ts +94 -0
  246. package/src/indexer/discovery.ts +176 -0
  247. package/src/indexer/externaldeps.ts +243 -0
  248. package/src/indexer/freshness.ts +166 -0
  249. package/src/indexer/git.ts +453 -0
  250. package/src/indexer/index.ts +1092 -0
  251. package/src/indexer/modules.ts +358 -0
  252. package/src/indexer/preflight.ts +548 -0
  253. package/src/indexer/protoScanner.ts +147 -0
  254. package/src/indexer/risk.ts +304 -0
  255. package/src/indexer/serviceHostScanner.ts +92 -0
  256. package/src/indexer/serviceLinks.ts +543 -0
  257. package/src/indexer/shapehash.ts +370 -0
  258. package/src/indexer/skeleton.ts +169 -0
  259. package/src/indexer/symbolhistory.ts +172 -0
  260. package/src/indexer/watcher.ts +206 -0
  261. package/src/mcp/server.ts +1659 -0
  262. package/src/parser/index.ts +37 -0
  263. package/src/parser/languages/cpp.ts +361 -0
  264. package/src/parser/languages/csharp.ts +235 -0
  265. package/src/parser/languages/go.ts +259 -0
  266. package/src/parser/languages/java.ts +382 -0
  267. package/src/parser/languages/python.ts +370 -0
  268. package/src/parser/languages/rust.ts +164 -0
  269. package/src/parser/languages/typescript.ts +1435 -0
  270. package/src/parser/parserContext.ts +392 -0
  271. package/src/parser/walker.ts +306 -0
  272. package/src/parser/worker.ts +181 -0
  273. package/src/parser/workerpool.ts +448 -0
  274. package/src/scip/format.ts +83 -0
  275. package/src/scip/import.ts +216 -0
  276. package/src/types.ts +457 -0
  277. package/tests/benchmark-service-links.ts +244 -0
  278. package/tests/bug-regressions.ts +626 -0
  279. package/tests/filters.ts +264 -0
  280. package/tests/fixtures/Counter.tsx +38 -0
  281. package/tests/fixtures/caller.ts +7 -0
  282. package/tests/fixtures/collisions.ts +23 -0
  283. package/tests/fixtures/local_helper.ts +5 -0
  284. package/tests/fixtures/overloads.java +17 -0
  285. package/tests/fixtures/remote_helper.ts +4 -0
  286. package/tests/fixtures/sample.c +15 -0
  287. package/tests/fixtures/sample.cpp +47 -0
  288. package/tests/fixtures/sample.cs +62 -0
  289. package/tests/fixtures/sample.go +68 -0
  290. package/tests/fixtures/sample.h +30 -0
  291. package/tests/fixtures/sample.java +85 -0
  292. package/tests/fixtures/sample.py +46 -0
  293. package/tests/fixtures/sample.rs +78 -0
  294. package/tests/fixtures/sample.ts +76 -0
  295. package/tests/fixtures-service/HttpClients.cs +30 -0
  296. package/tests/fixtures-service/HttpClients.java +24 -0
  297. package/tests/fixtures-service/billing.ts +15 -0
  298. package/tests/fixtures-service/docker-compose.yml +15 -0
  299. package/tests/fixtures-service/gateway.ts +10 -0
  300. package/tests/fixtures-service/get_user.ts +11 -0
  301. package/tests/fixtures-service/graphql_client.ts +63 -0
  302. package/tests/fixtures-service/graphql_server.ts +30 -0
  303. package/tests/fixtures-service/grpc_client.go +30 -0
  304. package/tests/fixtures-service/http_clients.go +23 -0
  305. package/tests/fixtures-service/http_clients.py +38 -0
  306. package/tests/fixtures-service/http_clients.ts +49 -0
  307. package/tests/fixtures-service/k8s/payment-service.yaml +22 -0
  308. package/tests/fixtures-service/k8s_calls.ts +20 -0
  309. package/tests/fixtures-service/messaging.ts +87 -0
  310. package/tests/fixtures-service/trpc_client.ts +39 -0
  311. package/tests/fixtures-service/trpc_server.ts +39 -0
  312. package/tests/fixtures-service/user_service.proto +33 -0
  313. package/tests/fixtures-trackcd/Cargo.toml +11 -0
  314. package/tests/fixtures-trackcd/SpringController.java +36 -0
  315. package/tests/fixtures-trackcd/auth_service.ts +19 -0
  316. package/tests/fixtures-trackcd/complex_module.py +50 -0
  317. package/tests/fixtures-trackcd/express_app.js +30 -0
  318. package/tests/fixtures-trackcd/fastapi_app.py +49 -0
  319. package/tests/fixtures-trackcd/fastify_object_routes.js +32 -0
  320. package/tests/fixtures-trackcd/go.mod +8 -0
  321. package/tests/fixtures-trackcd/package.json +15 -0
  322. package/tests/fixtures-trackcd/requirements.txt +4 -0
  323. package/tests/fixtures-trackcd/tests/auth_service.test.ts +13 -0
  324. package/tests/fixtures-tracke/auth/AuthService.ts +23 -0
  325. package/tests/fixtures-tracke/auth/crypto.ts +7 -0
  326. package/tests/fixtures-tracke/billing/Billing.ts +20 -0
  327. package/tests/fixtures-tracke/billing/Invoice.ts +10 -0
  328. package/tests/fixtures-tracke/billing/server.ts +17 -0
  329. package/tests/fixtures-tracke/package.json +7 -0
  330. package/tests/fixtures-tracke/tests/auth.test.ts +23 -0
  331. package/tests/fixtures-tracke/tests/billing.test.ts +14 -0
  332. package/tests/fixtures-trackf/package.json +5 -0
  333. package/tests/fixtures-trackf/src/auth.ts +26 -0
  334. package/tests/fixtures-trackf/src/handlers.ts +35 -0
  335. package/tests/fixtures-tracki/billing/routes.ts +12 -0
  336. package/tests/fixtures-tracki/gateway/client.ts +13 -0
  337. package/tests/git-features.ts +267 -0
  338. package/tests/init.ts +141 -0
  339. package/tests/mcp-jit.ts +130 -0
  340. package/tests/mcp-smoke.ts +191 -0
  341. package/tests/mcp-trackcd.ts +169 -0
  342. package/tests/mcp-tracke.ts +229 -0
  343. package/tests/mcp-trackf.ts +330 -0
  344. package/tests/mcp-trackg.ts +219 -0
  345. package/tests/mcp-tracki.ts +174 -0
  346. package/tests/mcp-watcher.ts +126 -0
  347. package/tests/optspec.ts +194 -0
  348. package/tests/parallel-index.ts +333 -0
  349. package/tests/parallel-read.ts +125 -0
  350. package/tests/parallel-recovery.ts +241 -0
  351. package/tests/perf-callers.ts +145 -0
  352. package/tests/query-parity.ts +184 -0
  353. package/tests/query-perf.ts +55 -0
  354. package/tests/scale-parallel-parity.ts +225 -0
  355. package/tests/scale-test.ts +523 -0
  356. package/tests/smoke.ts +396 -0
  357. package/tests/trackcd.ts +325 -0
  358. package/tests/tracke-collisions.ts +255 -0
  359. package/tests/tracke.ts +314 -0
  360. package/tests/trackf-bugs.ts +406 -0
  361. package/tests/trackf.ts +390 -0
  362. package/tests/trackg.ts +1372 -0
  363. package/tests/tracki-boundaries.ts +202 -0
  364. package/tests/tracki-continuity.ts +253 -0
  365. package/tests/tracki-contract-diff.ts +249 -0
  366. package/tests/tracki-external-bundles.ts +341 -0
  367. package/tests/tracki-preflight.ts +251 -0
  368. package/tests/verify-roles.ts +51 -0
  369. package/tests/worker-parity.ts +286 -0
  370. package/tests/worker-pool.ts +262 -0
  371. package/tsconfig.json +20 -0
@@ -0,0 +1,370 @@
1
+ import fs from 'fs';
2
+ import { Store } from '../db/store.js';
3
+
4
+ /**
5
+ * Track-F structural SimHash duplicate detection.
6
+ *
7
+ * For each function/method/constructor symbol we compute a 64-bit SimHash
8
+ * over its STRUCTURAL token stream — identifier names are folded into their
9
+ * "kind" (NAME / NUMBER / STRING / KEYWORD / OP) so two functions that do
10
+ * the same shape with different variable names still match. This is the
11
+ * classic Charikar SimHash construction, sized so two near-duplicates
12
+ * differ in only a small Hamming distance.
13
+ *
14
+ * Why structural and not exact-tree? Exact-tree hashes (Merkle over the AST)
15
+ * find verbatim copies; that's a small fraction of real-world duplication. A
16
+ * SimHash over tokens with a sliding 3-gram window catches:
17
+ * - genuine copy-paste with renames
18
+ * - near-duplicate boilerplate (CRUD handlers, parser branches)
19
+ * - structural twins across files / languages with similar syntactic shape
20
+ *
21
+ * The trade-off is exact-equality false positives (two unrelated 3-line
22
+ * helpers can hash close). We mitigate by:
23
+ * 1. Requiring LOC >= MIN_LOC (default 4) to avoid trivial pairs.
24
+ * 2. Computing the hash only over function/method/constructor symbols.
25
+ * 3. Returning Hamming distance with every pair so the caller can filter.
26
+ *
27
+ * SCIP-merged symbols keep the tree-sitter hash; SCIP-only symbols never get
28
+ * a hash because we don't see their bodies.
29
+ */
30
+
31
+ const MIN_LOC_DEFAULT = 4;
32
+ const NGRAM_SIZE = 3;
33
+ const HASH_BITS = 64;
34
+
35
+ /** Tokens we recognize when computing the shape hash. */
36
+ type TokenKind = 'NAME' | 'NUMBER' | 'STRING' | 'KEYWORD' | 'OP';
37
+
38
+ interface ShapeToken {
39
+ kind: TokenKind;
40
+ /**
41
+ * For keywords/operators we keep the lexeme so `if` ≠ `for` ≠ `while`.
42
+ * For names/numbers/strings we drop the lexeme to fold them together —
43
+ * structure first, content second.
44
+ */
45
+ text: string;
46
+ }
47
+
48
+ /**
49
+ * A tiny language-agnostic tokenizer. We don't need to be a full lexer —
50
+ * the goal is "structurally meaningful tokens that survive renames." A
51
+ * char-class scan over the source body suffices:
52
+ * - identifier-start runs → NAME (folded)
53
+ * - digit runs → NUMBER (folded)
54
+ * - quoted strings → STRING (folded; we just skip until the closing quote)
55
+ * - operators / punctuation → OP (lexeme kept)
56
+ *
57
+ * Keywords aren't language-specific in this tokenizer — they appear as NAME
58
+ * tokens. That's intentional: a Python `if` and a JS `if` have the same
59
+ * structural role, and at the shape-hash level we want them to collide.
60
+ * Real keywords still get distinguished from random identifiers because the
61
+ * SURROUNDING operators differ ( `if (` vs `def foo(` ).
62
+ */
63
+ export function tokenize(source: string): ShapeToken[] {
64
+ const tokens: ShapeToken[] = [];
65
+ let i = 0;
66
+ const n = source.length;
67
+ while (i < n) {
68
+ const c = source.charCodeAt(i);
69
+ // Whitespace
70
+ if (c === 32 || c === 9 || c === 10 || c === 13) { i++; continue; }
71
+ // Line comment (// or #) — skip to EOL
72
+ if ((c === 47 && source.charCodeAt(i + 1) === 47) || c === 35) {
73
+ while (i < n && source.charCodeAt(i) !== 10) i++;
74
+ continue;
75
+ }
76
+ // Block comment (/* … */)
77
+ if (c === 47 && source.charCodeAt(i + 1) === 42) {
78
+ i += 2;
79
+ while (i < n && !(source.charCodeAt(i) === 42 && source.charCodeAt(i + 1) === 47)) i++;
80
+ i += 2;
81
+ continue;
82
+ }
83
+ // String — single, double, or backtick quoted; folded to a single STRING.
84
+ if (c === 34 || c === 39 || c === 96) {
85
+ const quote = c;
86
+ i++;
87
+ while (i < n) {
88
+ const cc = source.charCodeAt(i);
89
+ if (cc === 92) { i += 2; continue; } // escape: skip next char too
90
+ if (cc === quote) { i++; break; }
91
+ i++;
92
+ }
93
+ tokens.push({ kind: 'STRING', text: '' });
94
+ continue;
95
+ }
96
+ // Identifier — letter / underscore / $ followed by alnum-underscore-$
97
+ if (isIdStart(c)) {
98
+ let j = i + 1;
99
+ while (j < n && isIdContinue(source.charCodeAt(j))) j++;
100
+ tokens.push({ kind: 'NAME', text: '' });
101
+ i = j;
102
+ continue;
103
+ }
104
+ // Number
105
+ if (c >= 48 && c <= 57) {
106
+ let j = i + 1;
107
+ while (j < n) {
108
+ const cc = source.charCodeAt(j);
109
+ if ((cc >= 48 && cc <= 57) || cc === 46 || cc === 95 || cc === 120 || cc === 88) j++;
110
+ else break;
111
+ }
112
+ tokens.push({ kind: 'NUMBER', text: '' });
113
+ i = j;
114
+ continue;
115
+ }
116
+ // Operator / punctuation — single char. We keep the lexeme so '{' ≠ '}'.
117
+ tokens.push({ kind: 'OP', text: source[i] });
118
+ i++;
119
+ }
120
+ return tokens;
121
+ }
122
+
123
+ function isIdStart(c: number): boolean {
124
+ return (c >= 65 && c <= 90) || (c >= 97 && c <= 122) || c === 95 || c === 36;
125
+ }
126
+ function isIdContinue(c: number): boolean {
127
+ return isIdStart(c) || (c >= 48 && c <= 57);
128
+ }
129
+
130
+ /**
131
+ * Compute the 64-bit structural SimHash over a function body source string.
132
+ * Returns NULL when the source is too small to be meaningfully compared.
133
+ *
134
+ * Algorithm (standard Charikar SimHash):
135
+ * 1. Tokenize, fold names/numbers/strings, keep operator lexemes.
136
+ * 2. Slide a 3-gram window over the tokens.
137
+ * 3. For each n-gram compute a stable 64-bit hash (FNV-1a is plenty here).
138
+ * 4. For each bit position, sum +1 if set in the gram-hash, -1 if unset.
139
+ * 5. Output bit i is 1 iff sum_i > 0.
140
+ */
141
+ export function computeShapeHash(body: string, minTokens = 8): bigint | null {
142
+ const tokens = tokenize(body);
143
+ if (tokens.length < minTokens) return null;
144
+ const counters = new Int32Array(HASH_BITS);
145
+ const ngram: string[] = [];
146
+ for (const tok of tokens) {
147
+ ngram.push(tok.kind === 'OP' ? `OP:${tok.text}` : tok.kind);
148
+ if (ngram.length < NGRAM_SIZE) continue;
149
+ if (ngram.length > NGRAM_SIZE) ngram.shift();
150
+ const h = fnv64(ngram.join('|'));
151
+ for (let b = 0; b < HASH_BITS; b++) {
152
+ const bit = (h >> BigInt(b)) & 1n;
153
+ counters[b] += bit === 1n ? 1 : -1;
154
+ }
155
+ }
156
+ let out = 0n;
157
+ for (let b = 0; b < HASH_BITS; b++) {
158
+ if (counters[b] > 0) out |= (1n << BigInt(b));
159
+ }
160
+ return out;
161
+ }
162
+
163
+ /** FNV-1a 64-bit hash. Stable, deterministic, no dependencies. */
164
+ function fnv64(s: string): bigint {
165
+ let h = 0xcbf29ce484222325n;
166
+ const PRIME = 0x100000001b3n;
167
+ const MASK = 0xFFFFFFFFFFFFFFFFn;
168
+ for (let i = 0; i < s.length; i++) {
169
+ h ^= BigInt(s.charCodeAt(i) & 0xff);
170
+ h = (h * PRIME) & MASK;
171
+ }
172
+ return h;
173
+ }
174
+
175
+ /** Hamming distance between two 64-bit bigints. */
176
+ export function hammingDistance(a: bigint, b: bigint): number {
177
+ let x = a ^ b;
178
+ let count = 0;
179
+ while (x !== 0n) {
180
+ x &= (x - 1n);
181
+ count++;
182
+ }
183
+ return count;
184
+ }
185
+
186
+ export interface BuildShapeHashResult {
187
+ symbolsHashed: number;
188
+ symbolsSkipped: number;
189
+ elapsedMs: number;
190
+ }
191
+
192
+ /**
193
+ * Compute shape hashes for every function-like symbol in the DB by reading
194
+ * its file and slicing out the body. We re-read each file once and slice all
195
+ * its function bodies in one pass.
196
+ *
197
+ * Idempotent: skips symbols that already have a shape_hash and whose file
198
+ * hash hasn't changed since the last pass (Store.upsertFileWithCache will
199
+ * have NULLed the column for re-parsed files automatically because the row
200
+ * gets deleted-and-reinserted).
201
+ */
202
+ export function buildShapeHashes(
203
+ store: Store,
204
+ options: { minLoc?: number; force?: boolean; log?: (m: string) => void } = {},
205
+ ): BuildShapeHashResult {
206
+ const start = Date.now();
207
+ const minLoc = options.minLoc ?? MIN_LOC_DEFAULT;
208
+ const log = options.log ?? (() => { /* */ });
209
+ if (!store.hasV7()) {
210
+ log('shape hashes require schema v7; skipping');
211
+ return { symbolsHashed: 0, symbolsSkipped: 0, elapsedMs: Date.now() - start };
212
+ }
213
+
214
+ // Pull every function/method/constructor symbol with loc >= minLoc that
215
+ // doesn't already have a shape_hash.
216
+ const where = options.force ? '' : 'AND s.shape_hash IS NULL';
217
+ const rows = store.rawDb().prepare(`
218
+ SELECT s.id, s.line_start AS lineStart, s.line_end AS lineEnd, s.loc, f.path AS filePath
219
+ FROM symbols s JOIN files f ON f.id = s.file_id
220
+ WHERE s.kind IN ('function','method','constructor')
221
+ AND s.symbol_role <> 'declaration'
222
+ AND s.loc >= ?
223
+ ${where}
224
+ ORDER BY f.path
225
+ `).all(minLoc) as Array<Record<string, unknown>>;
226
+
227
+ let symbolsHashed = 0;
228
+ let symbolsSkipped = 0;
229
+ let lastFile = '';
230
+ let lastSource: string | null = null;
231
+ // node:sqlite — minor optimization: prepare the update once and reuse.
232
+ const setHash = store.rawDb().prepare(
233
+ 'UPDATE symbols SET shape_hash = ? WHERE id = ?',
234
+ );
235
+ // Same signed-bigint conversion the Store uses for storage round-trip.
236
+ const toSigned = (u: bigint): bigint => {
237
+ const MAX = 0x7FFFFFFFFFFFFFFFn;
238
+ return u > MAX ? u - 0x10000000000000000n : u;
239
+ };
240
+
241
+ for (const r of rows) {
242
+ const filePath = String(r.filePath);
243
+ if (filePath !== lastFile) {
244
+ lastFile = filePath;
245
+ try { lastSource = fs.readFileSync(filePath, 'utf-8') as string; }
246
+ catch { lastSource = null; }
247
+ }
248
+ if (lastSource == null) { symbolsSkipped++; continue; }
249
+ const lineStart = Number(r.lineStart);
250
+ const lineEnd = Number(r.lineEnd);
251
+ const body = sliceLines(lastSource, lineStart, lineEnd);
252
+ const hash = computeShapeHash(body);
253
+ if (hash == null) { symbolsSkipped++; continue; }
254
+ setHash.run(toSigned(hash), Number(r.id));
255
+ symbolsHashed++;
256
+ }
257
+ log(`Hashed ${symbolsHashed} symbols (${symbolsSkipped} skipped)`);
258
+ return { symbolsHashed, symbolsSkipped, elapsedMs: Date.now() - start };
259
+ }
260
+
261
+ function sliceLines(source: string, startLine: number, endLine: number): string {
262
+ // 0-indexed line span — inclusive end. Naive line slicing is fine for our
263
+ // sizes; we don't need to worry about trailing-newline edge cases since
264
+ // tokenize() ignores whitespace anyway.
265
+ const lines = source.split(/\r?\n/);
266
+ return lines.slice(startLine, endLine + 1).join('\n');
267
+ }
268
+
269
+ export interface DuplicateCluster {
270
+ fingerprint: bigint;
271
+ symbols: Array<{
272
+ id: number; name: string; qualifiedName: string | null; kind: string;
273
+ file: string; lineStart: number; lineEnd: number; loc: number | null;
274
+ hammingFromAnchor: number;
275
+ }>;
276
+ }
277
+
278
+ export interface FindDuplicatesOptions {
279
+ /** Maximum Hamming distance two symbols may differ to count as duplicates. */
280
+ maxDistance?: number;
281
+ /** Minimum LOC for a symbol to be considered. */
282
+ minLoc?: number;
283
+ /** Include test files. */
284
+ includeTests?: boolean;
285
+ /** Hard cap on clusters returned. */
286
+ maxClusters?: number;
287
+ }
288
+
289
+ /**
290
+ * Find clusters of structurally near-duplicate symbols.
291
+ *
292
+ * Implementation: pairwise Hamming distance over the candidate set. For
293
+ * codebases up to ~20k functions this stays well under a second; bigger
294
+ * codebases can pre-bucket on the top 16 bits of the hash (we don't do that
295
+ * here yet — the current scale works). The output is grouped into clusters
296
+ * via simple transitive-closure union-find on the (distance ≤ N) graph.
297
+ */
298
+ export function findDuplicates(
299
+ store: Store, options: FindDuplicatesOptions = {},
300
+ ): DuplicateCluster[] {
301
+ const maxDistance = options.maxDistance ?? 6;
302
+ const minLoc = options.minLoc ?? MIN_LOC_DEFAULT;
303
+ const includeTests = options.includeTests ?? false;
304
+ const maxClusters = options.maxClusters ?? 200;
305
+
306
+ const candidates = store.listSymbolsWithShapeHash({
307
+ minLoc, includeTests, limit: 50000,
308
+ });
309
+ if (candidates.length < 2) return [];
310
+
311
+ // Union-find.
312
+ const parent = new Int32Array(candidates.length);
313
+ for (let i = 0; i < parent.length; i++) parent[i] = i;
314
+ function find(x: number): number {
315
+ while (parent[x] !== x) {
316
+ parent[x] = parent[parent[x]]; // path compression
317
+ x = parent[x];
318
+ }
319
+ return x;
320
+ }
321
+ function union(a: number, b: number): void {
322
+ const ra = find(a), rb = find(b);
323
+ if (ra !== rb) parent[ra] = rb;
324
+ }
325
+
326
+ // Pairwise. N² for now — acceptable up to ~10k candidates (50M comparisons
327
+ // each ~100ns = 5s worst case). Bigger codebases should bucket first.
328
+ for (let i = 0; i < candidates.length; i++) {
329
+ const a = candidates[i];
330
+ for (let j = i + 1; j < candidates.length; j++) {
331
+ const b = candidates[j];
332
+ // Skip pairs from the same symbol (same id). Two rows can share the
333
+ // same id when one is a tree-sitter / scip-merge overlap.
334
+ if (a.id === b.id) continue;
335
+ const d = hammingDistance(a.shapeHash, b.shapeHash);
336
+ if (d <= maxDistance) union(i, j);
337
+ }
338
+ }
339
+
340
+ // Group by root.
341
+ const clusters = new Map<number, number[]>();
342
+ for (let i = 0; i < candidates.length; i++) {
343
+ const r = find(i);
344
+ let bucket = clusters.get(r);
345
+ if (!bucket) { bucket = []; clusters.set(r, bucket); }
346
+ bucket.push(i);
347
+ }
348
+
349
+ const out: DuplicateCluster[] = [];
350
+ for (const indices of clusters.values()) {
351
+ if (indices.length < 2) continue;
352
+ const anchor = candidates[indices[0]];
353
+ out.push({
354
+ fingerprint: anchor.shapeHash,
355
+ symbols: indices.map(i => {
356
+ const s = candidates[i];
357
+ return {
358
+ id: s.id, name: s.name, qualifiedName: s.qualifiedName, kind: s.kind,
359
+ file: s.filePath, lineStart: s.lineStart, lineEnd: s.lineEnd, loc: s.loc,
360
+ hammingFromAnchor: hammingDistance(anchor.shapeHash, s.shapeHash),
361
+ };
362
+ }).sort((x, y) => x.hammingFromAnchor - y.hammingFromAnchor),
363
+ });
364
+ if (out.length >= maxClusters) break;
365
+ }
366
+ // Largest clusters first; ties broken by lowest fingerprint for stability.
367
+ out.sort((a, b) => b.symbols.length - a.symbols.length
368
+ || (a.fingerprint < b.fingerprint ? -1 : 1));
369
+ return out;
370
+ }
@@ -0,0 +1,169 @@
1
+ /**
2
+ * Deterministic skeleton renderer (AI-agent optimization §3).
3
+ *
4
+ * Renders a file as a *structural skeleton*: every symbol's header (signature)
5
+ * is kept, bodies are collapsed to a fold marker carrying the exact collapsed
6
+ * line count. This is deterministic SOURCE ELISION — not summarization — so it
7
+ * stays inside Seer's zero-AI / reproducible contract: the same DB + same file
8
+ * bytes always render byte-identical output.
9
+ *
10
+ * Inputs are entirely owned by Seer already: per-symbol line ranges and kinds
11
+ * from the index, plus the file bytes on disk (read only for the header lines
12
+ * and an optional focused body). Nesting is reconstructed from line-range
13
+ * containment, so it works for every language without per-grammar logic.
14
+ */
15
+ import fs from 'fs';
16
+ import type { Store } from '../db/store.js';
17
+ import type { SymbolRow } from '../types.js';
18
+
19
+ export interface SkeletonResult {
20
+ ok: boolean;
21
+ file?: string;
22
+ relPath?: string;
23
+ language?: string;
24
+ symbolCount?: number;
25
+ focus?: string | null;
26
+ skeleton?: string;
27
+ reason?: string;
28
+ }
29
+
30
+ /** Kinds whose bodies are worth collapsing into a fold marker. */
31
+ const BODY_KINDS = new Set(['function', 'method', 'constructor']);
32
+
33
+ interface Node {
34
+ row: SymbolRow;
35
+ children: Node[];
36
+ }
37
+
38
+ function norm(p: string): string {
39
+ return p.replace(/\\/g, '/').toLowerCase();
40
+ }
41
+
42
+ /** Find the indexed file row best matching `file` (abs path, rel_path, or a
43
+ * trailing path fragment on a `/` boundary). */
44
+ function matchFile(
45
+ store: Store,
46
+ file: string,
47
+ ): { id: number; path: string; relPath: string; language: string } | null {
48
+ const target = norm(file);
49
+ const files = store.listFiles();
50
+ // Exact first, then boundary-aligned suffix — mirrors getDefinition's rule.
51
+ const exact = files.find(f => norm(f.path) === target || norm(f.relPath) === target);
52
+ if (exact) return exact;
53
+ const frag = target.startsWith('/') ? target : '/' + target;
54
+ return (
55
+ files.find(f => norm(f.path).endsWith(frag) || norm(f.relPath).endsWith(frag)) ?? null
56
+ );
57
+ }
58
+
59
+ /** Build the containment forest from line ranges (tightest-encloser parenting). */
60
+ function buildForest(rows: SymbolRow[]): Node[] {
61
+ // Sort by start asc, then by end desc so a container precedes its members.
62
+ const sorted = [...rows].sort(
63
+ (a, b) => a.lineStart - b.lineStart || b.lineEnd - a.lineEnd || a.id - b.id,
64
+ );
65
+ const roots: Node[] = [];
66
+ const stack: Node[] = [];
67
+ for (const row of sorted) {
68
+ const node: Node = { row, children: [] };
69
+ // Pop until the top of the stack encloses this node.
70
+ while (
71
+ stack.length > 0 &&
72
+ !(stack[stack.length - 1].row.lineStart <= row.lineStart &&
73
+ row.lineEnd <= stack[stack.length - 1].row.lineEnd &&
74
+ stack[stack.length - 1].row !== row)
75
+ ) {
76
+ stack.pop();
77
+ }
78
+ if (stack.length === 0) roots.push(node);
79
+ else stack[stack.length - 1].children.push(node);
80
+ stack.push(node);
81
+ }
82
+ return roots;
83
+ }
84
+
85
+ /** Pick the display header for a symbol: prefer the stored signature, else the
86
+ * first source line of its declaration, trimmed of trailing block openers. */
87
+ function headerFor(row: SymbolRow, lines: string[]): string {
88
+ const sig = row.signature?.trim();
89
+ if (sig) return sig.replace(/\s*\{\s*$/, '').trim();
90
+ const raw = lines[row.lineStart - 1] ?? '';
91
+ return raw.trim().replace(/\s*\{\s*$/, '').trim();
92
+ }
93
+
94
+ function render(
95
+ node: Node,
96
+ lines: string[],
97
+ depth: number,
98
+ focus: string | null,
99
+ out: string[],
100
+ ): void {
101
+ const { row } = node;
102
+ const indent = ' '.repeat(depth);
103
+ const span = `[L${row.lineStart}-${row.lineEnd}]`;
104
+ const header = headerFor(row, lines);
105
+ const isFocus =
106
+ focus != null && (row.name === focus || row.qualifiedName === focus);
107
+
108
+ if (isFocus) {
109
+ // Expanded: show the real source slice verbatim (the agent asked for it).
110
+ out.push(`${indent}${header} ${span} ◀ focus`);
111
+ const body = lines.slice(row.lineStart - 1, row.lineEnd);
112
+ for (const l of body) out.push(`${indent} ${l}`);
113
+ return;
114
+ }
115
+
116
+ if (node.children.length > 0) {
117
+ // Container: header, then recurse into members.
118
+ out.push(`${indent}${header} ${span}`);
119
+ for (const child of node.children) render(child, lines, depth + 1, focus, out);
120
+ return;
121
+ }
122
+
123
+ // Leaf. Collapse a real body into a fold marker with the exact line count.
124
+ const bodyLines = row.lineEnd - row.lineStart - 1;
125
+ if (BODY_KINDS.has(row.kind) && bodyLines > 0) {
126
+ out.push(`${indent}${header} ${span} { … ${bodyLines} lines … }`);
127
+ } else {
128
+ out.push(`${indent}${header} ${span}`);
129
+ }
130
+ }
131
+
132
+ export function buildSkeleton(
133
+ store: Store,
134
+ file: string,
135
+ opts: { focusSymbol?: string } = {},
136
+ ): SkeletonResult {
137
+ const match = matchFile(store, file);
138
+ if (!match) return { ok: false, reason: `no indexed file matching "${file}"` };
139
+
140
+ let src: string;
141
+ try {
142
+ src = fs.readFileSync(match.path, 'utf8');
143
+ } catch (err) {
144
+ return { ok: false, reason: `cannot read ${match.path}: ${(err as Error).message}` };
145
+ }
146
+ // Strip a UTF-8 BOM so line 1 matches the indexer's view.
147
+ if (src.charCodeAt(0) === 0xfeff) src = src.slice(1);
148
+ const lines = src.split(/\r?\n/);
149
+
150
+ const rows = store.listSymbolsInFile(match.path, 5000);
151
+ const focus = opts.focusSymbol ?? null;
152
+ const forest = buildForest(rows);
153
+
154
+ const out: string[] = [];
155
+ out.push(`${match.relPath} (${match.language}, ${rows.length} symbols)`);
156
+ out.push('─'.repeat(Math.min(60, Math.max(20, match.relPath.length + 16))));
157
+ for (const root of forest) render(root, lines, 0, focus, out);
158
+ if (rows.length === 0) out.push('(no symbols indexed in this file)');
159
+
160
+ return {
161
+ ok: true,
162
+ file: match.path,
163
+ relPath: match.relPath,
164
+ language: match.language,
165
+ symbolCount: rows.length,
166
+ focus,
167
+ skeleton: out.join('\n'),
168
+ };
169
+ }