seer-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (371) hide show
  1. package/.vscode/settings.json +3 -0
  2. package/LICENSE +176 -0
  3. package/README.md +272 -0
  4. package/README_dev.md +199 -0
  5. package/dist/bundle/ci.d.ts +47 -0
  6. package/dist/bundle/ci.d.ts.map +1 -0
  7. package/dist/bundle/ci.js +113 -0
  8. package/dist/bundle/ci.js.map +1 -0
  9. package/dist/bundle/contract.d.ts +111 -0
  10. package/dist/bundle/contract.d.ts.map +1 -0
  11. package/dist/bundle/contract.js +352 -0
  12. package/dist/bundle/contract.js.map +1 -0
  13. package/dist/bundle/export.d.ts +36 -0
  14. package/dist/bundle/export.d.ts.map +1 -0
  15. package/dist/bundle/export.js +152 -0
  16. package/dist/bundle/export.js.map +1 -0
  17. package/dist/bundle/external.d.ts +66 -0
  18. package/dist/bundle/external.d.ts.map +1 -0
  19. package/dist/bundle/external.js +238 -0
  20. package/dist/bundle/external.js.map +1 -0
  21. package/dist/bundle/format.d.ts +94 -0
  22. package/dist/bundle/format.d.ts.map +1 -0
  23. package/dist/bundle/format.js +42 -0
  24. package/dist/bundle/format.js.map +1 -0
  25. package/dist/bundle/import.d.ts +49 -0
  26. package/dist/bundle/import.d.ts.map +1 -0
  27. package/dist/bundle/import.js +116 -0
  28. package/dist/bundle/import.js.map +1 -0
  29. package/dist/cli/index.d.ts +3 -0
  30. package/dist/cli/index.d.ts.map +1 -0
  31. package/dist/cli/index.js +1402 -0
  32. package/dist/cli/index.js.map +1 -0
  33. package/dist/cli/init.d.ts +48 -0
  34. package/dist/cli/init.d.ts.map +1 -0
  35. package/dist/cli/init.js +284 -0
  36. package/dist/cli/init.js.map +1 -0
  37. package/dist/db/schema.d.ts +3 -0
  38. package/dist/db/schema.d.ts.map +1 -0
  39. package/dist/db/schema.js +616 -0
  40. package/dist/db/schema.js.map +1 -0
  41. package/dist/db/store.d.ts +1011 -0
  42. package/dist/db/store.d.ts.map +1 -0
  43. package/dist/db/store.js +3888 -0
  44. package/dist/db/store.js.map +1 -0
  45. package/dist/graph/pagerank.d.ts +9 -0
  46. package/dist/graph/pagerank.d.ts.map +1 -0
  47. package/dist/graph/pagerank.js +47 -0
  48. package/dist/graph/pagerank.js.map +1 -0
  49. package/dist/indexer/architecture.d.ts +72 -0
  50. package/dist/indexer/architecture.d.ts.map +1 -0
  51. package/dist/indexer/architecture.js +112 -0
  52. package/dist/indexer/architecture.js.map +1 -0
  53. package/dist/indexer/behavior.d.ts +75 -0
  54. package/dist/indexer/behavior.d.ts.map +1 -0
  55. package/dist/indexer/behavior.js +395 -0
  56. package/dist/indexer/behavior.js.map +1 -0
  57. package/dist/indexer/boundaries.d.ts +60 -0
  58. package/dist/indexer/boundaries.d.ts.map +1 -0
  59. package/dist/indexer/boundaries.js +366 -0
  60. package/dist/indexer/boundaries.js.map +1 -0
  61. package/dist/indexer/churn.d.ts +15 -0
  62. package/dist/indexer/churn.d.ts.map +1 -0
  63. package/dist/indexer/churn.js +49 -0
  64. package/dist/indexer/churn.js.map +1 -0
  65. package/dist/indexer/classify.d.ts +9 -0
  66. package/dist/indexer/classify.d.ts.map +1 -0
  67. package/dist/indexer/classify.js +90 -0
  68. package/dist/indexer/classify.js.map +1 -0
  69. package/dist/indexer/context.d.ts +176 -0
  70. package/dist/indexer/context.d.ts.map +1 -0
  71. package/dist/indexer/context.js +193 -0
  72. package/dist/indexer/context.js.map +1 -0
  73. package/dist/indexer/continuity.d.ts +67 -0
  74. package/dist/indexer/continuity.d.ts.map +1 -0
  75. package/dist/indexer/continuity.js +288 -0
  76. package/dist/indexer/continuity.js.map +1 -0
  77. package/dist/indexer/detectchanges.d.ts +32 -0
  78. package/dist/indexer/detectchanges.d.ts.map +1 -0
  79. package/dist/indexer/detectchanges.js +74 -0
  80. package/dist/indexer/detectchanges.js.map +1 -0
  81. package/dist/indexer/discovery.d.ts +37 -0
  82. package/dist/indexer/discovery.d.ts.map +1 -0
  83. package/dist/indexer/discovery.js +136 -0
  84. package/dist/indexer/discovery.js.map +1 -0
  85. package/dist/indexer/externaldeps.d.ts +18 -0
  86. package/dist/indexer/externaldeps.d.ts.map +1 -0
  87. package/dist/indexer/externaldeps.js +288 -0
  88. package/dist/indexer/externaldeps.js.map +1 -0
  89. package/dist/indexer/freshness.d.ts +48 -0
  90. package/dist/indexer/freshness.d.ts.map +1 -0
  91. package/dist/indexer/freshness.js +128 -0
  92. package/dist/indexer/freshness.js.map +1 -0
  93. package/dist/indexer/git.d.ts +144 -0
  94. package/dist/indexer/git.d.ts.map +1 -0
  95. package/dist/indexer/git.js +444 -0
  96. package/dist/indexer/git.js.map +1 -0
  97. package/dist/indexer/index.d.ts +145 -0
  98. package/dist/indexer/index.d.ts.map +1 -0
  99. package/dist/indexer/index.js +930 -0
  100. package/dist/indexer/index.js.map +1 -0
  101. package/dist/indexer/modules.d.ts +62 -0
  102. package/dist/indexer/modules.d.ts.map +1 -0
  103. package/dist/indexer/modules.js +293 -0
  104. package/dist/indexer/modules.js.map +1 -0
  105. package/dist/indexer/preflight.d.ts +154 -0
  106. package/dist/indexer/preflight.d.ts.map +1 -0
  107. package/dist/indexer/preflight.js +399 -0
  108. package/dist/indexer/preflight.js.map +1 -0
  109. package/dist/indexer/protoScanner.d.ts +34 -0
  110. package/dist/indexer/protoScanner.d.ts.map +1 -0
  111. package/dist/indexer/protoScanner.js +133 -0
  112. package/dist/indexer/protoScanner.js.map +1 -0
  113. package/dist/indexer/risk.d.ts +115 -0
  114. package/dist/indexer/risk.d.ts.map +1 -0
  115. package/dist/indexer/risk.js +194 -0
  116. package/dist/indexer/risk.js.map +1 -0
  117. package/dist/indexer/serviceHostScanner.d.ts +25 -0
  118. package/dist/indexer/serviceHostScanner.d.ts.map +1 -0
  119. package/dist/indexer/serviceHostScanner.js +95 -0
  120. package/dist/indexer/serviceHostScanner.js.map +1 -0
  121. package/dist/indexer/serviceLinks.d.ts +105 -0
  122. package/dist/indexer/serviceLinks.d.ts.map +1 -0
  123. package/dist/indexer/serviceLinks.js +509 -0
  124. package/dist/indexer/serviceLinks.js.map +1 -0
  125. package/dist/indexer/shapehash.d.ts +98 -0
  126. package/dist/indexer/shapehash.d.ts.map +1 -0
  127. package/dist/indexer/shapehash.js +354 -0
  128. package/dist/indexer/shapehash.js.map +1 -0
  129. package/dist/indexer/skeleton.d.ts +15 -0
  130. package/dist/indexer/skeleton.d.ts.map +1 -0
  131. package/dist/indexer/skeleton.js +136 -0
  132. package/dist/indexer/skeleton.js.map +1 -0
  133. package/dist/indexer/symbolhistory.d.ts +41 -0
  134. package/dist/indexer/symbolhistory.d.ts.map +1 -0
  135. package/dist/indexer/symbolhistory.js +124 -0
  136. package/dist/indexer/symbolhistory.js.map +1 -0
  137. package/dist/indexer/watcher.d.ts +68 -0
  138. package/dist/indexer/watcher.d.ts.map +1 -0
  139. package/dist/indexer/watcher.js +179 -0
  140. package/dist/indexer/watcher.js.map +1 -0
  141. package/dist/mcp/server.d.ts +80 -0
  142. package/dist/mcp/server.d.ts.map +1 -0
  143. package/dist/mcp/server.js +1610 -0
  144. package/dist/mcp/server.js.map +1 -0
  145. package/dist/parser/index.d.ts +8 -0
  146. package/dist/parser/index.d.ts.map +1 -0
  147. package/dist/parser/index.js +33 -0
  148. package/dist/parser/index.js.map +1 -0
  149. package/dist/parser/languages/cpp.d.ts +3 -0
  150. package/dist/parser/languages/cpp.d.ts.map +1 -0
  151. package/dist/parser/languages/cpp.js +350 -0
  152. package/dist/parser/languages/cpp.js.map +1 -0
  153. package/dist/parser/languages/csharp.d.ts +3 -0
  154. package/dist/parser/languages/csharp.d.ts.map +1 -0
  155. package/dist/parser/languages/csharp.js +239 -0
  156. package/dist/parser/languages/csharp.js.map +1 -0
  157. package/dist/parser/languages/go.d.ts +3 -0
  158. package/dist/parser/languages/go.d.ts.map +1 -0
  159. package/dist/parser/languages/go.js +259 -0
  160. package/dist/parser/languages/go.js.map +1 -0
  161. package/dist/parser/languages/java.d.ts +3 -0
  162. package/dist/parser/languages/java.d.ts.map +1 -0
  163. package/dist/parser/languages/java.js +391 -0
  164. package/dist/parser/languages/java.js.map +1 -0
  165. package/dist/parser/languages/python.d.ts +3 -0
  166. package/dist/parser/languages/python.d.ts.map +1 -0
  167. package/dist/parser/languages/python.js +396 -0
  168. package/dist/parser/languages/python.js.map +1 -0
  169. package/dist/parser/languages/rust.d.ts +3 -0
  170. package/dist/parser/languages/rust.d.ts.map +1 -0
  171. package/dist/parser/languages/rust.js +159 -0
  172. package/dist/parser/languages/rust.js.map +1 -0
  173. package/dist/parser/languages/typescript.d.ts +3 -0
  174. package/dist/parser/languages/typescript.d.ts.map +1 -0
  175. package/dist/parser/languages/typescript.js +1442 -0
  176. package/dist/parser/languages/typescript.js.map +1 -0
  177. package/dist/parser/parserContext.d.ts +77 -0
  178. package/dist/parser/parserContext.d.ts.map +1 -0
  179. package/dist/parser/parserContext.js +354 -0
  180. package/dist/parser/parserContext.js.map +1 -0
  181. package/dist/parser/walker.d.ts +81 -0
  182. package/dist/parser/walker.d.ts.map +1 -0
  183. package/dist/parser/walker.js +217 -0
  184. package/dist/parser/walker.js.map +1 -0
  185. package/dist/parser/worker.d.ts +66 -0
  186. package/dist/parser/worker.d.ts.map +1 -0
  187. package/dist/parser/worker.js +129 -0
  188. package/dist/parser/worker.js.map +1 -0
  189. package/dist/parser/workerpool.d.ts +107 -0
  190. package/dist/parser/workerpool.d.ts.map +1 -0
  191. package/dist/parser/workerpool.js +383 -0
  192. package/dist/parser/workerpool.js.map +1 -0
  193. package/dist/scip/format.d.ts +87 -0
  194. package/dist/scip/format.d.ts.map +1 -0
  195. package/dist/scip/format.js +31 -0
  196. package/dist/scip/format.js.map +1 -0
  197. package/dist/scip/import.d.ts +37 -0
  198. package/dist/scip/import.d.ts.map +1 -0
  199. package/dist/scip/import.js +180 -0
  200. package/dist/scip/import.js.map +1 -0
  201. package/dist/types.d.ts +392 -0
  202. package/dist/types.d.ts.map +1 -0
  203. package/dist/types.js +4 -0
  204. package/dist/types.js.map +1 -0
  205. package/docs/architecture.md +105 -0
  206. package/docs/benchmarks/methodology.md +134 -0
  207. package/docs/benchmarks/raw-results.md +71 -0
  208. package/docs/benchmarks.md +74 -0
  209. package/docs/cli.md +148 -0
  210. package/docs/examples/behavior-tests.md +70 -0
  211. package/docs/examples/change-history.md +85 -0
  212. package/docs/examples/pre-edit-context.md +81 -0
  213. package/docs/examples/service-links.md +88 -0
  214. package/docs/examples.md +80 -0
  215. package/docs/faq.md +70 -0
  216. package/docs/internals.md +104 -0
  217. package/docs/languages.md +70 -0
  218. package/docs/limits.md +52 -0
  219. package/docs/mcp.md +199 -0
  220. package/docs/quickstart.md +119 -0
  221. package/docs/testing.md +123 -0
  222. package/docs/tools.md +115 -0
  223. package/package.json +52 -0
  224. package/research-codebase.md +578 -0
  225. package/seer-cli-docs.md +326 -0
  226. package/seer-master-guide.md +246 -0
  227. package/src/bundle/ci.ts +141 -0
  228. package/src/bundle/contract.ts +387 -0
  229. package/src/bundle/export.ts +175 -0
  230. package/src/bundle/external.ts +285 -0
  231. package/src/bundle/format.ts +92 -0
  232. package/src/bundle/import.ts +157 -0
  233. package/src/cli/index.ts +1249 -0
  234. package/src/cli/init.ts +389 -0
  235. package/src/db/schema.ts +614 -0
  236. package/src/db/store.ts +4306 -0
  237. package/src/graph/pagerank.ts +53 -0
  238. package/src/indexer/architecture.ts +148 -0
  239. package/src/indexer/behavior.ts +466 -0
  240. package/src/indexer/boundaries.ts +374 -0
  241. package/src/indexer/churn.ts +58 -0
  242. package/src/indexer/classify.ts +96 -0
  243. package/src/indexer/context.ts +340 -0
  244. package/src/indexer/continuity.ts +322 -0
  245. package/src/indexer/detectchanges.ts +94 -0
  246. package/src/indexer/discovery.ts +176 -0
  247. package/src/indexer/externaldeps.ts +243 -0
  248. package/src/indexer/freshness.ts +166 -0
  249. package/src/indexer/git.ts +453 -0
  250. package/src/indexer/index.ts +1092 -0
  251. package/src/indexer/modules.ts +358 -0
  252. package/src/indexer/preflight.ts +548 -0
  253. package/src/indexer/protoScanner.ts +147 -0
  254. package/src/indexer/risk.ts +304 -0
  255. package/src/indexer/serviceHostScanner.ts +92 -0
  256. package/src/indexer/serviceLinks.ts +543 -0
  257. package/src/indexer/shapehash.ts +370 -0
  258. package/src/indexer/skeleton.ts +169 -0
  259. package/src/indexer/symbolhistory.ts +172 -0
  260. package/src/indexer/watcher.ts +206 -0
  261. package/src/mcp/server.ts +1659 -0
  262. package/src/parser/index.ts +37 -0
  263. package/src/parser/languages/cpp.ts +361 -0
  264. package/src/parser/languages/csharp.ts +235 -0
  265. package/src/parser/languages/go.ts +259 -0
  266. package/src/parser/languages/java.ts +382 -0
  267. package/src/parser/languages/python.ts +370 -0
  268. package/src/parser/languages/rust.ts +164 -0
  269. package/src/parser/languages/typescript.ts +1435 -0
  270. package/src/parser/parserContext.ts +392 -0
  271. package/src/parser/walker.ts +306 -0
  272. package/src/parser/worker.ts +181 -0
  273. package/src/parser/workerpool.ts +448 -0
  274. package/src/scip/format.ts +83 -0
  275. package/src/scip/import.ts +216 -0
  276. package/src/types.ts +457 -0
  277. package/tests/benchmark-service-links.ts +244 -0
  278. package/tests/bug-regressions.ts +626 -0
  279. package/tests/filters.ts +264 -0
  280. package/tests/fixtures/Counter.tsx +38 -0
  281. package/tests/fixtures/caller.ts +7 -0
  282. package/tests/fixtures/collisions.ts +23 -0
  283. package/tests/fixtures/local_helper.ts +5 -0
  284. package/tests/fixtures/overloads.java +17 -0
  285. package/tests/fixtures/remote_helper.ts +4 -0
  286. package/tests/fixtures/sample.c +15 -0
  287. package/tests/fixtures/sample.cpp +47 -0
  288. package/tests/fixtures/sample.cs +62 -0
  289. package/tests/fixtures/sample.go +68 -0
  290. package/tests/fixtures/sample.h +30 -0
  291. package/tests/fixtures/sample.java +85 -0
  292. package/tests/fixtures/sample.py +46 -0
  293. package/tests/fixtures/sample.rs +78 -0
  294. package/tests/fixtures/sample.ts +76 -0
  295. package/tests/fixtures-service/HttpClients.cs +30 -0
  296. package/tests/fixtures-service/HttpClients.java +24 -0
  297. package/tests/fixtures-service/billing.ts +15 -0
  298. package/tests/fixtures-service/docker-compose.yml +15 -0
  299. package/tests/fixtures-service/gateway.ts +10 -0
  300. package/tests/fixtures-service/get_user.ts +11 -0
  301. package/tests/fixtures-service/graphql_client.ts +63 -0
  302. package/tests/fixtures-service/graphql_server.ts +30 -0
  303. package/tests/fixtures-service/grpc_client.go +30 -0
  304. package/tests/fixtures-service/http_clients.go +23 -0
  305. package/tests/fixtures-service/http_clients.py +38 -0
  306. package/tests/fixtures-service/http_clients.ts +49 -0
  307. package/tests/fixtures-service/k8s/payment-service.yaml +22 -0
  308. package/tests/fixtures-service/k8s_calls.ts +20 -0
  309. package/tests/fixtures-service/messaging.ts +87 -0
  310. package/tests/fixtures-service/trpc_client.ts +39 -0
  311. package/tests/fixtures-service/trpc_server.ts +39 -0
  312. package/tests/fixtures-service/user_service.proto +33 -0
  313. package/tests/fixtures-trackcd/Cargo.toml +11 -0
  314. package/tests/fixtures-trackcd/SpringController.java +36 -0
  315. package/tests/fixtures-trackcd/auth_service.ts +19 -0
  316. package/tests/fixtures-trackcd/complex_module.py +50 -0
  317. package/tests/fixtures-trackcd/express_app.js +30 -0
  318. package/tests/fixtures-trackcd/fastapi_app.py +49 -0
  319. package/tests/fixtures-trackcd/fastify_object_routes.js +32 -0
  320. package/tests/fixtures-trackcd/go.mod +8 -0
  321. package/tests/fixtures-trackcd/package.json +15 -0
  322. package/tests/fixtures-trackcd/requirements.txt +4 -0
  323. package/tests/fixtures-trackcd/tests/auth_service.test.ts +13 -0
  324. package/tests/fixtures-tracke/auth/AuthService.ts +23 -0
  325. package/tests/fixtures-tracke/auth/crypto.ts +7 -0
  326. package/tests/fixtures-tracke/billing/Billing.ts +20 -0
  327. package/tests/fixtures-tracke/billing/Invoice.ts +10 -0
  328. package/tests/fixtures-tracke/billing/server.ts +17 -0
  329. package/tests/fixtures-tracke/package.json +7 -0
  330. package/tests/fixtures-tracke/tests/auth.test.ts +23 -0
  331. package/tests/fixtures-tracke/tests/billing.test.ts +14 -0
  332. package/tests/fixtures-trackf/package.json +5 -0
  333. package/tests/fixtures-trackf/src/auth.ts +26 -0
  334. package/tests/fixtures-trackf/src/handlers.ts +35 -0
  335. package/tests/fixtures-tracki/billing/routes.ts +12 -0
  336. package/tests/fixtures-tracki/gateway/client.ts +13 -0
  337. package/tests/git-features.ts +267 -0
  338. package/tests/init.ts +141 -0
  339. package/tests/mcp-jit.ts +130 -0
  340. package/tests/mcp-smoke.ts +191 -0
  341. package/tests/mcp-trackcd.ts +169 -0
  342. package/tests/mcp-tracke.ts +229 -0
  343. package/tests/mcp-trackf.ts +330 -0
  344. package/tests/mcp-trackg.ts +219 -0
  345. package/tests/mcp-tracki.ts +174 -0
  346. package/tests/mcp-watcher.ts +126 -0
  347. package/tests/optspec.ts +194 -0
  348. package/tests/parallel-index.ts +333 -0
  349. package/tests/parallel-read.ts +125 -0
  350. package/tests/parallel-recovery.ts +241 -0
  351. package/tests/perf-callers.ts +145 -0
  352. package/tests/query-parity.ts +184 -0
  353. package/tests/query-perf.ts +55 -0
  354. package/tests/scale-parallel-parity.ts +225 -0
  355. package/tests/scale-test.ts +523 -0
  356. package/tests/smoke.ts +396 -0
  357. package/tests/trackcd.ts +325 -0
  358. package/tests/tracke-collisions.ts +255 -0
  359. package/tests/tracke.ts +314 -0
  360. package/tests/trackf-bugs.ts +406 -0
  361. package/tests/trackf.ts +390 -0
  362. package/tests/trackg.ts +1372 -0
  363. package/tests/tracki-boundaries.ts +202 -0
  364. package/tests/tracki-continuity.ts +253 -0
  365. package/tests/tracki-contract-diff.ts +249 -0
  366. package/tests/tracki-external-bundles.ts +341 -0
  367. package/tests/tracki-preflight.ts +251 -0
  368. package/tests/verify-roles.ts +51 -0
  369. package/tests/worker-parity.ts +286 -0
  370. package/tests/worker-pool.ts +262 -0
  371. package/tsconfig.json +20 -0
@@ -0,0 +1,1092 @@
1
+ import fs from 'fs';
2
+ import crypto from 'crypto';
3
+ import path from 'path';
4
+ import { discoverFiles, DiscoveredFile, DiscoveryMode } from './discovery.js';
5
+ import { parseFile, detectLanguage, wasmResetCount } from '../parser/index.js';
6
+ import { WorkerPool, WorkItem as PoolWorkItem, PoolResult } from '../parser/workerpool.js';
7
+ import { computePageRank } from '../graph/pagerank.js';
8
+ import { Store } from '../db/store.js';
9
+ import { classifyFile } from './classify.js';
10
+ import { buildModules } from './modules.js';
11
+ import { buildBoundaries } from './boundaries.js';
12
+ import { buildShapeHashes } from './shapehash.js';
13
+ import { buildContinuity } from './continuity.js';
14
+ import { normalizeHttpTarget, resolveServiceLinks } from './serviceLinks.js';
15
+ import { scanProtoFiles } from './protoScanner.js';
16
+ import { scanServiceHosts } from './serviceHostScanner.js';
17
+ import type { Language, FileExtraction } from '../types.js';
18
+
19
+ export interface IndexOptions {
20
+ verbose?: boolean;
21
+ reset?: boolean;
22
+ /**
23
+ * When true, suppress all progress / post-processing chatter to stdout.
24
+ * `verbose` still wins if both are set. Useful for the scale-test runner
25
+ * which prints its own one-line-per-codebase summary.
26
+ */
27
+ quiet?: boolean;
28
+ /**
29
+ * Skip files larger than this many bytes. Default is 0 (no cap) — we'd
30
+ * rather index everything than create silent holes in the graph by
31
+ * skipping hand-crafted files just because they're large. The combination
32
+ * of `setTimeoutMicros(10s)` per parse + automatic WASM-runtime reset on
33
+ * failure means a pathological file degrades to "this file is missing"
34
+ * instead of "the indexer crashed."
35
+ *
36
+ * Setting a positive cap is purely an optimization for indexing speed in
37
+ * codebases where you know certain files are huge generated boilerplate
38
+ * (Vulkan headers, protobuf output, etc.) — those usually live in a
39
+ * `thirdparty/` or `Generated/` dir which we already skip by default.
40
+ */
41
+ maxFileBytes?: number;
42
+ /**
43
+ * Max number of concurrent file reads issued by the async prefetcher.
44
+ * Defaults to 8. Each in-flight read holds an open file descriptor and the
45
+ * file's bytes in memory until the parser consumes it; the byte cap below
46
+ * is the real safety net, this just protects against FD exhaustion.
47
+ */
48
+ ioConcurrency?: number;
49
+ /**
50
+ * Hard upper bound on total bytes held in the prefetch buffer at any time.
51
+ * Defaults to 64 MiB. If a single file is larger than the cap it is still
52
+ * read (we always allow at least one in-flight read so progress never
53
+ * stalls) — the cap only governs how many *additional* prefetches we may
54
+ * launch while bytes are outstanding.
55
+ */
56
+ ioPrefetchBytes?: number;
57
+ /**
58
+ * Include vendored directories (`vendor/`, `vendored/`, `thirdparty/`, …)
59
+ * in discovery and indexing. By default these are skipped at discovery
60
+ * time AND tagged as `role='vendor'` if they leak through, so they don't
61
+ * dominate ranking. The flag exists for "I really do want to query into
62
+ * vendored code" workflows.
63
+ */
64
+ includeVendor?: boolean;
65
+ /**
66
+ * Include generated files (`*.pb.*`, `*.generated.*`, `*.gen.*`, …) in
67
+ * discovery and indexing. Off by default for the same reason as
68
+ * `includeVendor`.
69
+ */
70
+ includeGenerated?: boolean;
71
+ /**
72
+ * Discovery aggressiveness. `'full'` indexes everything we can parse
73
+ * (implicitly includes vendor + generated), `'standard'` (default) keeps
74
+ * vendor/generated out by default, and `'fast'` adds docs/examples/static
75
+ * skips on top of standard. See `DiscoveryMode` for the rationale.
76
+ */
77
+ mode?: DiscoveryMode;
78
+ /**
79
+ * Parse files in a pool of worker_threads instead of inline. Each worker
80
+ * owns its own WASM heap so heavy parsing parallelizes across CPU cores.
81
+ * DB writes still happen on the main thread in the same insertion order
82
+ * as the serial path — symbol IDs stay deterministic.
83
+ *
84
+ * Default: on for normal/large workspaces, serial for tiny workspaces where
85
+ * worker startup/teardown costs more than it saves. Set
86
+ * `SEER_PARALLEL_PARSE=0` or pass `parallel:false` to force the serial
87
+ * fallback. Pass `parallel:true` to force workers even below the tiny-repo
88
+ * threshold (the parity tests do this). Scale parity verified the worker
89
+ * path against representative large repos with row-identical DB output.
90
+ */
91
+ parallel?: boolean;
92
+ /**
93
+ * Worker thread count when `parallel` is on. Defaults to
94
+ * `min(8, max(1, availableParallelism()-1))`.
95
+ */
96
+ jobs?: number;
97
+ }
98
+
99
+ const DEFAULT_MAX_FILE_BYTES = 0; // no cap by default — completeness first
100
+ const DEFAULT_IO_CONCURRENCY = 8; // matches the file-handle budget on most OSes comfortably
101
+ const DEFAULT_IO_PREFETCH_BYTES = 64 * 1024 * 1024; // 64 MiB
102
+ const PARALLEL_AUTO_MIN_FILES = 100; // below this, default to serial unless explicitly forced
103
+
104
+ // Filenames that are almost always generated boilerplate (Unreal Header Tool
105
+ // produces *.generated.h; protobufs produce *.pb.h / *.pb.cc; etc.). We skip
106
+ // them at the per-file level so the discovery glob can stay simple.
107
+ const SKIP_FILENAME_PATTERNS = [
108
+ /\.generated\.h$/i,
109
+ /\.gen\.cpp$/i,
110
+ /\.gen\.h$/i,
111
+ /\.pb\.cc$/,
112
+ /\.pb\.h$/,
113
+ ];
114
+
115
+ function shouldSkipFilename(relativePath: string): boolean {
116
+ return SKIP_FILENAME_PATTERNS.some(re => re.test(relativePath));
117
+ }
118
+
119
+ export interface IndexResult {
120
+ filesDiscovered: number;
121
+ filesIndexed: number;
122
+ /**
123
+ * Files whose content hash matched the existing index row — we kept their
124
+ * symbols/edges/imports and skipped reparsing entirely. This is the single
125
+ * biggest win on a re-index: parse cost goes to ~0 for unchanged files.
126
+ */
127
+ filesReusedFromCache: number;
128
+ filesSkipped: number;
129
+ filesSkippedTooLarge: number;
130
+ filesParseError: number;
131
+ wasmResets: number;
132
+ symbols: number;
133
+ edges: number;
134
+ resolvedEdges: number;
135
+ resolvedImports: number;
136
+ edgeResolution: {
137
+ sameFile: number;
138
+ imported: number;
139
+ global: number;
140
+ };
141
+ /**
142
+ * True when the post-pass actually recomputed PageRank this run. False when
143
+ * the resolved edge graph was unchanged (every file came from cache, nothing
144
+ * was pruned, no new edge or import resolutions happened) — in that case
145
+ * every stored PageRank value is still correct and we kept it as-is. This
146
+ * is the "Lazy PageRank" optimization; the predicate lives at the bottom of
147
+ * `indexDirectory()`.
148
+ */
149
+ pagerankRecomputed: boolean;
150
+ /** Routes whose handler symbol id was filled in this run. */
151
+ routesResolved?: number;
152
+ /** Config-key rows whose enclosing symbol id was filled in this run. */
153
+ configKeysResolved?: number;
154
+ /** Tests edges synthesized this run. */
155
+ testEdgesAdded?: number;
156
+ /** External dependency rows in the DB after this run. */
157
+ externalDependencies?: number;
158
+ /** Number of modules in the clustering after this run (0 if not built). */
159
+ modules?: number;
160
+ /** True when module clustering was recomputed this run. */
161
+ modulesRecomputed?: boolean;
162
+ /** Number of new shape hashes computed this run (Track-F SimHash pass). */
163
+ shapeHashesAdded?: number;
164
+ /** v8 Track-G — service_links rows produced by the resolver this run. */
165
+ serviceLinks?: number;
166
+ /** v8 Track-G — service_link counts grouped by match_kind. */
167
+ serviceLinksByKind?: Record<string, number>;
168
+ elapsedMs: number;
169
+ }
170
+
171
+ // ── Async prefetch types ────────────────────────────────────────────────────────
172
+
173
+ /**
174
+ * A file in the work queue, paired with its language. We pre-filter the
175
+ * DiscoveredFile list once up-front: anything without a language match or
176
+ * matching a SKIP_FILENAME_PATTERN never enters the prefetcher — those are
177
+ * counted as plain `skipped` and the I/O budget isn't spent on them.
178
+ */
179
+ interface WorkItem {
180
+ file: DiscoveredFile;
181
+ language: Language;
182
+ }
183
+
184
+ /** Result of a prefetch task: either ready-to-process content, or a skip reason. */
185
+ type PrefetchResult =
186
+ | { kind: 'ok'; item: WorkItem; content: string; size: number }
187
+ | { kind: 'too-large'; item: WorkItem; size: number }
188
+ | { kind: 'io-error'; item: WorkItem };
189
+
190
+ // ── Byte-aware semaphore ────────────────────────────────────────────────────────
191
+ //
192
+ // Caps total bytes of file content held in the prefetch buffer. A waiter is
193
+ // admitted as soon as either (a) the new total fits within `capacity`, or
194
+ // (b) the budget is empty (so a single oversize file never deadlocks — we
195
+ // always allow at least one read in flight).
196
+ //
197
+ // The implementation is deliberately FIFO: we only wake the *head* waiter, so
198
+ // a flood of small reads can't perpetually starve a single large one queued
199
+ // behind them.
200
+
201
+ class ByteSemaphore {
202
+ private bytes = 0;
203
+ private readonly waiters: Array<{ bytes: number; resolve: () => void }> = [];
204
+
205
+ constructor(private readonly capacity: number) {}
206
+
207
+ async acquire(requested: number): Promise<void> {
208
+ if (this.bytes === 0 || this.bytes + requested <= this.capacity) {
209
+ this.bytes += requested;
210
+ return;
211
+ }
212
+ await new Promise<void>(resolve => {
213
+ this.waiters.push({ bytes: requested, resolve });
214
+ });
215
+ // `release` has already added `requested` to `this.bytes` on our behalf
216
+ // before resolving — see `release()` below.
217
+ }
218
+
219
+ release(returned: number): void {
220
+ this.bytes -= returned;
221
+ if (this.bytes < 0) this.bytes = 0; // defensive: never go negative
222
+ while (this.waiters.length > 0) {
223
+ const next = this.waiters[0];
224
+ if (this.bytes === 0 || this.bytes + next.bytes <= this.capacity) {
225
+ this.bytes += next.bytes;
226
+ this.waiters.shift();
227
+ next.resolve();
228
+ } else {
229
+ break;
230
+ }
231
+ }
232
+ }
233
+ }
234
+
235
+ export class Indexer {
236
+ constructor(private store: Store) {}
237
+
238
+ async indexDirectory(
239
+ repoRoot: string,
240
+ options: IndexOptions = {},
241
+ ): Promise<IndexResult> {
242
+ const start = Date.now();
243
+ const absRoot = path.resolve(repoRoot);
244
+
245
+ const quiet = options.quiet && !options.verbose;
246
+
247
+ if (options.verbose) {
248
+ process.stdout.write(`\nDiscovering files in ${absRoot}...\n`);
249
+ }
250
+
251
+ const files = await discoverFiles(absRoot, {
252
+ includeVendor: options.includeVendor,
253
+ includeGenerated: options.includeGenerated,
254
+ mode: options.mode,
255
+ });
256
+ const total = files.length;
257
+ const maxFileBytes = options.maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
258
+ const ioConcurrency = Math.max(1, options.ioConcurrency ?? DEFAULT_IO_CONCURRENCY);
259
+ const ioPrefetchBytes = Math.max(1, options.ioPrefetchBytes ?? DEFAULT_IO_PREFETCH_BYTES);
260
+
261
+ // Track every file_id we touch this run so we can prune ones left over
262
+ // from a previous run (e.g. files now hidden by a new ignore rule).
263
+ const touchedFileIds = new Set<number>();
264
+ let indexed = 0;
265
+ let reusedFromCache = 0;
266
+ let skipped = 0;
267
+ let skippedTooLarge = 0;
268
+ let parseErrors = 0;
269
+ let workerWasmResets = 0;
270
+
271
+ // ── Pre-filter into a work queue ────────────────────────────────────────────
272
+ // Pure CPU work (string ops on the path). Cheap to do all at once so the
273
+ // async prefetcher's index space matches up cleanly with progress counters.
274
+ const work: WorkItem[] = [];
275
+ for (const file of files) {
276
+ const language = detectLanguage(file.absolutePath) as Language | null;
277
+ if (!language || shouldSkipFilename(file.relativePath)) {
278
+ skipped++;
279
+ continue;
280
+ }
281
+ work.push({ file, language });
282
+ }
283
+
284
+ // Existing pre-v8 DBs can have all source hashes cached but no
285
+ // service_calls rows yet. Force one full parse pass so Track-G evidence is
286
+ // backfilled, then mark completion in finishIndex().
287
+ const forceServiceCallBackfill = this.store.needsServiceCallBackfill();
288
+
289
+ // ── Batched transactions ──────────────────────────────────────────────────
290
+ // The Phase-1 design wrapped each file's inserts in its own SQLite
291
+ // transaction. That works but every commit fsyncs the WAL, which adds
292
+ // O(milliseconds) of overhead per file. For 40k+ file repos the commit
293
+ // overhead dominates the per-file budget. Batching N files per transaction
294
+ // amortizes the fsync cost N-fold.
295
+ //
296
+ // Trade-off: a fatal error mid-batch rolls back at most BATCH_SIZE files'
297
+ // worth of inserts (which we'd just re-do on the next run). We never lose
298
+ // user data — only re-do work — so a moderately large batch is safe.
299
+ //
300
+ // The hash-skip path participates in the batch too (its UPDATE indexed_at
301
+ // and DELETE-old-symbols statements were previously running as
302
+ // autocommitted singletons — now they share a transaction with the file's
303
+ // inserts).
304
+ const BATCH_SIZE = 200;
305
+ let batchOpen = false;
306
+ const openBatch = (): void => {
307
+ if (!batchOpen) {
308
+ this.store.begin();
309
+ batchOpen = true;
310
+ }
311
+ };
312
+ const closeBatch = (): void => {
313
+ if (batchOpen) {
314
+ this.store.commit();
315
+ batchOpen = false;
316
+ }
317
+ };
318
+ const rollbackBatch = (): void => {
319
+ if (batchOpen) {
320
+ try { this.store.rollback(); } catch { /* best effort */ }
321
+ batchOpen = false;
322
+ }
323
+ };
324
+
325
+ if (!options.verbose && !quiet) {
326
+ writeProgress(0, total, '');
327
+ }
328
+
329
+ // ── Parallel-parsing branch (worker pool) ───────────────────────────────────
330
+ //
331
+ // When enabled, each file's read + hash + parse runs in a worker_threads
332
+ // worker (its own WASM heap). The pool delivers results to the callback
333
+ // STRICTLY in input order, so symbol-id insertion order — and therefore
334
+ // every cross-run-stable scale-test invariant — is identical to the
335
+ // serial path. DB writes still run single-writer on the main thread.
336
+ //
337
+ // Result-kind contract (matches the serial branch's semantics exactly):
338
+ // parsed → upsertFileWithCache → touchedFileIds.add → insert all
339
+ // parse-error → upsertFileWithCache → touchedFileIds.add → no inserts
340
+ // cached → upsertFileWithCache → touchedFileIds.add → no inserts
341
+ // (worker confirmed hash === expectedHash; upsert sees
342
+ // the same hash and returns unchanged=true)
343
+ // too-large → counter only; file row NOT touched → pruned
344
+ // io-error → counter only; file row NOT touched → pruned
345
+ //
346
+ // The cached/parse-error upsert calls are CRITICAL: without them
347
+ // `touchedFileIds` would not contain those file ids and
348
+ // `pruneFilesNotIn(touchedFileIds)` below would delete every unchanged
349
+ // cached file from the DB.
350
+ // Auto-enabled for normal/large workspaces. Tiny workspaces stay serial by
351
+ // default to avoid worker startup/churn; force workers with `parallel: true`
352
+ // or `SEER_PARALLEL_PARSE=1`. Opt out with `parallel: false` or
353
+ // `SEER_PARALLEL_PARSE=0`.
354
+ const envParallel: boolean | undefined =
355
+ typeof process !== 'undefined' && process.env != null
356
+ ? (process.env.SEER_PARALLEL_PARSE === '0' ? false
357
+ : process.env.SEER_PARALLEL_PARSE === '1' ? true
358
+ : undefined)
359
+ : undefined;
360
+ const parallelRequested = options.parallel ?? envParallel ?? true;
361
+ const parallelForced = options.parallel === true || envParallel === true;
362
+ const parallelEnabled =
363
+ parallelRequested && (parallelForced || work.length >= PARALLEL_AUTO_MIN_FILES);
364
+
365
+ if (parallelEnabled && work.length > 0) {
366
+ // Snapshot known DB hashes so workers can skip parsing on cache hits.
367
+ const cacheMap = new Map<string, string>();
368
+ for (const f of this.store.listFiles()) cacheMap.set(f.path, f.hash);
369
+
370
+ const poolItems: PoolWorkItem[] = work.map(w => ({
371
+ abs: w.file.absolutePath,
372
+ lang: w.language,
373
+ expectedHash: forceServiceCallBackfill ? null : cacheMap.get(w.file.absolutePath) ?? null,
374
+ maxFileBytes,
375
+ }));
376
+
377
+ const pool = new WorkerPool({ jobs: options.jobs });
378
+ try {
379
+ await pool.ready();
380
+ let processed = 0;
381
+ await pool.dispatch(poolItems, (seq, result) => {
382
+ processed++;
383
+ const w = work[seq];
384
+ const rel = w.file.relativePath;
385
+
386
+ // Counters-only branches (file row stays untouched → pruned).
387
+ if (result.kind === 'too-large') {
388
+ skippedTooLarge++;
389
+ if (options.verbose) {
390
+ process.stdout.write(` ⤬ ${rel} (${(result.size / 1024).toFixed(0)} KiB > ${(maxFileBytes / 1024).toFixed(0)} KiB cap)\n`);
391
+ } else if (!quiet) writeProgress(processed, total, rel);
392
+ if (processed % BATCH_SIZE === 0) closeBatch();
393
+ return;
394
+ }
395
+ if (result.kind === 'io-error') {
396
+ skipped++;
397
+ if (options.verbose) process.stdout.write(` ⚠ ${rel} (read error: ${result.error})\n`);
398
+ else if (!quiet) writeProgress(processed, total, rel);
399
+ if (processed % BATCH_SIZE === 0) closeBatch();
400
+ return;
401
+ }
402
+
403
+ // parsed / parse-error / cached all read the file successfully —
404
+ // we have hash + lines. Always upsert so touchedFileIds is updated.
405
+ const hash = result.hash;
406
+ const lines = result.lines;
407
+ openBatch();
408
+ const classification = classifyFile(rel);
409
+ const upserted = forceServiceCallBackfill
410
+ ? { fileId: this.store.upsertFile(w.file.absolutePath, rel, w.language, hash, lines, classification), unchanged: false }
411
+ : this.store.upsertFileWithCache(
412
+ w.file.absolutePath, rel, w.language, hash, lines, classification,
413
+ );
414
+ const { fileId, unchanged } = upserted;
415
+ touchedFileIds.add(fileId);
416
+
417
+ // Cache hit (worker's hash matched the DB's stored hash). Prior
418
+ // symbols/edges/imports/routes/configKeys stay as-is. Note: an
419
+ // explicit `cached` result always falls into this branch; a `parsed`
420
+ // result whose hash happens to match an in-flight DB update would
421
+ // also land here defensively (we never re-insert when unchanged).
422
+ if (unchanged) {
423
+ reusedFromCache++;
424
+ if (options.verbose) process.stdout.write(` = ${rel} (cached)\n`);
425
+ else if (!quiet) writeProgress(processed, total, rel);
426
+ if (processed % BATCH_SIZE === 0) closeBatch();
427
+ return;
428
+ }
429
+
430
+ // Only `parsed` carries an extraction. `cached` lands in the
431
+ // unchanged-branch above; `parse-error` and any defensive fall-
432
+ // through here get treated as a parse error (file row exists,
433
+ // no symbols/edges emitted).
434
+ if (result.kind !== 'parsed') {
435
+ parseErrors++;
436
+ if (options.verbose) process.stdout.write(` ⚠ ${rel} (parse error)\n`);
437
+ else if (!quiet) writeProgress(processed, total, rel);
438
+ if (processed % BATCH_SIZE === 0) closeBatch();
439
+ return;
440
+ }
441
+
442
+ // parsed: insert all symbols, edges, imports, routes, configKeys.
443
+ const extraction: FileExtraction = result.extraction;
444
+ const symbolIdMap = new Map<string, number>();
445
+ for (const def of extraction.definitions) {
446
+ const symId = this.store.insertSymbol(fileId, def);
447
+ const qname = def.qualifiedName ?? def.name;
448
+ if (!symbolIdMap.has(qname)) symbolIdMap.set(qname, symId);
449
+ }
450
+ for (const ref of extraction.references) {
451
+ const fromId = ref.callerName ? symbolIdMap.get(ref.callerName) : undefined;
452
+ if (fromId !== undefined) {
453
+ this.store.insertEdge(fromId, ref.calleeName, ref.kind, ref.line);
454
+ }
455
+ }
456
+ for (const mod of extraction.importedModules) {
457
+ this.store.insertFileImport(fileId, mod);
458
+ }
459
+ if (extraction.routes) {
460
+ for (const r of extraction.routes) {
461
+ this.store.insertRoute(
462
+ fileId, r.method, r.path, r.framework,
463
+ r.handlerName ?? null, r.line,
464
+ {
465
+ protocol: r.protocol ?? 'http',
466
+ operation: r.operation ?? null,
467
+ topic: r.topic ?? null,
468
+ queue: r.queue ?? null,
469
+ exchange: r.exchange ?? null,
470
+ service: r.service ?? null,
471
+ broker: r.broker ?? null,
472
+ metadataJson: r.metadataJson ?? null,
473
+ },
474
+ );
475
+ }
476
+ }
477
+ if (extraction.configKeys) {
478
+ for (const c of extraction.configKeys) {
479
+ const enclosingId = c.callerName ? symbolIdMap.get(c.callerName) ?? null : null;
480
+ this.store.insertConfigKey(c.key, c.source, fileId, enclosingId, c.line);
481
+ }
482
+ }
483
+ if (extraction.serviceCalls) {
484
+ for (const sc of extraction.serviceCalls) {
485
+ const enclosingId = sc.callerName ? symbolIdMap.get(sc.callerName) ?? null : null;
486
+ // Only run HTTP-shaped normalization when the call is HTTP.
487
+ const norm = sc.protocol === 'http'
488
+ ? normalizeHttpTarget(sc.rawTarget)
489
+ : { path: undefined, hostHint: undefined };
490
+ this.store.insertServiceCall({
491
+ fileId,
492
+ symbolId: enclosingId,
493
+ protocol: sc.protocol,
494
+ method: sc.method ?? null,
495
+ rawTarget: sc.rawTarget,
496
+ normalizedPath: sc.normalizedPath ?? norm.path ?? null,
497
+ hostHint: sc.hostHint ?? norm.hostHint ?? null,
498
+ envKey: sc.envKey ?? null,
499
+ framework: sc.framework,
500
+ line: sc.line,
501
+ confidence: sc.confidence,
502
+ operation: sc.operation ?? null,
503
+ topic: sc.topic ?? null,
504
+ queue: sc.queue ?? null,
505
+ exchange: sc.exchange ?? null,
506
+ service: sc.service ?? null,
507
+ broker: sc.broker ?? null,
508
+ metadataJson: sc.metadataJson ?? null,
509
+ });
510
+ }
511
+ }
512
+
513
+ if (processed % BATCH_SIZE === 0) closeBatch();
514
+ indexed++;
515
+
516
+ if (options.verbose) {
517
+ process.stdout.write(` ✓ ${rel} (${extraction.definitions.length} symbols, ${extraction.references.length} refs)\n`);
518
+ } else if (!quiet) {
519
+ writeProgress(processed, total, rel);
520
+ }
521
+ });
522
+ workerWasmResets = pool.wasmResetCount();
523
+ closeBatch();
524
+ } catch (err) {
525
+ rollbackBatch();
526
+ await pool.terminate().catch(() => { /* */ });
527
+ throw err;
528
+ }
529
+ await pool.shutdown();
530
+
531
+ if (!options.verbose && !quiet) process.stdout.write('\n');
532
+ // Skip the serial prefetcher block below.
533
+ return await this.finishIndex(
534
+ absRoot, start, total, indexed, reusedFromCache, skipped,
535
+ skippedTooLarge, parseErrors, touchedFileIds,
536
+ { verbose: options.verbose, quiet: !!quiet, workerWasmResets },
537
+ );
538
+ }
539
+
540
+ // ── Bounded async prefetcher (serial branch) ────────────────────────────────
541
+ //
542
+ // Producer side: a fixed sliding window of up to `ioConcurrency` in-flight
543
+ // `prefetchOne()` calls, each one bounded by `byteSem` so cumulative
544
+ // buffered content never exceeds `ioPrefetchBytes`.
545
+ //
546
+ // Consumer side: the main loop awaits prefetched results IN ORDER (so
547
+ // batching/progress/determinism match the old sync loop exactly), parses
548
+ // serially (single shared WASM module — see parser/index.ts), and writes
549
+ // serially (single SQLite connection).
550
+ //
551
+ // The byte budget is released only AFTER parse completes, because parse
552
+ // reads from the in-memory string. That means while a slow file parses we
553
+ // hold its budget — which is intentional: backpressures the prefetcher
554
+ // exactly when the parser falls behind.
555
+ const byteSem = new ByteSemaphore(ioPrefetchBytes);
556
+
557
+ const prefetchOne = async (idx: number): Promise<PrefetchResult> => {
558
+ const item = work[idx];
559
+ const abs = item.file.absolutePath;
560
+
561
+ // Only stat when there's an actual size cap to enforce. Saves one
562
+ // syscall per file in the (default) `maxFileBytes === 0` mode.
563
+ if (maxFileBytes > 0) {
564
+ let size: number;
565
+ try {
566
+ size = (await fs.promises.stat(abs)).size;
567
+ } catch {
568
+ return { kind: 'io-error', item };
569
+ }
570
+ if (size > maxFileBytes) {
571
+ return { kind: 'too-large', item, size };
572
+ }
573
+ await byteSem.acquire(Math.max(size, 1));
574
+ let content: string;
575
+ try {
576
+ content = await fs.promises.readFile(abs, 'utf8');
577
+ } catch {
578
+ byteSem.release(Math.max(size, 1));
579
+ return { kind: 'io-error', item };
580
+ }
581
+ // Update the held budget if the on-disk size disagreed with the
582
+ // decoded string length (Buffer length, after UTF-8 → UTF-16). We
583
+ // re-anchor to the actual content length so future releases match.
584
+ const actual = Buffer.byteLength(content, 'utf8');
585
+ if (actual !== size) {
586
+ byteSem.release(Math.max(size, 1));
587
+ await byteSem.acquire(Math.max(actual, 1));
588
+ }
589
+ return { kind: 'ok', item, content, size: actual };
590
+ }
591
+
592
+ // No size cap → skip the stat entirely. We don't know the size up
593
+ // front, so reserve a conservative slot (1 byte), read, then re-acquire
594
+ // the true size. This keeps a single huge file from blocking us before
595
+ // we even know it's huge.
596
+ await byteSem.acquire(1);
597
+ let content: string;
598
+ try {
599
+ content = await fs.promises.readFile(abs, 'utf8');
600
+ } catch {
601
+ byteSem.release(1);
602
+ return { kind: 'io-error', item };
603
+ }
604
+ const size = Buffer.byteLength(content, 'utf8');
605
+ // Re-anchor budget to actual size.
606
+ byteSem.release(1);
607
+ await byteSem.acquire(Math.max(size, 1));
608
+ return { kind: 'ok', item, content, size };
609
+ };
610
+
611
+ // Sliding window of in-flight prefetches, indexed by their position in `work`.
612
+ // We `slots.shift()` after awaiting so the array stays small (≤ ioConcurrency).
613
+ const slots: Array<Promise<PrefetchResult>> = [];
614
+ let nextToLaunch = 0;
615
+ const launchUpTo = (window: number): void => {
616
+ while (slots.length < window && nextToLaunch < work.length) {
617
+ slots.push(prefetchOne(nextToLaunch));
618
+ nextToLaunch++;
619
+ }
620
+ };
621
+
622
+ // Prime the pipeline.
623
+ launchUpTo(ioConcurrency);
624
+
625
+ let processed = 0;
626
+
627
+ try {
628
+ while (slots.length > 0) {
629
+ const prefetched = await slots.shift()!;
630
+ processed++;
631
+
632
+ // Whatever happens to this file, the moment we're done with the
633
+ // string we MUST release its byte budget so the next prefetch can
634
+ // start. We accumulate the released amount and release in `finally`
635
+ // at the end of each iteration.
636
+ const heldBytes = prefetched.kind === 'ok' ? Math.max(prefetched.size, 1) :
637
+ prefetched.kind === 'too-large' ? 0 : // never acquired
638
+ 0; // io-error already released
639
+ try {
640
+ if (prefetched.kind === 'too-large') {
641
+ skippedTooLarge++;
642
+ if (options.verbose) {
643
+ process.stdout.write(
644
+ ` ⤬ ${prefetched.item.file.relativePath} (${(prefetched.size / 1024).toFixed(0)} KiB > ${(maxFileBytes / 1024).toFixed(0)} KiB cap)\n`,
645
+ );
646
+ } else if (!quiet) {
647
+ writeProgress(processed, total, prefetched.item.file.relativePath);
648
+ }
649
+ if (processed % BATCH_SIZE === 0) closeBatch();
650
+ continue;
651
+ }
652
+
653
+ if (prefetched.kind === 'io-error') {
654
+ skipped++;
655
+ if (options.verbose) {
656
+ process.stdout.write(` ⚠ ${prefetched.item.file.relativePath} (read error)\n`);
657
+ } else if (!quiet) {
658
+ writeProgress(processed, total, prefetched.item.file.relativePath);
659
+ }
660
+ if (processed % BATCH_SIZE === 0) closeBatch();
661
+ continue;
662
+ }
663
+
664
+ const { item, content } = prefetched;
665
+ const { file, language } = item;
666
+ const hash = sha256(content);
667
+ const lines = content.split('\n').length;
668
+
669
+ openBatch();
670
+ const classification = classifyFile(file.relativePath);
671
+ const upserted = forceServiceCallBackfill
672
+ ? { fileId: this.store.upsertFile(
673
+ file.absolutePath,
674
+ file.relativePath,
675
+ language,
676
+ hash,
677
+ lines,
678
+ classification,
679
+ ), unchanged: false }
680
+ : this.store.upsertFileWithCache(
681
+ file.absolutePath,
682
+ file.relativePath,
683
+ language,
684
+ hash,
685
+ lines,
686
+ classification,
687
+ );
688
+ const { fileId, unchanged } = upserted;
689
+ touchedFileIds.add(fileId);
690
+
691
+ // Hash-based cache hit: same content as last run → keep symbols, edges,
692
+ // and file_imports as-is. Edge to_ids that point to symbols that have
693
+ // since been deleted got NULLed by the FK cascade, so resolveEdges()
694
+ // below will still re-link them.
695
+ if (unchanged) {
696
+ reusedFromCache++;
697
+ if (options.verbose) {
698
+ process.stdout.write(` = ${file.relativePath} (cached)\n`);
699
+ } else if (!quiet) {
700
+ writeProgress(processed, total, file.relativePath);
701
+ }
702
+ if (processed % BATCH_SIZE === 0) closeBatch();
703
+ continue;
704
+ }
705
+
706
+ const extraction = await parseFile(content, file.absolutePath, language);
707
+ if (!extraction) {
708
+ parseErrors++;
709
+ if (options.verbose) {
710
+ process.stdout.write(` ⚠ ${file.relativePath} (parse error)\n`);
711
+ } else if (!quiet) {
712
+ writeProgress(processed, total, file.relativePath);
713
+ }
714
+ if (processed % BATCH_SIZE === 0) closeBatch();
715
+ continue;
716
+ }
717
+
718
+ const symbolIdMap = new Map<string, number>(); // qualifiedName → id
719
+ for (const def of extraction.definitions) {
720
+ const symId = this.store.insertSymbol(fileId, def);
721
+ const qname = def.qualifiedName ?? def.name;
722
+ if (!symbolIdMap.has(qname)) symbolIdMap.set(qname, symId);
723
+ }
724
+ for (const ref of extraction.references) {
725
+ const fromId = ref.callerName ? symbolIdMap.get(ref.callerName) : undefined;
726
+ if (fromId !== undefined) {
727
+ this.store.insertEdge(fromId, ref.calleeName, ref.kind, ref.line);
728
+ }
729
+ }
730
+ for (const mod of extraction.importedModules) {
731
+ this.store.insertFileImport(fileId, mod);
732
+ }
733
+ if (extraction.routes) {
734
+ for (const r of extraction.routes) {
735
+ this.store.insertRoute(
736
+ fileId, r.method, r.path, r.framework,
737
+ r.handlerName ?? null, r.line,
738
+ {
739
+ protocol: r.protocol ?? 'http',
740
+ operation: r.operation ?? null,
741
+ topic: r.topic ?? null,
742
+ queue: r.queue ?? null,
743
+ exchange: r.exchange ?? null,
744
+ service: r.service ?? null,
745
+ broker: r.broker ?? null,
746
+ metadataJson: r.metadataJson ?? null,
747
+ },
748
+ );
749
+ }
750
+ }
751
+ if (extraction.configKeys) {
752
+ for (const c of extraction.configKeys) {
753
+ const enclosingId = c.callerName ? symbolIdMap.get(c.callerName) ?? null : null;
754
+ this.store.insertConfigKey(c.key, c.source, fileId, enclosingId, c.line);
755
+ }
756
+ }
757
+ if (extraction.serviceCalls) {
758
+ for (const sc of extraction.serviceCalls) {
759
+ const enclosingId = sc.callerName ? symbolIdMap.get(sc.callerName) ?? null : null;
760
+ // Only run HTTP-shaped normalization when the call is HTTP.
761
+ const norm = sc.protocol === 'http'
762
+ ? normalizeHttpTarget(sc.rawTarget)
763
+ : { path: undefined, hostHint: undefined };
764
+ this.store.insertServiceCall({
765
+ fileId,
766
+ symbolId: enclosingId,
767
+ protocol: sc.protocol,
768
+ method: sc.method ?? null,
769
+ rawTarget: sc.rawTarget,
770
+ normalizedPath: sc.normalizedPath ?? norm.path ?? null,
771
+ hostHint: sc.hostHint ?? norm.hostHint ?? null,
772
+ envKey: sc.envKey ?? null,
773
+ framework: sc.framework,
774
+ line: sc.line,
775
+ confidence: sc.confidence,
776
+ operation: sc.operation ?? null,
777
+ topic: sc.topic ?? null,
778
+ queue: sc.queue ?? null,
779
+ exchange: sc.exchange ?? null,
780
+ service: sc.service ?? null,
781
+ broker: sc.broker ?? null,
782
+ metadataJson: sc.metadataJson ?? null,
783
+ });
784
+ }
785
+ }
786
+
787
+ if (processed % BATCH_SIZE === 0) closeBatch();
788
+
789
+ indexed++;
790
+
791
+ if (options.verbose) {
792
+ const symCount = extraction.definitions.length;
793
+ const refCount = extraction.references.length;
794
+ process.stdout.write(
795
+ ` ✓ ${file.relativePath} (${symCount} symbols, ${refCount} refs)\n`,
796
+ );
797
+ } else if (!quiet) {
798
+ writeProgress(processed, total, file.relativePath);
799
+ }
800
+ } finally {
801
+ // Return this file's bytes to the prefetcher budget BEFORE launching
802
+ // the next slot, so the launch decision sees an up-to-date balance.
803
+ if (heldBytes > 0) byteSem.release(heldBytes);
804
+ // Refill the sliding window now that one slot drained.
805
+ launchUpTo(ioConcurrency);
806
+ }
807
+ }
808
+
809
+ // Close the last partial batch before kicking off post-processing
810
+ // (resolveImports / resolveEdges / pruneFilesNotIn all start their own
811
+ // transactions and would crash if one is already open).
812
+ closeBatch();
813
+ } catch (err) {
814
+ // Don't leave a transaction dangling — post-processing's BEGIN would
815
+ // throw and mask the original error.
816
+ rollbackBatch();
817
+ // Drain any in-flight prefetches so their byte budget is returned and
818
+ // open FDs / promises don't leak as unhandled rejections.
819
+ while (slots.length > 0) {
820
+ try {
821
+ const p = await slots.shift()!;
822
+ if (p.kind === 'ok') byteSem.release(Math.max(p.size, 1));
823
+ } catch { /* swallow */ }
824
+ }
825
+ throw err;
826
+ }
827
+
828
+ if (!options.verbose && !quiet) process.stdout.write('\n');
829
+
830
+ return await this.finishIndex(
831
+ absRoot, start, total, indexed, reusedFromCache, skipped,
832
+ skippedTooLarge, parseErrors, touchedFileIds,
833
+ { verbose: options.verbose, quiet: !!quiet },
834
+ );
835
+ }
836
+
837
+ /**
838
+ * Post-parse pipeline shared by the serial and parallel branches: prune
839
+ * stale files, resolve imports/edges/routes/config-keys, synthesize test
840
+ * edges, refresh external dependencies, lazily recompute PageRank, and
841
+ * assemble the `IndexResult`.
842
+ */
843
+ private async finishIndex(
844
+ absRoot: string,
845
+ start: number,
846
+ total: number,
847
+ indexed: number,
848
+ reusedFromCache: number,
849
+ skipped: number,
850
+ skippedTooLarge: number,
851
+ parseErrors: number,
852
+ touchedFileIds: Set<number>,
853
+ opts: { verbose?: boolean; quiet: boolean; workerWasmResets?: number },
854
+ ): Promise<IndexResult> {
855
+ const { verbose, quiet } = opts;
856
+
857
+ // v9 Track-H: scan .proto files for gRPC service definitions BEFORE the
858
+ // stale-file prune and service-link resolver run. .proto files are not
859
+ // part of normal tree-sitter discovery, so they must be added to
860
+ // touchedFileIds here; otherwise cached re-indexes would prune and
861
+ // recreate proto rows every time.
862
+ try {
863
+ const protoScan = await scanProtoFiles(absRoot, this.store);
864
+ for (const fileId of protoScan.fileIds) touchedFileIds.add(fileId);
865
+ } catch (err) {
866
+ if (verbose) process.stdout.write(` ⚠ proto scanner failed: ${err}\n`);
867
+ }
868
+
869
+ // Drop files that existed in a prior run but didn't show up this time
870
+ // (e.g. user added a new ignore rule, or files were removed from disk).
871
+ // FK cascades remove their symbols, edges, and file_imports too.
872
+ const prunedFiles = this.store.pruneFilesNotIn(touchedFileIds);
873
+ if (prunedFiles > 0 && !quiet) {
874
+ process.stdout.write(` Pruned ${prunedFiles.toLocaleString()} stale file(s) from prior run\n`);
875
+ }
876
+
877
+ // Post-processing passes
878
+ if (!quiet) process.stdout.write(' Resolving imports...\n');
879
+ const resolvedImports = this.store.resolveImports();
880
+
881
+ if (!quiet) process.stdout.write(' Resolving call edges...\n');
882
+ const resolution = this.store.resolveEdges();
883
+
884
+ // Track-C: link routes to handlers, config_keys to enclosing symbol,
885
+ // synthesize tests edges from test-file → non-test-file calls.
886
+ const routesResolved = this.store.resolveRouteHandlers();
887
+ const configKeysResolved = this.store.resolveConfigKeySymbols();
888
+ const testEdgesAdded = this.store.synthesizeTestEdges();
889
+
890
+ // v9 Track-H: scan k8s manifests + Docker Compose for service hostnames.
891
+ // Passed to the resolver as evidence — host_hint hits get a confidence
892
+ // boost and may be classified as `service_host` link matches.
893
+ let hostMap: import('./serviceHostScanner.js').ServiceHostMap | undefined;
894
+ try {
895
+ hostMap = await scanServiceHosts(absRoot);
896
+ } catch (err) {
897
+ if (verbose) process.stdout.write(` ⚠ service-host scanner failed: ${err}\n`);
898
+ }
899
+
900
+ // Track-G: deterministic service-link resolution. Runs every time, since
901
+ // any change in service_calls OR routes can shift link membership. The
902
+ // resolver itself wipes service_links before rebuilding so it's
903
+ // idempotent.
904
+ let serviceLinks = 0;
905
+ let serviceLinksByKind: Record<string, number> = {};
906
+ try {
907
+ const sr = resolveServiceLinks(this.store, { hostMap });
908
+ serviceLinks = sr.linksInserted;
909
+ serviceLinksByKind = sr.byKind as Record<string, number>;
910
+ this.store.markServiceCallsBackfilled();
911
+ } catch (err) {
912
+ if (verbose) process.stdout.write(` ⚠ service-link resolution failed: ${err}\n`);
913
+ }
914
+
915
+ // External dependency extraction from manifests/lockfiles. This is cheap
916
+ // and idempotent — clear and re-insert every full pass so deletions are
917
+ // reflected. We pass absRoot so the extractor finds package.json /
918
+ // Cargo.toml / etc. at the repo root and walks down for monorepos.
919
+ try {
920
+ const { extractExternalDependencies } = await import('./externaldeps.js');
921
+ await extractExternalDependencies(absRoot, this.store);
922
+ } catch (err) {
923
+ if (verbose) {
924
+ process.stdout.write(` ⚠ external dep extraction failed: ${err}\n`);
925
+ }
926
+ }
927
+
928
+ // ── Lazy PageRank ───────────────────────────────────────────────────────────
929
+ // PageRank values are a pure function of the resolved edge graph. If nothing
930
+ // in that graph changed this run, every previously-stored rank is still
931
+ // correct and we can skip the O(iterations × edges) recomputation.
932
+ //
933
+ // "Nothing changed" requires ALL of the following:
934
+ // - no file was newly indexed (no new symbols/edges/imports inserted)
935
+ // - no stale file was pruned (would have cascaded FK deletes,
936
+ // potentially NULLing inbound edge `to_id`s)
937
+ // - resolveEdges() promoted zero NULL `to_id`s to a real id
938
+ // - resolveImports() promoted zero NULL `resolved_file_id`s
939
+ //
940
+ // If any of those is nonzero, the symbol set OR the resolved-edge graph
941
+ // could have shifted, so we recompute. This is the same correctness
942
+ // contract that the scale-test's "top-symbol id stability" check enforces
943
+ // — drift there means the predicate below missed a case.
944
+ //
945
+ // Why this is safe even on first run: when the DB is fresh, `indexed > 0`
946
+ // (everything is new), so the predicate fires and PageRank is computed.
947
+ const graphChanged =
948
+ indexed > 0 ||
949
+ prunedFiles > 0 ||
950
+ resolution.sameFile + resolution.imported + resolution.global > 0 ||
951
+ resolvedImports > 0;
952
+
953
+ let pagerankRecomputed = false;
954
+ if (graphChanged) {
955
+ if (!quiet) process.stdout.write(' Computing PageRank...\n');
956
+ const symbolIds = this.store.getAllSymbolIds();
957
+ const edges = this.store.getAllEdges();
958
+ const ranks = computePageRank(symbolIds, edges);
959
+ this.store.updatePageRanks(ranks);
960
+ pagerankRecomputed = true;
961
+ } else if (!quiet) {
962
+ process.stdout.write(' Skipping PageRank (graph unchanged)\n');
963
+ }
964
+
965
+ // ── Lazy module clustering ──────────────────────────────────────────────
966
+ // Same skip predicate as PageRank: the cluster is a function of the file
967
+ // graph + symbol PageRank, both of which stay stable when nothing changed.
968
+ // Always build when modules table is empty so the first opt-in to v6 runs
969
+ // it once, even when the index itself was a no-op.
970
+ let modulesRecomputed = false;
971
+ if (graphChanged || !this.store.hasModulesData()) {
972
+ if (!quiet) process.stdout.write(' Clustering modules...\n');
973
+ try {
974
+ buildModules(this.store);
975
+ modulesRecomputed = true;
976
+ } catch (err) {
977
+ if (verbose) process.stdout.write(` ⚠ module clustering failed: ${err}\n`);
978
+ }
979
+ } else if (!quiet) {
980
+ process.stdout.write(' Skipping module clustering (graph unchanged)\n');
981
+ }
982
+
983
+ // ── v10 boundary detection ──────────────────────────────────────────────
984
+ // Always run when graphChanged (new files / pruned files / new edges)
985
+ // OR when the boundaries table is empty (first opt-in after migrating
986
+ // an existing DB to v10).
987
+ let boundariesRecomputed = false;
988
+ try {
989
+ if (graphChanged || !this.store.hasBoundariesData()) {
990
+ if (!quiet) process.stdout.write(' Detecting boundaries...\n');
991
+ const r = buildBoundaries(absRoot, this.store);
992
+ this.store.replaceBoundaries(r.boundaries, r.edges);
993
+ boundariesRecomputed = true;
994
+ }
995
+ } catch (err) {
996
+ if (verbose) process.stdout.write(` ⚠ boundary detection failed: ${err}\n`);
997
+ }
998
+ void boundariesRecomputed;
999
+
1000
+ // ── Lazy shape-hash pass (Track-F structural SimHash) ──────────────────
1001
+ // Re-indexed files delete their old symbols (no shape_hash on the new
1002
+ // rows yet) so graphChanged covers normal updates. We ALSO run when any
1003
+ // eligible symbol is missing a hash even on a cached/no-op run — this is
1004
+ // the case after a pre-v7 → v7 migration where every existing file is
1005
+ // "cached" (content hash unchanged) but the new shape_hash column starts
1006
+ // NULL on every row. Without this second predicate the backfill would
1007
+ // never run and `seer_duplicates` would silently return nothing.
1008
+ let shapeHashesAdded = 0;
1009
+ const needsHashBackfill = this.store.hasMissingShapeHashes();
1010
+ if (graphChanged || needsHashBackfill) {
1011
+ if (!quiet) {
1012
+ process.stdout.write(graphChanged
1013
+ ? ' Computing shape hashes...\n'
1014
+ : ' Backfilling shape hashes...\n');
1015
+ }
1016
+ try {
1017
+ const r = buildShapeHashes(this.store);
1018
+ shapeHashesAdded = r.symbolsHashed;
1019
+ } catch (err) {
1020
+ if (verbose) process.stdout.write(` ⚠ shape-hash pass failed: ${err}\n`);
1021
+ }
1022
+ } else if (!quiet) {
1023
+ process.stdout.write(' Skipping shape hashes (graph unchanged, no backfill needed)\n');
1024
+ }
1025
+
1026
+ // v10 — rename/move continuity heuristics. Runs whenever shape hashes
1027
+ // were computed; opt-in mode (includeAllSymbols=true) is reserved for
1028
+ // the explicit `seer continuity` CLI. The default pass only attaches
1029
+ // candidates to symbols whose recorded history is shallow (< 1 commit).
1030
+ if (graphChanged && this.store.hasV10()) {
1031
+ try {
1032
+ buildContinuity(this.store, { includeAllSymbols: true });
1033
+ } catch (err) {
1034
+ if (verbose) process.stdout.write(` ⚠ continuity pass failed: ${err}\n`);
1035
+ }
1036
+ }
1037
+
1038
+ const stats = this.store.getStats();
1039
+ const elapsedMs = Date.now() - start;
1040
+
1041
+ return {
1042
+ filesDiscovered: total,
1043
+ filesIndexed: indexed,
1044
+ filesReusedFromCache: reusedFromCache,
1045
+ filesSkipped: skipped,
1046
+ filesSkippedTooLarge: skippedTooLarge,
1047
+ filesParseError: parseErrors,
1048
+ wasmResets: wasmResetCount() + (opts.workerWasmResets ?? 0),
1049
+ symbols: stats.symbols,
1050
+ edges: stats.edges,
1051
+ // stats.resolvedEdges is the running DB total; resolution.{sameFile,
1052
+ // imported, global} below reports only the *delta* — what this run
1053
+ // newly resolved (mostly nonzero on first run, near-zero on a cached
1054
+ // re-run where everything was already resolved).
1055
+ resolvedEdges: stats.resolvedEdges,
1056
+ resolvedImports,
1057
+ edgeResolution: {
1058
+ sameFile: resolution.sameFile,
1059
+ imported: resolution.imported,
1060
+ global: resolution.global,
1061
+ },
1062
+ pagerankRecomputed,
1063
+ routesResolved,
1064
+ configKeysResolved,
1065
+ testEdgesAdded,
1066
+ externalDependencies: stats.externalDependencies,
1067
+ modules: stats.modules,
1068
+ modulesRecomputed,
1069
+ shapeHashesAdded,
1070
+ serviceLinks,
1071
+ serviceLinksByKind,
1072
+ elapsedMs,
1073
+ };
1074
+ }
1075
+ }
1076
+
1077
+ // ── Utilities ──────────────────────────────────────────────────────────────────
1078
+
1079
+ function sha256(content: string): string {
1080
+ return crypto.createHash('sha256').update(content, 'utf8').digest('hex').slice(0, 16);
1081
+ }
1082
+
1083
+ function writeProgress(current: number, total: number, label: string): void {
1084
+ if (!process.stdout.isTTY) return;
1085
+ const width = 28;
1086
+ const pct = total > 0 ? current / total : 0;
1087
+ const filled = Math.round(pct * width);
1088
+ const bar = '█'.repeat(filled) + '░'.repeat(width - filled);
1089
+ const pctStr = Math.round(pct * 100).toString().padStart(3);
1090
+ const short = label.length > 35 ? '…' + label.slice(-34) : label.padEnd(35);
1091
+ process.stdout.write(`\r [${bar}] ${pctStr}% (${current}/${total}) ${short}`);
1092
+ }