codedeep-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +177 -0
  3. package/dist/config.js +223 -0
  4. package/dist/git/analyzer.js +177 -0
  5. package/dist/git/git-service.js +568 -0
  6. package/dist/git/head-watcher.js +113 -0
  7. package/dist/git/runner.js +204 -0
  8. package/dist/index.js +138 -0
  9. package/dist/indexer/code-index.js +1801 -0
  10. package/dist/indexer/complexity.js +633 -0
  11. package/dist/indexer/extractor.js +354 -0
  12. package/dist/indexer/languages/cpp.js +934 -0
  13. package/dist/indexer/languages/csharp.js +854 -0
  14. package/dist/indexer/languages/dart.js +777 -0
  15. package/dist/indexer/languages/go.js +665 -0
  16. package/dist/indexer/languages/java.js +507 -0
  17. package/dist/indexer/languages/kotlin.js +709 -0
  18. package/dist/indexer/languages/objc.js +397 -0
  19. package/dist/indexer/languages/php.js +771 -0
  20. package/dist/indexer/languages/python.js +455 -0
  21. package/dist/indexer/languages/ruby.js +697 -0
  22. package/dist/indexer/languages/rust.js +754 -0
  23. package/dist/indexer/languages/swift.js +691 -0
  24. package/dist/indexer/languages/typescript.js +485 -0
  25. package/dist/indexer/parser.js +175 -0
  26. package/dist/indexer/pipeline.js +342 -0
  27. package/dist/indexer/scanner.js +279 -0
  28. package/dist/indexer/watcher.js +353 -0
  29. package/dist/logger.js +16 -0
  30. package/dist/server.js +170 -0
  31. package/dist/tools/common.js +207 -0
  32. package/dist/tools/find-references.js +224 -0
  33. package/dist/tools/find-symbol.js +94 -0
  34. package/dist/tools/get-context.js +370 -0
  35. package/dist/tools/impact.js +218 -0
  36. package/dist/tools/overview.js +482 -0
  37. package/dist/tools/search-structure.js +303 -0
  38. package/dist/types.js +61 -0
  39. package/grammars/tree-sitter-c.wasm +0 -0
  40. package/grammars/tree-sitter-c_sharp.wasm +0 -0
  41. package/grammars/tree-sitter-cpp.wasm +0 -0
  42. package/grammars/tree-sitter-dart.wasm +0 -0
  43. package/grammars/tree-sitter-go.wasm +0 -0
  44. package/grammars/tree-sitter-java.wasm +0 -0
  45. package/grammars/tree-sitter-javascript.wasm +0 -0
  46. package/grammars/tree-sitter-kotlin.wasm +0 -0
  47. package/grammars/tree-sitter-objc.wasm +0 -0
  48. package/grammars/tree-sitter-php.wasm +0 -0
  49. package/grammars/tree-sitter-python.wasm +0 -0
  50. package/grammars/tree-sitter-ruby.wasm +0 -0
  51. package/grammars/tree-sitter-rust.wasm +0 -0
  52. package/grammars/tree-sitter-swift.wasm +0 -0
  53. package/grammars/tree-sitter-tsx.wasm +0 -0
  54. package/grammars/tree-sitter-typescript.wasm +0 -0
  55. package/package.json +67 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Danh Hung
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,177 @@
1
+ # codedeep-mcp
2
+
3
+ [![CI](https://github.com/planexhq/codedeep-mcp/actions/workflows/ci.yml/badge.svg)](https://github.com/planexhq/codedeep-mcp/actions/workflows/ci.yml)
4
+ [![npm version](https://img.shields.io/npm/v/codedeep-mcp.svg)](https://www.npmjs.com/package/codedeep-mcp)
5
+ [![MCP spec](https://img.shields.io/badge/MCP_spec-2025--11--25-0a7ea4.svg)](https://modelcontextprotocol.io/specification/2025-11-25)
6
+ [![license: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
7
+ [![node](https://img.shields.io/badge/node-%E2%89%A520-brightgreen.svg)](https://nodejs.org)
8
+
9
+ An MCP server that gives AI coding agents structural understanding of codebases.
10
+
11
+ **One tool call replaces 5-10 Grep-Read cycles.**
12
+
13
+ codedeep-mcp parses your code with [tree-sitter](https://tree-sitter.github.io/tree-sitter/), builds a symbol index, and exposes 6 tools over the [Model Context Protocol](https://modelcontextprotocol.io/) that answer structural questions directly: find symbols, trace callers, assess blast radius, search by structure.
14
+
15
+ ## Why
16
+
17
+ AI coding agents explore codebases with text tools (grep, file reads). This works but is expensive:
18
+
19
+ - "Find all callers of X" requires 5+ grep-read cycles and returns false positives
20
+ - "What breaks if I change this?" requires exhaustive manual search
21
+ - Grep can't tell `user` the variable from `User` the class from `user()` the function
22
+
23
+ codedeep-mcp solves this by parsing code into symbols and relationships, then answering structural questions in a single call.
24
+
25
+ ## Tools
26
+
27
+ | Tool | Purpose | Example |
28
+ |------|---------|---------|
29
+ | `overview` | Orient in an unfamiliar codebase | Language breakdown, entry points, structure |
30
+ | `find_symbol` | AST-aware symbol lookup | Find function by name — matches definitions, not text |
31
+ | `get_context` | Full context for a symbol | Body + callers/callees + imports + co-change & complexity |
32
+ | `find_references` | Cross-file usage search | Who calls this function, and from where? |
33
+ | `impact` | Depth-N blast radius | Transitive upstream callers, grouped by hop |
34
+ | `search_structure` | Keyword and structural search | Find by name/signature (all languages), or AST pattern (TS/JS) |
35
+
36
+ ## Quick Start
37
+
38
+ ### Claude Code
39
+
40
+ Add to `~/.claude/settings.json`:
41
+
42
+ ```json
43
+ {
44
+ "mcpServers": {
45
+ "codedeep-mcp": {
46
+ "command": "npx",
47
+ "args": ["codedeep-mcp"]
48
+ }
49
+ }
50
+ }
51
+ ```
52
+
53
+ ### Cursor / Windsurf / Other MCP Clients
54
+
55
+ Any MCP client that supports stdio transport works. Configure it to run `npx codedeep-mcp`.
56
+
57
+ > **Note:** `npx codedeep-mcp` is a stdio server — it won't produce visible
58
+ > output when run directly. It communicates via JSON-RPC with the MCP client.
59
+
60
+ ## How It Works
61
+
62
+ ```
63
+ Your Code ──> tree-sitter (parse) ──> In-Memory Index ──> MCP Tools
64
+
65
+ Git (optional)
66
+ LSP (planned)
67
+ ```
68
+
69
+ **Structural index (always, instant):**
70
+ tree-sitter parses every file into an AST. Symbols, call relationships,
71
+ and imports are extracted and indexed in memory — with per-language call
72
+ resolution tuned for precision (an explicit 0-wrong-kind-edge goal), not
73
+ just text matching. Works on any repo with zero configuration.
74
+
75
+ **Complexity metrics (all 14 languages):**
76
+ Per-symbol cyclomatic and cognitive complexity, computed at index time and
77
+ pinned for behavioral comparability to McCabe / the Cognitive Complexity
78
+ whitepaper / open-source analyzers (SonarJS, sonar-java, gocyclo+gocognit,
79
+ rust-code-analysis, …). Shown on `find_symbol` / `get_context`.
80
+
81
+ **Git enrichment (when in a git repo):**
82
+ Commit frequency identifies hotspot files; co-change analysis reveals
83
+ behavioral coupling (files that change together); and a risk score
84
+ (churn × coupling × complexity) ranks the most change-prone, tangled hubs.
85
+
86
+ **Planned — LSP semantic tier:**
87
+ LSP integration (tsserver, pyright, gopls, …) for compiler-precise
88
+ cross-file references and type info is designed but **not yet shipped** —
89
+ cross-file edges today are AST name-matches.
90
+
91
+ ## Example
92
+
93
+ ````
94
+ > find_symbol({ name: "authenticate" })
95
+
96
+ src/auth/middleware.ts:42-67 | function | exported
97
+ async function authenticate(req: Request, res: Response, next: NextFunction): Promise<void>
98
+ Validates the JWT token and attaches user to request
99
+ References: ~5
100
+ Fan-out: 2
101
+ Complexity: cyc 3 / cog 1 [structural]
102
+
103
+ > get_context({ file: "src/auth/middleware.ts", symbol: "authenticate" })
104
+
105
+ src/auth/middleware.ts:42-67 | function | exported
106
+ async function authenticate(req: Request, res: Response, next: NextFunction): Promise<void>
107
+ Validates the JWT token and attaches user to request
108
+
109
+ ### Body
110
+ ```typescript
111
+ async function authenticate(req: Request, res: Response, next: NextFunction): Promise<void> {
112
+ const token = extractToken(req);
113
+ const payload = verify(token);
114
+ req.user = payload as User;
115
+ next();
116
+ }
117
+ ```
118
+
119
+ ### Callers
120
+ - src/routes/api.ts:67 — handleRequest() [structural]
121
+ - src/routes/webhook.ts:23 — verifyWebhook() [structural]
122
+
123
+ (get_context also emits ### Callees and ### Coupling sections here, omitted for brevity)
124
+
125
+ ### Imports
126
+ - jsonwebtoken: verify, decode
127
+ - ./types: User, AuthToken
128
+
129
+ ### Co-change Partners (2 behavioral)
130
+ - src/auth/types.ts 78% confidence (9 shared commits)
131
+ - tests/auth.test.ts 64% confidence (7 shared commits)
132
+ ````
133
+
134
+ ## Supported Languages
135
+
136
+ **14 languages**, each with tree-sitter symbol/reference extraction **and**
137
+ cyclomatic + cognitive complexity:
138
+
139
+ TypeScript / JS · Python · Java · Go · Rust · Swift · Kotlin · Dart · C# ·
140
+ PHP · Ruby · C++ · C · Objective-C
141
+
142
+ A planned LSP tier (see *How It Works*) will add compiler-precise cross-file
143
+ resolution per language.
144
+
145
+ ## Configuration
146
+
147
+ Optional `.codedeep/config.json` in your project root:
148
+
149
+ ```jsonc
150
+ {
151
+ "exclude": ["vendor/**", "generated/**"],
152
+ "languages": ["typescript", "python"],
153
+ "maxFiles": 100000,
154
+ "maxFileSize": 1048576,
155
+ "watch": true,
156
+ "gitEnabled": true,
157
+ "gitWindow": 180
158
+ }
159
+ ```
160
+
161
+ All fields are optional. Works with no config file.
162
+
163
+ Add `.codedeep/` to your `.gitignore` — the index cache is stored there.
164
+
165
+ Environment variables: `CODEDEEP_CACHE_DIR`, `CODEDEEP_EXCLUDE`, `CODEDEEP_GIT`, `CODEDEEP_GIT_WINDOW`, `CODEDEEP_WATCH`, `CODEDEEP_DEBUG`.
166
+
167
+ ## Development
168
+
169
+ ```bash
170
+ npm install
171
+ npm run build
172
+ npm test
173
+ ```
174
+
175
+ ## License
176
+
177
+ MIT — see [LICENSE](./LICENSE).
package/dist/config.js ADDED
@@ -0,0 +1,223 @@
1
+ import { createHash } from 'node:crypto';
2
+ import { constants as fsConstants, readFileSync } from 'node:fs';
3
+ import { access, mkdir } from 'node:fs/promises';
4
+ import { homedir } from 'node:os';
5
+ import { isAbsolute, join, relative, resolve, sep } from 'node:path';
6
+ import { toPosix } from './indexer/scanner.js';
7
+ import { errMsg, log } from './logger.js';
8
+ const DEFAULT_EXCLUDES = [
9
+ 'node_modules',
10
+ '.git',
11
+ '.codedeep',
12
+ '__pycache__',
13
+ '.venv',
14
+ 'dist',
15
+ 'build',
16
+ 'vendor',
17
+ '.next',
18
+ '.nuxt',
19
+ 'target',
20
+ '__generated__',
21
+ '*.min.js',
22
+ '*.bundle.js',
23
+ ];
24
+ const DEFAULT_LANGUAGES = ['typescript', 'tsx', 'javascript', 'python', 'java', 'go', 'rust', 'swift', 'kotlin', 'dart', 'csharp', 'php', 'ruby', 'cpp', 'c', 'objc'];
25
+ const DEFAULT_MAX_FILES = 100_000;
26
+ const DEFAULT_MAX_FILE_SIZE = 1_048_576;
27
+ const DEFAULT_GIT_WINDOW = 180;
28
+ function readFileConfig(root) {
29
+ const path = join(root, '.codedeep', 'config.json');
30
+ let raw;
31
+ try {
32
+ raw = readFileSync(path, 'utf8');
33
+ }
34
+ catch (err) {
35
+ if (err.code === 'ENOENT')
36
+ return {};
37
+ log.warn(`config: failed to read ${path}: ${err.message}; using defaults`);
38
+ return {};
39
+ }
40
+ let parsed;
41
+ try {
42
+ parsed = JSON.parse(raw);
43
+ }
44
+ catch (err) {
45
+ log.warn(`config: failed to parse ${path}: ${err.message}; using defaults`);
46
+ return {};
47
+ }
48
+ if (typeof parsed !== 'object' || parsed === null || Array.isArray(parsed)) {
49
+ log.warn(`config: ${path} is not a JSON object; using defaults`);
50
+ return {};
51
+ }
52
+ return parsed;
53
+ }
54
+ function asStringArray(value) {
55
+ if (!Array.isArray(value))
56
+ return undefined;
57
+ if (!value.every((v) => typeof v === 'string'))
58
+ return undefined;
59
+ return value;
60
+ }
61
+ function asNonNegativeInt(value) {
62
+ if (typeof value !== 'number' || !Number.isFinite(value) || value < 0)
63
+ return undefined;
64
+ return Math.floor(value);
65
+ }
66
+ // A 0-day git window is meaningless (empty analysis marked fresh), so the
67
+ // git window requires >= 1, unlike maxFiles/maxFileSize where 0 is valid.
68
+ function asPositiveInt(value) {
69
+ if (typeof value !== 'number' || !Number.isFinite(value) || value < 1)
70
+ return undefined;
71
+ return Math.floor(value);
72
+ }
73
+ function asNonBlankString(value) {
74
+ if (typeof value !== 'string')
75
+ return undefined;
76
+ const trimmed = value.trim();
77
+ return trimmed.length > 0 ? trimmed : undefined;
78
+ }
79
+ function asBoolean(value) {
80
+ return typeof value === 'boolean' ? value : undefined;
81
+ }
82
+ function parseEnvBool(name) {
83
+ const raw = process.env[name]?.trim().toLowerCase();
84
+ if (raw === undefined || raw === '')
85
+ return undefined;
86
+ if (raw === '0' || raw === 'false')
87
+ return false;
88
+ if (raw === '1' || raw === 'true')
89
+ return true;
90
+ log.warn(`config: ${name}=${raw} not recognized; expected 0/1/true/false`);
91
+ return undefined;
92
+ }
93
+ function parseEnvGitWindow() {
94
+ const raw = process.env.CODEDEEP_GIT_WINDOW?.trim();
95
+ if (raw === undefined || raw === '')
96
+ return undefined;
97
+ const parsed = asPositiveInt(Number(raw));
98
+ if (parsed === undefined) {
99
+ log.warn(`config: CODEDEEP_GIT_WINDOW=${raw} not recognized; expected a positive integer (days)`);
100
+ }
101
+ return parsed;
102
+ }
103
+ function parseEnvExclude() {
104
+ const raw = process.env.CODEDEEP_EXCLUDE;
105
+ if (!raw)
106
+ return [];
107
+ return raw
108
+ .split(',')
109
+ .map((s) => s.trim())
110
+ .filter(Boolean);
111
+ }
112
+ // When cacheDir lives inside projectRoot, the scanner must skip it.
113
+ // Otherwise persist() bumps cache/index.json's mtime, the next
114
+ // indexChanged() sees the divergence, re-indexes the cache, and writes
115
+ // it again — a self-feeding loop. Push both `<rel>` (so walk()'s
116
+ // dir-prune triggers) and `<rel>/**` (so file-level matchExclude in
117
+ // scanner.ts and indexer.indexFile catches children of multi-segment
118
+ // paths picomatch wouldn't auto-expand).
119
+ function computeCacheDirExcludes(root, cacheDir) {
120
+ const rel = relative(root, cacheDir);
121
+ if (rel.length === 0)
122
+ return [];
123
+ if (rel === '..' || rel.startsWith(`..${sep}`))
124
+ return [];
125
+ if (isAbsolute(rel))
126
+ return [];
127
+ const posixRel = toPosix(rel);
128
+ return [posixRel, `${posixRel}/**`];
129
+ }
130
+ export function loadConfig(projectRoot = process.cwd()) {
131
+ const root = resolve(projectRoot);
132
+ const fileCfg = readFileConfig(root);
133
+ const fileExclude = asStringArray(fileCfg.exclude) ?? [];
134
+ const fileLanguages = asStringArray(fileCfg.languages);
135
+ const fileMaxFiles = asNonNegativeInt(fileCfg.maxFiles);
136
+ const fileMaxFileSize = asNonNegativeInt(fileCfg.maxFileSize);
137
+ const fileCacheDir = asNonBlankString(fileCfg.cacheDir);
138
+ const envCacheDir = asNonBlankString(process.env.CODEDEEP_CACHE_DIR);
139
+ const envExclude = parseEnvExclude();
140
+ const cacheDirRaw = envCacheDir ?? fileCacheDir ?? join(root, '.codedeep', 'cache');
141
+ const resolvedCacheDir = resolve(root, cacheDirRaw);
142
+ // cacheDir === root produces no excludes, so <root>/index.json is admitted
143
+ // as an unknown source and re-indexed on every save (loop). Other invalid
144
+ // inputs degrade to defaults; this one corrupts the index, so fail loud.
145
+ // Default path is structurally non-root, guard only explicit input.
146
+ if ((envCacheDir ?? fileCacheDir) && relative(root, resolvedCacheDir) === '') {
147
+ throw new Error(`cacheDir resolves to the project root (${resolvedCacheDir}); ` +
148
+ `set CODEDEEP_CACHE_DIR or .codedeep/config.json "cacheDir" to a subdirectory or external path`);
149
+ }
150
+ const cacheDirExcludes = computeCacheDirExcludes(root, resolvedCacheDir);
151
+ const merged = [
152
+ ...DEFAULT_EXCLUDES,
153
+ ...fileExclude,
154
+ ...envExclude,
155
+ ...cacheDirExcludes,
156
+ ]
157
+ .map((s) => s.trim())
158
+ .filter(Boolean);
159
+ const exclude = Array.from(new Set(merged));
160
+ const cfg = {
161
+ projectRoot: root,
162
+ exclude: Object.freeze(exclude),
163
+ languages: Object.freeze(fileLanguages ?? [...DEFAULT_LANGUAGES]),
164
+ maxFiles: fileMaxFiles ?? DEFAULT_MAX_FILES,
165
+ maxFileSize: fileMaxFileSize ?? DEFAULT_MAX_FILE_SIZE,
166
+ cacheDir: resolvedCacheDir,
167
+ watch: parseEnvBool('CODEDEEP_WATCH') ?? asBoolean(fileCfg.watch) ?? true,
168
+ gitEnabled: parseEnvBool('CODEDEEP_GIT') ?? asBoolean(fileCfg.gitEnabled) ?? true,
169
+ gitWindow: parseEnvGitWindow() ?? asPositiveInt(fileCfg.gitWindow) ?? DEFAULT_GIT_WINDOW,
170
+ };
171
+ return Object.freeze(cfg);
172
+ }
173
+ export function defaultCacheDir(projectRoot) {
174
+ return resolve(projectRoot, '.codedeep', 'cache');
175
+ }
176
+ export function fallbackCacheDir(projectRoot) {
177
+ const hash = createHash('sha1').update(projectRoot).digest('hex').slice(0, 16);
178
+ return join(homedir(), '.cache', 'codedeep', hash);
179
+ }
180
+ // Ensures the configured cacheDir is writable. When the path equals the
181
+ // project-default and is not usable (read-only repo, EROFS mount, or a
182
+ // `.codedeep`-is-a-file FS conflict), falls back silently to
183
+ // ~/.cache/codedeep/<sha1(projectRoot)>/. Explicit user overrides fail loudly
184
+ // so they know their CODEDEEP_CACHE_DIR / cacheDir is broken instead of being
185
+ // silently ignored.
186
+ export async function resolveCacheDir(config) {
187
+ const isDefault = config.cacheDir === defaultCacheDir(config.projectRoot);
188
+ try {
189
+ await mkdir(config.cacheDir, { recursive: true });
190
+ // mkdir({recursive:true}) is idempotent, so a pre-existing cacheDir can
191
+ // slip through with restrictive permissions. Probe W+X explicitly:
192
+ // creating files inside a dir requires both bits per POSIX, so W alone
193
+ // admits modes like 0o200 / 0o600 where open(O_CREAT) still fails.
194
+ await access(config.cacheDir, fsConstants.W_OK | fsConstants.X_OK);
195
+ return config.cacheDir;
196
+ }
197
+ catch (err) {
198
+ const code = err?.code;
199
+ // ENOTDIR/EEXIST cover default-path FS conflicts (e.g. `.codedeep` is a
200
+ // regular file). Explicit overrides still throw so misconfig surfaces.
201
+ const canFallback = code === 'EACCES' ||
202
+ code === 'EROFS' ||
203
+ code === 'EPERM' ||
204
+ code === 'ENOTDIR' ||
205
+ code === 'EEXIST';
206
+ if (!canFallback || !isDefault)
207
+ throw err;
208
+ const fallback = fallbackCacheDir(config.projectRoot);
209
+ log.warn(`config: ${config.cacheDir} not usable (${code}); falling back to ${fallback}`);
210
+ try {
211
+ await mkdir(fallback, { recursive: true });
212
+ await access(fallback, fsConstants.W_OK | fsConstants.X_OK);
213
+ }
214
+ catch (fallbackErr) {
215
+ const wrapped = new Error(`Cache fallback ${fallback} is also not writable: ${errMsg(fallbackErr)}. ` +
216
+ `Set CODEDEEP_CACHE_DIR to a writable directory.`);
217
+ wrapped.code = fallbackErr?.code;
218
+ wrapped.cause = fallbackErr;
219
+ throw wrapped;
220
+ }
221
+ return fallback;
222
+ }
223
+ }
@@ -0,0 +1,177 @@
1
+ // The bulk git-log pass: one parse of `git log --name-only` output builds
2
+ // BOTH per-file commit counts (hotspots / commitFrequency) and the
3
+ // co-change pair matrix. Pure functions, no I/O — the GitService owns the
4
+ // subprocess; tests feed canned stdout strings.
5
+ //
6
+ // Output format contract (verified against real git):
7
+ // --pretty=format:%x00%ct --name-only
8
+ // emits, per commit, a NUL byte, the committer epoch-seconds, a newline,
9
+ // then one path per line (blank-line separated from the next record):
10
+ // \0<epoch>\n<path>\n<path>\n\n\0<epoch>\n<path>\n
11
+ // NUL can never appear in %ct output or in a path line, so splitting
12
+ // stdout on NUL yields exactly one chunk per commit. We deliberately do
13
+ // NOT print %H or %s: this pass needs only boundaries and timestamps,
14
+ // and omitting the subject removes the entire weird-subject parsing
15
+ // class. core.quotepath=false (prepended by GitRunner) keeps non-ASCII
16
+ // paths literal; a pathological newline-containing filename just becomes
17
+ // a non-matching line that the membership filters discard.
18
+ import { posix } from 'node:path';
19
+ import { log } from '../logger.js';
20
+ // Delegated to git via --max-count; also asserted parse-side so a huge
21
+ // repo can't blow the pair map regardless of what git returns.
22
+ export const GIT_COMMIT_CAP = 10_000;
23
+ // Commits touching more than this many files (vendored-dep updates, mass
24
+ // renames, formatting sweeps) are skipped ENTIRELY — both for pairs and
25
+ // for counts. Using one filtered stream for numerators AND denominators
26
+ // keeps confidence <= 1 as an invariant.
27
+ export const MAX_FILES_PER_COMMIT = 30;
28
+ // A pair must share at least this many commits to register as coupling.
29
+ export const MIN_SHARED_COMMITS = 3;
30
+ // Per-file partner lists are truncated to this many strongest partners
31
+ // to bound persisted cache size.
32
+ export const COCHANGES_PER_FILE_CAP = 20;
33
+ // Tools render top 10; the extra headroom serves the search boost and
34
+ // survives files dropping out of the index between analyses.
35
+ export const HOTSPOTS_KEPT = 50;
36
+ // Bounds transient memory for the pair accumulation (worst case ~40 MB).
37
+ // git log is newest-first, so when the cap hits, the most recent (most
38
+ // relevant) pairs are already in the map; we stop inserting NEW keys but
39
+ // keep incrementing existing ones.
40
+ const PAIR_MAP_CAP = 250_000;
41
+ export function buildLogArgs(windowDays, now = Date.now()) {
42
+ const since = new Date(now - windowDays * 86_400_000).toISOString();
43
+ return [
44
+ 'log',
45
+ '--no-merges',
46
+ // Rename detection is heuristic and git-version-dependent; with it
47
+ // disabled a rename is a plain delete+add, so the old path simply
48
+ // stops accruing and the new path starts fresh. Deterministic.
49
+ '--no-renames',
50
+ '--name-only',
51
+ `--max-count=${GIT_COMMIT_CAP}`,
52
+ `--since=${since}`,
53
+ '--pretty=format:%x00%ct',
54
+ ];
55
+ }
56
+ // `pathPrefix` handles project roots that are a SUBDIRECTORY of the git
57
+ // toplevel (monorepo packages): git log emits repo-relative paths
58
+ // ('packages/app/src/x.ts') while index keys are project-relative
59
+ // ('src/x.ts'). Paths under the prefix are stripped to index-relative;
60
+ // paths OUTSIDE it are rewritten project-relative too ('../'-prefixed
61
+ // via posix.relative) — index keys never start with '..', so an outside
62
+ // file like the toplevel package.json can never collide with the
63
+ // package's own package.json key (it would silently merge counts and
64
+ // fabricate co-change pairs otherwise). Outside paths only ever serve
65
+ // as confidence denominators and partner values.
66
+ // Pass '' (the default) when the project root IS the toplevel.
67
+ export function analyzeLog(stdout, isIndexed, pathPrefix = '') {
68
+ const counts = new Map();
69
+ const pairs = new Map();
70
+ let commitCount = 0;
71
+ let pairCapWarned = false;
72
+ for (const chunk of stdout.split('\u0000')) {
73
+ if (commitCount >= GIT_COMMIT_CAP)
74
+ break;
75
+ if (chunk.length === 0)
76
+ continue; // leading separator before the first record
77
+ const lines = chunk.split('\n');
78
+ const timestampSec = Number(lines[0]?.trim());
79
+ if (!Number.isFinite(timestampSec))
80
+ continue; // garbled record — drop, never throw
81
+ const timestampMs = timestampSec * 1000;
82
+ const files = new Set();
83
+ for (let i = 1; i < lines.length; i++) {
84
+ let path = lines[i].replace(/\r$/, '');
85
+ if (path.length === 0)
86
+ continue;
87
+ if (pathPrefix.length > 0) {
88
+ path = path.startsWith(pathPrefix)
89
+ ? path.slice(pathPrefix.length)
90
+ : posix.relative(pathPrefix, path);
91
+ }
92
+ if (path.length > 0)
93
+ files.add(path);
94
+ }
95
+ if (files.size === 0)
96
+ continue; // empty commit
97
+ if (files.size > MAX_FILES_PER_COMMIT)
98
+ continue;
99
+ commitCount++;
100
+ for (const path of files) {
101
+ counts.set(path, (counts.get(path) ?? 0) + 1);
102
+ }
103
+ const sorted = [...files].sort();
104
+ // Hoisted out of the O(k²) pair loop: per-pair isIndexed calls would
105
+ // re-resolve each path up to k-1 times.
106
+ const indexedFlags = sorted.map(isIndexed);
107
+ for (let i = 0; i < sorted.length; i++) {
108
+ for (let j = i + 1; j < sorted.length; j++) {
109
+ if (!indexedFlags[i] && !indexedFlags[j])
110
+ continue;
111
+ const key = `${sorted[i]}\u0000${sorted[j]}`;
112
+ const existing = pairs.get(key);
113
+ if (existing) {
114
+ existing.shared++;
115
+ }
116
+ else if (pairs.size < PAIR_MAP_CAP) {
117
+ // Newest-first log order: first sighting IS the most recent.
118
+ pairs.set(key, { shared: 1, lastSeen: timestampMs });
119
+ }
120
+ else if (!pairCapWarned) {
121
+ pairCapWarned = true;
122
+ log.debug(`git: co-change pair map hit ${PAIR_MAP_CAP} entries; older pairs ignored`);
123
+ }
124
+ }
125
+ }
126
+ }
127
+ const cochanges = new Map();
128
+ for (const [key, accum] of pairs) {
129
+ if (accum.shared < MIN_SHARED_COMMITS)
130
+ continue;
131
+ const sep = key.indexOf('\u0000');
132
+ const fileA = key.slice(0, sep);
133
+ const fileB = key.slice(sep + 1);
134
+ const commitsA = counts.get(fileA);
135
+ const commitsB = counts.get(fileB);
136
+ if (!commitsA || !commitsB)
137
+ continue; // defensive; both sides were counted
138
+ const record = {
139
+ fileA,
140
+ fileB,
141
+ sharedCommits: accum.shared,
142
+ confidenceAB: accum.shared / commitsA,
143
+ confidenceBA: accum.shared / commitsB,
144
+ lastSeen: accum.lastSeen,
145
+ };
146
+ if (isIndexed(fileA))
147
+ pushTo(cochanges, fileA, record);
148
+ if (isIndexed(fileB))
149
+ pushTo(cochanges, fileB, record);
150
+ }
151
+ for (const [path, list] of cochanges) {
152
+ list.sort((a, b) => b.sharedCommits - a.sharedCommits ||
153
+ comparePaths(partnerOf(a, path), partnerOf(b, path)));
154
+ if (list.length > COCHANGES_PER_FILE_CAP) {
155
+ cochanges.set(path, list.slice(0, COCHANGES_PER_FILE_CAP));
156
+ }
157
+ }
158
+ const hotspots = [...counts.entries()]
159
+ .filter(([path]) => isIndexed(path))
160
+ .sort((a, b) => b[1] - a[1] || comparePaths(a[0], b[0]))
161
+ .slice(0, HOTSPOTS_KEPT)
162
+ .map(([path]) => path);
163
+ return { counts, cochanges, hotspots, commitCount };
164
+ }
165
+ export function partnerOf(record, selfPath) {
166
+ return record.fileA === selfPath ? record.fileB : record.fileA;
167
+ }
168
+ function comparePaths(a, b) {
169
+ return a < b ? -1 : a > b ? 1 : 0;
170
+ }
171
+ function pushTo(map, key, value) {
172
+ const list = map.get(key);
173
+ if (list)
174
+ list.push(value);
175
+ else
176
+ map.set(key, [value]);
177
+ }