universal-ast-mapper 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +9 -0
  2. package/README.md +261 -12
  3. package/dist/ai-refactor.js +185 -0
  4. package/dist/ai-testgen.js +105 -0
  5. package/dist/analysis.js +134 -0
  6. package/dist/arch-rules.js +82 -0
  7. package/dist/callgraph.js +467 -0
  8. package/dist/check.js +112 -0
  9. package/dist/cli.js +2284 -0
  10. package/dist/complexity.js +98 -0
  11. package/dist/config.js +53 -0
  12. package/dist/contextpack.js +79 -0
  13. package/dist/coupling.js +35 -0
  14. package/dist/covmerge.js +176 -0
  15. package/dist/crosslang.js +425 -0
  16. package/dist/dashboard.js +259 -0
  17. package/dist/diagram.js +264 -0
  18. package/dist/diskcache.js +97 -0
  19. package/dist/docgen.js +156 -0
  20. package/dist/embeddings.js +136 -0
  21. package/dist/explain.js +123 -0
  22. package/dist/explorer.js +123 -0
  23. package/dist/extractors/c.js +204 -0
  24. package/dist/extractors/common.js +56 -0
  25. package/dist/extractors/cpp.js +272 -0
  26. package/dist/extractors/csharp.js +209 -0
  27. package/dist/extractors/go.js +212 -0
  28. package/dist/extractors/java.js +152 -0
  29. package/dist/extractors/kotlin.js +159 -0
  30. package/dist/extractors/php.js +208 -0
  31. package/dist/extractors/python.js +153 -0
  32. package/dist/extractors/ruby.js +146 -0
  33. package/dist/extractors/rust.js +249 -0
  34. package/dist/extractors/swift.js +192 -0
  35. package/dist/extractors/typescript.js +577 -0
  36. package/dist/fix.js +92 -0
  37. package/dist/gitdiff.js +178 -0
  38. package/dist/graph-analysis.js +279 -0
  39. package/dist/graph.js +165 -0
  40. package/dist/history.js +36 -0
  41. package/dist/html.js +658 -0
  42. package/dist/incremental.js +122 -0
  43. package/dist/index.js +1945 -0
  44. package/dist/indexstore.js +105 -0
  45. package/dist/layers.js +36 -0
  46. package/dist/lsp.js +238 -0
  47. package/dist/modulecoupling.js +0 -0
  48. package/dist/parser.js +84 -0
  49. package/dist/patch.js +199 -0
  50. package/dist/plugins.js +88 -0
  51. package/dist/pool.js +114 -0
  52. package/dist/prompts.js +67 -0
  53. package/dist/registry.js +87 -0
  54. package/dist/report.js +441 -0
  55. package/dist/resolver.js +222 -0
  56. package/dist/roots.js +47 -0
  57. package/dist/search.js +68 -0
  58. package/dist/security.js +178 -0
  59. package/dist/semantic.js +365 -0
  60. package/dist/serve.js +185 -0
  61. package/dist/sfc.js +27 -0
  62. package/dist/similar.js +98 -0
  63. package/dist/skeleton.js +132 -0
  64. package/dist/smells.js +285 -0
  65. package/dist/sourcemap.js +60 -0
  66. package/dist/testgen.js +280 -0
  67. package/dist/testmap.js +167 -0
  68. package/dist/tsconfig.js +212 -0
  69. package/dist/typeflow.js +124 -0
  70. package/dist/types.js +5 -0
  71. package/dist/unused-params.js +127 -0
  72. package/dist/webapp.js +341 -0
  73. package/dist/worker.js +27 -0
  74. package/dist/workspace.js +330 -0
  75. package/package.json +2 -1
package/dist/roots.js ADDED
@@ -0,0 +1,47 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ export function parseRootsFromEnv(env = process.env) {
4
+ const raw = env.AST_MAP_ROOT ?? process.cwd();
5
+ const roots = raw
6
+ .split(path.delimiter)
7
+ .map((p) => p.trim())
8
+ .filter((p) => p.length > 0)
9
+ .map((p) => path.resolve(p));
10
+ return {
11
+ roots: roots.length > 0 ? roots : [path.resolve(process.cwd())],
12
+ unlocked: env.AST_MAP_UNLOCKED === "1",
13
+ };
14
+ }
15
+ function within(root, abs) {
16
+ const rel = path.relative(root, abs);
17
+ if (rel === "")
18
+ return path.basename(abs);
19
+ if (rel.startsWith("..") || path.isAbsolute(rel))
20
+ return null;
21
+ return rel;
22
+ }
23
+ /**
24
+ * Resolve a client-supplied path against the allowed roots.
25
+ * Throws when the path escapes every root and unlocked mode is off.
26
+ */
27
+ export function resolvePathInRoots(input, cfg) {
28
+ const primary = cfg.roots[0];
29
+ const abs = path.resolve(primary, input);
30
+ for (const root of cfg.roots) {
31
+ const rel = within(root, abs);
32
+ if (rel !== null)
33
+ return { abs, rel, root };
34
+ }
35
+ if (cfg.unlocked) {
36
+ if (!fs.existsSync(abs)) {
37
+ throw new Error(`Path "${input}" does not exist (resolved to ${abs}).`);
38
+ }
39
+ const stat = fs.statSync(abs);
40
+ const root = stat.isDirectory() ? abs : path.dirname(abs);
41
+ return { abs, rel: path.basename(abs), root };
42
+ }
43
+ throw new Error(`Path "${input}" is outside the allowed root${cfg.roots.length > 1 ? "s" : ""} ` +
44
+ `(${cfg.roots.join(", ")}). Either set AST_MAP_ROOT to that project ` +
45
+ `(multiple roots allowed, separated by "${path.delimiter}"), or set ` +
46
+ `AST_MAP_UNLOCKED=1 to allow any absolute path.`);
47
+ }
package/dist/search.js ADDED
@@ -0,0 +1,68 @@
1
+ import path from "node:path";
2
+ import { buildSkeleton, collectSourceFiles } from "./skeleton.js";
3
+ import { resolveOptions, loadProjectConfig } from "./config.js";
4
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
5
+ /** Recursively yield every symbol in a file, including nested ones. */
6
+ function* flattenSymbols(symbols, file, parentName) {
7
+ for (const sym of symbols) {
8
+ const fullName = parentName ? `${parentName}.${sym.name}` : sym.name;
9
+ yield {
10
+ file,
11
+ symbol: fullName,
12
+ kind: sym.kind,
13
+ exported: sym.exported ?? false,
14
+ range: sym.range,
15
+ ...(sym.signature ? { signature: sym.signature } : {}),
16
+ };
17
+ if (sym.children.length > 0) {
18
+ yield* flattenSymbols(sym.children, file, fullName);
19
+ }
20
+ }
21
+ }
22
+ function makeMatcher(pattern, matchType) {
23
+ if (matchType === "exact") {
24
+ return (name) => name === pattern || name.endsWith(`.${pattern}`);
25
+ }
26
+ if (matchType === "regex") {
27
+ const re = new RegExp(pattern, "i");
28
+ return (name) => re.test(name);
29
+ }
30
+ // contains (default) — case-insensitive
31
+ const lower = pattern.toLowerCase();
32
+ return (name) => name.toLowerCase().includes(lower);
33
+ }
34
+ /**
35
+ * Search for symbols by name pattern across all source files in a directory.
36
+ * Traverses nested symbols (methods inside classes, etc.) with dot-notation names.
37
+ *
38
+ * @param dirAbs Absolute path of directory to scan.
39
+ * @param pattern Name to search for (matched per `matchType`).
40
+ * @param root Project root (for relative paths in results).
41
+ * @param options matchType, kind filter, exportedOnly, detail level.
42
+ */
43
+ export async function searchSymbols(dirAbs, pattern, root, options = {}) {
44
+ const { matchType = "contains", kind, exportedOnly = false, detail = "outline" } = options;
45
+ const test = makeMatcher(pattern, matchType);
46
+ const opts = resolveOptions({ detail, emitHtml: false }, loadProjectConfig(root));
47
+ const files = collectSourceFiles(dirAbs, opts);
48
+ const results = [];
49
+ for (const file of files) {
50
+ const fileRel = path.relative(root, file).split(path.sep).join("/");
51
+ try {
52
+ const skel = await buildSkeleton(file, fileRel, opts);
53
+ for (const match of flattenSymbols(skel.symbols, skel.file)) {
54
+ if (!test(match.symbol))
55
+ continue;
56
+ if (kind && match.kind !== kind)
57
+ continue;
58
+ if (exportedOnly && !match.exported)
59
+ continue;
60
+ results.push(match);
61
+ }
62
+ }
63
+ catch {
64
+ // skip unreadable / unparseable files
65
+ }
66
+ }
67
+ return results;
68
+ }
@@ -0,0 +1,178 @@
1
+ // ─── Static security scanner ──────────────────────────────────────────────────
2
+ // Line-by-line regex scanning — no AST needed. Finds dangerous patterns
3
+ // in source code across JavaScript, TypeScript, Python, etc.
4
+ // ─── Rule definitions ─────────────────────────────────────────────────────────
5
+ export const SECURITY_RULES = [
6
+ {
7
+ id: "eval",
8
+ severity: "critical",
9
+ message: "Use of eval() allows arbitrary code execution",
10
+ // matches eval( but not eval.toString( or eval.call(
11
+ pattern: /\beval\s*\(/,
12
+ exclude: /\beval\s*\.\s*\w+/,
13
+ },
14
+ {
15
+ id: "inner-html",
16
+ severity: "high",
17
+ message: "Direct assignment to innerHTML can lead to XSS",
18
+ // .innerHTML = but not .innerHTML +=
19
+ pattern: /\.innerHTML\s*=[^=+]/,
20
+ },
21
+ {
22
+ id: "document-write",
23
+ severity: "high",
24
+ message: "document.write() can overwrite the page and lead to XSS",
25
+ pattern: /\bdocument\s*\.\s*write\s*\(/,
26
+ },
27
+ {
28
+ id: "dangerously-set-inner-html",
29
+ severity: "high",
30
+ message: "dangerouslySetInnerHTML bypasses React's XSS protection",
31
+ pattern: /dangerouslySetInnerHTML/,
32
+ },
33
+ {
34
+ id: "child-process",
35
+ severity: "medium",
36
+ message: "Use of child_process module can lead to command injection if inputs are not sanitized",
37
+ pattern: /require\s*\(\s*['"]child_process['"]\s*\)|import\s+.*\bchild_process\b/,
38
+ },
39
+ {
40
+ id: "shell-exec",
41
+ severity: "high",
42
+ message: "exec/execSync with a non-literal argument is vulnerable to command injection",
43
+ // Only flag when the argument looks like a variable/template (contains $ or an identifier before ))
44
+ pattern: /\b(?:exec|execSync)\s*\(\s*(?:[`$]|\w+\s*[+,)])/,
45
+ },
46
+ {
47
+ id: "weak-crypto",
48
+ severity: "medium",
49
+ message: "MD5 and SHA-1 are cryptographically weak and should not be used for security purposes",
50
+ pattern: /createHash\s*\(\s*['"](?:md5|sha1)['"]\s*\)/i,
51
+ },
52
+ {
53
+ id: "hardcoded-secret",
54
+ severity: "high",
55
+ message: "Hardcoded secret/credential detected",
56
+ // variable named password/secret/api_key/apiKey/token/passwd assigned a string literal of 8+ chars
57
+ pattern: /(?:password|secret|api_key|apiKey|token|passwd)\s*[=:]\s*['"][^'"]{8,}['"]/i,
58
+ // filter out common placeholders
59
+ exclude: /(?:your[-_]?key|xxx+|changeme|placeholder|example|test|dummy|sample|fake|mock|<|>|\*)/i,
60
+ },
61
+ {
62
+ id: "sql-injection",
63
+ severity: "high",
64
+ message: "SQL query built with string concatenation may be vulnerable to injection",
65
+ // query( or execute( followed by string concatenation on same line or a nearby + sign
66
+ pattern: /\b(?:query|execute)\s*\(\s*[`"']?[^)]*\+/,
67
+ },
68
+ {
69
+ id: "http-url",
70
+ severity: "low",
71
+ message: "Hardcoded HTTP (non-HTTPS) URL detected",
72
+ pattern: /['"`]http:\/\/(?!(?:localhost|127\.0\.0\.1|0\.0\.0\.0|example\.com|schema\.org))/,
73
+ },
74
+ {
75
+ id: "no-rate-limit",
76
+ severity: "medium",
77
+ message: "Express route handler without apparent rate limiting",
78
+ // Express route .get( or .post( etc.
79
+ pattern: /\.\s*(?:get|post|put|patch|delete|all)\s*\(\s*['"`]/,
80
+ // only applies to JS/TS files
81
+ fileFilter: /\.[jt]sx?$/,
82
+ },
83
+ {
84
+ id: "prototype-pollution",
85
+ severity: "high",
86
+ message: "Potential prototype pollution via __proto__, constructor.prototype, or unsafe Object.assign",
87
+ pattern: /(?:__proto__|constructor\s*\.\s*prototype|Object\.assign\s*\(\s*\{\s*\}[^)]*(?:req|params|body|input|data|user))/,
88
+ },
89
+ ];
90
+ // ─── Comment-line detection ───────────────────────────────────────────────────
91
+ /**
92
+ * Returns true when the (trimmed) line is a comment and should be skipped
93
+ * for most security rules. Covers //, #, * (JSDoc / block comment lines).
94
+ */
95
+ function isCommentLine(trimmed) {
96
+ return (trimmed.startsWith("//") ||
97
+ trimmed.startsWith("#") ||
98
+ trimmed.startsWith("*") ||
99
+ trimmed.startsWith("/*"));
100
+ }
101
+ // Rules that should also scan comment lines (none by default — keep list empty
102
+ // but the structure allows future exceptions).
103
+ const SCAN_COMMENTS_FOR = new Set([]);
104
+ // ─── Core scanner ─────────────────────────────────────────────────────────────
105
+ /**
106
+ * Scan a single file's source text for security issues.
107
+ *
108
+ * @param source - Full file contents.
109
+ * @param relPath - Relative file path (used for fileFilter matching and issue reporting).
110
+ * @param rules - Rule set to apply (defaults to SECURITY_RULES).
111
+ */
112
+ export function scanFileForSecurityIssues(source, relPath, rules = SECURITY_RULES) {
113
+ const issues = [];
114
+ const lines = source.split("\n");
115
+ // Pre-filter rules by fileFilter so we don't re-test every line.
116
+ const applicableRules = rules.filter((r) => r.fileFilter === undefined || r.fileFilter.test(relPath));
117
+ // Build a lookup of line indices that need rate-limit context checks.
118
+ // (We collect matches first, then do the window search in one pass.)
119
+ const rateLimitMatches = [];
120
+ for (let i = 0; i < lines.length; i++) {
121
+ const raw = lines[i];
122
+ const trimmed = raw.trim();
123
+ const lineNo = i + 1; // 1-based
124
+ for (const rule of applicableRules) {
125
+ // Skip comment lines unless the rule explicitly needs them.
126
+ if (isCommentLine(trimmed) && !SCAN_COMMENTS_FOR.has(rule.id))
127
+ continue;
128
+ if (!rule.pattern.test(raw))
129
+ continue;
130
+ if (rule.exclude && rule.exclude.test(raw))
131
+ continue;
132
+ // Special handling for no-rate-limit: defer until we have all line indices.
133
+ if (rule.id === "no-rate-limit") {
134
+ rateLimitMatches.push(i);
135
+ continue;
136
+ }
137
+ issues.push({
138
+ file: relPath,
139
+ rule: rule.id,
140
+ severity: rule.severity,
141
+ message: rule.message,
142
+ line: lineNo,
143
+ snippet: trimmed.slice(0, 120),
144
+ });
145
+ }
146
+ }
147
+ // ── no-rate-limit: window check ──────────────────────────────────────────
148
+ // For each matched route line, look 5 lines before and 5 lines after for
149
+ // rate-limit keywords. Only emit an issue when none are found nearby.
150
+ const rateLimitRule = applicableRules.find((r) => r.id === "no-rate-limit");
151
+ if (rateLimitRule && rateLimitMatches.length > 0) {
152
+ const WINDOW = 5;
153
+ const rateLimitKeyword = /rateLimit|throttle|limiter/i;
154
+ for (const idx of rateLimitMatches) {
155
+ const windowStart = Math.max(0, idx - WINDOW);
156
+ const windowEnd = Math.min(lines.length - 1, idx + WINDOW);
157
+ let hasRateLimit = false;
158
+ for (let w = windowStart; w <= windowEnd; w++) {
159
+ if (rateLimitKeyword.test(lines[w])) {
160
+ hasRateLimit = true;
161
+ break;
162
+ }
163
+ }
164
+ if (!hasRateLimit) {
165
+ const trimmed = lines[idx].trim();
166
+ issues.push({
167
+ file: relPath,
168
+ rule: "no-rate-limit",
169
+ severity: rateLimitRule.severity,
170
+ message: rateLimitRule.message,
171
+ line: idx + 1,
172
+ snippet: trimmed.slice(0, 120),
173
+ });
174
+ }
175
+ }
176
+ }
177
+ return issues;
178
+ }
@@ -0,0 +1,365 @@
1
+ /**
2
+ * Semantic symbol search — find symbols by *meaning*, not exact name.
3
+ *
4
+ * No embeddings, no network, no model downloads. Pure lexical semantics:
5
+ * 1. Identifier tokenization — camelCase / PascalCase / snake_case /
6
+ * kebab-case / digits / acronym boundaries ("HTTPServer" → http, server).
7
+ * 2. Concept expansion — a built-in thesaurus of programming
8
+ * synonym groups (fetch≈get≈load≈retrieve, remove≈delete≈destroy, …).
9
+ * 3. Light stemming — plural/gerund/past suffixes folded so
10
+ * "parsing" matches "parse", "users" matches "user".
11
+ * 4. BM25-style ranking — rare tokens weigh more (IDF over the
12
+ * scanned corpus); name hits outweigh doc/signature/path hits;
13
+ * direct hits outweigh synonym hits outweigh fuzzy hits.
14
+ */
15
+ import path from "node:path";
16
+ import { buildSkeleton, collectSourceFiles } from "./skeleton.js";
17
+ import { resolveOptions, loadProjectConfig } from "./config.js";
18
+ // ─── Synonym groups (programming thesaurus) ────────────────────────────────────
19
+ // Tokens in the same group are considered semantically equivalent (at a small
20
+ // penalty vs. a direct match). Keep each group tight — over-broad groups cause
21
+ // noisy results.
22
+ const SYNONYM_GROUPS = [
23
+ ["get", "fetch", "load", "retrieve", "read", "lookup", "resolve"],
24
+ ["set", "update", "write", "assign", "put", "patch", "modify", "change", "edit"],
25
+ ["create", "make", "build", "new", "generate", "construct", "init", "initialize", "spawn"],
26
+ ["delete", "remove", "destroy", "drop", "clear", "purge", "erase"],
27
+ ["find", "search", "query", "locate", "match", "scan", "discover"],
28
+ ["send", "dispatch", "emit", "publish", "post", "broadcast", "notify"],
29
+ ["receive", "consume", "subscribe", "listen", "handle", "process"],
30
+ ["start", "begin", "launch", "run", "execute", "invoke", "trigger"],
31
+ ["stop", "end", "halt", "kill", "terminate", "cancel", "abort", "shutdown", "close"],
32
+ ["check", "validate", "verify", "test", "assert", "ensure", "confirm"],
33
+ ["parse", "decode", "deserialize", "unmarshal", "extract", "tokenize"],
34
+ ["format", "encode", "serialize", "marshal", "stringify", "render", "print"],
35
+ ["convert", "transform", "map", "translate", "cast", "normalize"],
36
+ ["user", "account", "member", "person", "profile", "customer"],
37
+ ["auth", "authenticate", "login", "signin", "authorize", "session", "credential"],
38
+ ["config", "configuration", "settings", "options", "preferences", "setup"],
39
+ ["error", "exception", "fault", "failure", "err", "panic"],
40
+ ["log", "logger", "logging", "trace", "audit"],
41
+ ["cache", "memo", "memoize", "store", "buffer"],
42
+ ["list", "enumerate", "all", "collection", "array", "items"],
43
+ ["count", "total", "sum", "aggregate", "tally"],
44
+ ["file", "document", "path", "filename"],
45
+ ["dir", "directory", "folder"],
46
+ ["request", "req", "call", "http"],
47
+ ["response", "res", "reply", "result", "output"],
48
+ ["message", "msg", "event", "signal"],
49
+ ["connect", "connection", "link", "attach", "bind", "join"],
50
+ ["disconnect", "detach", "unbind", "release", "unsubscribe"],
51
+ ["save", "persist", "commit", "flush", "sync"],
52
+ ["copy", "clone", "duplicate", "snapshot"],
53
+ ["merge", "combine", "concat", "union", "join"],
54
+ ["split", "divide", "partition", "chunk", "segment"],
55
+ ["sort", "order", "rank", "arrange"],
56
+ ["filter", "select", "exclude", "where"],
57
+ ["compare", "diff", "equal", "equals", "cmp"],
58
+ ["compute", "calculate", "calc", "derive", "evaluate", "measure"],
59
+ ["watch", "observe", "monitor", "track", "poll"],
60
+ ["wait", "sleep", "delay", "debounce", "throttle", "defer"],
61
+ ["retry", "attempt", "backoff"],
62
+ ["lock", "mutex", "semaphore", "guard"],
63
+ ["queue", "stack", "heap", "pool", "buffer"],
64
+ ["graph", "tree", "node", "edge", "vertex"],
65
+ ["dependency", "dep", "import", "require"],
66
+ ["token", "symbol", "identifier", "ident", "name"],
67
+ ["database", "db", "storage", "repository", "repo", "dao"],
68
+ ["key", "id", "identifier", "uuid", "guid"],
69
+ ["string", "str", "text", "char"],
70
+ ["number", "num", "int", "integer", "float", "numeric"],
71
+ ["boolean", "bool", "flag", "toggle"],
72
+ ["helper", "util", "utility", "utils", "tool", "common"],
73
+ ["test", "spec", "mock", "stub", "fixture"],
74
+ ["render", "draw", "paint", "display", "show", "view"],
75
+ ["hide", "conceal", "mask", "suppress"],
76
+ ["enable", "activate", "on"],
77
+ ["disable", "deactivate", "off"],
78
+ ["add", "insert", "append", "push", "register"],
79
+ ["pop", "shift", "dequeue", "take"],
80
+ ["circular", "cycle", "cyclic", "loop", "recursive"],
81
+ ["dead", "unused", "orphan", "unreachable", "stale"],
82
+ ["complexity", "complex", "cyclomatic", "cognitive"],
83
+ ["coupling", "cohesion", "instability", "afferent", "efferent"],
84
+ ];
85
+ const GROUP_OF = new Map();
86
+ SYNONYM_GROUPS.forEach((group, gi) => {
87
+ for (const word of group) {
88
+ // Register both raw and stemmed forms so stemmed corpus/query tokens
89
+ // ("setting", "item") still hit groups declared as "settings", "items".
90
+ for (const form of new Set([word, stem(word)])) {
91
+ const list = GROUP_OF.get(form);
92
+ if (list) {
93
+ if (!list.includes(gi))
94
+ list.push(gi);
95
+ }
96
+ else {
97
+ GROUP_OF.set(form, [gi]);
98
+ }
99
+ }
100
+ }
101
+ });
102
+ // ─── Tokenization ──────────────────────────────────────────────────────────────
103
+ /** Light stemmer: fold common English suffixes so "parsing"→"parse", "users"→"user". */
104
+ export function stem(word) {
105
+ let w = word;
106
+ if (w.length > 4 && w.endsWith("ies"))
107
+ return w.slice(0, -3) + "y";
108
+ if (w.length > 4 && w.endsWith("ing")) {
109
+ w = w.slice(0, -3);
110
+ // "mapping" → "mapp" → "map"; "parsing" → "pars" → add back "e"? keep both simple:
111
+ if (w.length > 2 && w[w.length - 1] === w[w.length - 2])
112
+ w = w.slice(0, -1);
113
+ return w;
114
+ }
115
+ if (w.length > 4 && w.endsWith("ed")) {
116
+ w = w.slice(0, -2);
117
+ if (w.length > 2 && w[w.length - 1] === w[w.length - 2])
118
+ w = w.slice(0, -1);
119
+ return w;
120
+ }
121
+ if (w.length > 3 && w.endsWith("es"))
122
+ return w.slice(0, -2);
123
+ if (w.length > 3 && w.endsWith("s") && !w.endsWith("ss"))
124
+ return w.slice(0, -1);
125
+ return w;
126
+ }
127
+ /**
128
+ * Split an identifier into lowercase word tokens.
129
+ * Handles camelCase, PascalCase, snake_case, kebab-case, dots, digits and
130
+ * acronym boundaries: "getHTTPServerByID" → [get, http, server, by, id].
131
+ */
132
+ export function splitIdentifier(identifier) {
133
+ const out = [];
134
+ for (const chunk of identifier.split(/[^A-Za-z0-9]+/)) {
135
+ if (!chunk)
136
+ continue;
137
+ // Insert boundaries: aA | AAa (acronym→word) | letter↔digit
138
+ const spaced = chunk
139
+ .replace(/([a-z0-9])([A-Z])/g, "$1 $2")
140
+ .replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
141
+ .replace(/([A-Za-z])([0-9])/g, "$1 $2")
142
+ .replace(/([0-9])([A-Za-z])/g, "$1 $2");
143
+ for (const word of spaced.split(" ")) {
144
+ if (word)
145
+ out.push(word.toLowerCase());
146
+ }
147
+ }
148
+ return out;
149
+ }
150
+ /** Levenshtein distance with early exit when > max. */
151
+ function editDistance(a, b, max) {
152
+ if (Math.abs(a.length - b.length) > max)
153
+ return max + 1;
154
+ const prev = new Array(b.length + 1);
155
+ const curr = new Array(b.length + 1);
156
+ for (let j = 0; j <= b.length; j++)
157
+ prev[j] = j;
158
+ for (let i = 1; i <= a.length; i++) {
159
+ curr[0] = i;
160
+ let rowMin = curr[0];
161
+ for (let j = 1; j <= b.length; j++) {
162
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
163
+ curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
164
+ if (curr[j] < rowMin)
165
+ rowMin = curr[j];
166
+ }
167
+ if (rowMin > max)
168
+ return max + 1;
169
+ for (let j = 0; j <= b.length; j++)
170
+ prev[j] = curr[j];
171
+ }
172
+ return prev[b.length];
173
+ }
174
+ function sharesGroup(a, b) {
175
+ const ga = GROUP_OF.get(a);
176
+ if (!ga)
177
+ return false;
178
+ const gb = GROUP_OF.get(b);
179
+ if (!gb)
180
+ return false;
181
+ return ga.some((g) => gb.includes(g));
182
+ }
183
+ const FIELD_WEIGHT = { name: 3, doc: 2, signature: 1.5, path: 1, kind: 1 };
184
+ function addToken(doc, raw, weight) {
185
+ const t = stem(raw);
186
+ if (t.length < 2)
187
+ return;
188
+ const existing = doc.tokens.get(t);
189
+ if (existing === undefined || weight > existing)
190
+ doc.tokens.set(t, weight);
191
+ }
192
+ function* flattenDocs(symbols, file, parentName) {
193
+ for (const sym of symbols) {
194
+ const fullName = parentName ? `${parentName}.${sym.name}` : sym.name;
195
+ yield { sym, fullName };
196
+ if (sym.children.length > 0)
197
+ yield* flattenDocs(sym.children, file, fullName);
198
+ }
199
+ }
200
+ function buildDoc(sym, fullName, file) {
201
+ const doc = {
202
+ match: {
203
+ file,
204
+ symbol: fullName,
205
+ kind: sym.kind,
206
+ exported: sym.exported ?? false,
207
+ range: sym.range,
208
+ ...(sym.signature ? { signature: sym.signature } : {}),
209
+ },
210
+ tokens: new Map(),
211
+ nameTokens: new Set(),
212
+ };
213
+ for (const t of splitIdentifier(fullName)) {
214
+ addToken(doc, t, FIELD_WEIGHT.name);
215
+ doc.nameTokens.add(stem(t));
216
+ }
217
+ addToken(doc, sym.kind, FIELD_WEIGHT.kind);
218
+ if (sym.doc) {
219
+ for (const t of splitIdentifier(sym.doc))
220
+ addToken(doc, t, FIELD_WEIGHT.doc);
221
+ }
222
+ if (sym.signature) {
223
+ for (const t of splitIdentifier(sym.signature))
224
+ addToken(doc, t, FIELD_WEIGHT.signature);
225
+ }
226
+ for (const seg of file.split("/")) {
227
+ for (const t of splitIdentifier(seg))
228
+ addToken(doc, t, FIELD_WEIGHT.path);
229
+ }
230
+ return doc;
231
+ }
232
+ // ─── Scoring ───────────────────────────────────────────────────────────────────
233
+ const MATCH_WEIGHT = { direct: 1, synonym: 0.7, fuzzy: 0.45 };
234
+ // English/query stopwords — ignored as query concepts.
235
+ const STOPWORDS = new Set([
236
+ "a", "an", "the", "of", "in", "on", "for", "to", "with", "that", "this",
237
+ "is", "are", "be", "and", "or", "by", "from", "at", "it", "its", "as",
238
+ "do", "does", "how", "what", "which", "where", "when", "i", "we", "you",
239
+ "function", "method", "code", "thing", "stuff", "something",
240
+ ]);
241
+ /**
242
+ * Search for symbols by meaning across all source files in a directory.
243
+ *
244
+ * @param dirAbs Absolute path of directory to scan.
245
+ * @param query Natural-language-ish query, e.g. "remove expired sessions".
246
+ * @param root Project root (for relative paths in results).
247
+ * @param options limit, kind filter, exportedOnly.
248
+ */
249
+ export async function semanticSearch(dirAbs, query, root, options = {}) {
250
+ const { limit = 20, kind, exportedOnly = false } = options;
251
+ // Query concepts: tokenized, stopword-filtered, stemmed (dedup, keep order).
252
+ const concepts = [];
253
+ for (const raw of splitIdentifier(query)) {
254
+ if (STOPWORDS.has(raw))
255
+ continue;
256
+ const t = stem(raw);
257
+ if (t.length >= 2 && !concepts.includes(t))
258
+ concepts.push(t);
259
+ }
260
+ if (concepts.length === 0)
261
+ return [];
262
+ // Build corpus (detail "full" so doc comments and signatures are available).
263
+ const opts = resolveOptions({ detail: "full", emitHtml: false }, loadProjectConfig(root));
264
+ const files = collectSourceFiles(dirAbs, opts);
265
+ const docs = [];
266
+ for (const file of files) {
267
+ const fileRel = path.relative(root, file).split(path.sep).join("/");
268
+ try {
269
+ const skel = await buildSkeleton(file, fileRel, opts);
270
+ for (const { sym, fullName } of flattenDocs(skel.symbols, skel.file)) {
271
+ if (kind && sym.kind !== kind)
272
+ continue;
273
+ if (exportedOnly && !(sym.exported ?? false))
274
+ continue;
275
+ docs.push(buildDoc(sym, fullName, skel.file));
276
+ }
277
+ }
278
+ catch {
279
+ // skip unreadable / unparseable files
280
+ }
281
+ }
282
+ if (docs.length === 0)
283
+ return [];
284
+ // Document frequency per concept (direct-token presence) → BM25-ish IDF.
285
+ const N = docs.length;
286
+ const idf = new Map();
287
+ for (const concept of concepts) {
288
+ let df = 0;
289
+ for (const doc of docs)
290
+ if (doc.tokens.has(concept))
291
+ df++;
292
+ idf.set(concept, Math.log(1 + (N - df + 0.5) / (df + 0.5)));
293
+ }
294
+ const scored = [];
295
+ for (const doc of docs) {
296
+ let score = 0;
297
+ const matchedTerms = [];
298
+ let nameHits = 0;
299
+ for (const concept of concepts) {
300
+ let best = 0;
301
+ let how = null;
302
+ for (const [token, fieldWeight] of doc.tokens) {
303
+ let mw = 0;
304
+ let label = null;
305
+ if (token === concept) {
306
+ mw = MATCH_WEIGHT.direct;
307
+ label = concept;
308
+ }
309
+ else if (sharesGroup(token, concept)) {
310
+ mw = MATCH_WEIGHT.synonym;
311
+ label = `${concept}≈${token}`;
312
+ }
313
+ else if (concept.length >= 4 &&
314
+ token.length >= 4 &&
315
+ editDistance(token, concept, 1) <= 1) {
316
+ mw = MATCH_WEIGHT.fuzzy;
317
+ label = `${concept}~${token}`;
318
+ }
319
+ const contribution = mw * fieldWeight;
320
+ if (contribution > best) {
321
+ best = contribution;
322
+ how = label;
323
+ if (fieldWeight >= FIELD_WEIGHT.name && mw === MATCH_WEIGHT.direct)
324
+ break; // can't beat this
325
+ }
326
+ }
327
+ if (best > 0 && how) {
328
+ score += best * (idf.get(concept) ?? 1);
329
+ matchedTerms.push(how);
330
+ if (doc.nameTokens.has(concept))
331
+ nameHits++;
332
+ }
333
+ }
334
+ if (matchedTerms.length === 0)
335
+ continue;
336
+ // Bonuses: all concepts matched; full query substring of name; coverage ratio.
337
+ const coverage = matchedTerms.length / concepts.length;
338
+ score *= 0.5 + 0.5 * coverage;
339
+ if (nameHits === concepts.length)
340
+ score *= 1.25;
341
+ const flatQuery = concepts.join("");
342
+ if (doc.match.symbol.toLowerCase().includes(flatQuery))
343
+ score *= 1.2;
344
+ // Length normalization: prefer focused names — "login" beats "handleLogin"
345
+ // when both match the same concepts. Penalize name tokens no concept explains.
346
+ let unmatchedNameTokens = 0;
347
+ for (const t of doc.nameTokens) {
348
+ const explained = concepts.some((c) => t === c ||
349
+ sharesGroup(t, c) ||
350
+ (c.length >= 4 && t.length >= 4 && editDistance(t, c, 1) <= 1));
351
+ if (!explained)
352
+ unmatchedNameTokens++;
353
+ }
354
+ score /= 1 + 0.15 * unmatchedNameTokens;
355
+ scored.push({ ...doc.match, score, matchedTerms });
356
+ }
357
+ scored.sort((a, b) => b.score - a.score || a.symbol.localeCompare(b.symbol));
358
+ const top = scored.slice(0, limit);
359
+ // Normalize scores to 0–1 within the result set.
360
+ const max = top.length > 0 ? top[0].score : 1;
361
+ if (max > 0)
362
+ for (const m of top)
363
+ m.score = Math.round((m.score / max) * 1000) / 1000;
364
+ return top;
365
+ }