@massu/core 1.5.7 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,317 @@
1
+ // src/detect/adapters/tree-sitter-loader.ts
2
+ import { createHash } from "crypto";
3
+ import {
4
+ mkdirSync,
5
+ readdirSync,
6
+ readFileSync,
7
+ writeFileSync,
8
+ renameSync,
9
+ unlinkSync,
10
+ lstatSync,
11
+ chmodSync,
12
+ utimesSync
13
+ } from "fs";
14
+ import { homedir } from "os";
15
+ import { dirname, join } from "path";
16
+ import { Language, Parser } from "web-tree-sitter";
17
+ var GrammarSHAMismatchError = class extends Error {
18
+ language;
19
+ expected;
20
+ actual;
21
+ constructor(language, expected, actual) {
22
+ super(
23
+ `[tree-sitter-loader] SHA-256 mismatch for grammar "${language}". Expected ${expected}, got ${actual}. REFUSING to load \u2014 see Phase 3.5 audit attack vector #3.`
24
+ );
25
+ this.name = "GrammarSHAMismatchError";
26
+ this.language = language;
27
+ this.expected = expected;
28
+ this.actual = actual;
29
+ }
30
+ };
31
+ var GrammarUnavailableError = class extends Error {
32
+ language;
33
+ cause;
34
+ constructor(language, cause) {
35
+ const causeMsg = cause instanceof Error ? cause.message : cause ? String(cause) : "no cached grammar and download failed";
36
+ super(
37
+ `[tree-sitter-loader] Grammar for "${language}" is unavailable: ${causeMsg}. Falling back to regex introspection for files in ${language}.`
38
+ );
39
+ this.name = "GrammarUnavailableError";
40
+ this.language = language;
41
+ this.cause = cause;
42
+ }
43
+ };
44
+ var GrammarCacheSymlinkError = class extends Error {
45
+ cachePath;
46
+ constructor(cachePath) {
47
+ super(
48
+ `[tree-sitter-loader] Refusing to load grammar \u2014 cache path "${cachePath}" is a symlink or non-regular file. (Phase 3.5 finding #3 \u2014 symlink attack vector.)`
49
+ );
50
+ this.name = "GrammarCacheSymlinkError";
51
+ this.cachePath = cachePath;
52
+ }
53
+ };
54
+ var GrammarUrlNotHttpsError = class extends Error {
55
+ url;
56
+ constructor(url) {
57
+ super(
58
+ `[tree-sitter-loader] Refusing to download grammar from non-HTTPS URL: ${url}. Only https:// URLs are accepted. (Phase 3.5 finding #3.)`
59
+ );
60
+ this.name = "GrammarUrlNotHttpsError";
61
+ this.url = url;
62
+ }
63
+ };
64
+ var GRAMMAR_MANIFEST = {
65
+ python: {
66
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-python.wasm",
67
+ sha256: "9056d0fb0c337810d019fae350e8167786119da98f0f282aceae7ab89ee8253b",
68
+ version: "0.1.13"
69
+ },
70
+ typescript: {
71
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-typescript.wasm",
72
+ sha256: "8515404dceed38e1ed86aa34b09fcf3379fff1b4ff9dd3967bcd6d1eb5ac3d8f",
73
+ version: "0.1.13"
74
+ },
75
+ javascript: {
76
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-javascript.wasm",
77
+ sha256: "63812b9e275d26851264734868d27a1656bd44a2ef6eb3e85e6b03728c595ab5",
78
+ version: "0.1.13"
79
+ },
80
+ swift: {
81
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-swift.wasm",
82
+ sha256: "41c4fdb2249a3aa6d87eed0d383081ff09725c2248b4977043a43825980ffcc7",
83
+ version: "0.1.13"
84
+ },
85
+ // ----------------------------------------------------------------
86
+ // Plan 3c Phase 7 expansion (2026-05-07):
87
+ //
88
+ // Six additional grammars to support the registry-verified framework
89
+ // adapters (go-chi, rails, aspnet, spring, ktor, phoenix) plus the
90
+ // bundled adapters in the same language families (gin/echo/fiber,
91
+ // sinatra, etc.). All entries use the SAME pinned tree-sitter-wasms
92
+ // version (0.1.13) as the v1 four to keep the dependency surface
93
+ // single-source.
94
+ //
95
+ // SHA-256s computed 2026-05-07 via:
96
+ // curl -fsSL <url> | shasum -a 256
97
+ //
98
+ // The unpkg filename for C# uses an underscore (`c_sharp`) while the
99
+ // TreeSitterLanguage identifier uses no separator (`csharp`); the map
100
+ // key is the type identifier, the URL is the storage path — they do
101
+ // NOT need to match, the same as how `python` maps to `tree-sitter-
102
+ // python.wasm`. This is intentional and validated by the manifest
103
+ // shape test in tree-sitter-loader-manifest.test.ts.
104
+ // ----------------------------------------------------------------
105
+ go: {
106
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-go.wasm",
107
+ sha256: "9963ca89b616eaf04b08a43bc1fb0f07b85395bec313330851f1f1ead2f755b6",
108
+ version: "0.1.13"
109
+ },
110
+ ruby: {
111
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-ruby.wasm",
112
+ sha256: "93a5022855314cdb45458c7bb026a24a0ebc3a5ff6439e542e881f14dfa13a39",
113
+ version: "0.1.13"
114
+ },
115
+ csharp: {
116
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-c_sharp.wasm",
117
+ sha256: "6266a7e32d68a3459104d994dc848df15d5672b0ea8e86d327274b694f8e6991",
118
+ version: "0.1.13"
119
+ },
120
+ java: {
121
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-java.wasm",
122
+ sha256: "637aac4415fb39a211a4f4292d63c66b5ce9c32fa2cd35464af4f681d91b9a1f",
123
+ version: "0.1.13"
124
+ },
125
+ kotlin: {
126
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-kotlin.wasm",
127
+ sha256: "b5cb00c8d06ed0f10f1dbe497205b437809d7e87db1f638721a8cfb30e044449",
128
+ version: "0.1.13"
129
+ },
130
+ elixir: {
131
+ url: "https://unpkg.com/tree-sitter-wasms@0.1.13/out/tree-sitter-elixir.wasm",
132
+ sha256: "82e91b9759ddca30d8978ebbfa8e347b4451b64c931f9ae62112e6db9b8fac20",
133
+ version: "0.1.13"
134
+ }
135
+ };
136
+ function getCacheDir() {
137
+ return process.env.MASSU_WASM_CACHE_DIR ?? join(homedir(), ".massu", "wasm-cache");
138
+ }
139
+ function getCachedPath(language, sha) {
140
+ return join(getCacheDir(), `${language}-${sha}.wasm`);
141
+ }
142
+ var DEFAULT_CACHE_RETAIN_COUNT = 16;
143
+ function getCacheRetainCount() {
144
+ const env = process.env.MASSU_WASM_CACHE_RETAIN;
145
+ if (env) {
146
+ const n = Number(env);
147
+ if (Number.isFinite(n) && n >= 1 && n <= 1024) return Math.floor(n);
148
+ }
149
+ return DEFAULT_CACHE_RETAIN_COUNT;
150
+ }
151
+ function touchCacheFile(path) {
152
+ try {
153
+ const now = /* @__PURE__ */ new Date();
154
+ utimesSync(path, now, now);
155
+ } catch {
156
+ }
157
+ }
158
+ function evictBeyondRetainCount(retain = getCacheRetainCount()) {
159
+ const dir = getCacheDir();
160
+ let entries;
161
+ try {
162
+ entries = readdirSync(dir);
163
+ } catch {
164
+ return;
165
+ }
166
+ const candidates = [];
167
+ for (const name of entries) {
168
+ if (!name.endsWith(".wasm")) continue;
169
+ const path = join(dir, name);
170
+ let stat;
171
+ try {
172
+ stat = lstatSync(path);
173
+ } catch {
174
+ continue;
175
+ }
176
+ if (stat.isSymbolicLink() || !stat.isFile()) {
177
+ console.error(
178
+ `[tree-sitter-loader] cache eviction skipped non-regular file: ${path} (possible symlink attack \u2014 see Phase 3.5 finding F-008).`
179
+ );
180
+ continue;
181
+ }
182
+ candidates.push({ path, mtimeMs: stat.mtimeMs });
183
+ }
184
+ if (candidates.length <= retain) return;
185
+ candidates.sort((a, b) => b.mtimeMs - a.mtimeMs);
186
+ for (const victim of candidates.slice(retain)) {
187
+ try {
188
+ unlinkSync(victim.path);
189
+ } catch {
190
+ }
191
+ }
192
+ }
193
+ function _evictCacheForTest(retain) {
194
+ evictBeyondRetainCount(retain);
195
+ }
196
+ function sha256(bytes) {
197
+ return createHash("sha256").update(bytes).digest("hex");
198
+ }
199
+ var parserInitPromise = null;
200
+ async function ensureParserInitialized() {
201
+ if (parserInitPromise) return parserInitPromise;
202
+ parserInitPromise = Parser.init();
203
+ return parserInitPromise;
204
+ }
205
+ var loadedGrammars = /* @__PURE__ */ new Map();
206
+ async function loadGrammar(language, options = {}) {
207
+ await ensureParserInitialized();
208
+ const cached = loadedGrammars.get(language);
209
+ if (cached) return cached;
210
+ const manifest = options.manifestOverride?.[language] ?? GRAMMAR_MANIFEST[language];
211
+ if (!manifest) {
212
+ throw new GrammarUnavailableError(
213
+ language,
214
+ new Error(`No manifest entry for language "${language}". v1 supports: ${Object.keys(GRAMMAR_MANIFEST).join(", ")}.`)
215
+ );
216
+ }
217
+ const cachePath = getCachedPath(language, manifest.sha256);
218
+ let cacheLstat;
219
+ try {
220
+ cacheLstat = lstatSync(cachePath);
221
+ } catch {
222
+ cacheLstat = null;
223
+ }
224
+ if (cacheLstat) {
225
+ if (cacheLstat.isSymbolicLink() || !cacheLstat.isFile()) {
226
+ throw new GrammarCacheSymlinkError(cachePath);
227
+ }
228
+ let bytes;
229
+ try {
230
+ bytes = readFileSync(cachePath);
231
+ } catch (e) {
232
+ bytes = new Uint8Array(0);
233
+ }
234
+ if (bytes.byteLength > 0) {
235
+ const actualSha = sha256(bytes);
236
+ if (actualSha !== manifest.sha256) {
237
+ throw new GrammarSHAMismatchError(language, manifest.sha256, actualSha);
238
+ }
239
+ const lang2 = await Language.load(bytes);
240
+ loadedGrammars.set(language, lang2);
241
+ touchCacheFile(cachePath);
242
+ return lang2;
243
+ }
244
+ }
245
+ if (!/^https:\/\//i.test(manifest.url)) {
246
+ throw new GrammarUrlNotHttpsError(manifest.url);
247
+ }
248
+ const fetchImpl = options.fetchImpl ?? globalThis.fetch;
249
+ if (!fetchImpl) {
250
+ throw new GrammarUnavailableError(
251
+ language,
252
+ new Error("No fetch implementation available (Node < 18?)")
253
+ );
254
+ }
255
+ let body;
256
+ try {
257
+ const res = await fetchImpl(manifest.url);
258
+ if (!res.ok) {
259
+ throw new Error(`HTTP ${res.status ?? "unknown"} from ${manifest.url}`);
260
+ }
261
+ body = new Uint8Array(await res.arrayBuffer());
262
+ } catch (e) {
263
+ throw new GrammarUnavailableError(language, e);
264
+ }
265
+ const downloadedSha = sha256(body);
266
+ if (downloadedSha !== manifest.sha256) {
267
+ throw new GrammarSHAMismatchError(language, manifest.sha256, downloadedSha);
268
+ }
269
+ try {
270
+ mkdirSync(dirname(cachePath), { recursive: true, mode: 448 });
271
+ try {
272
+ chmodSync(dirname(cachePath), 448);
273
+ } catch {
274
+ }
275
+ const tmpPath = `${cachePath}.tmp.${process.pid}`;
276
+ writeFileSync(tmpPath, body, { mode: 384 });
277
+ try {
278
+ chmodSync(tmpPath, 384);
279
+ } catch {
280
+ }
281
+ try {
282
+ renameSync(tmpPath, cachePath);
283
+ try {
284
+ chmodSync(cachePath, 384);
285
+ } catch {
286
+ }
287
+ } catch (e) {
288
+ try {
289
+ unlinkSync(tmpPath);
290
+ } catch {
291
+ }
292
+ throw e;
293
+ }
294
+ evictBeyondRetainCount();
295
+ } catch (e) {
296
+ console.error(
297
+ `[tree-sitter-loader] cache write failed for ${language}: ${e instanceof Error ? e.message : String(e)} \u2014 loading directly from memory.`
298
+ );
299
+ }
300
+ const lang = await Language.load(body);
301
+ loadedGrammars.set(language, lang);
302
+ return lang;
303
+ }
304
+ function __resetLoadedGrammars() {
305
+ loadedGrammars.clear();
306
+ }
307
+ export {
308
+ GRAMMAR_MANIFEST,
309
+ GrammarCacheSymlinkError,
310
+ GrammarSHAMismatchError,
311
+ GrammarUnavailableError,
312
+ GrammarUrlNotHttpsError,
313
+ __resetLoadedGrammars,
314
+ _evictCacheForTest,
315
+ ensureParserInitialized,
316
+ loadGrammar
317
+ };
@@ -0,0 +1,151 @@
1
+ /**
2
+ * Plan 3b — Phase 1: AST Adapter contract types.
3
+ *
4
+ * Lives at `packages/core/src/detect/adapters/types.ts`. All types are local —
5
+ * NONE re-exported from `web-tree-sitter`.
6
+ *
7
+ * Adapter authors import from this module only; the runner (`runner.ts`)
8
+ * orchestrates execution and the loader (`tree-sitter-loader.ts`) handles
9
+ * grammar acquisition.
10
+ *
11
+ * Per-field confidence is enforced (NOT per-adapter): a single weak field
12
+ * MUST NOT poison the rest. The runner consumes `confidence` per-adapter for
13
+ * the moment, but the merge rule reads each `conventions[field]` against the
14
+ * provenance trail to decide what survives.
15
+ */
16
+ /**
17
+ * Closed-set of Tree-sitter grammars massu ships first-party adapters for.
18
+ *
19
+ * Note: this is a string-literal union, NOT re-exported from `web-tree-sitter`
20
+ * (which exposes `Language` as a class, not a name list). Phase 1 ships
21
+ * adapters for python/typescript/javascript/swift only — the remaining
22
+ * languages are reserved for Plan 3c.
23
+ */
24
+ export type TreeSitterLanguage = 'python' | 'typescript' | 'javascript' | 'swift' | 'rust' | 'go' | 'ruby' | 'php' | 'java' | 'kotlin' | 'elixir' | 'erlang' | 'csharp' | 'cpp' | 'haskell' | 'ocaml';
25
+ /**
26
+ * Read-only signal bundle the runner builds BEFORE adapter dispatch.
27
+ *
28
+ * Adapters consume signals to answer `matches()` cheaply (no file IO inside
29
+ * `matches()` — that's why the bundle is built up-front).
30
+ */
31
+ export interface DetectionSignals {
32
+ /** Parsed `package.json` (root or first workspace) — undefined if absent. */
33
+ packageJson?: Record<string, unknown>;
34
+ /** Parsed `pyproject.toml` — undefined if absent. */
35
+ pyprojectToml?: Record<string, unknown>;
36
+ /** Raw `Gemfile` text — undefined if absent. */
37
+ gemfile?: string;
38
+ /** Parsed `Cargo.toml` — undefined if absent. */
39
+ cargoToml?: Record<string, unknown>;
40
+ /** Raw `go.mod` text — undefined if absent. */
41
+ goMod?: string;
42
+ /** Raw `mix.exs` text — undefined if absent. Elixir/Mix manifest. */
43
+ mixExs?: string;
44
+ /**
45
+ * Raw text of the FIRST `.csproj` file found at the project root —
46
+ * undefined if absent. .NET project file (XML). Adapters that need to
47
+ * inspect more than one csproj (multi-project solutions) can re-scan from
48
+ * `presentFiles`.
49
+ */
50
+ csproj?: string;
51
+ /** Raw `pom.xml` text — undefined if absent. Maven build manifest. */
52
+ pomXml?: string;
53
+ /**
54
+ * Raw text of `build.gradle` OR `build.gradle.kts` — whichever exists at
55
+ * the project root, with `.kts` (Kotlin DSL) preferred when both are
56
+ * present (Kotlin DSL is the modern default per Gradle 7+ docs).
57
+ * Undefined if neither exists.
58
+ */
59
+ gradleBuild?: string;
60
+ /** Set of present directory names directly under the project root (one level). */
61
+ presentDirs: Set<string>;
62
+ /** Set of present file basenames directly under the project root (one level). */
63
+ presentFiles: Set<string>;
64
+ }
65
+ /**
66
+ * A sampled source file the runner hands to the adapter.
67
+ *
68
+ * `content` is pre-read; adapters MUST NOT re-read from disk inside
69
+ * `introspect()`. `size` is in bytes (pre-read length).
70
+ */
71
+ export interface SourceFile {
72
+ path: string;
73
+ content: string;
74
+ language: TreeSitterLanguage;
75
+ size: number;
76
+ }
77
+ /**
78
+ * Trail entry produced for every captured field — the user can audit
79
+ * `detected.<adapter>._provenance` to see exactly which file/line/query
80
+ * produced a value.
81
+ */
82
+ export interface Provenance {
83
+ field: string;
84
+ sourceFile: string;
85
+ line: number;
86
+ query: string;
87
+ }
88
+ export interface AdapterResult {
89
+ /**
90
+ * Becomes `detected.<adapter.id>` in `massu.config.yaml`. Field names are
91
+ * adapter-defined; values are `unknown` so adapters can return strings,
92
+ * arrays, or nested records as needed.
93
+ */
94
+ conventions: Record<string, unknown>;
95
+ /**
96
+ * Per-field provenance trail. The runner writes this to
97
+ * `detected.<adapter.id>._provenance` so a downstream auditor can verify
98
+ * any extracted value.
99
+ */
100
+ provenance: Provenance[];
101
+ /**
102
+ * 'high' : single canonical match, query produced exactly one result
103
+ * 'medium': multiple matches, all agree
104
+ * 'low' : multiple matches with disagreement (still emitted, with warning)
105
+ * 'none' : no matches, timed out, or threw — fields are dropped
106
+ */
107
+ confidence: 'high' | 'medium' | 'low' | 'none';
108
+ }
109
+ export interface CodebaseAdapter {
110
+ /** Stable adapter id, e.g. "python-fastapi". Becomes `detected.<id>` block. */
111
+ id: string;
112
+ /** Languages this adapter consumes. Used by the runner to skip work. */
113
+ languages: TreeSitterLanguage[];
114
+ /**
115
+ * Cheap signal check — must NOT do file IO. Returns true if any signal
116
+ * suggests this adapter should run.
117
+ */
118
+ matches(signals: DetectionSignals): boolean;
119
+ /**
120
+ * Sample N files (already read by the runner), run AST queries, return
121
+ * extracted conventions. May throw — the runner isolates failures.
122
+ */
123
+ introspect(files: SourceFile[], rootDir: string): Promise<AdapterResult>;
124
+ }
125
+ /**
126
+ * The runner's output: per-adapter id → its conventions block (with the
127
+ * `_provenance` map merged in). The introspector then folds this into the
128
+ * `detected.<adapter.id>` namespace alongside the existing
129
+ * `detected.python` / `detected.swift` / `detected.typescript` regex blocks.
130
+ */
131
+ export interface MergedAdapterOutput {
132
+ /** Per-adapter id → resolved conventions. */
133
+ byAdapter: Record<string, AdapterResolved>;
134
+ /** Adapters that were skipped (didn't match) for diagnostic logging. */
135
+ skipped: string[];
136
+ /** Adapters that threw during introspect — runner isolates these. */
137
+ errored: Array<{
138
+ adapterId: string;
139
+ error: string;
140
+ }>;
141
+ }
142
+ /**
143
+ * Resolved-and-merged form of an `AdapterResult`. Provenance is folded into
144
+ * `_provenance` (key per field, value = `path:line :: query`).
145
+ */
146
+ export interface AdapterResolved {
147
+ conventions: Record<string, unknown>;
148
+ /** field-name -> "relativePath:line :: queryName". Empty when no fields. */
149
+ _provenance: Record<string, string>;
150
+ confidence: 'high' | 'medium' | 'low' | 'none';
151
+ }
File without changes