@getmikk/core 1.8.1 → 1.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,21 +2,147 @@ import * as path from 'node:path'
2
2
  import { createRequire } from 'node:module'
3
3
  import { hashContent } from '../../hash/file-hasher.js'
4
4
  import { BaseParser } from '../base-parser.js'
5
- import type { ParsedFile, ParsedFunction, ParsedClass, ParsedParam, ParsedImport, ParsedExport, ParsedRoute } from '../types.js'
5
+ import type { ParsedFile, ParsedFunction, ParsedClass, ParsedParam, ParsedImport } from '../types.js'
6
6
  import * as Queries from './queries.js'
7
7
 
8
8
  // Safely require web-tree-sitter via CJS
9
9
  const getRequire = () => {
10
- if (typeof require !== 'undefined') return require;
11
- return createRequire(import.meta.url);
12
- };
13
- const _require = getRequire();
10
+ if (typeof require !== 'undefined') return require
11
+ return createRequire(import.meta.url)
12
+ }
13
+ const _require = getRequire()
14
14
  const ParserModule = _require('web-tree-sitter')
15
15
  const Parser = ParserModule.Parser || ParserModule
16
16
 
17
+ // ---------------------------------------------------------------------------
18
+ // Language-specific export visibility rules
19
+ // ---------------------------------------------------------------------------
20
+
21
+ /**
22
+ * Determine whether a function node is exported based on language conventions.
23
+ * Python: public if name does not start with underscore.
24
+ * Java/C#/Rust: requires an explicit visibility keyword in the node text.
25
+ * Go: exported if name starts with an uppercase letter.
26
+ * All others (C, C++, PHP, Ruby): default to false (no reliable static rule).
27
+ */
28
+ function isExportedByLanguage(ext: string, name: string, nodeText: string): boolean {
29
+ switch (ext) {
30
+ case '.py':
31
+ return !name.startsWith('_')
32
+ case '.java':
33
+ case '.cs':
34
+ return /\bpublic\b/.test(nodeText)
35
+ case '.go':
36
+ return name.length > 0 && name[0] === name[0].toUpperCase() && name[0] !== name[0].toLowerCase()
37
+ case '.rs':
38
+ return /\bpub\b/.test(nodeText)
39
+ default:
40
+ return false
41
+ }
42
+ }
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // Parameter extraction from tree-sitter nodes
46
+ // ---------------------------------------------------------------------------
47
+
48
+ /**
49
+ * Best-effort parameter extraction from a function definition node.
50
+ * Walks child nodes looking for parameter/formal_parameter identifiers.
51
+ * Returns an empty array on failure — never throws.
52
+ */
53
+ function extractParamsFromNode(defNode: any): ParsedParam[] {
54
+ const params: ParsedParam[] = []
55
+ if (!defNode || !defNode.children) return params
56
+
57
+ // Walk all descendants looking for parameter-like nodes
58
+ const walk = (node: any) => {
59
+ if (!node) return
60
+ const t = node.type ?? ''
61
+ // Common parameter node type names across tree-sitter grammars
62
+ if (
63
+ t === 'parameter' || t === 'formal_parameter' || t === 'simple_parameter' ||
64
+ t === 'variadic_parameter' || t === 'typed_parameter' || t === 'typed_default_parameter' ||
65
+ t === 'keyword_argument' || t === 'field_declaration'
66
+ ) {
67
+ // Try to find the identifier within this param node
68
+ const identNode = findFirstChild(node, n => n.type === 'identifier' || n.type === 'name')
69
+ const typeNode = findFirstChild(node, n =>
70
+ n.type === 'type' || n.type === 'type_annotation' ||
71
+ n.type === 'type_identifier' || n.type === 'predefined_type'
72
+ )
73
+ const name = identNode?.text ?? node.text ?? ''
74
+ const type = typeNode?.text ?? 'any'
75
+ if (name && name !== '' && !params.some(p => p.name === name)) {
76
+ params.push({ name, type, optional: false })
77
+ }
78
+ return // Don't recurse into parameter children
79
+ }
80
+ if (node.children) {
81
+ for (const child of node.children) walk(child)
82
+ }
83
+ }
84
+
85
+ walk(defNode)
86
+ return params
87
+ }
88
+
89
+ function findFirstChild(node: any, predicate: (n: any) => boolean): any {
90
+ if (!node?.children) return null
91
+ for (const child of node.children) {
92
+ if (predicate(child)) return child
93
+ }
94
+ return null
95
+ }
96
+
97
+ // ---------------------------------------------------------------------------
98
+ // Scope-aware call resolver
99
+ // ---------------------------------------------------------------------------
100
+
101
+ /**
102
+ * Given the ordered list of functions (with startLine/endLine already set)
103
+ * and a map of callName → line, assign each call to the innermost function
104
+ * whose line range contains that call's line.
105
+ *
106
+ * Returns an array of call names that were NOT assigned to any function scope
107
+ * (these are module-scope calls).
108
+ */
109
+ function assignCallsToFunctions(
110
+ functions: ParsedFunction[],
111
+ callEntries: Array<{ name: string; line: number }>
112
+ ): string[] {
113
+ const unassigned: string[] = []
114
+ for (const { name, line } of callEntries) {
115
+ // Find the innermost (smallest range) function that contains this line
116
+ let best: ParsedFunction | null = null
117
+ let bestRange = Infinity
118
+ for (const fn of functions) {
119
+ if (line >= fn.startLine && line <= fn.endLine) {
120
+ const range = fn.endLine - fn.startLine
121
+ if (range < bestRange) {
122
+ best = fn
123
+ bestRange = range
124
+ }
125
+ }
126
+ }
127
+ if (best) {
128
+ if (!best.calls.includes(name)) {
129
+ best.calls.push(name)
130
+ }
131
+ } else {
132
+ unassigned.push(name)
133
+ }
134
+ }
135
+ return unassigned
136
+ }
137
+
138
+ // ---------------------------------------------------------------------------
139
+ // Main parser class
140
+ // ---------------------------------------------------------------------------
141
+
17
142
  export class TreeSitterParser extends BaseParser {
18
143
  private parser: any = null
19
144
  private languages = new Map<string, any>()
145
+ private nameCounter = new Map<string, number>()
20
146
 
21
147
  getSupportedExtensions(): string[] {
22
148
  return ['.py', '.java', '.c', '.cpp', '.cc', '.h', '.hpp', '.cs', '.go', '.rs', '.php', '.rb']
@@ -30,12 +156,12 @@ export class TreeSitterParser extends BaseParser {
30
156
  }
31
157
 
32
158
  async parse(filePath: string, content: string): Promise<ParsedFile> {
159
+ this.nameCounter.clear()
33
160
  await this.init()
34
161
  const ext = path.extname(filePath).toLowerCase()
35
162
  const config = await this.getLanguageConfig(ext)
36
163
 
37
164
  if (!config || !config.lang) {
38
- // Fallback to empty if language not supported or grammar failed to load
39
165
  return this.buildEmptyFile(filePath, content, ext)
40
166
  }
41
167
 
@@ -47,7 +173,10 @@ export class TreeSitterParser extends BaseParser {
47
173
  const functions: ParsedFunction[] = []
48
174
  const classesMap = new Map<string, ParsedClass>()
49
175
  const imports: ParsedImport[] = []
50
- const calls = new Set<string>()
176
+ // callEntries stores name + line so we can scope them to the right function
177
+ const callEntries: Array<{ name: string; line: number }> = []
178
+ // Track processed function IDs to avoid collisions from overloads
179
+ const seenFnIds = new Set<string>()
51
180
 
52
181
  for (const match of matches) {
53
182
  const captures: Record<string, any> = {}
@@ -55,13 +184,17 @@ export class TreeSitterParser extends BaseParser {
55
184
  captures[c.name] = c.node
56
185
  }
57
186
 
58
- // Calls
187
+ // --- Calls: record name and line position ---
59
188
  if (captures['call.name']) {
60
- calls.add(captures['call.name'].text)
189
+ const callNode = captures['call.name']
190
+ callEntries.push({
191
+ name: callNode.text,
192
+ line: (callNode.startPosition?.row ?? 0) + 1,
193
+ })
61
194
  continue
62
195
  }
63
196
 
64
- // Imports
197
+ // --- Imports ---
65
198
  if (captures['import.source']) {
66
199
  const src = captures['import.source'].text.replace(/['"]/g, '')
67
200
  imports.push({
@@ -69,31 +202,52 @@ export class TreeSitterParser extends BaseParser {
69
202
  resolvedPath: '',
70
203
  names: [],
71
204
  isDefault: false,
72
- isDynamic: false
205
+ isDynamic: false,
73
206
  })
74
207
  continue
75
208
  }
76
209
 
77
- // Functions / Methods
210
+ // --- Functions / Methods ---
78
211
  if (captures['definition.function'] || captures['definition.method']) {
79
212
  const nameNode = captures['name']
80
213
  const defNode = captures['definition.function'] || captures['definition.method']
81
-
214
+
82
215
  if (nameNode && defNode) {
83
216
  const fnName = nameNode.text
217
+ const startLine = defNode.startPosition.row + 1
218
+ const endLine = defNode.endPosition.row + 1
219
+ const nodeText = defNode.text ?? ''
220
+ const count = (this.nameCounter.get(fnName) ?? 0) + 1
221
+ this.nameCounter.set(fnName, count)
222
+
223
+ // Unique ID: use stable format with counter for collisions
224
+ let fnId = count === 1 ? `fn:${filePath}:${fnName}` : `fn:${filePath}:${fnName}#${count}`
225
+ if (seenFnIds.has(fnId)) {
226
+ continue
227
+ }
228
+ seenFnIds.add(fnId)
229
+
230
+ const exported = isExportedByLanguage(ext, fnName, nodeText)
231
+ const isAsync = /\basync\b/.test(nodeText)
232
+
233
+ // Detect return type — language-specific heuristics
234
+ const returnType = extractReturnType(ext, defNode)
235
+
236
+ const params = extractParamsFromNode(defNode)
237
+
84
238
  functions.push({
85
- id: `fn:${filePath}:${fnName}`,
239
+ id: fnId,
86
240
  name: fnName,
87
241
  file: filePath,
88
- startLine: defNode.startPosition.row + 1,
89
- endLine: defNode.endPosition.row + 1,
90
- params: [],
91
- returnType: 'any',
92
- isExported: true, // simplified for universal parser
93
- isAsync: false,
94
- calls: [], // We aggregate at file level currently
95
- hash: hashContent(defNode.text),
96
- purpose: '',
242
+ startLine,
243
+ endLine,
244
+ params,
245
+ returnType,
246
+ isExported: exported,
247
+ isAsync,
248
+ calls: [], // populated after all functions are collected
249
+ hash: hashContent(nodeText),
250
+ purpose: extractDocComment(content, startLine),
97
251
  edgeCasesHandled: [],
98
252
  errorHandling: [],
99
253
  detailedLines: [],
@@ -101,46 +255,71 @@ export class TreeSitterParser extends BaseParser {
101
255
  }
102
256
  }
103
257
 
104
- // Classes / Structs / Interfaces
105
- if (captures['definition.class'] || captures['definition.struct'] || captures['definition.interface']) {
258
+ // --- Classes / Structs / Interfaces ---
259
+ if (
260
+ captures['definition.class'] ||
261
+ captures['definition.struct'] ||
262
+ captures['definition.interface']
263
+ ) {
106
264
  const nameNode = captures['name']
107
- const defNode = captures['definition.class'] || captures['definition.struct'] || captures['definition.interface']
108
-
265
+ const defNode =
266
+ captures['definition.class'] ||
267
+ captures['definition.struct'] ||
268
+ captures['definition.interface']
269
+
109
270
  if (nameNode && defNode) {
110
271
  const clsName = nameNode.text
111
- if (!classesMap.has(clsName)) {
112
- classesMap.set(clsName, {
113
- id: `cls:${filePath}:${clsName}`,
272
+ const startLine = defNode.startPosition.row + 1
273
+ const endLine = defNode.endPosition.row + 1
274
+ const nodeText = defNode.text ?? ''
275
+ const clsId = `class:${filePath}:${clsName}` // consistent with ts-extractor
276
+
277
+ if (!classesMap.has(clsId)) {
278
+ classesMap.set(clsId, {
279
+ id: clsId,
114
280
  name: clsName,
115
281
  file: filePath,
116
- startLine: defNode.startPosition.row + 1,
117
- endLine: defNode.endPosition.row + 1,
282
+ startLine,
283
+ endLine,
118
284
  methods: [],
119
- isExported: true,
285
+ isExported: isExportedByLanguage(ext, clsName, nodeText),
120
286
  })
121
287
  }
122
288
  }
123
289
  }
124
290
  }
125
291
 
126
- // Attach global calls to the first function as a heuristic, or store in a dummy
127
- if (functions.length > 0) {
128
- functions[0].calls = Array.from(calls)
129
- }
292
+ // Assign calls to their enclosing function scopes.
293
+ const unassignedCalls = assignCallsToFunctions(functions, callEntries)
130
294
 
131
- let finalLang: ParsedFile['language'] = 'go'
132
- switch (ext) {
133
- case '.py': finalLang = 'python'; break
134
- case '.java': finalLang = 'java'; break
135
- case '.c': case '.h': finalLang = 'c'; break
136
- case '.cpp': case '.cc': case '.hpp': finalLang = 'cpp'; break
137
- case '.cs': finalLang = 'csharp'; break
138
- case '.go': finalLang = 'go'; break
139
- case '.rs': finalLang = 'rust'; break
140
- case '.php': finalLang = 'php'; break
141
- case '.rb': finalLang = 'ruby'; break
295
+ // Only add a synthetic module-level function if there are actually calls made outside any function.
296
+ if (unassignedCalls.length > 0) {
297
+ const lineCount = content.split('\n').length
298
+ functions.push({
299
+ id: `fn:${filePath}:<module>:1`,
300
+ name: '<module>',
301
+ file: filePath,
302
+ startLine: 1,
303
+ endLine: lineCount || 1,
304
+ params: [],
305
+ returnType: 'void',
306
+ isExported: false, // Don't export the synthetic module function
307
+ isAsync: false,
308
+ calls: Array.from(new Set(unassignedCalls)),
309
+ hash: '',
310
+ purpose: 'Module-level initialization code',
311
+ edgeCasesHandled: [],
312
+ errorHandling: [],
313
+ detailedLines: [],
314
+ })
142
315
  }
143
316
 
317
+ const finalLang = extensionToLanguage(ext)
318
+
319
+ // Link methods: functions whose names contain '.' belong to a class
320
+ // (Go receiver methods, Java/C# member methods detected via method capture)
321
+ linkMethodsToClasses(functions, classesMap)
322
+
144
323
  return {
145
324
  path: filePath,
146
325
  language: finalLang,
@@ -148,45 +327,42 @@ export class TreeSitterParser extends BaseParser {
148
327
  classes: Array.from(classesMap.values()),
149
328
  generics: [],
150
329
  imports,
151
- exports: functions.map(f => ({ name: f.name, type: 'function', file: filePath })),
330
+ exports: functions.filter(f => f.isExported).map(f => ({
331
+ name: f.name,
332
+ type: 'function' as const,
333
+ file: filePath,
334
+ })),
152
335
  routes: [],
153
336
  hash: hashContent(content),
154
- parsedAt: Date.now()
337
+ parsedAt: Date.now(),
155
338
  }
156
339
  }
157
340
 
158
- resolveImports(files: ParsedFile[], projectRoot: string): ParsedFile[] {
159
- // Universal resolver: just link absolute paths if they exist locally
160
- // Basic heuristic for all 11 languages
341
+ resolveImports(files: ParsedFile[], _projectRoot: string): ParsedFile[] {
342
+ // Tree-sitter resolver: no cross-file resolution implemented.
343
+ // Imports are left with resolvedPath = '' which signals unresolved to the graph builder.
344
+ // A future pass can resolve Go/Python/Java imports using language-specific rules.
161
345
  return files
162
346
  }
163
347
 
164
348
  private buildEmptyFile(filePath: string, content: string, ext: string): ParsedFile {
165
- let finalLang: ParsedFile['language'] = 'unknown'
166
- switch (ext) {
167
- case '.py': finalLang = 'python'; break
168
- case '.java': finalLang = 'java'; break
169
- case '.c': case '.h': finalLang = 'c'; break
170
- case '.cpp': case '.cc': case '.hpp': finalLang = 'cpp'; break
171
- case '.cs': finalLang = 'csharp'; break
172
- case '.go': finalLang = 'go'; break
173
- case '.rs': finalLang = 'rust'; break
174
- case '.php': finalLang = 'php'; break
175
- case '.rb': finalLang = 'ruby'; break
176
- }
177
349
  return {
178
350
  path: filePath,
179
- language: finalLang,
180
- functions: [], classes: [], generics: [], imports: [], exports: [], routes: [],
351
+ language: extensionToLanguage(ext),
352
+ functions: [],
353
+ classes: [],
354
+ generics: [],
355
+ imports: [],
356
+ exports: [],
357
+ routes: [],
181
358
  hash: hashContent(content),
182
- parsedAt: Date.now()
359
+ parsedAt: Date.now(),
183
360
  }
184
361
  }
185
362
 
186
363
  private async loadLang(name: string): Promise<any> {
187
364
  if (this.languages.has(name)) return this.languages.get(name)
188
365
  try {
189
- // Get module root path to locate wasms
190
366
  const tcPath = _require.resolve('tree-sitter-wasms/package.json')
191
367
  const wasmPath = path.join(path.dirname(tcPath), 'out', `tree-sitter-${name}.wasm`)
192
368
  const lang = await Parser.Language.load(wasmPath)
@@ -226,3 +402,114 @@ export class TreeSitterParser extends BaseParser {
226
402
  }
227
403
  }
228
404
  }
405
+
406
+ // ---------------------------------------------------------------------------
407
+ // Helpers
408
+ // ---------------------------------------------------------------------------
409
+
410
+ function extensionToLanguage(ext: string): ParsedFile['language'] {
411
+ switch (ext) {
412
+ case '.py': return 'python'
413
+ case '.java': return 'java'
414
+ case '.c': case '.h': return 'c'
415
+ case '.cpp': case '.cc': case '.hpp': return 'cpp'
416
+ case '.cs': return 'csharp'
417
+ case '.go': return 'go'
418
+ case '.rs': return 'rust'
419
+ case '.php': return 'php'
420
+ case '.rb': return 'ruby'
421
+ default: return 'unknown'
422
+ }
423
+ }
424
+
425
+ /**
426
+ * Extract a simple return type hint from the function node text.
427
+ * Falls back to 'unknown' rather than 'any' to distinguish "not parsed"
428
+ * from "genuinely untyped".
429
+ */
430
+ function extractReturnType(ext: string, defNode: any): string {
431
+ const text: string = defNode?.text ?? ''
432
+ // TypeScript/Go/Rust: look for "-> Type" or ": Type" after parameters
433
+ const arrowMatch = text.match(/\)\s*->\s*([^\s{]+)/)
434
+ if (arrowMatch) return arrowMatch[1].trim()
435
+ // Java/C# style: "public int foo(" — type precedes the name
436
+ // This is too fragile to do reliably here; return 'unknown'
437
+ if (ext === '.go') {
438
+ // Go: "func foo() (int, error)" or "func foo() error"
439
+ const goReturnTuple = text.match(/\)\s+(\([^)]+\))/)
440
+ if (goReturnTuple) return goReturnTuple[1].trim()
441
+ const goReturn = text.match(/\)\s+([^\s{(]+)/)
442
+ if (goReturn) return goReturn[1].trim()
443
+ }
444
+ return 'unknown'
445
+ }
446
+
447
+ /**
448
+ * Extract a single-line doc comment immediately preceding the given line.
449
+ * Scans backwards from startLine looking for `#`, `//`, `/**`, or `"""` comments.
450
+ */
451
+ function extractDocComment(content: string, startLine: number): string {
452
+ const lines = content.split('\n')
453
+ const targetIdx = startLine - 2 // 0-indexed line before the function
454
+ if (targetIdx < 0) return ''
455
+
456
+ const prev = lines[targetIdx]?.trim() ?? ''
457
+ // Single-line comment styles
458
+ for (const prefix of ['# ', '// ', '/// ']) {
459
+ if (prev.startsWith(prefix)) return prev.slice(prefix.length).trim()
460
+ }
461
+ // JSDoc / block comment end
462
+ if (prev === '*/') {
463
+ // Walk back to find the first meaningful JSDoc line
464
+ for (let i = targetIdx - 1; i >= 0; i--) {
465
+ const line = lines[i].trim()
466
+ if (line.startsWith('/*') || line.startsWith('/**')) break
467
+ const cleaned = line.replace(/^\*+\s?/, '')
468
+ if (cleaned && !/^[\-_=*]{3,}$/.test(cleaned)) return cleaned
469
+ }
470
+ }
471
+ return ''
472
+ }
473
+
474
+ /**
475
+ * Move functions that are class methods (identified by having a receiver or
476
+ * by being within the line range of a class) into the class's methods array.
477
+ * This is a best-effort heuristic; direct tree-sitter capture of method
478
+ * declarations already places them correctly in most languages.
479
+ */
480
+ function linkMethodsToClasses(
481
+ functions: ParsedFunction[],
482
+ classesMap: Map<string, ParsedClass>
483
+ ): void {
484
+ const classes = Array.from(classesMap.values())
485
+ if (classes.length === 0) return
486
+
487
+ for (const fn of functions) {
488
+ // Already categorised if name contains "." (e.g. "MyClass.method")
489
+ // and never link the synthetic <module> function to a class.
490
+ if (fn.name === '<module>' || fn.name.includes('.')) continue
491
+
492
+ // Skip functions nested inside other functions (local helpers)
493
+ const isNestedInFunction = functions.some(f =>
494
+ f.id !== fn.id &&
495
+ fn.startLine >= f.startLine && fn.endLine <= f.endLine
496
+ )
497
+ if (isNestedInFunction) continue
498
+
499
+ // Find the innermost (smallest range) class that contains this function
500
+ let bestCls: ParsedClass | null = null
501
+ let bestRange = Infinity
502
+ for (const cls of classes) {
503
+ if (fn.startLine > cls.startLine && fn.endLine <= cls.endLine) {
504
+ const range = cls.endLine - cls.startLine
505
+ if (range < bestRange) {
506
+ bestCls = cls
507
+ bestRange = range
508
+ }
509
+ }
510
+ }
511
+ if (bestCls && !bestCls.methods.some(m => m.id === fn.id)) {
512
+ bestCls.methods.push(fn)
513
+ }
514
+ }
515
+ }
@@ -15,6 +15,7 @@ export interface ParsedFunction {
15
15
  id: string // "fn:auth/verify.ts:verifyToken"
16
16
  name: string // "verifyToken"
17
17
  file: string // "src/auth/verify.ts"
18
+ moduleId?: string
18
19
  startLine: number // 14
19
20
  endLine: number // 28
20
21
  params: ParsedParam[] // [{name: "token", type: "string"}]
@@ -52,6 +53,7 @@ export interface ParsedClass {
52
53
  id: string
53
54
  name: string
54
55
  file: string
56
+ moduleId?: string
55
57
  startLine: number
56
58
  endLine: number
57
59
  methods: ParsedFunction[]