@mnemonik/shared 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,727 @@
1
+ /**
2
+ * Code Scanner - Parse and chunk source files for embedding
3
+ */
4
+
5
+ import { readdir, readFile, stat, lstat, realpath } from 'fs/promises';
6
+ import { join, relative, extname } from 'path';
7
+ import { createHash } from 'crypto';
8
+ import { debug as logDebug } from './logger.js';
9
+ import { withTimeout } from './asyncUtils.js';
10
+
11
+ /**
12
+ * v2.46: File operation timeout (5 seconds) to prevent hanging on slow/unresponsive filesystems
13
+ */
14
+ const FILE_OP_TIMEOUT_MS = 5000;
15
+
16
+ export interface CodeChunk {
17
+ content: string;
18
+ filePath: string;
19
+ language: string;
20
+ startLine: number;
21
+ endLine: number;
22
+ chunkType: 'function' | 'class' | 'module' | 'raw';
23
+ contentHash: string;
24
+ metadata: {
25
+ fileName: string;
26
+ extension: string;
27
+ size: number;
28
+ signature?: string; // v3.3: Function/class signature (e.g. "function foo(bar: string): number")
29
+ symbolName?: string; // v3.3: Symbol name (e.g. "foo")
30
+ };
31
+ }
32
+
33
+ export interface ScanOptions {
34
+ maxChunkSize?: number; // characters
35
+ minChunkSize?: number;
36
+ ignorePatterns?: string[];
37
+ includeExtensions?: string[];
38
+ }
39
+
40
+ const DEFAULT_OPTIONS: Required<ScanOptions> = {
41
+ maxChunkSize: 8000, // ~2000 tokens
42
+ minChunkSize: 100,
43
+ ignorePatterns: [
44
+ // JavaScript/Node
45
+ 'node_modules',
46
+ '.next',
47
+ // Python virtual environments
48
+ 'venv',
49
+ '.venv',
50
+ 'env',
51
+ '.env',
52
+ '__pycache__',
53
+ '.tox',
54
+ '.mypy_cache',
55
+ '.pytest_cache',
56
+ // Build outputs
57
+ 'dist',
58
+ 'build',
59
+ 'target',
60
+ 'bin',
61
+ 'obj',
62
+ 'coverage',
63
+ // Version control & cache
64
+ '.git',
65
+ '.cache',
66
+ '.DS_Store',
67
+ // Lock files
68
+ '*.log',
69
+ '*.lock',
70
+ 'package-lock.json',
71
+ 'yarn.lock',
72
+ // Minified/bundled files - too large and not useful for context
73
+ '*.min.js',
74
+ '*.min.css',
75
+ '*.bundle.js',
76
+ '*.legacy.js',
77
+ '*.map',
78
+ ],
79
+ includeExtensions: [
80
+ '.ts',
81
+ '.tsx',
82
+ '.js',
83
+ '.jsx',
84
+ '.py',
85
+ '.rs',
86
+ '.go',
87
+ '.java',
88
+ '.c',
89
+ '.cpp',
90
+ '.h',
91
+ '.cs',
92
+ '.rb',
93
+ '.php',
94
+ '.swift',
95
+ '.kt',
96
+ '.md',
97
+ ],
98
+ };
99
+
100
+ export class CodeScanner {
101
+ private options: Required<ScanOptions>;
102
+
103
+ constructor(options: ScanOptions = {}) {
104
+ this.options = { ...DEFAULT_OPTIONS, ...options };
105
+ }
106
+
107
+ /**
108
+ * Maximum directory depth for recursive scanning
109
+ * v2.43: Prevents runaway recursion on deep/symlinked structures
110
+ */
111
+ private static readonly MAX_DEPTH = 10;
112
+
113
+ /**
114
+ * Scan a directory recursively and extract code chunks
115
+ * v2.43: Added max depth (10) to prevent infinite recursion
116
+ */
117
+ async scanDirectory(rootPath: string): Promise<CodeChunk[]> {
118
+ const chunks: CodeChunk[] = [];
119
+ await this.traverseDirectory(rootPath, rootPath, chunks, 0);
120
+ return chunks;
121
+ }
122
+
123
+ /**
124
+ * Scan specific files and extract code chunks.
125
+ * Pass rootPath to compute proper relative file paths in chunk metadata.
126
+ */
127
+ async scanFiles(filePaths: string[], rootPath: string): Promise<CodeChunk[]> {
128
+ const chunks: CodeChunk[] = [];
129
+
130
+ for (const filePath of filePaths) {
131
+ try {
132
+ if (this.shouldIgnore(filePath)) {
133
+ continue;
134
+ }
135
+
136
+ const ext = extname(filePath);
137
+ if (this.options.includeExtensions.includes(ext)) {
138
+ const fileChunks = await this.parseFile(filePath, rootPath || filePath);
139
+ chunks.push(...fileChunks);
140
+ }
141
+ } catch (error) {
142
+ logDebug('Error scanning file', { filePath, error });
143
+ }
144
+ }
145
+
146
+ return chunks;
147
+ }
148
+
149
+ /**
150
+ * Recursively traverse directory
151
+ * v2.43: Added depth parameter with max limit
152
+ */
153
+ private async traverseDirectory(
154
+ currentPath: string,
155
+ rootPath: string,
156
+ chunks: CodeChunk[],
157
+ depth: number
158
+ ): Promise<void> {
159
+ // v2.43: Prevent infinite recursion
160
+ if (depth >= CodeScanner.MAX_DEPTH) {
161
+ logDebug('Max directory depth reached, skipping', { path: currentPath, depth });
162
+ return;
163
+ }
164
+
165
+ try {
166
+ // v2.46: Wrap readdir with timeout to prevent hanging
167
+ const entries = await withTimeout(
168
+ readdir(currentPath),
169
+ FILE_OP_TIMEOUT_MS,
170
+ `readdir timed out: ${currentPath}`
171
+ );
172
+
173
+ for (const entry of entries) {
174
+ const fullPath = join(currentPath, entry);
175
+ const relativePath = relative(rootPath, fullPath);
176
+
177
+ // Check ignore patterns
178
+ if (this.shouldIgnore(relativePath)) {
179
+ continue;
180
+ }
181
+
182
+ const lstats = await withTimeout(
183
+ lstat(fullPath),
184
+ FILE_OP_TIMEOUT_MS,
185
+ `lstat timed out: ${fullPath}`
186
+ );
187
+
188
+ if (lstats.isSymbolicLink()) {
189
+ const resolved = await realpath(fullPath);
190
+ const resolvedRoot = await realpath(rootPath);
191
+ if (!resolved.startsWith(resolvedRoot + '/') && resolved !== resolvedRoot) {
192
+ logDebug('Skipping symlink escaping project root', { fullPath, resolved, rootPath });
193
+ continue;
194
+ }
195
+ }
196
+
197
+ const stats = lstats.isSymbolicLink()
198
+ ? await withTimeout(stat(fullPath), FILE_OP_TIMEOUT_MS, `stat timed out: ${fullPath}`)
199
+ : lstats;
200
+
201
+ if (stats.isDirectory()) {
202
+ await this.traverseDirectory(fullPath, rootPath, chunks, depth + 1);
203
+ } else if (stats.isFile()) {
204
+ const ext = extname(fullPath);
205
+ if (this.options.includeExtensions.includes(ext)) {
206
+ const fileChunks = await this.parseFile(fullPath, rootPath);
207
+ chunks.push(...fileChunks);
208
+ }
209
+ }
210
+ }
211
+ } catch (error) {
212
+ logDebug('Error traversing directory', { path: currentPath, error });
213
+ }
214
+ }
215
+
216
+ /**
217
+ * Check if path should be ignored
218
+ * v2.71: Fixed glob-to-regex conversion and substring matching.
219
+ * - Escape regex special chars before replacing * with .*
220
+ * - Replace ALL * occurrences (not just the first)
221
+ * - For non-glob patterns, match on path segments to avoid false positives
222
+ * (e.g., '.env' should not match '.environment.ts')
223
+ */
224
+ private shouldIgnore(path: string): boolean {
225
+ const segments = path.split('/');
226
+ return this.options.ignorePatterns.some((pattern) => {
227
+ if (pattern.includes('*')) {
228
+ // Escape regex special chars, then replace all * with .*
229
+ const escaped = pattern.replace(/[.+?^${}()|[\]\\]/g, '\\$&');
230
+ const regex = new RegExp(escaped.replace(/\*/g, '.*'));
231
+ return regex.test(path);
232
+ }
233
+ // For non-glob patterns, check if any path segment matches exactly
234
+ // or if the full path ends with the pattern (for extension-like patterns)
235
+ return segments.some((segment) => segment === pattern) || path.endsWith('/' + pattern);
236
+ });
237
+ }
238
+
239
+ /**
240
+ * Parse a file and extract code chunks
241
+ * v2.43: Added 10MB file size limit
242
+ */
243
+ private static readonly MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB
244
+
245
+ private async parseFile(filePath: string, rootPath: string): Promise<CodeChunk[]> {
246
+ try {
247
+ // v2.43: Check file size before reading to avoid memory issues
248
+ // v2.46: Wrap stat with timeout
249
+ const stats = await withTimeout(
250
+ stat(filePath),
251
+ FILE_OP_TIMEOUT_MS,
252
+ `stat timed out: ${filePath}`
253
+ );
254
+ if (stats.size > CodeScanner.MAX_FILE_SIZE) {
255
+ logDebug('Skipping file exceeding size limit', {
256
+ filePath,
257
+ size: stats.size,
258
+ limit: CodeScanner.MAX_FILE_SIZE,
259
+ });
260
+ return [];
261
+ }
262
+
263
+ // v2.46: Wrap readFile with timeout
264
+ const content = await withTimeout(
265
+ readFile(filePath, 'utf-8'),
266
+ FILE_OP_TIMEOUT_MS,
267
+ `readFile timed out: ${filePath}`
268
+ );
269
+ const relativePath = relative(rootPath, filePath);
270
+ const language = this.detectLanguage(filePath);
271
+
272
+ // Try to extract functions/classes
273
+ if (language === 'markdown') {
274
+ return this.chunkMarkdown(content, relativePath, stats.size);
275
+ }
276
+
277
+ const structuredChunks = this.extractStructuredChunks(content, language);
278
+
279
+ const fileMetadata = {
280
+ fileName: filePath.split('/').pop() || '',
281
+ extension: extname(filePath),
282
+ size: stats.size,
283
+ };
284
+
285
+ if (structuredChunks.length > 0) {
286
+ const mapped = structuredChunks.map(({ signature, symbolName, ...chunk }) => ({
287
+ ...chunk,
288
+ filePath: relativePath,
289
+ language,
290
+ metadata: {
291
+ ...fileMetadata,
292
+ ...(signature && { signature }),
293
+ ...(symbolName && { symbolName }),
294
+ },
295
+ }));
296
+
297
+ // Coverage check: if structured chunks cover less than 50% of file lines,
298
+ // supplement with raw chunks for uncovered regions. This prevents a single
299
+ // small match from blocking all raw chunking in large files.
300
+ const totalLines = content.split('\n').length;
301
+ const coveredLines = new Set<number>();
302
+ for (const chunk of structuredChunks) {
303
+ for (let l = chunk.startLine; l <= chunk.endLine; l++) {
304
+ coveredLines.add(l);
305
+ }
306
+ }
307
+ const coverageRatio = coveredLines.size / totalLines;
308
+
309
+ if (coverageRatio < 0.5 && totalLines > 50) {
310
+ const rawChunks = this.chunkRaw(content, relativePath, language, stats.size);
311
+ // Only keep raw chunks that don't overlap with structured chunks
312
+ const supplemental = rawChunks.filter((rc) => {
313
+ for (const sc of structuredChunks) {
314
+ if (rc.startLine <= sc.endLine && rc.endLine >= sc.startLine) {
315
+ return false;
316
+ }
317
+ }
318
+ return true;
319
+ });
320
+ mapped.push(...supplemental);
321
+ }
322
+
323
+ return mapped;
324
+ }
325
+
326
+ // Fall back to raw chunking
327
+ return this.chunkRaw(content, relativePath, language, stats.size);
328
+ } catch (error) {
329
+ logDebug('Error parsing file', { filePath, error });
330
+ return [];
331
+ }
332
+ }
333
+
334
+ /**
335
+ * Detect language from file extension
336
+ */
337
+ private detectLanguage(filePath: string): string {
338
+ const ext = extname(filePath).toLowerCase();
339
+ const langMap: Record<string, string> = {
340
+ '.ts': 'typescript',
341
+ '.tsx': 'typescript',
342
+ '.js': 'javascript',
343
+ '.jsx': 'javascript',
344
+ '.py': 'python',
345
+ '.rs': 'rust',
346
+ '.go': 'go',
347
+ '.java': 'java',
348
+ '.c': 'c',
349
+ '.cpp': 'cpp',
350
+ '.h': 'c',
351
+ '.cs': 'csharp',
352
+ '.rb': 'ruby',
353
+ '.php': 'php',
354
+ '.swift': 'swift',
355
+ '.kt': 'kotlin',
356
+ '.md': 'markdown',
357
+ };
358
+ return langMap[ext] || 'unknown';
359
+ }
360
+
361
+ /**
362
+ * Chunk markdown files by headers
363
+ */
364
+ private chunkMarkdown(content: string, filePath: string, size: number): CodeChunk[] {
365
+ const chunks: CodeChunk[] = [];
366
+ const lines = content.split('\n');
367
+
368
+ let currentChunk: string[] = [];
369
+ let currentStartLine = 1;
370
+
371
+ for (let i = 0; i < lines.length; i++) {
372
+ const line = lines[i];
373
+ if (line === undefined) continue;
374
+ const isHeader = /^#{1,6}\s/.test(line);
375
+
376
+ // If we hit a new header and have content, push the previous chunk
377
+ if (isHeader && currentChunk.length > 0) {
378
+ const chunkContent = currentChunk.join('\n').trim();
379
+ if (chunkContent.length >= this.options.minChunkSize) {
380
+ chunks.push({
381
+ content: chunkContent,
382
+ filePath,
383
+ language: 'markdown',
384
+ startLine: currentStartLine,
385
+ endLine: i, // Previous line
386
+ chunkType: 'module', // Treat sections as modules
387
+ contentHash: this.hash(chunkContent),
388
+ metadata: {
389
+ fileName: filePath.split('/').pop() || '',
390
+ extension: '.md',
391
+ size,
392
+ },
393
+ });
394
+ }
395
+ currentChunk = [];
396
+ currentStartLine = i + 1;
397
+ // currentHeader = line; // unused
398
+ }
399
+
400
+ currentChunk.push(line);
401
+
402
+ // If chunk gets too big, force a split (fallback to raw-like behavior but inside markdown logic)
403
+ if (currentChunk.join('\n').length > this.options.maxChunkSize) {
404
+ const chunkContent = currentChunk.join('\n').trim();
405
+ chunks.push({
406
+ content: chunkContent,
407
+ filePath,
408
+ language: 'markdown',
409
+ startLine: currentStartLine,
410
+ endLine: i + 1,
411
+ chunkType: 'raw',
412
+ contentHash: this.hash(chunkContent),
413
+ metadata: {
414
+ fileName: filePath.split('/').pop() || '',
415
+ extension: '.md',
416
+ size,
417
+ },
418
+ });
419
+ currentChunk = [];
420
+ currentStartLine = i + 2;
421
+ }
422
+ }
423
+
424
+ // Push remaining content
425
+ if (currentChunk.length > 0) {
426
+ const chunkContent = currentChunk.join('\n').trim();
427
+ if (chunkContent.length >= this.options.minChunkSize) {
428
+ chunks.push({
429
+ content: chunkContent,
430
+ filePath,
431
+ language: 'markdown',
432
+ startLine: currentStartLine,
433
+ endLine: lines.length,
434
+ chunkType: 'module',
435
+ contentHash: this.hash(chunkContent),
436
+ metadata: {
437
+ fileName: filePath.split('/').pop() || '',
438
+ extension: '.md',
439
+ size,
440
+ },
441
+ });
442
+ }
443
+ }
444
+
445
+ return chunks;
446
+ }
447
+
448
+ /**
449
+ * Find the index of the closing brace matching the opening brace at openIndex.
450
+ * Handles nested braces. Skips braces inside string literals, template literals,
451
+ * single-line comments, multi-line comments, and regex literals.
452
+ */
453
+ private findMatchingBrace(content: string, openIndex: number): number {
454
+ if (content[openIndex] !== '{') return -1;
455
+ let depth = 1;
456
+ let i = openIndex + 1;
457
+ const len = content.length;
458
+
459
+ while (i < len) {
460
+ const c = content[i];
461
+ const next = i + 1 < len ? content[i + 1] : '';
462
+
463
+ // Single-line comment
464
+ if (c === '/' && next === '/') {
465
+ i = content.indexOf('\n', i);
466
+ if (i === -1) return -1;
467
+ i++;
468
+ continue;
469
+ }
470
+
471
+ // Multi-line comment
472
+ if (c === '/' && next === '*') {
473
+ i = content.indexOf('*/', i + 2);
474
+ if (i === -1) return -1;
475
+ i += 2;
476
+ continue;
477
+ }
478
+
479
+ // String literals (single or double quote)
480
+ if (c === "'" || c === '"') {
481
+ i++;
482
+ while (i < len && content[i] !== c) {
483
+ if (content[i] === '\\') i++; // skip escaped char
484
+ i++;
485
+ }
486
+ i++; // skip closing quote
487
+ continue;
488
+ }
489
+
490
+ // Template literal
491
+ if (c === '`') {
492
+ i++;
493
+ while (i < len && content[i] !== '`') {
494
+ if (content[i] === '\\') i++; // skip escaped char
495
+ i++;
496
+ }
497
+ i++; // skip closing backtick
498
+ continue;
499
+ }
500
+
501
+ // Regex literal — heuristic: / after operator chars or keywords that precede expressions
502
+ if (c === '/' && i > 0) {
503
+ // Look back for operator context (skip whitespace)
504
+ let j = i - 1;
505
+ while (j >= 0 && (content[j] === ' ' || content[j] === '\t')) j--;
506
+ const prev = j >= 0 ? content[j] : '\n';
507
+ // Check for keywords that precede regex: return, typeof, void, delete, throw, new, case, in, instanceof
508
+ let isRegexContext = '=({[,;:!&|?+->~^%\n'.includes(prev);
509
+ if (!isRegexContext && j >= 0 && /[a-z]/.test(prev)) {
510
+ // Extract the word ending at position j
511
+ let wordStart = j;
512
+ while (wordStart > 0 && /[a-z]/.test(content[wordStart - 1])) wordStart--;
513
+ const word = content.substring(wordStart, j + 1);
514
+ const regexKeywords = [
515
+ 'return',
516
+ 'typeof',
517
+ 'void',
518
+ 'delete',
519
+ 'throw',
520
+ 'new',
521
+ 'case',
522
+ 'in',
523
+ 'instanceof',
524
+ 'yield',
525
+ 'await',
526
+ ];
527
+ isRegexContext = regexKeywords.includes(word);
528
+ }
529
+ if (isRegexContext) {
530
+ i++;
531
+ while (i < len && content[i] !== '/') {
532
+ if (content[i] === '\\') {
533
+ i++; // skip escaped char
534
+ } else if (content[i] === '[') {
535
+ // character class — skip to ]
536
+ i++;
537
+ while (i < len && content[i] !== ']') {
538
+ if (content[i] === '\\') i++;
539
+ i++;
540
+ }
541
+ }
542
+ i++;
543
+ }
544
+ i++; // skip closing /
545
+ continue;
546
+ }
547
+ }
548
+
549
+ if (c === '{') depth++;
550
+ else if (c === '}') {
551
+ depth--;
552
+ if (depth === 0) return i;
553
+ }
554
+ i++;
555
+ }
556
+ return -1;
557
+ }
558
+
559
+ /**
560
+ * Extract structured chunks (functions, classes)
561
+ * v2.76: Uses brace-matching for TS/JS/Rust so nested braces are not truncated at first \n}
562
+ */
563
+ private extractStructuredChunks(
564
+ content: string,
565
+ language: string
566
+ ): (Omit<CodeChunk, 'filePath' | 'language' | 'metadata'> & {
567
+ signature?: string;
568
+ symbolName?: string;
569
+ })[] {
570
+ const chunks: (Omit<CodeChunk, 'filePath' | 'language' | 'metadata'> & {
571
+ signature?: string;
572
+ symbolName?: string;
573
+ })[] = [];
574
+
575
+ const patterns = this.getLanguagePatterns(language);
576
+ const useBraceMatch = ['typescript', 'javascript', 'rust'].includes(language);
577
+
578
+ for (const pattern of patterns) {
579
+ let match: RegExpExecArray | null;
580
+ const regex = new RegExp(pattern.regex, 'gm');
581
+
582
+ while ((match = regex.exec(content)) !== null) {
583
+ let matchContent: string;
584
+ if (useBraceMatch && pattern.regex.endsWith('\\{')) {
585
+ const openBraceIndex = match.index + match[0].length - 1;
586
+ if (content[openBraceIndex] === '{') {
587
+ const closeIndex = this.findMatchingBrace(content, openBraceIndex);
588
+ if (closeIndex >= 0) {
589
+ matchContent = content.slice(match.index, closeIndex + 1);
590
+ } else {
591
+ matchContent = match[0];
592
+ }
593
+ } else {
594
+ matchContent = match[0];
595
+ }
596
+ } else {
597
+ matchContent = match[0];
598
+ }
599
+
600
+ const startLine = content.substring(0, match.index).split('\n').length;
601
+ const endLine = startLine + matchContent.split('\n').length - 1;
602
+
603
+ if (
604
+ matchContent.length >= this.options.minChunkSize &&
605
+ matchContent.length <= this.options.maxChunkSize
606
+ ) {
607
+ // v3.3: Extract function/class signature and symbol name
608
+ const firstLine = matchContent.split('\n')[0].trim();
609
+ const signature = firstLine.replace(/\{$/, '').trim() || undefined;
610
+ const nameMatch = firstLine.match(
611
+ /(?:function|class|const|interface|type|enum|export\s+(?:default\s+)?(?:function|class|const|interface|type|enum))\s+(\w+)/
612
+ );
613
+ const symbolName = nameMatch?.[1] || undefined;
614
+
615
+ chunks.push({
616
+ content: matchContent.trim(),
617
+ startLine,
618
+ endLine,
619
+ chunkType: pattern.type,
620
+ contentHash: this.hash(matchContent),
621
+ signature,
622
+ symbolName,
623
+ });
624
+ }
625
+ }
626
+ }
627
+
628
+ return chunks;
629
+ }
630
+
631
+ /**
632
+ * Get regex patterns for language
633
+ */
634
+ private getLanguagePatterns(language: string): Array<{
635
+ regex: string;
636
+ type: 'function' | 'class' | 'module';
637
+ }> {
638
+ switch (language) {
639
+ case 'typescript':
640
+ case 'javascript':
641
+ return [
642
+ // Classes (body extracted via brace-matching)
643
+ {
644
+ regex: '(?:export\\s+)?(?:abstract\\s+)?class\\s+\\w+[^{]*\\{',
645
+ type: 'class',
646
+ },
647
+ // Functions (body extracted via brace-matching)
648
+ {
649
+ regex: '(?:export\\s+)?(?:async\\s+)?function\\s+\\w+[^{]*\\{',
650
+ type: 'function',
651
+ },
652
+ // Arrow functions (body extracted via brace-matching)
653
+ {
654
+ regex: '(?:export\\s+)?const\\s+\\w+\\s*=\\s*(?:async\\s+)?\\([^)]*\\)\\s*=>\\s*\\{',
655
+ type: 'function',
656
+ },
657
+ // Class methods — matches indented methods with optional modifiers.
658
+ // Excludes control flow keywords (if, for, while, switch, catch, return).
659
+ {
660
+ regex:
661
+ '^\\s+(?:(?:private|protected|public|static|abstract|override|readonly|async|get|set)\\s+)*(?!if|for|while|switch|catch|return|throw|new|import|export)\\w+\\s*(?:<[^>]*>)?\\s*\\([^)]*\\)[^{]*\\{',
662
+ type: 'function',
663
+ },
664
+ ];
665
+
666
+ case 'python':
667
+ return [
668
+ // Classes
669
+ { regex: 'class\\s+\\w+[^:]*:[^]*?(?=\\nclass\\s|\\ndef\\s|$)', type: 'class' },
670
+ // Functions
671
+ { regex: 'def\\s+\\w+[^:]*:[^]*?(?=\\ndef\\s|\\nclass\\s|$)', type: 'function' },
672
+ ];
673
+
674
+ case 'rust':
675
+ return [
676
+ // Functions (body extracted via brace-matching)
677
+ { regex: '(?:pub\\s+)?fn\\s+\\w+[^{]*\\{', type: 'function' },
678
+ // Structs (single-line style; no nested braces in pattern)
679
+ { regex: '(?:pub\\s+)?struct\\s+\\w+[^}]*\\}', type: 'class' },
680
+ ];
681
+
682
+ default:
683
+ return [];
684
+ }
685
+ }
686
+
687
+ /**
688
+ * Fall back to raw chunking with overlap
689
+ */
690
+ private chunkRaw(content: string, filePath: string, language: string, size: number): CodeChunk[] {
691
+ const chunks: CodeChunk[] = [];
692
+ const lines = content.split('\n');
693
+ const chunkSizeLines = Math.floor(this.options.maxChunkSize / 80); // Assume ~80 chars per line
694
+ const overlapLines = Math.floor(chunkSizeLines * 0.1); // 10% overlap
695
+
696
+ for (let i = 0; i < lines.length; i += chunkSizeLines - overlapLines) {
697
+ const chunkLines = lines.slice(i, i + chunkSizeLines);
698
+ const chunkContent = chunkLines.join('\n');
699
+
700
+ if (chunkContent.length >= this.options.minChunkSize) {
701
+ chunks.push({
702
+ content: chunkContent.trim(),
703
+ filePath,
704
+ language,
705
+ startLine: i + 1,
706
+ endLine: i + chunkLines.length,
707
+ chunkType: 'raw',
708
+ contentHash: this.hash(chunkContent),
709
+ metadata: {
710
+ fileName: filePath.split('/').pop() || '',
711
+ extension: extname(filePath),
712
+ size,
713
+ },
714
+ });
715
+ }
716
+ }
717
+
718
+ return chunks;
719
+ }
720
+
721
+ /**
722
+ * Generate content hash for drift detection
723
+ */
724
+ private hash(content: string): string {
725
+ return createHash('sha256').update(content).digest('hex').substring(0, 16);
726
+ }
727
+ }