@musashishao/agent-kit 1.2.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.agent/mcp-gateway/README.md +121 -0
  2. package/.agent/mcp-gateway/dist/index.d.ts +11 -0
  3. package/.agent/mcp-gateway/dist/index.js +504 -0
  4. package/.agent/mcp-gateway/dist/sync/debouncer.d.ts +56 -0
  5. package/.agent/mcp-gateway/dist/sync/debouncer.js +112 -0
  6. package/.agent/mcp-gateway/dist/sync/incremental_syncer.d.ts +58 -0
  7. package/.agent/mcp-gateway/dist/sync/incremental_syncer.js +172 -0
  8. package/.agent/mcp-gateway/dist/sync/index.d.ts +6 -0
  9. package/.agent/mcp-gateway/dist/sync/index.js +6 -0
  10. package/.agent/mcp-gateway/dist/sync/timestamp_checker.d.ts +69 -0
  11. package/.agent/mcp-gateway/dist/sync/timestamp_checker.js +169 -0
  12. package/.agent/mcp-gateway/package.json +28 -0
  13. package/.agent/mcp-gateway/src/index.ts +608 -0
  14. package/.agent/mcp-gateway/src/sync/debouncer.ts +129 -0
  15. package/.agent/mcp-gateway/src/sync/incremental_syncer.ts +237 -0
  16. package/.agent/mcp-gateway/src/sync/index.ts +7 -0
  17. package/.agent/mcp-gateway/src/sync/timestamp_checker.ts +194 -0
  18. package/.agent/scripts/ak_cli.py +549 -0
  19. package/.agent/scripts/setup_host.py +557 -0
  20. package/.agent/scripts/verify_install.py +174 -0
  21. package/.agent/skills/app-builder/SKILL.md +51 -1
  22. package/.agent/skills/app-builder/scripts/generate_ai_infra.py +510 -0
  23. package/.agent/skills/documentation-templates/SKILL.md +9 -1
  24. package/.agent/skills/documentation-templates/agents-template.md +202 -0
  25. package/.agent/skills/graph-mapper/SKILL.md +211 -0
  26. package/.agent/skills/graph-mapper/scripts/generate_graph.py +705 -0
  27. package/.agent/skills/rag-engineering/SKILL.md +342 -0
  28. package/.agent/skills/rag-engineering/chunking-strategies.md +229 -0
  29. package/.agent/skills/rag-engineering/contextual-retrieval.md +261 -0
  30. package/.agent/skills/rag-engineering/hybrid-search.md +356 -0
  31. package/.agent/skills/rag-engineering/scripts/chunk_code.py +916 -0
  32. package/.agent/templates/mcp_configs/claude_desktop.json +14 -0
  33. package/.agent/templates/mcp_configs/cursor.json +13 -0
  34. package/.agent/templates/mcp_configs/vscode.json +13 -0
  35. package/.agent/workflows/create.md +70 -2
  36. package/bin/cli.js +91 -0
  37. package/docs/AI_DATA_INFRASTRUCTURE.md +288 -0
  38. package/docs/CHANGELOG_AI_INFRA.md +111 -0
  39. package/docs/PLAN-universal-intelligence.md +48 -0
  40. package/package.json +7 -2
@@ -0,0 +1,916 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Code Chunker - Intelligent code chunking by logical boundaries.
4
+
5
+ Splits code files into meaningful chunks (functions, classes, modules)
6
+ instead of arbitrary character/token limits.
7
+
8
+ Usage:
9
+ python chunk_code.py --src ./src --output chunks.json --lang auto
10
+ """
11
+
12
+ import os
13
+ import re
14
+ import json
15
+ import hashlib
16
+ import argparse
17
+ from pathlib import Path
18
+ from typing import List, Dict, Optional, Tuple
19
+ from dataclasses import dataclass, asdict
20
+ from datetime import datetime
21
+
22
+
23
+ @dataclass
24
+ class Chunk:
25
+ """Represents a code chunk."""
26
+ id: str
27
+ content: str
28
+ metadata: Dict
29
+
30
+
31
+ def generate_chunk_id(file_path: str, content: str) -> str:
32
+ """Generate unique chunk ID."""
33
+ hash_input = f"{file_path}:{content[:100]}"
34
+ return hashlib.sha256(hash_input.encode()).hexdigest()[:16]
35
+
36
+
37
+ class TypeScriptChunker:
38
+ """Chunk TypeScript/JavaScript files by logical boundaries."""
39
+
40
+ EXTENSIONS = {'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs'}
41
+
42
+ def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
43
+ self.max_chunk_size = max_chunk_size # in characters
44
+ self.overlap = overlap
45
+
46
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
47
+ """Chunk a single file."""
48
+ try:
49
+ content = file_path.read_text(encoding='utf-8')
50
+ except (UnicodeDecodeError, PermissionError):
51
+ return []
52
+
53
+ relative_path = str(file_path.relative_to(base_path))
54
+ chunks = []
55
+
56
+ # Try to extract functions and classes
57
+ code_units = self._extract_code_units(content)
58
+
59
+ if code_units:
60
+ # Chunk by code units
61
+ for unit in code_units:
62
+ chunk_content = unit['code']
63
+
64
+ # If unit is too large, split it
65
+ if len(chunk_content) > self.max_chunk_size:
66
+ sub_chunks = self._split_large_chunk(chunk_content)
67
+ for i, sub in enumerate(sub_chunks):
68
+ chunks.append(self._create_chunk(
69
+ content=sub,
70
+ file_path=relative_path,
71
+ unit_type=unit['type'],
72
+ unit_name=f"{unit['name']}_part{i+1}",
73
+ start_line=unit['start_line'],
74
+ end_line=unit['end_line']
75
+ ))
76
+ else:
77
+ chunks.append(self._create_chunk(
78
+ content=chunk_content,
79
+ file_path=relative_path,
80
+ unit_type=unit['type'],
81
+ unit_name=unit['name'],
82
+ start_line=unit['start_line'],
83
+ end_line=unit['end_line']
84
+ ))
85
+ else:
86
+ # Fallback: chunk entire file
87
+ if len(content) <= self.max_chunk_size:
88
+ chunks.append(self._create_chunk(
89
+ content=content,
90
+ file_path=relative_path,
91
+ unit_type='module',
92
+ unit_name=file_path.stem,
93
+ start_line=1,
94
+ end_line=content.count('\n') + 1
95
+ ))
96
+ else:
97
+ # Split by size with overlap
98
+ sub_chunks = self._split_large_chunk(content)
99
+ for i, sub in enumerate(sub_chunks):
100
+ chunks.append(self._create_chunk(
101
+ content=sub,
102
+ file_path=relative_path,
103
+ unit_type='module_part',
104
+ unit_name=f"{file_path.stem}_part{i+1}",
105
+ start_line=1,
106
+ end_line=content.count('\n') + 1
107
+ ))
108
+
109
+ return chunks
110
+
111
+ def _extract_code_units(self, content: str) -> List[Dict]:
112
+ """Extract functions and classes from code."""
113
+ units = []
114
+ lines = content.split('\n')
115
+
116
+ # Patterns for detecting code blocks
117
+ patterns = [
118
+ # Exported function
119
+ (r'^export\s+(?:async\s+)?function\s+(\w+)', 'function'),
120
+ # Regular function
121
+ (r'^(?:async\s+)?function\s+(\w+)', 'function'),
122
+ # Arrow function (const/let/var)
123
+ (r'^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>', 'function'),
124
+ # Class
125
+ (r'^(?:export\s+)?class\s+(\w+)', 'class'),
126
+ # Interface/Type (TypeScript)
127
+ (r'^(?:export\s+)?(?:interface|type)\s+(\w+)', 'type'),
128
+ ]
129
+
130
+ i = 0
131
+ while i < len(lines):
132
+ line = lines[i]
133
+
134
+ for pattern, unit_type in patterns:
135
+ match = re.match(pattern, line.strip())
136
+ if match:
137
+ name = match.group(1)
138
+ start_line = i + 1
139
+
140
+ # Find the end of this block (matching braces)
141
+ end_line = self._find_block_end(lines, i)
142
+
143
+ code = '\n'.join(lines[i:end_line])
144
+ units.append({
145
+ 'type': unit_type,
146
+ 'name': name,
147
+ 'code': code,
148
+ 'start_line': start_line,
149
+ 'end_line': end_line
150
+ })
151
+
152
+ i = end_line
153
+ break
154
+ else:
155
+ i += 1
156
+
157
+ return units
158
+
159
+ def _find_block_end(self, lines: List[str], start: int) -> int:
160
+ """Find the end of a code block by matching braces."""
161
+ brace_count = 0
162
+ found_open = False
163
+
164
+ for i in range(start, len(lines)):
165
+ line = lines[i]
166
+ for char in line:
167
+ if char == '{':
168
+ brace_count += 1
169
+ found_open = True
170
+ elif char == '}':
171
+ brace_count -= 1
172
+
173
+ if found_open and brace_count == 0:
174
+ return i + 1
175
+
176
+ return len(lines)
177
+
178
+ def _split_large_chunk(self, content: str) -> List[str]:
179
+ """Split large content into smaller chunks with overlap."""
180
+ chunks = []
181
+ lines = content.split('\n')
182
+
183
+ current_chunk = []
184
+ current_size = 0
185
+
186
+ for line in lines:
187
+ line_size = len(line) + 1 # +1 for newline
188
+
189
+ if current_size + line_size > self.max_chunk_size and current_chunk:
190
+ chunks.append('\n'.join(current_chunk))
191
+ # Keep overlap lines
192
+ overlap_lines = current_chunk[-3:] if len(current_chunk) > 3 else current_chunk
193
+ current_chunk = overlap_lines.copy()
194
+ current_size = sum(len(l) + 1 for l in current_chunk)
195
+
196
+ current_chunk.append(line)
197
+ current_size += line_size
198
+
199
+ if current_chunk:
200
+ chunks.append('\n'.join(current_chunk))
201
+
202
+ return chunks
203
+
204
+ def _create_chunk(
205
+ self,
206
+ content: str,
207
+ file_path: str,
208
+ unit_type: str,
209
+ unit_name: str,
210
+ start_line: int,
211
+ end_line: int
212
+ ) -> Chunk:
213
+ """Create a Chunk object with metadata."""
214
+ return Chunk(
215
+ id=generate_chunk_id(file_path, content),
216
+ content=content,
217
+ metadata={
218
+ 'file_path': file_path,
219
+ 'file_type': 'typescript',
220
+ 'chunk_type': unit_type,
221
+ 'name': unit_name,
222
+ 'start_line': start_line,
223
+ 'end_line': end_line,
224
+ 'char_count': len(content),
225
+ 'line_count': content.count('\n') + 1
226
+ }
227
+ )
228
+
229
+
230
+ class PythonChunker:
231
+ """Chunk Python files by logical boundaries."""
232
+
233
+ def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
234
+ self.max_chunk_size = max_chunk_size
235
+ self.overlap = overlap
236
+
237
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
238
+ """Chunk a single Python file."""
239
+ try:
240
+ content = file_path.read_text(encoding='utf-8')
241
+ except (UnicodeDecodeError, PermissionError):
242
+ return []
243
+
244
+ relative_path = str(file_path.relative_to(base_path))
245
+ chunks = []
246
+
247
+ # Try to use AST for extraction
248
+ try:
249
+ import ast
250
+ tree = ast.parse(content)
251
+ chunks = self._chunk_with_ast(content, tree, relative_path)
252
+ except SyntaxError:
253
+ # Fallback to regex-based extraction
254
+ chunks = self._chunk_with_regex(content, relative_path)
255
+
256
+ return chunks
257
+
258
+ def _chunk_with_ast(self, content: str, tree, file_path: str) -> List[Chunk]:
259
+ """Extract chunks using Python AST."""
260
+ import ast
261
+
262
+ chunks = []
263
+ lines = content.split('\n')
264
+
265
+ for node in ast.iter_child_nodes(tree):
266
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
267
+ chunk_type = 'function'
268
+ name = node.name
269
+ elif isinstance(node, ast.ClassDef):
270
+ chunk_type = 'class'
271
+ name = node.name
272
+ else:
273
+ continue
274
+
275
+ # Get source segment
276
+ start_line = node.lineno
277
+ end_line = node.end_lineno or start_line
278
+ code = '\n'.join(lines[start_line - 1:end_line])
279
+
280
+ # Handle large chunks
281
+ if len(code) > self.max_chunk_size:
282
+ # For classes, try to split by methods
283
+ if isinstance(node, ast.ClassDef):
284
+ method_chunks = self._split_class_by_methods(
285
+ code, node, lines, file_path, name
286
+ )
287
+ chunks.extend(method_chunks)
288
+ else:
289
+ # Split function (rare case)
290
+ sub_chunks = self._split_large_content(code)
291
+ for i, sub in enumerate(sub_chunks):
292
+ chunks.append(self._create_chunk(
293
+ content=sub,
294
+ file_path=file_path,
295
+ chunk_type=f'{chunk_type}_part',
296
+ name=f'{name}_part{i+1}',
297
+ start_line=start_line,
298
+ end_line=end_line
299
+ ))
300
+ else:
301
+ chunks.append(self._create_chunk(
302
+ content=code,
303
+ file_path=file_path,
304
+ chunk_type=chunk_type,
305
+ name=name,
306
+ start_line=start_line,
307
+ end_line=end_line
308
+ ))
309
+
310
+ # If no chunks extracted, treat as module
311
+ if not chunks:
312
+ if len(content) <= self.max_chunk_size:
313
+ chunks.append(self._create_chunk(
314
+ content=content,
315
+ file_path=file_path,
316
+ chunk_type='module',
317
+ name=Path(file_path).stem,
318
+ start_line=1,
319
+ end_line=len(lines)
320
+ ))
321
+ else:
322
+ sub_chunks = self._split_large_content(content)
323
+ for i, sub in enumerate(sub_chunks):
324
+ chunks.append(self._create_chunk(
325
+ content=sub,
326
+ file_path=file_path,
327
+ chunk_type='module_part',
328
+ name=f'{Path(file_path).stem}_part{i+1}',
329
+ start_line=1,
330
+ end_line=len(lines)
331
+ ))
332
+
333
+ return chunks
334
+
335
+ def _split_class_by_methods(
336
+ self,
337
+ code: str,
338
+ class_node,
339
+ lines: List[str],
340
+ file_path: str,
341
+ class_name: str
342
+ ) -> List[Chunk]:
343
+ """Split a large class into method-level chunks."""
344
+ import ast
345
+
346
+ chunks = []
347
+
348
+ # First chunk: class definition + docstring + class variables
349
+ class_start = class_node.lineno - 1
350
+ first_method_line = None
351
+
352
+ for item in class_node.body:
353
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
354
+ first_method_line = item.lineno - 1
355
+ break
356
+
357
+ if first_method_line:
358
+ class_header = '\n'.join(lines[class_start:first_method_line])
359
+ chunks.append(self._create_chunk(
360
+ content=class_header,
361
+ file_path=file_path,
362
+ chunk_type='class_header',
363
+ name=f'{class_name}_header',
364
+ start_line=class_start + 1,
365
+ end_line=first_method_line
366
+ ))
367
+
368
+ # Each method as separate chunk
369
+ for item in class_node.body:
370
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
371
+ start = item.lineno - 1
372
+ end = item.end_lineno or start + 1
373
+ method_code = '\n'.join(lines[start:end])
374
+
375
+ chunks.append(self._create_chunk(
376
+ content=method_code,
377
+ file_path=file_path,
378
+ chunk_type='method',
379
+ name=f'{class_name}.{item.name}',
380
+ start_line=start + 1,
381
+ end_line=end
382
+ ))
383
+
384
+ return chunks
385
+
386
+ def _chunk_with_regex(self, content: str, file_path: str) -> List[Chunk]:
387
+ """Fallback regex-based chunking."""
388
+ chunks = []
389
+ lines = content.split('\n')
390
+
391
+ # Simple pattern for function/class definitions
392
+ def_pattern = re.compile(r'^(class|def|async\s+def)\s+(\w+)')
393
+
394
+ current_block = []
395
+ current_type = None
396
+ current_name = None
397
+ block_start = 0
398
+ base_indent = None
399
+
400
+ for i, line in enumerate(lines):
401
+ match = def_pattern.match(line)
402
+
403
+ if match:
404
+ # Save previous block
405
+ if current_block and current_name:
406
+ chunks.append(self._create_chunk(
407
+ content='\n'.join(current_block),
408
+ file_path=file_path,
409
+ chunk_type=current_type or 'code',
410
+ name=current_name,
411
+ start_line=block_start + 1,
412
+ end_line=i
413
+ ))
414
+
415
+ # Start new block
416
+ current_type = 'class' if match.group(1) == 'class' else 'function'
417
+ current_name = match.group(2)
418
+ current_block = [line]
419
+ block_start = i
420
+ base_indent = len(line) - len(line.lstrip())
421
+ elif current_block:
422
+ # Check if still in block (indent-based)
423
+ stripped = line.lstrip()
424
+ if stripped: # Non-empty line
425
+ current_indent = len(line) - len(stripped)
426
+ if current_indent <= base_indent and not line.strip().startswith('#'):
427
+ # End of block
428
+ chunks.append(self._create_chunk(
429
+ content='\n'.join(current_block),
430
+ file_path=file_path,
431
+ chunk_type=current_type or 'code',
432
+ name=current_name,
433
+ start_line=block_start + 1,
434
+ end_line=i
435
+ ))
436
+ current_block = []
437
+ current_name = None
438
+ current_type = None
439
+ else:
440
+ current_block.append(line)
441
+ else:
442
+ current_block.append(line)
443
+
444
+ # Don't forget last block
445
+ if current_block and current_name:
446
+ chunks.append(self._create_chunk(
447
+ content='\n'.join(current_block),
448
+ file_path=file_path,
449
+ chunk_type=current_type or 'code',
450
+ name=current_name,
451
+ start_line=block_start + 1,
452
+ end_line=len(lines)
453
+ ))
454
+
455
+ return chunks
456
+
457
+ def _split_large_content(self, content: str) -> List[str]:
458
+ """Split large content with overlap."""
459
+ chunks = []
460
+ lines = content.split('\n')
461
+
462
+ current_chunk = []
463
+ current_size = 0
464
+
465
+ for line in lines:
466
+ line_size = len(line) + 1
467
+
468
+ if current_size + line_size > self.max_chunk_size and current_chunk:
469
+ chunks.append('\n'.join(current_chunk))
470
+ overlap_lines = current_chunk[-3:] if len(current_chunk) > 3 else current_chunk
471
+ current_chunk = overlap_lines.copy()
472
+ current_size = sum(len(l) + 1 for l in current_chunk)
473
+
474
+ current_chunk.append(line)
475
+ current_size += line_size
476
+
477
+ if current_chunk:
478
+ chunks.append('\n'.join(current_chunk))
479
+
480
+ return chunks
481
+
482
+ def _create_chunk(
483
+ self,
484
+ content: str,
485
+ file_path: str,
486
+ chunk_type: str,
487
+ name: str,
488
+ start_line: int,
489
+ end_line: int
490
+ ) -> Chunk:
491
+ """Create a Chunk object."""
492
+ return Chunk(
493
+ id=generate_chunk_id(file_path, content),
494
+ content=content,
495
+ metadata={
496
+ 'file_path': file_path,
497
+ 'file_type': 'python',
498
+ 'chunk_type': chunk_type,
499
+ 'name': name,
500
+ 'start_line': start_line,
501
+ 'end_line': end_line,
502
+ 'char_count': len(content),
503
+ 'line_count': content.count('\n') + 1
504
+ }
505
+ )
506
+
507
+
508
+ class MarkdownChunker:
509
+ """Chunk Markdown files by heading sections."""
510
+
511
+ EXTENSIONS = {'.md', '.mdx', '.markdown'}
512
+
513
+ def __init__(self, max_chunk_size: int = 2000, overlap: int = 100):
514
+ self.max_chunk_size = max_chunk_size
515
+ self.overlap = overlap
516
+
517
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
518
+ """Chunk a Markdown file by headings."""
519
+ try:
520
+ content = file_path.read_text(encoding='utf-8')
521
+ except (UnicodeDecodeError, PermissionError):
522
+ return []
523
+
524
+ relative_path = str(file_path.relative_to(base_path))
525
+ chunks = []
526
+
527
+ # Extract sections by heading
528
+ sections = self._extract_sections(content)
529
+
530
+ if sections:
531
+ for section in sections:
532
+ chunk_content = section['content']
533
+
534
+ # If section is too large, split it
535
+ if len(chunk_content) > self.max_chunk_size:
536
+ sub_chunks = self._split_by_paragraphs(chunk_content)
537
+ for i, sub in enumerate(sub_chunks):
538
+ chunks.append(self._create_chunk(
539
+ content=sub,
540
+ file_path=relative_path,
541
+ section_title=f"{section['title']}_part{i+1}",
542
+ heading_level=section['level'],
543
+ parent_headings=section['parents'],
544
+ start_line=section['start_line'],
545
+ end_line=section['end_line']
546
+ ))
547
+ else:
548
+ chunks.append(self._create_chunk(
549
+ content=chunk_content,
550
+ file_path=relative_path,
551
+ section_title=section['title'],
552
+ heading_level=section['level'],
553
+ parent_headings=section['parents'],
554
+ start_line=section['start_line'],
555
+ end_line=section['end_line']
556
+ ))
557
+ else:
558
+ # No headings found, treat entire file as one chunk or split by paragraphs
559
+ if len(content) <= self.max_chunk_size:
560
+ chunks.append(self._create_chunk(
561
+ content=content,
562
+ file_path=relative_path,
563
+ section_title=file_path.stem,
564
+ heading_level=0,
565
+ parent_headings=[],
566
+ start_line=1,
567
+ end_line=content.count('\n') + 1
568
+ ))
569
+ else:
570
+ sub_chunks = self._split_by_paragraphs(content)
571
+ for i, sub in enumerate(sub_chunks):
572
+ chunks.append(self._create_chunk(
573
+ content=sub,
574
+ file_path=relative_path,
575
+ section_title=f"{file_path.stem}_part{i+1}",
576
+ heading_level=0,
577
+ parent_headings=[],
578
+ start_line=1,
579
+ end_line=content.count('\n') + 1
580
+ ))
581
+
582
+ return chunks
583
+
584
+ def _extract_sections(self, content: str) -> List[Dict]:
585
+ """Extract sections based on Markdown headings."""
586
+ sections = []
587
+ lines = content.split('\n')
588
+
589
+ heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
590
+
591
+ current_section = None
592
+ current_lines = []
593
+ parent_stack = [] # Track parent headings for context
594
+
595
+ for i, line in enumerate(lines):
596
+ match = heading_pattern.match(line)
597
+
598
+ if match:
599
+ # Save previous section
600
+ if current_section:
601
+ current_section['content'] = '\n'.join(current_lines).strip()
602
+ current_section['end_line'] = i
603
+ if current_section['content']:
604
+ sections.append(current_section)
605
+
606
+ # Update parent stack
607
+ level = len(match.group(1))
608
+ title = match.group(2).strip()
609
+
610
+ # Pop parents that are same or deeper level
611
+ while parent_stack and parent_stack[-1]['level'] >= level:
612
+ parent_stack.pop()
613
+
614
+ parents = [p['title'] for p in parent_stack]
615
+
616
+ # Start new section
617
+ current_section = {
618
+ 'title': title,
619
+ 'level': level,
620
+ 'parents': parents.copy(),
621
+ 'start_line': i + 1,
622
+ 'end_line': i + 1,
623
+ 'content': ''
624
+ }
625
+ current_lines = [line]
626
+
627
+ # Add this heading to parent stack
628
+ parent_stack.append({'level': level, 'title': title})
629
+ elif current_section:
630
+ current_lines.append(line)
631
+
632
+ # Don't forget last section
633
+ if current_section:
634
+ current_section['content'] = '\n'.join(current_lines).strip()
635
+ current_section['end_line'] = len(lines)
636
+ if current_section['content']:
637
+ sections.append(current_section)
638
+
639
+ return sections
640
+
641
+ def _split_by_paragraphs(self, content: str) -> List[str]:
642
+ """Split content by paragraphs when too large."""
643
+ chunks = []
644
+ paragraphs = re.split(r'\n\s*\n', content)
645
+
646
+ current_chunk = []
647
+ current_size = 0
648
+
649
+ for para in paragraphs:
650
+ para_size = len(para) + 2 # +2 for paragraph break
651
+
652
+ if current_size + para_size > self.max_chunk_size and current_chunk:
653
+ chunks.append('\n\n'.join(current_chunk))
654
+ current_chunk = []
655
+ current_size = 0
656
+
657
+ current_chunk.append(para)
658
+ current_size += para_size
659
+
660
+ if current_chunk:
661
+ chunks.append('\n\n'.join(current_chunk))
662
+
663
+ return chunks
664
+
665
+ def _create_chunk(
666
+ self,
667
+ content: str,
668
+ file_path: str,
669
+ section_title: str,
670
+ heading_level: int,
671
+ parent_headings: List[str],
672
+ start_line: int,
673
+ end_line: int
674
+ ) -> Chunk:
675
+ """Create a Chunk object with rich context."""
676
+ # Build context string for better retrieval
677
+ context_path = ' > '.join(parent_headings + [section_title]) if parent_headings else section_title
678
+
679
+ return Chunk(
680
+ id=generate_chunk_id(file_path, content),
681
+ content=content,
682
+ metadata={
683
+ 'file_path': file_path,
684
+ 'file_type': 'markdown',
685
+ 'chunk_type': f'heading_{heading_level}' if heading_level > 0 else 'paragraph',
686
+ 'name': section_title,
687
+ 'context_path': context_path,
688
+ 'heading_level': heading_level,
689
+ 'parent_headings': parent_headings,
690
+ 'start_line': start_line,
691
+ 'end_line': end_line,
692
+ 'char_count': len(content),
693
+ 'line_count': content.count('\n') + 1
694
+ }
695
+ )
696
+
697
+
698
+ class TextChunker:
699
+ """Chunk plain text files by paragraphs."""
700
+
701
+ EXTENSIONS = {'.txt', '.text', '.log'}
702
+
703
+ def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
704
+ self.max_chunk_size = max_chunk_size
705
+ self.overlap = overlap
706
+
707
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
708
+ """Chunk a text file by paragraphs."""
709
+ try:
710
+ content = file_path.read_text(encoding='utf-8')
711
+ except (UnicodeDecodeError, PermissionError):
712
+ return []
713
+
714
+ relative_path = str(file_path.relative_to(base_path))
715
+ chunks = []
716
+
717
+ if len(content) <= self.max_chunk_size:
718
+ chunks.append(self._create_chunk(
719
+ content=content,
720
+ file_path=relative_path,
721
+ name=file_path.stem,
722
+ start_line=1,
723
+ end_line=content.count('\n') + 1
724
+ ))
725
+ else:
726
+ # Split by paragraphs
727
+ paragraphs = re.split(r'\n\s*\n', content)
728
+ current_chunk = []
729
+ current_size = 0
730
+ chunk_index = 0
731
+
732
+ for para in paragraphs:
733
+ para_size = len(para) + 2
734
+
735
+ if current_size + para_size > self.max_chunk_size and current_chunk:
736
+ chunk_index += 1
737
+ chunks.append(self._create_chunk(
738
+ content='\n\n'.join(current_chunk),
739
+ file_path=relative_path,
740
+ name=f"{file_path.stem}_part{chunk_index}",
741
+ start_line=1,
742
+ end_line=content.count('\n') + 1
743
+ ))
744
+ current_chunk = []
745
+ current_size = 0
746
+
747
+ current_chunk.append(para)
748
+ current_size += para_size
749
+
750
+ if current_chunk:
751
+ chunk_index += 1
752
+ chunks.append(self._create_chunk(
753
+ content='\n\n'.join(current_chunk),
754
+ file_path=relative_path,
755
+ name=f"{file_path.stem}_part{chunk_index}",
756
+ start_line=1,
757
+ end_line=content.count('\n') + 1
758
+ ))
759
+
760
+ return chunks
761
+
762
+ def _create_chunk(
763
+ self,
764
+ content: str,
765
+ file_path: str,
766
+ name: str,
767
+ start_line: int,
768
+ end_line: int
769
+ ) -> Chunk:
770
+ """Create a Chunk object."""
771
+ return Chunk(
772
+ id=generate_chunk_id(file_path, content),
773
+ content=content,
774
+ metadata={
775
+ 'file_path': file_path,
776
+ 'file_type': 'text',
777
+ 'chunk_type': 'paragraph',
778
+ 'name': name,
779
+ 'start_line': start_line,
780
+ 'end_line': end_line,
781
+ 'char_count': len(content),
782
+ 'line_count': content.count('\n') + 1
783
+ }
784
+ )
785
+
786
+
787
+ def chunk_directory(
788
+ src_path: Path,
789
+ lang: str,
790
+ exclude_patterns: List[str],
791
+ max_chunk_size: int = 1500
792
+ ) -> List[Chunk]:
793
+ """Chunk all files in a directory - Universal support for Code, Docs, and Text."""
794
+ all_chunks = []
795
+
796
+ # Initialize all chunkers
797
+ ts_chunker = TypeScriptChunker(max_chunk_size=max_chunk_size)
798
+ py_chunker = PythonChunker(max_chunk_size=max_chunk_size)
799
+ md_chunker = MarkdownChunker(max_chunk_size=max_chunk_size)
800
+ txt_chunker = TextChunker(max_chunk_size=max_chunk_size)
801
+
802
+ # Map extensions to chunkers
803
+ extension_map = {}
804
+ for ext in TypeScriptChunker.EXTENSIONS:
805
+ extension_map[ext] = ts_chunker
806
+ extension_map['.py'] = py_chunker
807
+ for ext in MarkdownChunker.EXTENSIONS:
808
+ extension_map[ext] = md_chunker
809
+ for ext in TextChunker.EXTENSIONS:
810
+ extension_map[ext] = txt_chunker
811
+
812
+ # All supported extensions
813
+ all_extensions = set(extension_map.keys())
814
+
815
+ # Process files
816
+ for file_path in src_path.rglob('*'):
817
+ if not file_path.is_file():
818
+ continue
819
+
820
+ # Check if extension is supported
821
+ if file_path.suffix not in all_extensions:
822
+ continue
823
+
824
+ # Check exclusions
825
+ path_str = str(file_path)
826
+ if any(pattern in path_str for pattern in exclude_patterns):
827
+ continue
828
+
829
+ # Select appropriate chunker
830
+ chunker = extension_map.get(file_path.suffix)
831
+ if chunker:
832
+ chunks = chunker.chunk_file(file_path, src_path)
833
+ all_chunks.extend(chunks)
834
+
835
+ return all_chunks
836
+
837
+
838
+ def main():
839
+ parser = argparse.ArgumentParser(
840
+ description='Universal Chunker - Code, Markdown, and Text files'
841
+ )
842
+ parser.add_argument('--src', default='./src', help='Source directory')
843
+ parser.add_argument('--output', default='.agent/rag/chunks.json', help='Output file')
844
+ parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
845
+ default='universal', help='Language mode (universal = all file types)')
846
+ parser.add_argument('--max-size', type=int, default=1500, help='Max chunk size in chars')
847
+ parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
848
+ help='Patterns to exclude')
849
+
850
+ args = parser.parse_args()
851
+
852
+ src_path = Path(args.src).resolve()
853
+ output_path = Path(args.output)
854
+ exclude_patterns = args.exclude.split(',')
855
+
856
+ if not src_path.exists():
857
+ print(f"Error: Source directory '{src_path}' does not exist")
858
+ return 1
859
+
860
+ # Mode selection
861
+ lang = args.lang
862
+ if lang == 'auto':
863
+ ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
864
+ py_files = list(src_path.rglob('*.py'))
865
+ lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
866
+ print(f"Auto-detected language: {lang}")
867
+ elif lang == 'universal':
868
+ print("Universal mode: Processing Code, Markdown, and Text files")
869
+
870
+ print(f"Chunking {src_path}...")
871
+ chunks = chunk_directory(src_path, lang, exclude_patterns, args.max_size)
872
+ print(f"Created {len(chunks)} chunks")
873
+
874
+ if len(chunks) == 0:
875
+ print("Warning: No chunks created. Check if source directory has supported files.")
876
+ print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx, .txt")
877
+
878
+ # Save output
879
+ output_path.parent.mkdir(parents=True, exist_ok=True)
880
+
881
+ # Categorize chunks by file type
882
+ file_types = {}
883
+ for c in chunks:
884
+ ft = c.metadata.get('file_type', 'unknown')
885
+ file_types[ft] = file_types.get(ft, 0) + 1
886
+
887
+ output_data = {
888
+ 'metadata': {
889
+ 'generated_at': datetime.now().isoformat(),
890
+ 'source_path': str(src_path),
891
+ 'mode': lang,
892
+ 'total_chunks': len(chunks),
893
+ 'max_chunk_size': args.max_size,
894
+ 'file_types': file_types
895
+ },
896
+ 'chunks': [asdict(c) for c in chunks]
897
+ }
898
+
899
+ output_path.write_text(json.dumps(output_data, indent=2), encoding='utf-8')
900
+ print(f"Saved to: {output_path}")
901
+
902
+ # Print summary
903
+ chunk_types = {}
904
+ for c in chunks:
905
+ t = c.metadata['chunk_type']
906
+ chunk_types[t] = chunk_types.get(t, 0) + 1
907
+
908
+ print("\nChunk summary:")
909
+ for t, count in sorted(chunk_types.items()):
910
+ print(f" {t}: {count}")
911
+
912
+ return 0
913
+
914
+
915
+ if __name__ == '__main__':
916
+ exit(main())