@musashishao/agent-kit 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -258,6 +258,22 @@ def cmd_init(args: argparse.Namespace) -> int:
258
258
  # Command: sync
259
259
  # ============================================================================
260
260
 
261
+ def find_source_dir(project_root: Path) -> Path:
262
+ """Intelligently find the source directory."""
263
+ # Priority defaults
264
+ for folder in ["src", "app", "lib", "scripts", "components"]:
265
+ if (project_root / folder).exists():
266
+ return project_root / folder
267
+
268
+ # Fallback to root if there are source files in root
269
+ source_extensions = {".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".c", ".cpp", ".cs"}
270
+ for item in project_root.iterdir():
271
+ if item.is_file() and item.suffix in source_extensions:
272
+ return project_root
273
+
274
+ return project_root / "src" # Ultimate fallback
275
+
276
+
261
277
  def cmd_sync(args: argparse.Namespace) -> int:
262
278
  """Sync AI infrastructure data."""
263
279
  project_root = Path(args.project_root).resolve()
@@ -271,6 +287,10 @@ def cmd_sync(args: argparse.Namespace) -> int:
271
287
  print("❌ .agent directory not found. Run 'ak init' first.")
272
288
  return 1
273
289
 
290
+ # Determine source directory
291
+ src_dir = find_source_dir(project_root)
292
+ print(f"🔍 Detected source directory: {src_dir.relative_to(project_root) if src_dir != project_root else '.'}")
293
+
274
294
  # Determine what to sync
275
295
  targets = []
276
296
  if args.target == "all":
@@ -285,11 +305,12 @@ def cmd_sync(args: argparse.Namespace) -> int:
285
305
  print("\n📊 Updating dependency graph...")
286
306
  graph_script = kit_path / "skills" / "graph-mapper" / "scripts" / "generate_graph.py"
287
307
 
288
- src_dir = project_root / "src"
289
- if not src_dir.exists():
290
- src_dir = project_root / "app"
291
-
292
- if src_dir.exists() and graph_script.exists():
308
+ if not graph_script.exists():
309
+ print(f" ❌ Graph script not found at: {graph_script}")
310
+ success = False
311
+ elif not src_dir.exists() and src_dir != project_root:
312
+ print(f" ⚠️ Source directory {src_dir} not found")
313
+ else:
293
314
  result = subprocess.run(
294
315
  [
295
316
  "python3", str(graph_script),
@@ -303,21 +324,18 @@ def cmd_sync(args: argparse.Namespace) -> int:
303
324
  if result.returncode == 0:
304
325
  print(" ✅ Graph updated")
305
326
  else:
306
- print(f" ❌ Graph sync failed: {result.stderr[:200]}")
327
+ print(f" ❌ Graph sync failed: {result.stderr}")
307
328
  success = False
308
- else:
309
- print(" ⚠️ Source directory or script not found")
310
329
 
311
330
  # Sync RAG
312
331
  if "rag" in targets:
313
332
  print("\n📚 Updating RAG chunks...")
314
333
  rag_script = kit_path / "skills" / "rag-engineering" / "scripts" / "chunk_code.py"
315
334
 
316
- src_dir = project_root / "src"
317
- if not src_dir.exists():
318
- src_dir = project_root / "app"
319
-
320
- if src_dir.exists() and rag_script.exists():
335
+ if not rag_script.exists():
336
+ print(f" ❌ RAG script not found at: {rag_script}")
337
+ success = False
338
+ else:
321
339
  result = subprocess.run(
322
340
  [
323
341
  "python3", str(rag_script),
@@ -330,10 +348,8 @@ def cmd_sync(args: argparse.Namespace) -> int:
330
348
  if result.returncode == 0:
331
349
  print(" ✅ RAG chunks updated")
332
350
  else:
333
- print(f" ❌ RAG sync failed: {result.stderr[:200]}")
351
+ print(f" ❌ RAG sync failed: {result.stderr}")
334
352
  success = False
335
- else:
336
- print(" ⚠️ Source directory or script not found")
337
353
 
338
354
  # Update timestamp cache
339
355
  cache_file = agent_dir / ".cache" / "timestamps.json"
@@ -311,6 +311,165 @@ class PythonAnalyzer:
311
311
  return nodes
312
312
 
313
313
 
314
+ class MarkdownAnalyzer:
315
+ """Analyzes Markdown files for internal links and references."""
316
+
317
+ EXTENSIONS = {'.md', '.mdx', '.markdown'}
318
+
319
+ # Regex patterns for link detection
320
+ LINK_PATTERNS = [
321
+ # Standard markdown link: [text](path)
322
+ r'\[([^\]]+)\]\(([^)]+)\)',
323
+ # Wikilink: [[path]] or [[path|text]]
324
+ r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]',
325
+ ]
326
+
327
+ # Pattern for image refs (also a form of dependency)
328
+ IMAGE_PATTERN = r'!\[([^\]]*)\]\(([^)]+)\)'
329
+
330
+ def __init__(self, base_path: Path, exclude_patterns: List[str]):
331
+ self.base_path = base_path
332
+ self.exclude_patterns = exclude_patterns
333
+
334
+ def should_exclude(self, path: Path) -> bool:
335
+ """Check if path should be excluded."""
336
+ path_str = str(path)
337
+ for pattern in self.exclude_patterns:
338
+ if pattern in path_str:
339
+ return True
340
+ return False
341
+
342
+ def analyze_file(self, file_path: Path) -> Optional[Node]:
343
+ """Analyze a single Markdown file for links."""
344
+ if self.should_exclude(file_path):
345
+ return None
346
+
347
+ try:
348
+ content = file_path.read_text(encoding='utf-8')
349
+ except (UnicodeDecodeError, PermissionError):
350
+ return None
351
+
352
+ relative_path = str(file_path.relative_to(self.base_path))
353
+ node_type = self._detect_type(relative_path, content)
354
+
355
+ # Extract internal links
356
+ links = []
357
+
358
+ # Standard markdown links
359
+ for match in re.findall(self.LINK_PATTERNS[0], content):
360
+ link_path = match[1]
361
+ resolved = self._resolve_link(link_path, file_path)
362
+ if resolved:
363
+ links.append(resolved)
364
+
365
+ # Wikilinks
366
+ for match in re.findall(self.LINK_PATTERNS[1], content):
367
+ resolved = self._resolve_wikilink(match)
368
+ if resolved:
369
+ links.append(resolved)
370
+
371
+ # Extract "exports" (main topics/headings)
372
+ exports = []
373
+ # Get main title (first h1)
374
+ title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
375
+ if title_match:
376
+ exports.append(title_match.group(1).strip())
377
+
378
+ # Get h2 sections as additional exports
379
+ for match in re.findall(r'^##\s+(.+)$', content, re.MULTILINE):
380
+ exports.append(match.strip())
381
+
382
+ return Node(
383
+ id=relative_path,
384
+ type=node_type,
385
+ path=relative_path,
386
+ imports=list(set(links)), # Links = imports in docs context
387
+ exports=exports[:10] # Limit to first 10 headings
388
+ )
389
+
390
+ def _detect_type(self, path: str, content: str) -> str:
391
+ """Detect the type of documentation file."""
392
+ path_lower = path.lower()
393
+
394
+ if 'readme' in path_lower:
395
+ return 'readme'
396
+ elif 'changelog' in path_lower or 'history' in path_lower:
397
+ return 'changelog'
398
+ elif 'contributing' in path_lower:
399
+ return 'contributing'
400
+ elif 'license' in path_lower:
401
+ return 'license'
402
+ elif '/docs/' in path_lower or path_lower.startswith('docs/'):
403
+ return 'documentation'
404
+ elif '/guides/' in path_lower or '/tutorials/' in path_lower:
405
+ return 'guide'
406
+ elif '/api/' in path_lower or 'api' in path_lower:
407
+ return 'api_doc'
408
+ elif 'plan' in path_lower or 'roadmap' in path_lower:
409
+ return 'plan'
410
+ else:
411
+ return 'document'
412
+
413
+ def _resolve_link(self, link_path: str, from_file: Path) -> Optional[str]:
414
+ """Resolve a markdown link to a relative path."""
415
+ # Skip external links
416
+ if link_path.startswith(('http://', 'https://', 'mailto:', '#')):
417
+ return None
418
+
419
+ # Skip anchor-only links
420
+ if link_path.startswith('#'):
421
+ return None
422
+
423
+ # Remove anchor from path
424
+ if '#' in link_path:
425
+ link_path = link_path.split('#')[0]
426
+
427
+ if not link_path:
428
+ return None
429
+
430
+ # Resolve relative path
431
+ from_dir = from_file.parent
432
+ resolved = (from_dir / link_path).resolve()
433
+
434
+ try:
435
+ relative = str(resolved.relative_to(self.base_path))
436
+ # Check if file exists
437
+ if resolved.exists():
438
+ return relative
439
+ return None
440
+ except ValueError:
441
+ return None
442
+
443
+ def _resolve_wikilink(self, link_name: str) -> Optional[str]:
444
+ """Resolve a wikilink to a file path."""
445
+ # Search for file matching the wikilink name
446
+ search_name = link_name.strip()
447
+
448
+ # Try exact match with .md extension
449
+ for ext in self.EXTENSIONS:
450
+ for file_path in self.base_path.rglob(f"*{ext}"):
451
+ if file_path.stem.lower() == search_name.lower():
452
+ try:
453
+ return str(file_path.relative_to(self.base_path))
454
+ except ValueError:
455
+ continue
456
+
457
+ return None
458
+
459
+ def analyze_directory(self, directory: Path) -> List[Node]:
460
+ """Analyze all Markdown files in a directory."""
461
+ nodes = []
462
+
463
+ for ext in self.EXTENSIONS:
464
+ for file_path in directory.rglob(f'*{ext}'):
465
+ if file_path.is_file():
466
+ node = self.analyze_file(file_path)
467
+ if node:
468
+ nodes.append(node)
469
+
470
+ return nodes
471
+
472
+
314
473
  def build_edges(nodes: List[Node]) -> List[Edge]:
315
474
  """Build edges from node imports."""
316
475
  edges = []
@@ -424,13 +583,16 @@ def generate_markdown(graph: Graph, output_path: Path):
424
583
 
425
584
 
426
585
  def main():
427
- parser = argparse.ArgumentParser(description='Generate dependency graph')
586
+ parser = argparse.ArgumentParser(
587
+ description='Universal Dependency Graph Generator - Code and Documentation'
588
+ )
428
589
  parser.add_argument('--src', default='./src', help='Source directory')
429
590
  parser.add_argument('--output', default='.agent/graph.json', help='Output file')
430
591
  parser.add_argument('--format', choices=['json', 'markdown', 'both'], default='both')
431
- parser.add_argument('--lang', choices=['typescript', 'python', 'auto'], default='auto')
592
+ parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
593
+ default='universal', help='Language mode (universal = Code + Markdown)')
432
594
  parser.add_argument('--depth', type=int, default=3, help='Max depth for impact analysis')
433
- parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build',
595
+ parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
434
596
  help='Comma-separated patterns to exclude')
435
597
 
436
598
  args = parser.parse_args()
@@ -443,37 +605,77 @@ def main():
443
605
  print(f"Error: Source directory '{src_path}' does not exist")
444
606
  return 1
445
607
 
446
- # Detect language if auto
608
+ # Collect all nodes
609
+ all_nodes = []
447
610
  lang = args.lang
448
- if lang == 'auto':
611
+
612
+ if lang == 'universal':
613
+ print("Universal mode: Analyzing Code + Markdown files")
614
+
615
+ # Analyze TypeScript/JavaScript
616
+ ts_analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
617
+ ts_nodes = ts_analyzer.analyze_directory(src_path)
618
+ all_nodes.extend(ts_nodes)
619
+ print(f" TypeScript/JS: {len(ts_nodes)} files")
620
+
621
+ # Analyze Python
622
+ py_analyzer = PythonAnalyzer(src_path, exclude_patterns)
623
+ py_nodes = py_analyzer.analyze_directory(src_path)
624
+ all_nodes.extend(py_nodes)
625
+ print(f" Python: {len(py_nodes)} files")
626
+
627
+ # Analyze Markdown
628
+ md_analyzer = MarkdownAnalyzer(src_path, exclude_patterns)
629
+ md_nodes = md_analyzer.analyze_directory(src_path)
630
+ all_nodes.extend(md_nodes)
631
+ print(f" Markdown: {len(md_nodes)} files")
632
+
633
+ elif lang == 'auto':
449
634
  ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
450
635
  py_files = list(src_path.rglob('*.py'))
451
636
  lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
452
637
  print(f"Auto-detected language: {lang}")
453
-
454
- # Analyze based on language
455
- if lang == 'typescript':
456
- analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
638
+
639
+ if lang == 'typescript':
640
+ analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
641
+ else:
642
+ analyzer = PythonAnalyzer(src_path, exclude_patterns)
643
+
644
+ all_nodes = analyzer.analyze_directory(src_path)
457
645
  else:
458
- analyzer = PythonAnalyzer(src_path, exclude_patterns)
646
+ # Specific language
647
+ if lang == 'typescript':
648
+ analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
649
+ else:
650
+ analyzer = PythonAnalyzer(src_path, exclude_patterns)
651
+
652
+ all_nodes = analyzer.analyze_directory(src_path)
459
653
 
460
- print(f"Analyzing {src_path}...")
461
- nodes = analyzer.analyze_directory(src_path)
462
- print(f"Found {len(nodes)} files")
654
+ print(f"Total: {len(all_nodes)} files")
655
+
656
+ if len(all_nodes) == 0:
657
+ print("Warning: No files found. Check source directory and exclude patterns.")
658
+ print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx")
463
659
 
464
660
  # Build graph
465
- edges = build_edges(nodes)
466
- print(f"Found {len(edges)} dependencies")
661
+ edges = build_edges(all_nodes)
662
+ print(f"Found {len(edges)} dependencies/links")
663
+
664
+ # Categorize by type
665
+ type_counts = {}
666
+ for node in all_nodes:
667
+ type_counts[node.type] = type_counts.get(node.type, 0) + 1
467
668
 
468
669
  graph = Graph(
469
- nodes=nodes,
670
+ nodes=all_nodes,
470
671
  edges=edges,
471
672
  metadata={
472
673
  "generated_at": datetime.now().isoformat(),
473
674
  "source_path": str(src_path),
474
- "language": lang,
475
- "total_files": len(nodes),
476
- "total_edges": len(edges)
675
+ "mode": lang,
676
+ "total_files": len(all_nodes),
677
+ "total_edges": len(edges),
678
+ "file_types": type_counts
477
679
  }
478
680
  )
479
681
 
@@ -505,29 +505,320 @@ class PythonChunker:
505
505
  )
506
506
 
507
507
 
508
+ class MarkdownChunker:
509
+ """Chunk Markdown files by heading sections."""
510
+
511
+ EXTENSIONS = {'.md', '.mdx', '.markdown'}
512
+
513
+ def __init__(self, max_chunk_size: int = 2000, overlap: int = 100):
514
+ self.max_chunk_size = max_chunk_size
515
+ self.overlap = overlap
516
+
517
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
518
+ """Chunk a Markdown file by headings."""
519
+ try:
520
+ content = file_path.read_text(encoding='utf-8')
521
+ except (UnicodeDecodeError, PermissionError):
522
+ return []
523
+
524
+ relative_path = str(file_path.relative_to(base_path))
525
+ chunks = []
526
+
527
+ # Extract sections by heading
528
+ sections = self._extract_sections(content)
529
+
530
+ if sections:
531
+ for section in sections:
532
+ chunk_content = section['content']
533
+
534
+ # If section is too large, split it
535
+ if len(chunk_content) > self.max_chunk_size:
536
+ sub_chunks = self._split_by_paragraphs(chunk_content)
537
+ for i, sub in enumerate(sub_chunks):
538
+ chunks.append(self._create_chunk(
539
+ content=sub,
540
+ file_path=relative_path,
541
+ section_title=f"{section['title']}_part{i+1}",
542
+ heading_level=section['level'],
543
+ parent_headings=section['parents'],
544
+ start_line=section['start_line'],
545
+ end_line=section['end_line']
546
+ ))
547
+ else:
548
+ chunks.append(self._create_chunk(
549
+ content=chunk_content,
550
+ file_path=relative_path,
551
+ section_title=section['title'],
552
+ heading_level=section['level'],
553
+ parent_headings=section['parents'],
554
+ start_line=section['start_line'],
555
+ end_line=section['end_line']
556
+ ))
557
+ else:
558
+ # No headings found, treat entire file as one chunk or split by paragraphs
559
+ if len(content) <= self.max_chunk_size:
560
+ chunks.append(self._create_chunk(
561
+ content=content,
562
+ file_path=relative_path,
563
+ section_title=file_path.stem,
564
+ heading_level=0,
565
+ parent_headings=[],
566
+ start_line=1,
567
+ end_line=content.count('\n') + 1
568
+ ))
569
+ else:
570
+ sub_chunks = self._split_by_paragraphs(content)
571
+ for i, sub in enumerate(sub_chunks):
572
+ chunks.append(self._create_chunk(
573
+ content=sub,
574
+ file_path=relative_path,
575
+ section_title=f"{file_path.stem}_part{i+1}",
576
+ heading_level=0,
577
+ parent_headings=[],
578
+ start_line=1,
579
+ end_line=content.count('\n') + 1
580
+ ))
581
+
582
+ return chunks
583
+
584
+ def _extract_sections(self, content: str) -> List[Dict]:
585
+ """Extract sections based on Markdown headings."""
586
+ sections = []
587
+ lines = content.split('\n')
588
+
589
+ heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
590
+
591
+ current_section = None
592
+ current_lines = []
593
+ parent_stack = [] # Track parent headings for context
594
+
595
+ for i, line in enumerate(lines):
596
+ match = heading_pattern.match(line)
597
+
598
+ if match:
599
+ # Save previous section
600
+ if current_section:
601
+ current_section['content'] = '\n'.join(current_lines).strip()
602
+ current_section['end_line'] = i
603
+ if current_section['content']:
604
+ sections.append(current_section)
605
+
606
+ # Update parent stack
607
+ level = len(match.group(1))
608
+ title = match.group(2).strip()
609
+
610
+ # Pop parents that are same or deeper level
611
+ while parent_stack and parent_stack[-1]['level'] >= level:
612
+ parent_stack.pop()
613
+
614
+ parents = [p['title'] for p in parent_stack]
615
+
616
+ # Start new section
617
+ current_section = {
618
+ 'title': title,
619
+ 'level': level,
620
+ 'parents': parents.copy(),
621
+ 'start_line': i + 1,
622
+ 'end_line': i + 1,
623
+ 'content': ''
624
+ }
625
+ current_lines = [line]
626
+
627
+ # Add this heading to parent stack
628
+ parent_stack.append({'level': level, 'title': title})
629
+ elif current_section:
630
+ current_lines.append(line)
631
+
632
+ # Don't forget last section
633
+ if current_section:
634
+ current_section['content'] = '\n'.join(current_lines).strip()
635
+ current_section['end_line'] = len(lines)
636
+ if current_section['content']:
637
+ sections.append(current_section)
638
+
639
+ return sections
640
+
641
+ def _split_by_paragraphs(self, content: str) -> List[str]:
642
+ """Split content by paragraphs when too large."""
643
+ chunks = []
644
+ paragraphs = re.split(r'\n\s*\n', content)
645
+
646
+ current_chunk = []
647
+ current_size = 0
648
+
649
+ for para in paragraphs:
650
+ para_size = len(para) + 2 # +2 for paragraph break
651
+
652
+ if current_size + para_size > self.max_chunk_size and current_chunk:
653
+ chunks.append('\n\n'.join(current_chunk))
654
+ current_chunk = []
655
+ current_size = 0
656
+
657
+ current_chunk.append(para)
658
+ current_size += para_size
659
+
660
+ if current_chunk:
661
+ chunks.append('\n\n'.join(current_chunk))
662
+
663
+ return chunks
664
+
665
+ def _create_chunk(
666
+ self,
667
+ content: str,
668
+ file_path: str,
669
+ section_title: str,
670
+ heading_level: int,
671
+ parent_headings: List[str],
672
+ start_line: int,
673
+ end_line: int
674
+ ) -> Chunk:
675
+ """Create a Chunk object with rich context."""
676
+ # Build context string for better retrieval
677
+ context_path = ' > '.join(parent_headings + [section_title]) if parent_headings else section_title
678
+
679
+ return Chunk(
680
+ id=generate_chunk_id(file_path, content),
681
+ content=content,
682
+ metadata={
683
+ 'file_path': file_path,
684
+ 'file_type': 'markdown',
685
+ 'chunk_type': f'heading_{heading_level}' if heading_level > 0 else 'paragraph',
686
+ 'name': section_title,
687
+ 'context_path': context_path,
688
+ 'heading_level': heading_level,
689
+ 'parent_headings': parent_headings,
690
+ 'start_line': start_line,
691
+ 'end_line': end_line,
692
+ 'char_count': len(content),
693
+ 'line_count': content.count('\n') + 1
694
+ }
695
+ )
696
+
697
+
698
+ class TextChunker:
699
+ """Chunk plain text files by paragraphs."""
700
+
701
+ EXTENSIONS = {'.txt', '.text', '.log'}
702
+
703
+ def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
704
+ self.max_chunk_size = max_chunk_size
705
+ self.overlap = overlap
706
+
707
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
708
+ """Chunk a text file by paragraphs."""
709
+ try:
710
+ content = file_path.read_text(encoding='utf-8')
711
+ except (UnicodeDecodeError, PermissionError):
712
+ return []
713
+
714
+ relative_path = str(file_path.relative_to(base_path))
715
+ chunks = []
716
+
717
+ if len(content) <= self.max_chunk_size:
718
+ chunks.append(self._create_chunk(
719
+ content=content,
720
+ file_path=relative_path,
721
+ name=file_path.stem,
722
+ start_line=1,
723
+ end_line=content.count('\n') + 1
724
+ ))
725
+ else:
726
+ # Split by paragraphs
727
+ paragraphs = re.split(r'\n\s*\n', content)
728
+ current_chunk = []
729
+ current_size = 0
730
+ chunk_index = 0
731
+
732
+ for para in paragraphs:
733
+ para_size = len(para) + 2
734
+
735
+ if current_size + para_size > self.max_chunk_size and current_chunk:
736
+ chunk_index += 1
737
+ chunks.append(self._create_chunk(
738
+ content='\n\n'.join(current_chunk),
739
+ file_path=relative_path,
740
+ name=f"{file_path.stem}_part{chunk_index}",
741
+ start_line=1,
742
+ end_line=content.count('\n') + 1
743
+ ))
744
+ current_chunk = []
745
+ current_size = 0
746
+
747
+ current_chunk.append(para)
748
+ current_size += para_size
749
+
750
+ if current_chunk:
751
+ chunk_index += 1
752
+ chunks.append(self._create_chunk(
753
+ content='\n\n'.join(current_chunk),
754
+ file_path=relative_path,
755
+ name=f"{file_path.stem}_part{chunk_index}",
756
+ start_line=1,
757
+ end_line=content.count('\n') + 1
758
+ ))
759
+
760
+ return chunks
761
+
762
+ def _create_chunk(
763
+ self,
764
+ content: str,
765
+ file_path: str,
766
+ name: str,
767
+ start_line: int,
768
+ end_line: int
769
+ ) -> Chunk:
770
+ """Create a Chunk object."""
771
+ return Chunk(
772
+ id=generate_chunk_id(file_path, content),
773
+ content=content,
774
+ metadata={
775
+ 'file_path': file_path,
776
+ 'file_type': 'text',
777
+ 'chunk_type': 'paragraph',
778
+ 'name': name,
779
+ 'start_line': start_line,
780
+ 'end_line': end_line,
781
+ 'char_count': len(content),
782
+ 'line_count': content.count('\n') + 1
783
+ }
784
+ )
785
+
786
+
508
787
  def chunk_directory(
509
788
  src_path: Path,
510
789
  lang: str,
511
790
  exclude_patterns: List[str],
512
791
  max_chunk_size: int = 1500
513
792
  ) -> List[Chunk]:
514
- """Chunk all files in a directory."""
793
+ """Chunk all files in a directory - Universal support for Code, Docs, and Text."""
515
794
  all_chunks = []
516
795
 
517
- # Select chunker
518
- if lang == 'typescript':
519
- chunker = TypeScriptChunker(max_chunk_size=max_chunk_size)
520
- extensions = TypeScriptChunker.EXTENSIONS
521
- else:
522
- chunker = PythonChunker(max_chunk_size=max_chunk_size)
523
- extensions = {'.py'}
796
+ # Initialize all chunkers
797
+ ts_chunker = TypeScriptChunker(max_chunk_size=max_chunk_size)
798
+ py_chunker = PythonChunker(max_chunk_size=max_chunk_size)
799
+ md_chunker = MarkdownChunker(max_chunk_size=max_chunk_size)
800
+ txt_chunker = TextChunker(max_chunk_size=max_chunk_size)
801
+
802
+ # Map extensions to chunkers
803
+ extension_map = {}
804
+ for ext in TypeScriptChunker.EXTENSIONS:
805
+ extension_map[ext] = ts_chunker
806
+ extension_map['.py'] = py_chunker
807
+ for ext in MarkdownChunker.EXTENSIONS:
808
+ extension_map[ext] = md_chunker
809
+ for ext in TextChunker.EXTENSIONS:
810
+ extension_map[ext] = txt_chunker
811
+
812
+ # All supported extensions
813
+ all_extensions = set(extension_map.keys())
524
814
 
525
815
  # Process files
526
816
  for file_path in src_path.rglob('*'):
527
817
  if not file_path.is_file():
528
818
  continue
529
819
 
530
- if file_path.suffix not in extensions:
820
+ # Check if extension is supported
821
+ if file_path.suffix not in all_extensions:
531
822
  continue
532
823
 
533
824
  # Check exclusions
@@ -535,19 +826,25 @@ def chunk_directory(
535
826
  if any(pattern in path_str for pattern in exclude_patterns):
536
827
  continue
537
828
 
538
- chunks = chunker.chunk_file(file_path, src_path)
539
- all_chunks.extend(chunks)
829
+ # Select appropriate chunker
830
+ chunker = extension_map.get(file_path.suffix)
831
+ if chunker:
832
+ chunks = chunker.chunk_file(file_path, src_path)
833
+ all_chunks.extend(chunks)
540
834
 
541
835
  return all_chunks
542
836
 
543
837
 
544
838
  def main():
545
- parser = argparse.ArgumentParser(description='Chunk code files')
839
+ parser = argparse.ArgumentParser(
840
+ description='Universal Chunker - Code, Markdown, and Text files'
841
+ )
546
842
  parser.add_argument('--src', default='./src', help='Source directory')
547
843
  parser.add_argument('--output', default='.agent/rag/chunks.json', help='Output file')
548
- parser.add_argument('--lang', choices=['typescript', 'python', 'auto'], default='auto')
844
+ parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
845
+ default='universal', help='Language mode (universal = all file types)')
549
846
  parser.add_argument('--max-size', type=int, default=1500, help='Max chunk size in chars')
550
- parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build',
847
+ parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
551
848
  help='Patterns to exclude')
552
849
 
553
850
  args = parser.parse_args()
@@ -560,28 +857,41 @@ def main():
560
857
  print(f"Error: Source directory '{src_path}' does not exist")
561
858
  return 1
562
859
 
563
- # Auto-detect language
860
+ # Mode selection
564
861
  lang = args.lang
565
862
  if lang == 'auto':
566
863
  ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
567
864
  py_files = list(src_path.rglob('*.py'))
568
865
  lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
569
866
  print(f"Auto-detected language: {lang}")
867
+ elif lang == 'universal':
868
+ print("Universal mode: Processing Code, Markdown, and Text files")
570
869
 
571
870
  print(f"Chunking {src_path}...")
572
871
  chunks = chunk_directory(src_path, lang, exclude_patterns, args.max_size)
573
872
  print(f"Created {len(chunks)} chunks")
574
873
 
874
+ if len(chunks) == 0:
875
+ print("Warning: No chunks created. Check if source directory has supported files.")
876
+ print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx, .txt")
877
+
575
878
  # Save output
576
879
  output_path.parent.mkdir(parents=True, exist_ok=True)
577
880
 
881
+ # Categorize chunks by file type
882
+ file_types = {}
883
+ for c in chunks:
884
+ ft = c.metadata.get('file_type', 'unknown')
885
+ file_types[ft] = file_types.get(ft, 0) + 1
886
+
578
887
  output_data = {
579
888
  'metadata': {
580
889
  'generated_at': datetime.now().isoformat(),
581
890
  'source_path': str(src_path),
582
- 'language': lang,
891
+ 'mode': lang,
583
892
  'total_chunks': len(chunks),
584
- 'max_chunk_size': args.max_size
893
+ 'max_chunk_size': args.max_size,
894
+ 'file_types': file_types
585
895
  },
586
896
  'chunks': [asdict(c) for c in chunks]
587
897
  }
@@ -0,0 +1,48 @@
1
+ # PLAN: Universal Intelligence Engine (UIE)
2
+
3
+ > Mục tiêu: Nâng cấp Agent Kit để hỗ trợ mọi loại folder làm việc (Docs, Research, Code) với khả năng thấu hiểu tri thức toàn diện.
4
+
5
+ ## Phase 1: Universal RAG (Smart Chunking cho mọi định dạng) ✅ COMPLETED
6
+ - [x] **Nâng cấp `chunk_code.py` thành Universal Chunker**:
7
+ - [x] Thêm `MarkdownChunker`: Cắt segment theo Heading (`#`, `##`, `###`).
8
+ - [x] Thêm `TextChunker`: Cắt theo đoạn văn (Paragraph).
9
+ - [x] Giữ nguyên khả năng cắt Code (TS, Python) sử dụng AST.
10
+ - [x] **Contextual Metadata cho Docs**:
11
+ - [x] Tự động trích xuất tiêu đề chính của file làm context cho từng chunk bên trong.
12
+ - [x] Ghi nhận `context_path` (Parent > Child heading) để AI biết ngữ cảnh.
13
+
14
+ ## Phase 2: Knowledge Graph cho Tài liệu (Relationship Mapping) ✅ COMPLETED
15
+ - [x] **Nâng cấp `generate_graph.py`**:
16
+ - [x] Thêm `MarkdownAnalyzer`: Quét link nội bộ `[text](file.md)` và wikilink `[[file]]`.
17
+ - [x] Nhận diện heading exports (H1, H2) làm "API" của tài liệu.
18
+ - [x] Chế độ `universal`: Kết hợp Code + Markdown trong cùng một graph.
19
+
20
+ ## Phase 3: MCP Gateway Enhancements
21
+ - [ ] **Content-Type Awareness**:
22
+ - [ ] Tool `analyze_dependencies` trả về "References" thay vì "Imports" cho docs.
23
+ - [ ] Tool `search_code_logic` alias thành `search_knowledge`.
24
+ - [ ] **Auto-Detection**: gateway báo cáo loại dự án (Creative, Tech, Mixed).
25
+
26
+ ## Phase 4: CLI Evolution (`ak` command)
27
+ - [ ] **`ak init` thông minh**:
28
+ - [ ] Không ép buộc phải có `src` hay `app`.
29
+ - [ ] Tự động tạo `AGENTS.md` theo template "General Knowledge" nếu không phát hiện code.
30
+ - [ ] **`ak sync` toàn diện**:
31
+ - [ ] Quét mọi file (ngo trừ ignore list) để đảm bảo không bỏ sót tri thức nào.
32
+
33
+ ## Phase 5: Verification & Beta Test
34
+ - [ ] **Test Case 1**: Dự án chỉ gồm 100 file Markdown (Hướng dẫn sử dụng).
35
+ - [ ] **Test Case 2**: Dự án hỗn hợp (Next.js + Docs + API Specs).
36
+ - [ ] **Test Case 3**: Dự án nghiên cứu (Nhiều file .txt và ghi chú rời rạc).
37
+
38
+ ---
39
+
40
+ ## Agent Assignments
41
+ - **Python Specialist**: Xử lý logic Chunker và Graph Mapper (Phase 1 & 2).
42
+ - **TypeScript Expert**: Cập nhật MCP Gateway (Phase 3).
43
+ - **Orchestrator**: Cập nhật CLI và hoàn thiện Docs (Phase 4).
44
+
45
+ ## Verification Checklist
46
+ - [ ] `ai status` hiển thị ✅ RAG và ✅ Graph ngay cả trong folder không có code.
47
+ - [ ] AI có thể tìm thấy thông tin nằm sâu trong một sub-section của file Markdown dài.
48
+ - [ ] Dependency graph hiển thị được mối liên kết giữa các file tài liệu.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@musashishao/agent-kit",
3
- "version": "1.3.0",
3
+ "version": "1.4.0",
4
4
  "description": "AI Agent templates - Skills, Agents, Workflows, and AI-Ready Data Infrastructure Gateway",
5
5
  "main": "index.js",
6
6
  "bin": {