@musashishao/agent-kit 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -179,17 +179,25 @@ def cmd_init(args: argparse.Namespace) -> int:
179
179
  print(f"\n[{steps_completed + 1}/{total_steps}] Generating dependency graph...")
180
180
  graph_script = kit_path / "skills" / "graph-mapper" / "scripts" / "generate_graph.py"
181
181
 
182
- src_dir = project_root / "src"
183
- if not src_dir.exists():
184
- src_dir = project_root / "app"
182
+ # Smart source detection
183
+ src_dir = None
184
+ for folder in ["src", "app", "lib", "scripts", "components", "docs"]:
185
+ if (project_root / folder).exists():
186
+ src_dir = project_root / folder
187
+ break
188
+
189
+ if src_dir is None:
190
+ # Fallback to project root
191
+ src_dir = project_root
185
192
 
186
- if src_dir.exists() and graph_script.exists():
193
+ if graph_script.exists():
187
194
  result = subprocess.run(
188
195
  [
189
196
  "python3", str(graph_script),
190
197
  "--src", str(src_dir),
191
198
  "--output", str(agent_dir / "graph.json"),
192
199
  "--format", "both",
200
+ "--lang", "universal",
193
201
  ],
194
202
  capture_output=True,
195
203
  text=True,
@@ -199,7 +207,7 @@ def cmd_init(args: argparse.Namespace) -> int:
199
207
  else:
200
208
  print(" ⚠️ No source code found or script failed")
201
209
  else:
202
- print(" ⚠️ No src/ or app/ directory found, skipping")
210
+ print(" ⚠️ Graph script not found, skipping")
203
211
  steps_completed += 1
204
212
 
205
213
  # Step 5: Configure AI hosts
@@ -258,6 +266,29 @@ def cmd_init(args: argparse.Namespace) -> int:
258
266
  # Command: sync
259
267
  # ============================================================================
260
268
 
269
+ def find_source_dir(project_root: Path) -> Path:
270
+ """Intelligently find the source directory."""
271
+ # Priority folders
272
+ for folder in ["src", "app", "lib", "scripts", "components"]:
273
+ if (project_root / folder).exists():
274
+ return project_root / folder
275
+
276
+ # Check if there are source files in root
277
+ source_extensions = {".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".c", ".cpp", ".cs", ".md"}
278
+ has_source_files = False
279
+ for item in project_root.iterdir():
280
+ if item.is_file() and item.suffix in source_extensions:
281
+ has_source_files = True
282
+ break
283
+
284
+ # Check for docs folder
285
+ if (project_root / "docs").exists():
286
+ return project_root / "docs"
287
+
288
+ # Fallback: use project root if it has any content, otherwise return root anyway
289
+ return project_root
290
+
291
+
261
292
  def cmd_sync(args: argparse.Namespace) -> int:
262
293
  """Sync AI infrastructure data."""
263
294
  project_root = Path(args.project_root).resolve()
@@ -271,6 +302,10 @@ def cmd_sync(args: argparse.Namespace) -> int:
271
302
  print("❌ .agent directory not found. Run 'ak init' first.")
272
303
  return 1
273
304
 
305
+ # Determine source directory
306
+ src_dir = find_source_dir(project_root)
307
+ print(f"🔍 Detected source directory: {src_dir.relative_to(project_root) if src_dir != project_root else '.'}")
308
+
274
309
  # Determine what to sync
275
310
  targets = []
276
311
  if args.target == "all":
@@ -285,17 +320,19 @@ def cmd_sync(args: argparse.Namespace) -> int:
285
320
  print("\n📊 Updating dependency graph...")
286
321
  graph_script = kit_path / "skills" / "graph-mapper" / "scripts" / "generate_graph.py"
287
322
 
288
- src_dir = project_root / "src"
289
- if not src_dir.exists():
290
- src_dir = project_root / "app"
291
-
292
- if src_dir.exists() and graph_script.exists():
323
+ if not graph_script.exists():
324
+ print(f" ❌ Graph script not found at: {graph_script}")
325
+ success = False
326
+ elif not src_dir.exists():
327
+ print(f" ⚠️ Source directory {src_dir} not found")
328
+ else:
293
329
  result = subprocess.run(
294
330
  [
295
331
  "python3", str(graph_script),
296
332
  "--src", str(src_dir),
297
333
  "--output", str(agent_dir / "graph.json"),
298
334
  "--format", "both",
335
+ "--lang", "universal",
299
336
  ],
300
337
  capture_output=True,
301
338
  text=True,
@@ -303,26 +340,26 @@ def cmd_sync(args: argparse.Namespace) -> int:
303
340
  if result.returncode == 0:
304
341
  print(" ✅ Graph updated")
305
342
  else:
306
- print(f" ❌ Graph sync failed: {result.stderr[:200]}")
343
+ print(f" ❌ Graph sync failed: {result.stderr}")
307
344
  success = False
308
- else:
309
- print(" ⚠️ Source directory or script not found")
310
345
 
311
346
  # Sync RAG
312
347
  if "rag" in targets:
313
348
  print("\n📚 Updating RAG chunks...")
314
349
  rag_script = kit_path / "skills" / "rag-engineering" / "scripts" / "chunk_code.py"
315
350
 
316
- src_dir = project_root / "src"
317
- if not src_dir.exists():
318
- src_dir = project_root / "app"
319
-
320
- if src_dir.exists() and rag_script.exists():
351
+ if not rag_script.exists():
352
+ print(f" ❌ RAG script not found at: {rag_script}")
353
+ success = False
354
+ elif not src_dir.exists():
355
+ print(f" ⚠️ Source directory {src_dir} not found")
356
+ else:
321
357
  result = subprocess.run(
322
358
  [
323
359
  "python3", str(rag_script),
324
360
  "--src", str(src_dir),
325
361
  "--output", str(agent_dir / "rag" / "chunks.json"),
362
+ "--lang", "universal",
326
363
  ],
327
364
  capture_output=True,
328
365
  text=True,
@@ -330,10 +367,8 @@ def cmd_sync(args: argparse.Namespace) -> int:
330
367
  if result.returncode == 0:
331
368
  print(" ✅ RAG chunks updated")
332
369
  else:
333
- print(f" ❌ RAG sync failed: {result.stderr[:200]}")
370
+ print(f" ❌ RAG sync failed: {result.stderr}")
334
371
  success = False
335
- else:
336
- print(" ⚠️ Source directory or script not found")
337
372
 
338
373
  # Update timestamp cache
339
374
  cache_file = agent_dir / ".cache" / "timestamps.json"
@@ -311,6 +311,165 @@ class PythonAnalyzer:
311
311
  return nodes
312
312
 
313
313
 
314
+ class MarkdownAnalyzer:
315
+ """Analyzes Markdown files for internal links and references."""
316
+
317
+ EXTENSIONS = {'.md', '.mdx', '.markdown'}
318
+
319
+ # Regex patterns for link detection
320
+ LINK_PATTERNS = [
321
+ # Standard markdown link: [text](path)
322
+ r'\[([^\]]+)\]\(([^)]+)\)',
323
+ # Wikilink: [[path]] or [[path|text]]
324
+ r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]',
325
+ ]
326
+
327
+ # Pattern for image refs (also a form of dependency)
328
+ IMAGE_PATTERN = r'!\[([^\]]*)\]\(([^)]+)\)'
329
+
330
+ def __init__(self, base_path: Path, exclude_patterns: List[str]):
331
+ self.base_path = base_path
332
+ self.exclude_patterns = exclude_patterns
333
+
334
+ def should_exclude(self, path: Path) -> bool:
335
+ """Check if path should be excluded."""
336
+ path_str = str(path)
337
+ for pattern in self.exclude_patterns:
338
+ if pattern in path_str:
339
+ return True
340
+ return False
341
+
342
+ def analyze_file(self, file_path: Path) -> Optional[Node]:
343
+ """Analyze a single Markdown file for links."""
344
+ if self.should_exclude(file_path):
345
+ return None
346
+
347
+ try:
348
+ content = file_path.read_text(encoding='utf-8')
349
+ except (UnicodeDecodeError, PermissionError):
350
+ return None
351
+
352
+ relative_path = str(file_path.relative_to(self.base_path))
353
+ node_type = self._detect_type(relative_path, content)
354
+
355
+ # Extract internal links
356
+ links = []
357
+
358
+ # Standard markdown links
359
+ for match in re.findall(self.LINK_PATTERNS[0], content):
360
+ link_path = match[1]
361
+ resolved = self._resolve_link(link_path, file_path)
362
+ if resolved:
363
+ links.append(resolved)
364
+
365
+ # Wikilinks
366
+ for match in re.findall(self.LINK_PATTERNS[1], content):
367
+ resolved = self._resolve_wikilink(match)
368
+ if resolved:
369
+ links.append(resolved)
370
+
371
+ # Extract "exports" (main topics/headings)
372
+ exports = []
373
+ # Get main title (first h1)
374
+ title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
375
+ if title_match:
376
+ exports.append(title_match.group(1).strip())
377
+
378
+ # Get h2 sections as additional exports
379
+ for match in re.findall(r'^##\s+(.+)$', content, re.MULTILINE):
380
+ exports.append(match.strip())
381
+
382
+ return Node(
383
+ id=relative_path,
384
+ type=node_type,
385
+ path=relative_path,
386
+ imports=list(set(links)), # Links = imports in docs context
387
+ exports=exports[:10] # Limit to first 10 headings
388
+ )
389
+
390
+ def _detect_type(self, path: str, content: str) -> str:
391
+ """Detect the type of documentation file."""
392
+ path_lower = path.lower()
393
+
394
+ if 'readme' in path_lower:
395
+ return 'readme'
396
+ elif 'changelog' in path_lower or 'history' in path_lower:
397
+ return 'changelog'
398
+ elif 'contributing' in path_lower:
399
+ return 'contributing'
400
+ elif 'license' in path_lower:
401
+ return 'license'
402
+ elif '/docs/' in path_lower or path_lower.startswith('docs/'):
403
+ return 'documentation'
404
+ elif '/guides/' in path_lower or '/tutorials/' in path_lower:
405
+ return 'guide'
406
+ elif '/api/' in path_lower or 'api' in path_lower:
407
+ return 'api_doc'
408
+ elif 'plan' in path_lower or 'roadmap' in path_lower:
409
+ return 'plan'
410
+ else:
411
+ return 'document'
412
+
413
+ def _resolve_link(self, link_path: str, from_file: Path) -> Optional[str]:
414
+ """Resolve a markdown link to a relative path."""
415
+ # Skip external links
416
+ if link_path.startswith(('http://', 'https://', 'mailto:', '#')):
417
+ return None
418
+
419
+ # Skip anchor-only links
420
+ if link_path.startswith('#'):
421
+ return None
422
+
423
+ # Remove anchor from path
424
+ if '#' in link_path:
425
+ link_path = link_path.split('#')[0]
426
+
427
+ if not link_path:
428
+ return None
429
+
430
+ # Resolve relative path
431
+ from_dir = from_file.parent
432
+ resolved = (from_dir / link_path).resolve()
433
+
434
+ try:
435
+ relative = str(resolved.relative_to(self.base_path))
436
+ # Check if file exists
437
+ if resolved.exists():
438
+ return relative
439
+ return None
440
+ except ValueError:
441
+ return None
442
+
443
+ def _resolve_wikilink(self, link_name: str) -> Optional[str]:
444
+ """Resolve a wikilink to a file path."""
445
+ # Search for file matching the wikilink name
446
+ search_name = link_name.strip()
447
+
448
+ # Try exact match with .md extension
449
+ for ext in self.EXTENSIONS:
450
+ for file_path in self.base_path.rglob(f"*{ext}"):
451
+ if file_path.stem.lower() == search_name.lower():
452
+ try:
453
+ return str(file_path.relative_to(self.base_path))
454
+ except ValueError:
455
+ continue
456
+
457
+ return None
458
+
459
+ def analyze_directory(self, directory: Path) -> List[Node]:
460
+ """Analyze all Markdown files in a directory."""
461
+ nodes = []
462
+
463
+ for ext in self.EXTENSIONS:
464
+ for file_path in directory.rglob(f'*{ext}'):
465
+ if file_path.is_file():
466
+ node = self.analyze_file(file_path)
467
+ if node:
468
+ nodes.append(node)
469
+
470
+ return nodes
471
+
472
+
314
473
  def build_edges(nodes: List[Node]) -> List[Edge]:
315
474
  """Build edges from node imports."""
316
475
  edges = []
@@ -424,13 +583,16 @@ def generate_markdown(graph: Graph, output_path: Path):
424
583
 
425
584
 
426
585
  def main():
427
- parser = argparse.ArgumentParser(description='Generate dependency graph')
586
+ parser = argparse.ArgumentParser(
587
+ description='Universal Dependency Graph Generator - Code and Documentation'
588
+ )
428
589
  parser.add_argument('--src', default='./src', help='Source directory')
429
590
  parser.add_argument('--output', default='.agent/graph.json', help='Output file')
430
591
  parser.add_argument('--format', choices=['json', 'markdown', 'both'], default='both')
431
- parser.add_argument('--lang', choices=['typescript', 'python', 'auto'], default='auto')
592
+ parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
593
+ default='universal', help='Language mode (universal = Code + Markdown)')
432
594
  parser.add_argument('--depth', type=int, default=3, help='Max depth for impact analysis')
433
- parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build',
595
+ parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
434
596
  help='Comma-separated patterns to exclude')
435
597
 
436
598
  args = parser.parse_args()
@@ -443,37 +605,77 @@ def main():
443
605
  print(f"Error: Source directory '{src_path}' does not exist")
444
606
  return 1
445
607
 
446
- # Detect language if auto
608
+ # Collect all nodes
609
+ all_nodes = []
447
610
  lang = args.lang
448
- if lang == 'auto':
611
+
612
+ if lang == 'universal':
613
+ print("Universal mode: Analyzing Code + Markdown files")
614
+
615
+ # Analyze TypeScript/JavaScript
616
+ ts_analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
617
+ ts_nodes = ts_analyzer.analyze_directory(src_path)
618
+ all_nodes.extend(ts_nodes)
619
+ print(f" TypeScript/JS: {len(ts_nodes)} files")
620
+
621
+ # Analyze Python
622
+ py_analyzer = PythonAnalyzer(src_path, exclude_patterns)
623
+ py_nodes = py_analyzer.analyze_directory(src_path)
624
+ all_nodes.extend(py_nodes)
625
+ print(f" Python: {len(py_nodes)} files")
626
+
627
+ # Analyze Markdown
628
+ md_analyzer = MarkdownAnalyzer(src_path, exclude_patterns)
629
+ md_nodes = md_analyzer.analyze_directory(src_path)
630
+ all_nodes.extend(md_nodes)
631
+ print(f" Markdown: {len(md_nodes)} files")
632
+
633
+ elif lang == 'auto':
449
634
  ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
450
635
  py_files = list(src_path.rglob('*.py'))
451
636
  lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
452
637
  print(f"Auto-detected language: {lang}")
453
-
454
- # Analyze based on language
455
- if lang == 'typescript':
456
- analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
638
+
639
+ if lang == 'typescript':
640
+ analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
641
+ else:
642
+ analyzer = PythonAnalyzer(src_path, exclude_patterns)
643
+
644
+ all_nodes = analyzer.analyze_directory(src_path)
457
645
  else:
458
- analyzer = PythonAnalyzer(src_path, exclude_patterns)
646
+ # Specific language
647
+ if lang == 'typescript':
648
+ analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
649
+ else:
650
+ analyzer = PythonAnalyzer(src_path, exclude_patterns)
651
+
652
+ all_nodes = analyzer.analyze_directory(src_path)
459
653
 
460
- print(f"Analyzing {src_path}...")
461
- nodes = analyzer.analyze_directory(src_path)
462
- print(f"Found {len(nodes)} files")
654
+ print(f"Total: {len(all_nodes)} files")
655
+
656
+ if len(all_nodes) == 0:
657
+ print("Warning: No files found. Check source directory and exclude patterns.")
658
+ print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx")
463
659
 
464
660
  # Build graph
465
- edges = build_edges(nodes)
466
- print(f"Found {len(edges)} dependencies")
661
+ edges = build_edges(all_nodes)
662
+ print(f"Found {len(edges)} dependencies/links")
663
+
664
+ # Categorize by type
665
+ type_counts = {}
666
+ for node in all_nodes:
667
+ type_counts[node.type] = type_counts.get(node.type, 0) + 1
467
668
 
468
669
  graph = Graph(
469
- nodes=nodes,
670
+ nodes=all_nodes,
470
671
  edges=edges,
471
672
  metadata={
472
673
  "generated_at": datetime.now().isoformat(),
473
674
  "source_path": str(src_path),
474
- "language": lang,
475
- "total_files": len(nodes),
476
- "total_edges": len(edges)
675
+ "mode": lang,
676
+ "total_files": len(all_nodes),
677
+ "total_edges": len(edges),
678
+ "file_types": type_counts
477
679
  }
478
680
  )
479
681
 
@@ -505,29 +505,320 @@ class PythonChunker:
505
505
  )
506
506
 
507
507
 
508
+ class MarkdownChunker:
509
+ """Chunk Markdown files by heading sections."""
510
+
511
+ EXTENSIONS = {'.md', '.mdx', '.markdown'}
512
+
513
+ def __init__(self, max_chunk_size: int = 2000, overlap: int = 100):
514
+ self.max_chunk_size = max_chunk_size
515
+ self.overlap = overlap
516
+
517
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
518
+ """Chunk a Markdown file by headings."""
519
+ try:
520
+ content = file_path.read_text(encoding='utf-8')
521
+ except (UnicodeDecodeError, PermissionError):
522
+ return []
523
+
524
+ relative_path = str(file_path.relative_to(base_path))
525
+ chunks = []
526
+
527
+ # Extract sections by heading
528
+ sections = self._extract_sections(content)
529
+
530
+ if sections:
531
+ for section in sections:
532
+ chunk_content = section['content']
533
+
534
+ # If section is too large, split it
535
+ if len(chunk_content) > self.max_chunk_size:
536
+ sub_chunks = self._split_by_paragraphs(chunk_content)
537
+ for i, sub in enumerate(sub_chunks):
538
+ chunks.append(self._create_chunk(
539
+ content=sub,
540
+ file_path=relative_path,
541
+ section_title=f"{section['title']}_part{i+1}",
542
+ heading_level=section['level'],
543
+ parent_headings=section['parents'],
544
+ start_line=section['start_line'],
545
+ end_line=section['end_line']
546
+ ))
547
+ else:
548
+ chunks.append(self._create_chunk(
549
+ content=chunk_content,
550
+ file_path=relative_path,
551
+ section_title=section['title'],
552
+ heading_level=section['level'],
553
+ parent_headings=section['parents'],
554
+ start_line=section['start_line'],
555
+ end_line=section['end_line']
556
+ ))
557
+ else:
558
+ # No headings found, treat entire file as one chunk or split by paragraphs
559
+ if len(content) <= self.max_chunk_size:
560
+ chunks.append(self._create_chunk(
561
+ content=content,
562
+ file_path=relative_path,
563
+ section_title=file_path.stem,
564
+ heading_level=0,
565
+ parent_headings=[],
566
+ start_line=1,
567
+ end_line=content.count('\n') + 1
568
+ ))
569
+ else:
570
+ sub_chunks = self._split_by_paragraphs(content)
571
+ for i, sub in enumerate(sub_chunks):
572
+ chunks.append(self._create_chunk(
573
+ content=sub,
574
+ file_path=relative_path,
575
+ section_title=f"{file_path.stem}_part{i+1}",
576
+ heading_level=0,
577
+ parent_headings=[],
578
+ start_line=1,
579
+ end_line=content.count('\n') + 1
580
+ ))
581
+
582
+ return chunks
583
+
584
+ def _extract_sections(self, content: str) -> List[Dict]:
585
+ """Extract sections based on Markdown headings."""
586
+ sections = []
587
+ lines = content.split('\n')
588
+
589
+ heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
590
+
591
+ current_section = None
592
+ current_lines = []
593
+ parent_stack = [] # Track parent headings for context
594
+
595
+ for i, line in enumerate(lines):
596
+ match = heading_pattern.match(line)
597
+
598
+ if match:
599
+ # Save previous section
600
+ if current_section:
601
+ current_section['content'] = '\n'.join(current_lines).strip()
602
+ current_section['end_line'] = i
603
+ if current_section['content']:
604
+ sections.append(current_section)
605
+
606
+ # Update parent stack
607
+ level = len(match.group(1))
608
+ title = match.group(2).strip()
609
+
610
+ # Pop parents that are same or deeper level
611
+ while parent_stack and parent_stack[-1]['level'] >= level:
612
+ parent_stack.pop()
613
+
614
+ parents = [p['title'] for p in parent_stack]
615
+
616
+ # Start new section
617
+ current_section = {
618
+ 'title': title,
619
+ 'level': level,
620
+ 'parents': parents.copy(),
621
+ 'start_line': i + 1,
622
+ 'end_line': i + 1,
623
+ 'content': ''
624
+ }
625
+ current_lines = [line]
626
+
627
+ # Add this heading to parent stack
628
+ parent_stack.append({'level': level, 'title': title})
629
+ elif current_section:
630
+ current_lines.append(line)
631
+
632
+ # Don't forget last section
633
+ if current_section:
634
+ current_section['content'] = '\n'.join(current_lines).strip()
635
+ current_section['end_line'] = len(lines)
636
+ if current_section['content']:
637
+ sections.append(current_section)
638
+
639
+ return sections
640
+
641
+ def _split_by_paragraphs(self, content: str) -> List[str]:
642
+ """Split content by paragraphs when too large."""
643
+ chunks = []
644
+ paragraphs = re.split(r'\n\s*\n', content)
645
+
646
+ current_chunk = []
647
+ current_size = 0
648
+
649
+ for para in paragraphs:
650
+ para_size = len(para) + 2 # +2 for paragraph break
651
+
652
+ if current_size + para_size > self.max_chunk_size and current_chunk:
653
+ chunks.append('\n\n'.join(current_chunk))
654
+ current_chunk = []
655
+ current_size = 0
656
+
657
+ current_chunk.append(para)
658
+ current_size += para_size
659
+
660
+ if current_chunk:
661
+ chunks.append('\n\n'.join(current_chunk))
662
+
663
+ return chunks
664
+
665
+ def _create_chunk(
666
+ self,
667
+ content: str,
668
+ file_path: str,
669
+ section_title: str,
670
+ heading_level: int,
671
+ parent_headings: List[str],
672
+ start_line: int,
673
+ end_line: int
674
+ ) -> Chunk:
675
+ """Create a Chunk object with rich context."""
676
+ # Build context string for better retrieval
677
+ context_path = ' > '.join(parent_headings + [section_title]) if parent_headings else section_title
678
+
679
+ return Chunk(
680
+ id=generate_chunk_id(file_path, content),
681
+ content=content,
682
+ metadata={
683
+ 'file_path': file_path,
684
+ 'file_type': 'markdown',
685
+ 'chunk_type': f'heading_{heading_level}' if heading_level > 0 else 'paragraph',
686
+ 'name': section_title,
687
+ 'context_path': context_path,
688
+ 'heading_level': heading_level,
689
+ 'parent_headings': parent_headings,
690
+ 'start_line': start_line,
691
+ 'end_line': end_line,
692
+ 'char_count': len(content),
693
+ 'line_count': content.count('\n') + 1
694
+ }
695
+ )
696
+
697
+
698
+ class TextChunker:
699
+ """Chunk plain text files by paragraphs."""
700
+
701
+ EXTENSIONS = {'.txt', '.text', '.log'}
702
+
703
+ def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
704
+ self.max_chunk_size = max_chunk_size
705
+ self.overlap = overlap
706
+
707
+ def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
708
+ """Chunk a text file by paragraphs."""
709
+ try:
710
+ content = file_path.read_text(encoding='utf-8')
711
+ except (UnicodeDecodeError, PermissionError):
712
+ return []
713
+
714
+ relative_path = str(file_path.relative_to(base_path))
715
+ chunks = []
716
+
717
+ if len(content) <= self.max_chunk_size:
718
+ chunks.append(self._create_chunk(
719
+ content=content,
720
+ file_path=relative_path,
721
+ name=file_path.stem,
722
+ start_line=1,
723
+ end_line=content.count('\n') + 1
724
+ ))
725
+ else:
726
+ # Split by paragraphs
727
+ paragraphs = re.split(r'\n\s*\n', content)
728
+ current_chunk = []
729
+ current_size = 0
730
+ chunk_index = 0
731
+
732
+ for para in paragraphs:
733
+ para_size = len(para) + 2
734
+
735
+ if current_size + para_size > self.max_chunk_size and current_chunk:
736
+ chunk_index += 1
737
+ chunks.append(self._create_chunk(
738
+ content='\n\n'.join(current_chunk),
739
+ file_path=relative_path,
740
+ name=f"{file_path.stem}_part{chunk_index}",
741
+ start_line=1,
742
+ end_line=content.count('\n') + 1
743
+ ))
744
+ current_chunk = []
745
+ current_size = 0
746
+
747
+ current_chunk.append(para)
748
+ current_size += para_size
749
+
750
+ if current_chunk:
751
+ chunk_index += 1
752
+ chunks.append(self._create_chunk(
753
+ content='\n\n'.join(current_chunk),
754
+ file_path=relative_path,
755
+ name=f"{file_path.stem}_part{chunk_index}",
756
+ start_line=1,
757
+ end_line=content.count('\n') + 1
758
+ ))
759
+
760
+ return chunks
761
+
762
+ def _create_chunk(
763
+ self,
764
+ content: str,
765
+ file_path: str,
766
+ name: str,
767
+ start_line: int,
768
+ end_line: int
769
+ ) -> Chunk:
770
+ """Create a Chunk object."""
771
+ return Chunk(
772
+ id=generate_chunk_id(file_path, content),
773
+ content=content,
774
+ metadata={
775
+ 'file_path': file_path,
776
+ 'file_type': 'text',
777
+ 'chunk_type': 'paragraph',
778
+ 'name': name,
779
+ 'start_line': start_line,
780
+ 'end_line': end_line,
781
+ 'char_count': len(content),
782
+ 'line_count': content.count('\n') + 1
783
+ }
784
+ )
785
+
786
+
508
787
  def chunk_directory(
509
788
  src_path: Path,
510
789
  lang: str,
511
790
  exclude_patterns: List[str],
512
791
  max_chunk_size: int = 1500
513
792
  ) -> List[Chunk]:
514
- """Chunk all files in a directory."""
793
+ """Chunk all files in a directory - Universal support for Code, Docs, and Text."""
515
794
  all_chunks = []
516
795
 
517
- # Select chunker
518
- if lang == 'typescript':
519
- chunker = TypeScriptChunker(max_chunk_size=max_chunk_size)
520
- extensions = TypeScriptChunker.EXTENSIONS
521
- else:
522
- chunker = PythonChunker(max_chunk_size=max_chunk_size)
523
- extensions = {'.py'}
796
+ # Initialize all chunkers
797
+ ts_chunker = TypeScriptChunker(max_chunk_size=max_chunk_size)
798
+ py_chunker = PythonChunker(max_chunk_size=max_chunk_size)
799
+ md_chunker = MarkdownChunker(max_chunk_size=max_chunk_size)
800
+ txt_chunker = TextChunker(max_chunk_size=max_chunk_size)
801
+
802
+ # Map extensions to chunkers
803
+ extension_map = {}
804
+ for ext in TypeScriptChunker.EXTENSIONS:
805
+ extension_map[ext] = ts_chunker
806
+ extension_map['.py'] = py_chunker
807
+ for ext in MarkdownChunker.EXTENSIONS:
808
+ extension_map[ext] = md_chunker
809
+ for ext in TextChunker.EXTENSIONS:
810
+ extension_map[ext] = txt_chunker
811
+
812
+ # All supported extensions
813
+ all_extensions = set(extension_map.keys())
524
814
 
525
815
  # Process files
526
816
  for file_path in src_path.rglob('*'):
527
817
  if not file_path.is_file():
528
818
  continue
529
819
 
530
- if file_path.suffix not in extensions:
820
+ # Check if extension is supported
821
+ if file_path.suffix not in all_extensions:
531
822
  continue
532
823
 
533
824
  # Check exclusions
@@ -535,19 +826,25 @@ def chunk_directory(
535
826
  if any(pattern in path_str for pattern in exclude_patterns):
536
827
  continue
537
828
 
538
- chunks = chunker.chunk_file(file_path, src_path)
539
- all_chunks.extend(chunks)
829
+ # Select appropriate chunker
830
+ chunker = extension_map.get(file_path.suffix)
831
+ if chunker:
832
+ chunks = chunker.chunk_file(file_path, src_path)
833
+ all_chunks.extend(chunks)
540
834
 
541
835
  return all_chunks
542
836
 
543
837
 
544
838
  def main():
545
- parser = argparse.ArgumentParser(description='Chunk code files')
839
+ parser = argparse.ArgumentParser(
840
+ description='Universal Chunker - Code, Markdown, and Text files'
841
+ )
546
842
  parser.add_argument('--src', default='./src', help='Source directory')
547
843
  parser.add_argument('--output', default='.agent/rag/chunks.json', help='Output file')
548
- parser.add_argument('--lang', choices=['typescript', 'python', 'auto'], default='auto')
844
+ parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
845
+ default='universal', help='Language mode (universal = all file types)')
549
846
  parser.add_argument('--max-size', type=int, default=1500, help='Max chunk size in chars')
550
- parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build',
847
+ parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
551
848
  help='Patterns to exclude')
552
849
 
553
850
  args = parser.parse_args()
@@ -560,28 +857,41 @@ def main():
560
857
  print(f"Error: Source directory '{src_path}' does not exist")
561
858
  return 1
562
859
 
563
- # Auto-detect language
860
+ # Mode selection
564
861
  lang = args.lang
565
862
  if lang == 'auto':
566
863
  ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
567
864
  py_files = list(src_path.rglob('*.py'))
568
865
  lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
569
866
  print(f"Auto-detected language: {lang}")
867
+ elif lang == 'universal':
868
+ print("Universal mode: Processing Code, Markdown, and Text files")
570
869
 
571
870
  print(f"Chunking {src_path}...")
572
871
  chunks = chunk_directory(src_path, lang, exclude_patterns, args.max_size)
573
872
  print(f"Created {len(chunks)} chunks")
574
873
 
874
+ if len(chunks) == 0:
875
+ print("Warning: No chunks created. Check if source directory has supported files.")
876
+ print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx, .txt")
877
+
575
878
  # Save output
576
879
  output_path.parent.mkdir(parents=True, exist_ok=True)
577
880
 
881
+ # Categorize chunks by file type
882
+ file_types = {}
883
+ for c in chunks:
884
+ ft = c.metadata.get('file_type', 'unknown')
885
+ file_types[ft] = file_types.get(ft, 0) + 1
886
+
578
887
  output_data = {
579
888
  'metadata': {
580
889
  'generated_at': datetime.now().isoformat(),
581
890
  'source_path': str(src_path),
582
- 'language': lang,
891
+ 'mode': lang,
583
892
  'total_chunks': len(chunks),
584
- 'max_chunk_size': args.max_size
893
+ 'max_chunk_size': args.max_size,
894
+ 'file_types': file_types
585
895
  },
586
896
  'chunks': [asdict(c) for c in chunks]
587
897
  }
@@ -0,0 +1,48 @@
1
+ # PLAN: Universal Intelligence Engine (UIE)
2
+
3
+ > Mục tiêu: Nâng cấp Agent Kit để hỗ trợ mọi loại folder làm việc (Docs, Research, Code) với khả năng thấu hiểu tri thức toàn diện.
4
+
5
+ ## Phase 1: Universal RAG (Smart Chunking cho mọi định dạng) ✅ COMPLETED
6
+ - [x] **Nâng cấp `chunk_code.py` thành Universal Chunker**:
7
+ - [x] Thêm `MarkdownChunker`: Cắt segment theo Heading (`#`, `##`, `###`).
8
+ - [x] Thêm `TextChunker`: Cắt theo đoạn văn (Paragraph).
9
+ - [x] Giữ nguyên khả năng cắt Code (TS, Python) sử dụng AST.
10
+ - [x] **Contextual Metadata cho Docs**:
11
+ - [x] Tự động trích xuất tiêu đề chính của file làm context cho từng chunk bên trong.
12
+ - [x] Ghi nhận `context_path` (Parent > Child heading) để AI biết ngữ cảnh.
13
+
14
+ ## Phase 2: Knowledge Graph cho Tài liệu (Relationship Mapping) ✅ COMPLETED
15
+ - [x] **Nâng cấp `generate_graph.py`**:
16
+ - [x] Thêm `MarkdownAnalyzer`: Quét link nội bộ `[text](file.md)` và wikilink `[[file]]`.
17
+ - [x] Nhận diện heading exports (H1, H2) làm "API" của tài liệu.
18
+ - [x] Chế độ `universal`: Kết hợp Code + Markdown trong cùng một graph.
19
+
20
+ ## Phase 3: MCP Gateway Enhancements
21
+ - [ ] **Content-Type Awareness**:
22
+ - [ ] Tool `analyze_dependencies` trả về "References" thay vì "Imports" cho docs.
23
+ - [ ] Tool `search_code_logic` alias thành `search_knowledge`.
24
+ - [ ] **Auto-Detection**: gateway báo cáo loại dự án (Creative, Tech, Mixed).
25
+
26
+ ## Phase 4: CLI Evolution (`ak` command)
27
+ - [ ] **`ak init` thông minh**:
28
+ - [ ] Không ép buộc phải có `src` hay `app`.
29
+ - [ ] Tự động tạo `AGENTS.md` theo template "General Knowledge" nếu không phát hiện code.
30
+ - [ ] **`ak sync` toàn diện**:
31
+ - [ ] Quét mọi file (ngo trừ ignore list) để đảm bảo không bỏ sót tri thức nào.
32
+
33
+ ## Phase 5: Verification & Beta Test
34
+ - [ ] **Test Case 1**: Dự án chỉ gồm 100 file Markdown (Hướng dẫn sử dụng).
35
+ - [ ] **Test Case 2**: Dự án hỗn hợp (Next.js + Docs + API Specs).
36
+ - [ ] **Test Case 3**: Dự án nghiên cứu (Nhiều file .txt và ghi chú rời rạc).
37
+
38
+ ---
39
+
40
+ ## Agent Assignments
41
+ - **Python Specialist**: Xử lý logic Chunker và Graph Mapper (Phase 1 & 2).
42
+ - **TypeScript Expert**: Cập nhật MCP Gateway (Phase 3).
43
+ - **Orchestrator**: Cập nhật CLI và hoàn thiện Docs (Phase 4).
44
+
45
+ ## Verification Checklist
46
+ - [ ] `ai status` hiển thị ✅ RAG và ✅ Graph ngay cả trong folder không có code.
47
+ - [ ] AI có thể tìm thấy thông tin nằm sâu trong một sub-section của file Markdown dài.
48
+ - [ ] Dependency graph hiển thị được mối liên kết giữa các file tài liệu.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@musashishao/agent-kit",
3
- "version": "1.3.0",
3
+ "version": "1.4.1",
4
4
  "description": "AI Agent templates - Skills, Agents, Workflows, and AI-Ready Data Infrastructure Gateway",
5
5
  "main": "index.js",
6
6
  "bin": {