@musashishao/agent-kit 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.agent/scripts/ak_cli.py
CHANGED
|
@@ -258,6 +258,22 @@ def cmd_init(args: argparse.Namespace) -> int:
|
|
|
258
258
|
# Command: sync
|
|
259
259
|
# ============================================================================
|
|
260
260
|
|
|
261
|
+
def find_source_dir(project_root: Path) -> Path:
|
|
262
|
+
"""Intelligently find the source directory."""
|
|
263
|
+
# Priority defaults
|
|
264
|
+
for folder in ["src", "app", "lib", "scripts", "components"]:
|
|
265
|
+
if (project_root / folder).exists():
|
|
266
|
+
return project_root / folder
|
|
267
|
+
|
|
268
|
+
# Fallback to root if there are source files in root
|
|
269
|
+
source_extensions = {".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".c", ".cpp", ".cs"}
|
|
270
|
+
for item in project_root.iterdir():
|
|
271
|
+
if item.is_file() and item.suffix in source_extensions:
|
|
272
|
+
return project_root
|
|
273
|
+
|
|
274
|
+
return project_root / "src" # Ultimate fallback
|
|
275
|
+
|
|
276
|
+
|
|
261
277
|
def cmd_sync(args: argparse.Namespace) -> int:
|
|
262
278
|
"""Sync AI infrastructure data."""
|
|
263
279
|
project_root = Path(args.project_root).resolve()
|
|
@@ -271,6 +287,10 @@ def cmd_sync(args: argparse.Namespace) -> int:
|
|
|
271
287
|
print("❌ .agent directory not found. Run 'ak init' first.")
|
|
272
288
|
return 1
|
|
273
289
|
|
|
290
|
+
# Determine source directory
|
|
291
|
+
src_dir = find_source_dir(project_root)
|
|
292
|
+
print(f"🔍 Detected source directory: {src_dir.relative_to(project_root) if src_dir != project_root else '.'}")
|
|
293
|
+
|
|
274
294
|
# Determine what to sync
|
|
275
295
|
targets = []
|
|
276
296
|
if args.target == "all":
|
|
@@ -285,11 +305,12 @@ def cmd_sync(args: argparse.Namespace) -> int:
|
|
|
285
305
|
print("\n📊 Updating dependency graph...")
|
|
286
306
|
graph_script = kit_path / "skills" / "graph-mapper" / "scripts" / "generate_graph.py"
|
|
287
307
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
308
|
+
if not graph_script.exists():
|
|
309
|
+
print(f" ❌ Graph script not found at: {graph_script}")
|
|
310
|
+
success = False
|
|
311
|
+
elif not src_dir.exists() and src_dir != project_root:
|
|
312
|
+
print(f" ⚠️ Source directory {src_dir} not found")
|
|
313
|
+
else:
|
|
293
314
|
result = subprocess.run(
|
|
294
315
|
[
|
|
295
316
|
"python3", str(graph_script),
|
|
@@ -303,21 +324,18 @@ def cmd_sync(args: argparse.Namespace) -> int:
|
|
|
303
324
|
if result.returncode == 0:
|
|
304
325
|
print(" ✅ Graph updated")
|
|
305
326
|
else:
|
|
306
|
-
print(f" ❌ Graph sync failed: {result.stderr
|
|
327
|
+
print(f" ❌ Graph sync failed: {result.stderr}")
|
|
307
328
|
success = False
|
|
308
|
-
else:
|
|
309
|
-
print(" ⚠️ Source directory or script not found")
|
|
310
329
|
|
|
311
330
|
# Sync RAG
|
|
312
331
|
if "rag" in targets:
|
|
313
332
|
print("\n📚 Updating RAG chunks...")
|
|
314
333
|
rag_script = kit_path / "skills" / "rag-engineering" / "scripts" / "chunk_code.py"
|
|
315
334
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
if src_dir.exists() and rag_script.exists():
|
|
335
|
+
if not rag_script.exists():
|
|
336
|
+
print(f" ❌ RAG script not found at: {rag_script}")
|
|
337
|
+
success = False
|
|
338
|
+
else:
|
|
321
339
|
result = subprocess.run(
|
|
322
340
|
[
|
|
323
341
|
"python3", str(rag_script),
|
|
@@ -330,10 +348,8 @@ def cmd_sync(args: argparse.Namespace) -> int:
|
|
|
330
348
|
if result.returncode == 0:
|
|
331
349
|
print(" ✅ RAG chunks updated")
|
|
332
350
|
else:
|
|
333
|
-
print(f" ❌ RAG sync failed: {result.stderr
|
|
351
|
+
print(f" ❌ RAG sync failed: {result.stderr}")
|
|
334
352
|
success = False
|
|
335
|
-
else:
|
|
336
|
-
print(" ⚠️ Source directory or script not found")
|
|
337
353
|
|
|
338
354
|
# Update timestamp cache
|
|
339
355
|
cache_file = agent_dir / ".cache" / "timestamps.json"
|
|
@@ -311,6 +311,165 @@ class PythonAnalyzer:
|
|
|
311
311
|
return nodes
|
|
312
312
|
|
|
313
313
|
|
|
314
|
+
class MarkdownAnalyzer:
|
|
315
|
+
"""Analyzes Markdown files for internal links and references."""
|
|
316
|
+
|
|
317
|
+
EXTENSIONS = {'.md', '.mdx', '.markdown'}
|
|
318
|
+
|
|
319
|
+
# Regex patterns for link detection
|
|
320
|
+
LINK_PATTERNS = [
|
|
321
|
+
# Standard markdown link: [text](path)
|
|
322
|
+
r'\[([^\]]+)\]\(([^)]+)\)',
|
|
323
|
+
# Wikilink: [[path]] or [[path|text]]
|
|
324
|
+
r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]',
|
|
325
|
+
]
|
|
326
|
+
|
|
327
|
+
# Pattern for image refs (also a form of dependency)
|
|
328
|
+
IMAGE_PATTERN = r'!\[([^\]]*)\]\(([^)]+)\)'
|
|
329
|
+
|
|
330
|
+
def __init__(self, base_path: Path, exclude_patterns: List[str]):
|
|
331
|
+
self.base_path = base_path
|
|
332
|
+
self.exclude_patterns = exclude_patterns
|
|
333
|
+
|
|
334
|
+
def should_exclude(self, path: Path) -> bool:
|
|
335
|
+
"""Check if path should be excluded."""
|
|
336
|
+
path_str = str(path)
|
|
337
|
+
for pattern in self.exclude_patterns:
|
|
338
|
+
if pattern in path_str:
|
|
339
|
+
return True
|
|
340
|
+
return False
|
|
341
|
+
|
|
342
|
+
def analyze_file(self, file_path: Path) -> Optional[Node]:
|
|
343
|
+
"""Analyze a single Markdown file for links."""
|
|
344
|
+
if self.should_exclude(file_path):
|
|
345
|
+
return None
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
content = file_path.read_text(encoding='utf-8')
|
|
349
|
+
except (UnicodeDecodeError, PermissionError):
|
|
350
|
+
return None
|
|
351
|
+
|
|
352
|
+
relative_path = str(file_path.relative_to(self.base_path))
|
|
353
|
+
node_type = self._detect_type(relative_path, content)
|
|
354
|
+
|
|
355
|
+
# Extract internal links
|
|
356
|
+
links = []
|
|
357
|
+
|
|
358
|
+
# Standard markdown links
|
|
359
|
+
for match in re.findall(self.LINK_PATTERNS[0], content):
|
|
360
|
+
link_path = match[1]
|
|
361
|
+
resolved = self._resolve_link(link_path, file_path)
|
|
362
|
+
if resolved:
|
|
363
|
+
links.append(resolved)
|
|
364
|
+
|
|
365
|
+
# Wikilinks
|
|
366
|
+
for match in re.findall(self.LINK_PATTERNS[1], content):
|
|
367
|
+
resolved = self._resolve_wikilink(match)
|
|
368
|
+
if resolved:
|
|
369
|
+
links.append(resolved)
|
|
370
|
+
|
|
371
|
+
# Extract "exports" (main topics/headings)
|
|
372
|
+
exports = []
|
|
373
|
+
# Get main title (first h1)
|
|
374
|
+
title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
|
|
375
|
+
if title_match:
|
|
376
|
+
exports.append(title_match.group(1).strip())
|
|
377
|
+
|
|
378
|
+
# Get h2 sections as additional exports
|
|
379
|
+
for match in re.findall(r'^##\s+(.+)$', content, re.MULTILINE):
|
|
380
|
+
exports.append(match.strip())
|
|
381
|
+
|
|
382
|
+
return Node(
|
|
383
|
+
id=relative_path,
|
|
384
|
+
type=node_type,
|
|
385
|
+
path=relative_path,
|
|
386
|
+
imports=list(set(links)), # Links = imports in docs context
|
|
387
|
+
exports=exports[:10] # Limit to first 10 headings
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
def _detect_type(self, path: str, content: str) -> str:
|
|
391
|
+
"""Detect the type of documentation file."""
|
|
392
|
+
path_lower = path.lower()
|
|
393
|
+
|
|
394
|
+
if 'readme' in path_lower:
|
|
395
|
+
return 'readme'
|
|
396
|
+
elif 'changelog' in path_lower or 'history' in path_lower:
|
|
397
|
+
return 'changelog'
|
|
398
|
+
elif 'contributing' in path_lower:
|
|
399
|
+
return 'contributing'
|
|
400
|
+
elif 'license' in path_lower:
|
|
401
|
+
return 'license'
|
|
402
|
+
elif '/docs/' in path_lower or path_lower.startswith('docs/'):
|
|
403
|
+
return 'documentation'
|
|
404
|
+
elif '/guides/' in path_lower or '/tutorials/' in path_lower:
|
|
405
|
+
return 'guide'
|
|
406
|
+
elif '/api/' in path_lower or 'api' in path_lower:
|
|
407
|
+
return 'api_doc'
|
|
408
|
+
elif 'plan' in path_lower or 'roadmap' in path_lower:
|
|
409
|
+
return 'plan'
|
|
410
|
+
else:
|
|
411
|
+
return 'document'
|
|
412
|
+
|
|
413
|
+
def _resolve_link(self, link_path: str, from_file: Path) -> Optional[str]:
|
|
414
|
+
"""Resolve a markdown link to a relative path."""
|
|
415
|
+
# Skip external links
|
|
416
|
+
if link_path.startswith(('http://', 'https://', 'mailto:', '#')):
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
# Skip anchor-only links
|
|
420
|
+
if link_path.startswith('#'):
|
|
421
|
+
return None
|
|
422
|
+
|
|
423
|
+
# Remove anchor from path
|
|
424
|
+
if '#' in link_path:
|
|
425
|
+
link_path = link_path.split('#')[0]
|
|
426
|
+
|
|
427
|
+
if not link_path:
|
|
428
|
+
return None
|
|
429
|
+
|
|
430
|
+
# Resolve relative path
|
|
431
|
+
from_dir = from_file.parent
|
|
432
|
+
resolved = (from_dir / link_path).resolve()
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
relative = str(resolved.relative_to(self.base_path))
|
|
436
|
+
# Check if file exists
|
|
437
|
+
if resolved.exists():
|
|
438
|
+
return relative
|
|
439
|
+
return None
|
|
440
|
+
except ValueError:
|
|
441
|
+
return None
|
|
442
|
+
|
|
443
|
+
def _resolve_wikilink(self, link_name: str) -> Optional[str]:
|
|
444
|
+
"""Resolve a wikilink to a file path."""
|
|
445
|
+
# Search for file matching the wikilink name
|
|
446
|
+
search_name = link_name.strip()
|
|
447
|
+
|
|
448
|
+
# Try exact match with .md extension
|
|
449
|
+
for ext in self.EXTENSIONS:
|
|
450
|
+
for file_path in self.base_path.rglob(f"*{ext}"):
|
|
451
|
+
if file_path.stem.lower() == search_name.lower():
|
|
452
|
+
try:
|
|
453
|
+
return str(file_path.relative_to(self.base_path))
|
|
454
|
+
except ValueError:
|
|
455
|
+
continue
|
|
456
|
+
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
def analyze_directory(self, directory: Path) -> List[Node]:
|
|
460
|
+
"""Analyze all Markdown files in a directory."""
|
|
461
|
+
nodes = []
|
|
462
|
+
|
|
463
|
+
for ext in self.EXTENSIONS:
|
|
464
|
+
for file_path in directory.rglob(f'*{ext}'):
|
|
465
|
+
if file_path.is_file():
|
|
466
|
+
node = self.analyze_file(file_path)
|
|
467
|
+
if node:
|
|
468
|
+
nodes.append(node)
|
|
469
|
+
|
|
470
|
+
return nodes
|
|
471
|
+
|
|
472
|
+
|
|
314
473
|
def build_edges(nodes: List[Node]) -> List[Edge]:
|
|
315
474
|
"""Build edges from node imports."""
|
|
316
475
|
edges = []
|
|
@@ -424,13 +583,16 @@ def generate_markdown(graph: Graph, output_path: Path):
|
|
|
424
583
|
|
|
425
584
|
|
|
426
585
|
def main():
|
|
427
|
-
parser = argparse.ArgumentParser(
|
|
586
|
+
parser = argparse.ArgumentParser(
|
|
587
|
+
description='Universal Dependency Graph Generator - Code and Documentation'
|
|
588
|
+
)
|
|
428
589
|
parser.add_argument('--src', default='./src', help='Source directory')
|
|
429
590
|
parser.add_argument('--output', default='.agent/graph.json', help='Output file')
|
|
430
591
|
parser.add_argument('--format', choices=['json', 'markdown', 'both'], default='both')
|
|
431
|
-
parser.add_argument('--lang', choices=['typescript', 'python', 'auto'
|
|
592
|
+
parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
|
|
593
|
+
default='universal', help='Language mode (universal = Code + Markdown)')
|
|
432
594
|
parser.add_argument('--depth', type=int, default=3, help='Max depth for impact analysis')
|
|
433
|
-
parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build',
|
|
595
|
+
parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
|
|
434
596
|
help='Comma-separated patterns to exclude')
|
|
435
597
|
|
|
436
598
|
args = parser.parse_args()
|
|
@@ -443,37 +605,77 @@ def main():
|
|
|
443
605
|
print(f"Error: Source directory '{src_path}' does not exist")
|
|
444
606
|
return 1
|
|
445
607
|
|
|
446
|
-
#
|
|
608
|
+
# Collect all nodes
|
|
609
|
+
all_nodes = []
|
|
447
610
|
lang = args.lang
|
|
448
|
-
|
|
611
|
+
|
|
612
|
+
if lang == 'universal':
|
|
613
|
+
print("Universal mode: Analyzing Code + Markdown files")
|
|
614
|
+
|
|
615
|
+
# Analyze TypeScript/JavaScript
|
|
616
|
+
ts_analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
|
|
617
|
+
ts_nodes = ts_analyzer.analyze_directory(src_path)
|
|
618
|
+
all_nodes.extend(ts_nodes)
|
|
619
|
+
print(f" TypeScript/JS: {len(ts_nodes)} files")
|
|
620
|
+
|
|
621
|
+
# Analyze Python
|
|
622
|
+
py_analyzer = PythonAnalyzer(src_path, exclude_patterns)
|
|
623
|
+
py_nodes = py_analyzer.analyze_directory(src_path)
|
|
624
|
+
all_nodes.extend(py_nodes)
|
|
625
|
+
print(f" Python: {len(py_nodes)} files")
|
|
626
|
+
|
|
627
|
+
# Analyze Markdown
|
|
628
|
+
md_analyzer = MarkdownAnalyzer(src_path, exclude_patterns)
|
|
629
|
+
md_nodes = md_analyzer.analyze_directory(src_path)
|
|
630
|
+
all_nodes.extend(md_nodes)
|
|
631
|
+
print(f" Markdown: {len(md_nodes)} files")
|
|
632
|
+
|
|
633
|
+
elif lang == 'auto':
|
|
449
634
|
ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
|
|
450
635
|
py_files = list(src_path.rglob('*.py'))
|
|
451
636
|
lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
|
|
452
637
|
print(f"Auto-detected language: {lang}")
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
638
|
+
|
|
639
|
+
if lang == 'typescript':
|
|
640
|
+
analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
|
|
641
|
+
else:
|
|
642
|
+
analyzer = PythonAnalyzer(src_path, exclude_patterns)
|
|
643
|
+
|
|
644
|
+
all_nodes = analyzer.analyze_directory(src_path)
|
|
457
645
|
else:
|
|
458
|
-
|
|
646
|
+
# Specific language
|
|
647
|
+
if lang == 'typescript':
|
|
648
|
+
analyzer = TypeScriptAnalyzer(src_path, exclude_patterns)
|
|
649
|
+
else:
|
|
650
|
+
analyzer = PythonAnalyzer(src_path, exclude_patterns)
|
|
651
|
+
|
|
652
|
+
all_nodes = analyzer.analyze_directory(src_path)
|
|
459
653
|
|
|
460
|
-
print(f"
|
|
461
|
-
|
|
462
|
-
|
|
654
|
+
print(f"Total: {len(all_nodes)} files")
|
|
655
|
+
|
|
656
|
+
if len(all_nodes) == 0:
|
|
657
|
+
print("Warning: No files found. Check source directory and exclude patterns.")
|
|
658
|
+
print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx")
|
|
463
659
|
|
|
464
660
|
# Build graph
|
|
465
|
-
edges = build_edges(
|
|
466
|
-
print(f"Found {len(edges)} dependencies")
|
|
661
|
+
edges = build_edges(all_nodes)
|
|
662
|
+
print(f"Found {len(edges)} dependencies/links")
|
|
663
|
+
|
|
664
|
+
# Categorize by type
|
|
665
|
+
type_counts = {}
|
|
666
|
+
for node in all_nodes:
|
|
667
|
+
type_counts[node.type] = type_counts.get(node.type, 0) + 1
|
|
467
668
|
|
|
468
669
|
graph = Graph(
|
|
469
|
-
nodes=
|
|
670
|
+
nodes=all_nodes,
|
|
470
671
|
edges=edges,
|
|
471
672
|
metadata={
|
|
472
673
|
"generated_at": datetime.now().isoformat(),
|
|
473
674
|
"source_path": str(src_path),
|
|
474
|
-
"
|
|
475
|
-
"total_files": len(
|
|
476
|
-
"total_edges": len(edges)
|
|
675
|
+
"mode": lang,
|
|
676
|
+
"total_files": len(all_nodes),
|
|
677
|
+
"total_edges": len(edges),
|
|
678
|
+
"file_types": type_counts
|
|
477
679
|
}
|
|
478
680
|
)
|
|
479
681
|
|
|
@@ -505,29 +505,320 @@ class PythonChunker:
|
|
|
505
505
|
)
|
|
506
506
|
|
|
507
507
|
|
|
508
|
+
class MarkdownChunker:
|
|
509
|
+
"""Chunk Markdown files by heading sections."""
|
|
510
|
+
|
|
511
|
+
EXTENSIONS = {'.md', '.mdx', '.markdown'}
|
|
512
|
+
|
|
513
|
+
def __init__(self, max_chunk_size: int = 2000, overlap: int = 100):
|
|
514
|
+
self.max_chunk_size = max_chunk_size
|
|
515
|
+
self.overlap = overlap
|
|
516
|
+
|
|
517
|
+
def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
|
|
518
|
+
"""Chunk a Markdown file by headings."""
|
|
519
|
+
try:
|
|
520
|
+
content = file_path.read_text(encoding='utf-8')
|
|
521
|
+
except (UnicodeDecodeError, PermissionError):
|
|
522
|
+
return []
|
|
523
|
+
|
|
524
|
+
relative_path = str(file_path.relative_to(base_path))
|
|
525
|
+
chunks = []
|
|
526
|
+
|
|
527
|
+
# Extract sections by heading
|
|
528
|
+
sections = self._extract_sections(content)
|
|
529
|
+
|
|
530
|
+
if sections:
|
|
531
|
+
for section in sections:
|
|
532
|
+
chunk_content = section['content']
|
|
533
|
+
|
|
534
|
+
# If section is too large, split it
|
|
535
|
+
if len(chunk_content) > self.max_chunk_size:
|
|
536
|
+
sub_chunks = self._split_by_paragraphs(chunk_content)
|
|
537
|
+
for i, sub in enumerate(sub_chunks):
|
|
538
|
+
chunks.append(self._create_chunk(
|
|
539
|
+
content=sub,
|
|
540
|
+
file_path=relative_path,
|
|
541
|
+
section_title=f"{section['title']}_part{i+1}",
|
|
542
|
+
heading_level=section['level'],
|
|
543
|
+
parent_headings=section['parents'],
|
|
544
|
+
start_line=section['start_line'],
|
|
545
|
+
end_line=section['end_line']
|
|
546
|
+
))
|
|
547
|
+
else:
|
|
548
|
+
chunks.append(self._create_chunk(
|
|
549
|
+
content=chunk_content,
|
|
550
|
+
file_path=relative_path,
|
|
551
|
+
section_title=section['title'],
|
|
552
|
+
heading_level=section['level'],
|
|
553
|
+
parent_headings=section['parents'],
|
|
554
|
+
start_line=section['start_line'],
|
|
555
|
+
end_line=section['end_line']
|
|
556
|
+
))
|
|
557
|
+
else:
|
|
558
|
+
# No headings found, treat entire file as one chunk or split by paragraphs
|
|
559
|
+
if len(content) <= self.max_chunk_size:
|
|
560
|
+
chunks.append(self._create_chunk(
|
|
561
|
+
content=content,
|
|
562
|
+
file_path=relative_path,
|
|
563
|
+
section_title=file_path.stem,
|
|
564
|
+
heading_level=0,
|
|
565
|
+
parent_headings=[],
|
|
566
|
+
start_line=1,
|
|
567
|
+
end_line=content.count('\n') + 1
|
|
568
|
+
))
|
|
569
|
+
else:
|
|
570
|
+
sub_chunks = self._split_by_paragraphs(content)
|
|
571
|
+
for i, sub in enumerate(sub_chunks):
|
|
572
|
+
chunks.append(self._create_chunk(
|
|
573
|
+
content=sub,
|
|
574
|
+
file_path=relative_path,
|
|
575
|
+
section_title=f"{file_path.stem}_part{i+1}",
|
|
576
|
+
heading_level=0,
|
|
577
|
+
parent_headings=[],
|
|
578
|
+
start_line=1,
|
|
579
|
+
end_line=content.count('\n') + 1
|
|
580
|
+
))
|
|
581
|
+
|
|
582
|
+
return chunks
|
|
583
|
+
|
|
584
|
+
def _extract_sections(self, content: str) -> List[Dict]:
|
|
585
|
+
"""Extract sections based on Markdown headings."""
|
|
586
|
+
sections = []
|
|
587
|
+
lines = content.split('\n')
|
|
588
|
+
|
|
589
|
+
heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
|
|
590
|
+
|
|
591
|
+
current_section = None
|
|
592
|
+
current_lines = []
|
|
593
|
+
parent_stack = [] # Track parent headings for context
|
|
594
|
+
|
|
595
|
+
for i, line in enumerate(lines):
|
|
596
|
+
match = heading_pattern.match(line)
|
|
597
|
+
|
|
598
|
+
if match:
|
|
599
|
+
# Save previous section
|
|
600
|
+
if current_section:
|
|
601
|
+
current_section['content'] = '\n'.join(current_lines).strip()
|
|
602
|
+
current_section['end_line'] = i
|
|
603
|
+
if current_section['content']:
|
|
604
|
+
sections.append(current_section)
|
|
605
|
+
|
|
606
|
+
# Update parent stack
|
|
607
|
+
level = len(match.group(1))
|
|
608
|
+
title = match.group(2).strip()
|
|
609
|
+
|
|
610
|
+
# Pop parents that are same or deeper level
|
|
611
|
+
while parent_stack and parent_stack[-1]['level'] >= level:
|
|
612
|
+
parent_stack.pop()
|
|
613
|
+
|
|
614
|
+
parents = [p['title'] for p in parent_stack]
|
|
615
|
+
|
|
616
|
+
# Start new section
|
|
617
|
+
current_section = {
|
|
618
|
+
'title': title,
|
|
619
|
+
'level': level,
|
|
620
|
+
'parents': parents.copy(),
|
|
621
|
+
'start_line': i + 1,
|
|
622
|
+
'end_line': i + 1,
|
|
623
|
+
'content': ''
|
|
624
|
+
}
|
|
625
|
+
current_lines = [line]
|
|
626
|
+
|
|
627
|
+
# Add this heading to parent stack
|
|
628
|
+
parent_stack.append({'level': level, 'title': title})
|
|
629
|
+
elif current_section:
|
|
630
|
+
current_lines.append(line)
|
|
631
|
+
|
|
632
|
+
# Don't forget last section
|
|
633
|
+
if current_section:
|
|
634
|
+
current_section['content'] = '\n'.join(current_lines).strip()
|
|
635
|
+
current_section['end_line'] = len(lines)
|
|
636
|
+
if current_section['content']:
|
|
637
|
+
sections.append(current_section)
|
|
638
|
+
|
|
639
|
+
return sections
|
|
640
|
+
|
|
641
|
+
def _split_by_paragraphs(self, content: str) -> List[str]:
|
|
642
|
+
"""Split content by paragraphs when too large."""
|
|
643
|
+
chunks = []
|
|
644
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
645
|
+
|
|
646
|
+
current_chunk = []
|
|
647
|
+
current_size = 0
|
|
648
|
+
|
|
649
|
+
for para in paragraphs:
|
|
650
|
+
para_size = len(para) + 2 # +2 for paragraph break
|
|
651
|
+
|
|
652
|
+
if current_size + para_size > self.max_chunk_size and current_chunk:
|
|
653
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
654
|
+
current_chunk = []
|
|
655
|
+
current_size = 0
|
|
656
|
+
|
|
657
|
+
current_chunk.append(para)
|
|
658
|
+
current_size += para_size
|
|
659
|
+
|
|
660
|
+
if current_chunk:
|
|
661
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
662
|
+
|
|
663
|
+
return chunks
|
|
664
|
+
|
|
665
|
+
def _create_chunk(
|
|
666
|
+
self,
|
|
667
|
+
content: str,
|
|
668
|
+
file_path: str,
|
|
669
|
+
section_title: str,
|
|
670
|
+
heading_level: int,
|
|
671
|
+
parent_headings: List[str],
|
|
672
|
+
start_line: int,
|
|
673
|
+
end_line: int
|
|
674
|
+
) -> Chunk:
|
|
675
|
+
"""Create a Chunk object with rich context."""
|
|
676
|
+
# Build context string for better retrieval
|
|
677
|
+
context_path = ' > '.join(parent_headings + [section_title]) if parent_headings else section_title
|
|
678
|
+
|
|
679
|
+
return Chunk(
|
|
680
|
+
id=generate_chunk_id(file_path, content),
|
|
681
|
+
content=content,
|
|
682
|
+
metadata={
|
|
683
|
+
'file_path': file_path,
|
|
684
|
+
'file_type': 'markdown',
|
|
685
|
+
'chunk_type': f'heading_{heading_level}' if heading_level > 0 else 'paragraph',
|
|
686
|
+
'name': section_title,
|
|
687
|
+
'context_path': context_path,
|
|
688
|
+
'heading_level': heading_level,
|
|
689
|
+
'parent_headings': parent_headings,
|
|
690
|
+
'start_line': start_line,
|
|
691
|
+
'end_line': end_line,
|
|
692
|
+
'char_count': len(content),
|
|
693
|
+
'line_count': content.count('\n') + 1
|
|
694
|
+
}
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
class TextChunker:
|
|
699
|
+
"""Chunk plain text files by paragraphs."""
|
|
700
|
+
|
|
701
|
+
EXTENSIONS = {'.txt', '.text', '.log'}
|
|
702
|
+
|
|
703
|
+
def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
|
|
704
|
+
self.max_chunk_size = max_chunk_size
|
|
705
|
+
self.overlap = overlap
|
|
706
|
+
|
|
707
|
+
def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
|
|
708
|
+
"""Chunk a text file by paragraphs."""
|
|
709
|
+
try:
|
|
710
|
+
content = file_path.read_text(encoding='utf-8')
|
|
711
|
+
except (UnicodeDecodeError, PermissionError):
|
|
712
|
+
return []
|
|
713
|
+
|
|
714
|
+
relative_path = str(file_path.relative_to(base_path))
|
|
715
|
+
chunks = []
|
|
716
|
+
|
|
717
|
+
if len(content) <= self.max_chunk_size:
|
|
718
|
+
chunks.append(self._create_chunk(
|
|
719
|
+
content=content,
|
|
720
|
+
file_path=relative_path,
|
|
721
|
+
name=file_path.stem,
|
|
722
|
+
start_line=1,
|
|
723
|
+
end_line=content.count('\n') + 1
|
|
724
|
+
))
|
|
725
|
+
else:
|
|
726
|
+
# Split by paragraphs
|
|
727
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
728
|
+
current_chunk = []
|
|
729
|
+
current_size = 0
|
|
730
|
+
chunk_index = 0
|
|
731
|
+
|
|
732
|
+
for para in paragraphs:
|
|
733
|
+
para_size = len(para) + 2
|
|
734
|
+
|
|
735
|
+
if current_size + para_size > self.max_chunk_size and current_chunk:
|
|
736
|
+
chunk_index += 1
|
|
737
|
+
chunks.append(self._create_chunk(
|
|
738
|
+
content='\n\n'.join(current_chunk),
|
|
739
|
+
file_path=relative_path,
|
|
740
|
+
name=f"{file_path.stem}_part{chunk_index}",
|
|
741
|
+
start_line=1,
|
|
742
|
+
end_line=content.count('\n') + 1
|
|
743
|
+
))
|
|
744
|
+
current_chunk = []
|
|
745
|
+
current_size = 0
|
|
746
|
+
|
|
747
|
+
current_chunk.append(para)
|
|
748
|
+
current_size += para_size
|
|
749
|
+
|
|
750
|
+
if current_chunk:
|
|
751
|
+
chunk_index += 1
|
|
752
|
+
chunks.append(self._create_chunk(
|
|
753
|
+
content='\n\n'.join(current_chunk),
|
|
754
|
+
file_path=relative_path,
|
|
755
|
+
name=f"{file_path.stem}_part{chunk_index}",
|
|
756
|
+
start_line=1,
|
|
757
|
+
end_line=content.count('\n') + 1
|
|
758
|
+
))
|
|
759
|
+
|
|
760
|
+
return chunks
|
|
761
|
+
|
|
762
|
+
def _create_chunk(
|
|
763
|
+
self,
|
|
764
|
+
content: str,
|
|
765
|
+
file_path: str,
|
|
766
|
+
name: str,
|
|
767
|
+
start_line: int,
|
|
768
|
+
end_line: int
|
|
769
|
+
) -> Chunk:
|
|
770
|
+
"""Create a Chunk object."""
|
|
771
|
+
return Chunk(
|
|
772
|
+
id=generate_chunk_id(file_path, content),
|
|
773
|
+
content=content,
|
|
774
|
+
metadata={
|
|
775
|
+
'file_path': file_path,
|
|
776
|
+
'file_type': 'text',
|
|
777
|
+
'chunk_type': 'paragraph',
|
|
778
|
+
'name': name,
|
|
779
|
+
'start_line': start_line,
|
|
780
|
+
'end_line': end_line,
|
|
781
|
+
'char_count': len(content),
|
|
782
|
+
'line_count': content.count('\n') + 1
|
|
783
|
+
}
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
|
|
508
787
|
def chunk_directory(
|
|
509
788
|
src_path: Path,
|
|
510
789
|
lang: str,
|
|
511
790
|
exclude_patterns: List[str],
|
|
512
791
|
max_chunk_size: int = 1500
|
|
513
792
|
) -> List[Chunk]:
|
|
514
|
-
"""Chunk all files in a directory."""
|
|
793
|
+
"""Chunk all files in a directory - Universal support for Code, Docs, and Text."""
|
|
515
794
|
all_chunks = []
|
|
516
795
|
|
|
517
|
-
#
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
796
|
+
# Initialize all chunkers
|
|
797
|
+
ts_chunker = TypeScriptChunker(max_chunk_size=max_chunk_size)
|
|
798
|
+
py_chunker = PythonChunker(max_chunk_size=max_chunk_size)
|
|
799
|
+
md_chunker = MarkdownChunker(max_chunk_size=max_chunk_size)
|
|
800
|
+
txt_chunker = TextChunker(max_chunk_size=max_chunk_size)
|
|
801
|
+
|
|
802
|
+
# Map extensions to chunkers
|
|
803
|
+
extension_map = {}
|
|
804
|
+
for ext in TypeScriptChunker.EXTENSIONS:
|
|
805
|
+
extension_map[ext] = ts_chunker
|
|
806
|
+
extension_map['.py'] = py_chunker
|
|
807
|
+
for ext in MarkdownChunker.EXTENSIONS:
|
|
808
|
+
extension_map[ext] = md_chunker
|
|
809
|
+
for ext in TextChunker.EXTENSIONS:
|
|
810
|
+
extension_map[ext] = txt_chunker
|
|
811
|
+
|
|
812
|
+
# All supported extensions
|
|
813
|
+
all_extensions = set(extension_map.keys())
|
|
524
814
|
|
|
525
815
|
# Process files
|
|
526
816
|
for file_path in src_path.rglob('*'):
|
|
527
817
|
if not file_path.is_file():
|
|
528
818
|
continue
|
|
529
819
|
|
|
530
|
-
if
|
|
820
|
+
# Check if extension is supported
|
|
821
|
+
if file_path.suffix not in all_extensions:
|
|
531
822
|
continue
|
|
532
823
|
|
|
533
824
|
# Check exclusions
|
|
@@ -535,19 +826,25 @@ def chunk_directory(
|
|
|
535
826
|
if any(pattern in path_str for pattern in exclude_patterns):
|
|
536
827
|
continue
|
|
537
828
|
|
|
538
|
-
|
|
539
|
-
|
|
829
|
+
# Select appropriate chunker
|
|
830
|
+
chunker = extension_map.get(file_path.suffix)
|
|
831
|
+
if chunker:
|
|
832
|
+
chunks = chunker.chunk_file(file_path, src_path)
|
|
833
|
+
all_chunks.extend(chunks)
|
|
540
834
|
|
|
541
835
|
return all_chunks
|
|
542
836
|
|
|
543
837
|
|
|
544
838
|
def main():
|
|
545
|
-
parser = argparse.ArgumentParser(
|
|
839
|
+
parser = argparse.ArgumentParser(
|
|
840
|
+
description='Universal Chunker - Code, Markdown, and Text files'
|
|
841
|
+
)
|
|
546
842
|
parser.add_argument('--src', default='./src', help='Source directory')
|
|
547
843
|
parser.add_argument('--output', default='.agent/rag/chunks.json', help='Output file')
|
|
548
|
-
parser.add_argument('--lang', choices=['typescript', 'python', 'auto'
|
|
844
|
+
parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
|
|
845
|
+
default='universal', help='Language mode (universal = all file types)')
|
|
549
846
|
parser.add_argument('--max-size', type=int, default=1500, help='Max chunk size in chars')
|
|
550
|
-
parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build',
|
|
847
|
+
parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
|
|
551
848
|
help='Patterns to exclude')
|
|
552
849
|
|
|
553
850
|
args = parser.parse_args()
|
|
@@ -560,28 +857,41 @@ def main():
|
|
|
560
857
|
print(f"Error: Source directory '{src_path}' does not exist")
|
|
561
858
|
return 1
|
|
562
859
|
|
|
563
|
-
#
|
|
860
|
+
# Mode selection
|
|
564
861
|
lang = args.lang
|
|
565
862
|
if lang == 'auto':
|
|
566
863
|
ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
|
|
567
864
|
py_files = list(src_path.rglob('*.py'))
|
|
568
865
|
lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
|
|
569
866
|
print(f"Auto-detected language: {lang}")
|
|
867
|
+
elif lang == 'universal':
|
|
868
|
+
print("Universal mode: Processing Code, Markdown, and Text files")
|
|
570
869
|
|
|
571
870
|
print(f"Chunking {src_path}...")
|
|
572
871
|
chunks = chunk_directory(src_path, lang, exclude_patterns, args.max_size)
|
|
573
872
|
print(f"Created {len(chunks)} chunks")
|
|
574
873
|
|
|
874
|
+
if len(chunks) == 0:
|
|
875
|
+
print("Warning: No chunks created. Check if source directory has supported files.")
|
|
876
|
+
print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx, .txt")
|
|
877
|
+
|
|
575
878
|
# Save output
|
|
576
879
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
577
880
|
|
|
881
|
+
# Categorize chunks by file type
|
|
882
|
+
file_types = {}
|
|
883
|
+
for c in chunks:
|
|
884
|
+
ft = c.metadata.get('file_type', 'unknown')
|
|
885
|
+
file_types[ft] = file_types.get(ft, 0) + 1
|
|
886
|
+
|
|
578
887
|
output_data = {
|
|
579
888
|
'metadata': {
|
|
580
889
|
'generated_at': datetime.now().isoformat(),
|
|
581
890
|
'source_path': str(src_path),
|
|
582
|
-
'
|
|
891
|
+
'mode': lang,
|
|
583
892
|
'total_chunks': len(chunks),
|
|
584
|
-
'max_chunk_size': args.max_size
|
|
893
|
+
'max_chunk_size': args.max_size,
|
|
894
|
+
'file_types': file_types
|
|
585
895
|
},
|
|
586
896
|
'chunks': [asdict(c) for c in chunks]
|
|
587
897
|
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# PLAN: Universal Intelligence Engine (UIE)
|
|
2
|
+
|
|
3
|
+
> Mục tiêu: Nâng cấp Agent Kit để hỗ trợ mọi loại folder làm việc (Docs, Research, Code) với khả năng thấu hiểu tri thức toàn diện.
|
|
4
|
+
|
|
5
|
+
## Phase 1: Universal RAG (Smart Chunking cho mọi định dạng) ✅ COMPLETED
|
|
6
|
+
- [x] **Nâng cấp `chunk_code.py` thành Universal Chunker**:
|
|
7
|
+
- [x] Thêm `MarkdownChunker`: Cắt segment theo Heading (`#`, `##`, `###`).
|
|
8
|
+
- [x] Thêm `TextChunker`: Cắt theo đoạn văn (Paragraph).
|
|
9
|
+
- [x] Giữ nguyên khả năng cắt Code (TS, Python) sử dụng AST.
|
|
10
|
+
- [x] **Contextual Metadata cho Docs**:
|
|
11
|
+
- [x] Tự động trích xuất tiêu đề chính của file làm context cho từng chunk bên trong.
|
|
12
|
+
- [x] Ghi nhận `context_path` (Parent > Child heading) để AI biết ngữ cảnh.
|
|
13
|
+
|
|
14
|
+
## Phase 2: Knowledge Graph cho Tài liệu (Relationship Mapping) ✅ COMPLETED
|
|
15
|
+
- [x] **Nâng cấp `generate_graph.py`**:
|
|
16
|
+
- [x] Thêm `MarkdownAnalyzer`: Quét link nội bộ `[text](file.md)` và wikilink `[[file]]`.
|
|
17
|
+
- [x] Nhận diện heading exports (H1, H2) làm "API" của tài liệu.
|
|
18
|
+
- [x] Chế độ `universal`: Kết hợp Code + Markdown trong cùng một graph.
|
|
19
|
+
|
|
20
|
+
## Phase 3: MCP Gateway Enhancements
|
|
21
|
+
- [ ] **Content-Type Awareness**:
|
|
22
|
+
- [ ] Tool `analyze_dependencies` trả về "References" thay vì "Imports" cho docs.
|
|
23
|
+
- [ ] Tool `search_code_logic` alias thành `search_knowledge`.
|
|
24
|
+
- [ ] **Auto-Detection**: gateway báo cáo loại dự án (Creative, Tech, Mixed).
|
|
25
|
+
|
|
26
|
+
## Phase 4: CLI Evolution (`ak` command)
|
|
27
|
+
- [ ] **`ak init` thông minh**:
|
|
28
|
+
- [ ] Không ép buộc phải có `src` hay `app`.
|
|
29
|
+
- [ ] Tự động tạo `AGENTS.md` theo template "General Knowledge" nếu không phát hiện code.
|
|
30
|
+
- [ ] **`ak sync` toàn diện**:
|
|
31
|
+
- [ ] Quét mọi file (ngo trừ ignore list) để đảm bảo không bỏ sót tri thức nào.
|
|
32
|
+
|
|
33
|
+
## Phase 5: Verification & Beta Test
|
|
34
|
+
- [ ] **Test Case 1**: Dự án chỉ gồm 100 file Markdown (Hướng dẫn sử dụng).
|
|
35
|
+
- [ ] **Test Case 2**: Dự án hỗn hợp (Next.js + Docs + API Specs).
|
|
36
|
+
- [ ] **Test Case 3**: Dự án nghiên cứu (Nhiều file .txt và ghi chú rời rạc).
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Agent Assignments
|
|
41
|
+
- **Python Specialist**: Xử lý logic Chunker và Graph Mapper (Phase 1 & 2).
|
|
42
|
+
- **TypeScript Expert**: Cập nhật MCP Gateway (Phase 3).
|
|
43
|
+
- **Orchestrator**: Cập nhật CLI và hoàn thiện Docs (Phase 4).
|
|
44
|
+
|
|
45
|
+
## Verification Checklist
|
|
46
|
+
- [ ] `ai status` hiển thị ✅ RAG và ✅ Graph ngay cả trong folder không có code.
|
|
47
|
+
- [ ] AI có thể tìm thấy thông tin nằm sâu trong một sub-section của file Markdown dài.
|
|
48
|
+
- [ ] Dependency graph hiển thị được mối liên kết giữa các file tài liệu.
|