@musashishao/agent-kit 1.2.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/mcp-gateway/README.md +121 -0
- package/.agent/mcp-gateway/dist/index.d.ts +11 -0
- package/.agent/mcp-gateway/dist/index.js +504 -0
- package/.agent/mcp-gateway/dist/sync/debouncer.d.ts +56 -0
- package/.agent/mcp-gateway/dist/sync/debouncer.js +112 -0
- package/.agent/mcp-gateway/dist/sync/incremental_syncer.d.ts +58 -0
- package/.agent/mcp-gateway/dist/sync/incremental_syncer.js +172 -0
- package/.agent/mcp-gateway/dist/sync/index.d.ts +6 -0
- package/.agent/mcp-gateway/dist/sync/index.js +6 -0
- package/.agent/mcp-gateway/dist/sync/timestamp_checker.d.ts +69 -0
- package/.agent/mcp-gateway/dist/sync/timestamp_checker.js +169 -0
- package/.agent/mcp-gateway/package.json +28 -0
- package/.agent/mcp-gateway/src/index.ts +608 -0
- package/.agent/mcp-gateway/src/sync/debouncer.ts +129 -0
- package/.agent/mcp-gateway/src/sync/incremental_syncer.ts +237 -0
- package/.agent/mcp-gateway/src/sync/index.ts +7 -0
- package/.agent/mcp-gateway/src/sync/timestamp_checker.ts +194 -0
- package/.agent/scripts/ak_cli.py +549 -0
- package/.agent/scripts/setup_host.py +557 -0
- package/.agent/scripts/verify_install.py +174 -0
- package/.agent/skills/app-builder/SKILL.md +51 -1
- package/.agent/skills/app-builder/scripts/generate_ai_infra.py +510 -0
- package/.agent/skills/documentation-templates/SKILL.md +9 -1
- package/.agent/skills/documentation-templates/agents-template.md +202 -0
- package/.agent/skills/graph-mapper/SKILL.md +211 -0
- package/.agent/skills/graph-mapper/scripts/generate_graph.py +705 -0
- package/.agent/skills/rag-engineering/SKILL.md +342 -0
- package/.agent/skills/rag-engineering/chunking-strategies.md +229 -0
- package/.agent/skills/rag-engineering/contextual-retrieval.md +261 -0
- package/.agent/skills/rag-engineering/hybrid-search.md +356 -0
- package/.agent/skills/rag-engineering/scripts/chunk_code.py +916 -0
- package/.agent/templates/mcp_configs/claude_desktop.json +14 -0
- package/.agent/templates/mcp_configs/cursor.json +13 -0
- package/.agent/templates/mcp_configs/vscode.json +13 -0
- package/.agent/workflows/create.md +70 -2
- package/bin/cli.js +91 -0
- package/docs/AI_DATA_INFRASTRUCTURE.md +288 -0
- package/docs/CHANGELOG_AI_INFRA.md +111 -0
- package/docs/PLAN-universal-intelligence.md +48 -0
- package/package.json +7 -2
|
@@ -0,0 +1,916 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Code Chunker - Intelligent code chunking by logical boundaries.
|
|
4
|
+
|
|
5
|
+
Splits code files into meaningful chunks (functions, classes, modules)
|
|
6
|
+
instead of arbitrary character/token limits.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python chunk_code.py --src ./src --output chunks.json --lang auto
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
import json
|
|
15
|
+
import hashlib
|
|
16
|
+
import argparse
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import List, Dict, Optional, Tuple
|
|
19
|
+
from dataclasses import dataclass, asdict
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class Chunk:
|
|
25
|
+
"""Represents a code chunk."""
|
|
26
|
+
id: str
|
|
27
|
+
content: str
|
|
28
|
+
metadata: Dict
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def generate_chunk_id(file_path: str, content: str) -> str:
|
|
32
|
+
"""Generate unique chunk ID."""
|
|
33
|
+
hash_input = f"{file_path}:{content[:100]}"
|
|
34
|
+
return hashlib.sha256(hash_input.encode()).hexdigest()[:16]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TypeScriptChunker:
|
|
38
|
+
"""Chunk TypeScript/JavaScript files by logical boundaries."""
|
|
39
|
+
|
|
40
|
+
EXTENSIONS = {'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs'}
|
|
41
|
+
|
|
42
|
+
def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
|
|
43
|
+
self.max_chunk_size = max_chunk_size # in characters
|
|
44
|
+
self.overlap = overlap
|
|
45
|
+
|
|
46
|
+
def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
|
|
47
|
+
"""Chunk a single file."""
|
|
48
|
+
try:
|
|
49
|
+
content = file_path.read_text(encoding='utf-8')
|
|
50
|
+
except (UnicodeDecodeError, PermissionError):
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
relative_path = str(file_path.relative_to(base_path))
|
|
54
|
+
chunks = []
|
|
55
|
+
|
|
56
|
+
# Try to extract functions and classes
|
|
57
|
+
code_units = self._extract_code_units(content)
|
|
58
|
+
|
|
59
|
+
if code_units:
|
|
60
|
+
# Chunk by code units
|
|
61
|
+
for unit in code_units:
|
|
62
|
+
chunk_content = unit['code']
|
|
63
|
+
|
|
64
|
+
# If unit is too large, split it
|
|
65
|
+
if len(chunk_content) > self.max_chunk_size:
|
|
66
|
+
sub_chunks = self._split_large_chunk(chunk_content)
|
|
67
|
+
for i, sub in enumerate(sub_chunks):
|
|
68
|
+
chunks.append(self._create_chunk(
|
|
69
|
+
content=sub,
|
|
70
|
+
file_path=relative_path,
|
|
71
|
+
unit_type=unit['type'],
|
|
72
|
+
unit_name=f"{unit['name']}_part{i+1}",
|
|
73
|
+
start_line=unit['start_line'],
|
|
74
|
+
end_line=unit['end_line']
|
|
75
|
+
))
|
|
76
|
+
else:
|
|
77
|
+
chunks.append(self._create_chunk(
|
|
78
|
+
content=chunk_content,
|
|
79
|
+
file_path=relative_path,
|
|
80
|
+
unit_type=unit['type'],
|
|
81
|
+
unit_name=unit['name'],
|
|
82
|
+
start_line=unit['start_line'],
|
|
83
|
+
end_line=unit['end_line']
|
|
84
|
+
))
|
|
85
|
+
else:
|
|
86
|
+
# Fallback: chunk entire file
|
|
87
|
+
if len(content) <= self.max_chunk_size:
|
|
88
|
+
chunks.append(self._create_chunk(
|
|
89
|
+
content=content,
|
|
90
|
+
file_path=relative_path,
|
|
91
|
+
unit_type='module',
|
|
92
|
+
unit_name=file_path.stem,
|
|
93
|
+
start_line=1,
|
|
94
|
+
end_line=content.count('\n') + 1
|
|
95
|
+
))
|
|
96
|
+
else:
|
|
97
|
+
# Split by size with overlap
|
|
98
|
+
sub_chunks = self._split_large_chunk(content)
|
|
99
|
+
for i, sub in enumerate(sub_chunks):
|
|
100
|
+
chunks.append(self._create_chunk(
|
|
101
|
+
content=sub,
|
|
102
|
+
file_path=relative_path,
|
|
103
|
+
unit_type='module_part',
|
|
104
|
+
unit_name=f"{file_path.stem}_part{i+1}",
|
|
105
|
+
start_line=1,
|
|
106
|
+
end_line=content.count('\n') + 1
|
|
107
|
+
))
|
|
108
|
+
|
|
109
|
+
return chunks
|
|
110
|
+
|
|
111
|
+
def _extract_code_units(self, content: str) -> List[Dict]:
|
|
112
|
+
"""Extract functions and classes from code."""
|
|
113
|
+
units = []
|
|
114
|
+
lines = content.split('\n')
|
|
115
|
+
|
|
116
|
+
# Patterns for detecting code blocks
|
|
117
|
+
patterns = [
|
|
118
|
+
# Exported function
|
|
119
|
+
(r'^export\s+(?:async\s+)?function\s+(\w+)', 'function'),
|
|
120
|
+
# Regular function
|
|
121
|
+
(r'^(?:async\s+)?function\s+(\w+)', 'function'),
|
|
122
|
+
# Arrow function (const/let/var)
|
|
123
|
+
(r'^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>', 'function'),
|
|
124
|
+
# Class
|
|
125
|
+
(r'^(?:export\s+)?class\s+(\w+)', 'class'),
|
|
126
|
+
# Interface/Type (TypeScript)
|
|
127
|
+
(r'^(?:export\s+)?(?:interface|type)\s+(\w+)', 'type'),
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
i = 0
|
|
131
|
+
while i < len(lines):
|
|
132
|
+
line = lines[i]
|
|
133
|
+
|
|
134
|
+
for pattern, unit_type in patterns:
|
|
135
|
+
match = re.match(pattern, line.strip())
|
|
136
|
+
if match:
|
|
137
|
+
name = match.group(1)
|
|
138
|
+
start_line = i + 1
|
|
139
|
+
|
|
140
|
+
# Find the end of this block (matching braces)
|
|
141
|
+
end_line = self._find_block_end(lines, i)
|
|
142
|
+
|
|
143
|
+
code = '\n'.join(lines[i:end_line])
|
|
144
|
+
units.append({
|
|
145
|
+
'type': unit_type,
|
|
146
|
+
'name': name,
|
|
147
|
+
'code': code,
|
|
148
|
+
'start_line': start_line,
|
|
149
|
+
'end_line': end_line
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
i = end_line
|
|
153
|
+
break
|
|
154
|
+
else:
|
|
155
|
+
i += 1
|
|
156
|
+
|
|
157
|
+
return units
|
|
158
|
+
|
|
159
|
+
def _find_block_end(self, lines: List[str], start: int) -> int:
|
|
160
|
+
"""Find the end of a code block by matching braces."""
|
|
161
|
+
brace_count = 0
|
|
162
|
+
found_open = False
|
|
163
|
+
|
|
164
|
+
for i in range(start, len(lines)):
|
|
165
|
+
line = lines[i]
|
|
166
|
+
for char in line:
|
|
167
|
+
if char == '{':
|
|
168
|
+
brace_count += 1
|
|
169
|
+
found_open = True
|
|
170
|
+
elif char == '}':
|
|
171
|
+
brace_count -= 1
|
|
172
|
+
|
|
173
|
+
if found_open and brace_count == 0:
|
|
174
|
+
return i + 1
|
|
175
|
+
|
|
176
|
+
return len(lines)
|
|
177
|
+
|
|
178
|
+
def _split_large_chunk(self, content: str) -> List[str]:
|
|
179
|
+
"""Split large content into smaller chunks with overlap."""
|
|
180
|
+
chunks = []
|
|
181
|
+
lines = content.split('\n')
|
|
182
|
+
|
|
183
|
+
current_chunk = []
|
|
184
|
+
current_size = 0
|
|
185
|
+
|
|
186
|
+
for line in lines:
|
|
187
|
+
line_size = len(line) + 1 # +1 for newline
|
|
188
|
+
|
|
189
|
+
if current_size + line_size > self.max_chunk_size and current_chunk:
|
|
190
|
+
chunks.append('\n'.join(current_chunk))
|
|
191
|
+
# Keep overlap lines
|
|
192
|
+
overlap_lines = current_chunk[-3:] if len(current_chunk) > 3 else current_chunk
|
|
193
|
+
current_chunk = overlap_lines.copy()
|
|
194
|
+
current_size = sum(len(l) + 1 for l in current_chunk)
|
|
195
|
+
|
|
196
|
+
current_chunk.append(line)
|
|
197
|
+
current_size += line_size
|
|
198
|
+
|
|
199
|
+
if current_chunk:
|
|
200
|
+
chunks.append('\n'.join(current_chunk))
|
|
201
|
+
|
|
202
|
+
return chunks
|
|
203
|
+
|
|
204
|
+
def _create_chunk(
|
|
205
|
+
self,
|
|
206
|
+
content: str,
|
|
207
|
+
file_path: str,
|
|
208
|
+
unit_type: str,
|
|
209
|
+
unit_name: str,
|
|
210
|
+
start_line: int,
|
|
211
|
+
end_line: int
|
|
212
|
+
) -> Chunk:
|
|
213
|
+
"""Create a Chunk object with metadata."""
|
|
214
|
+
return Chunk(
|
|
215
|
+
id=generate_chunk_id(file_path, content),
|
|
216
|
+
content=content,
|
|
217
|
+
metadata={
|
|
218
|
+
'file_path': file_path,
|
|
219
|
+
'file_type': 'typescript',
|
|
220
|
+
'chunk_type': unit_type,
|
|
221
|
+
'name': unit_name,
|
|
222
|
+
'start_line': start_line,
|
|
223
|
+
'end_line': end_line,
|
|
224
|
+
'char_count': len(content),
|
|
225
|
+
'line_count': content.count('\n') + 1
|
|
226
|
+
}
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class PythonChunker:
|
|
231
|
+
"""Chunk Python files by logical boundaries."""
|
|
232
|
+
|
|
233
|
+
def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
|
|
234
|
+
self.max_chunk_size = max_chunk_size
|
|
235
|
+
self.overlap = overlap
|
|
236
|
+
|
|
237
|
+
def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
|
|
238
|
+
"""Chunk a single Python file."""
|
|
239
|
+
try:
|
|
240
|
+
content = file_path.read_text(encoding='utf-8')
|
|
241
|
+
except (UnicodeDecodeError, PermissionError):
|
|
242
|
+
return []
|
|
243
|
+
|
|
244
|
+
relative_path = str(file_path.relative_to(base_path))
|
|
245
|
+
chunks = []
|
|
246
|
+
|
|
247
|
+
# Try to use AST for extraction
|
|
248
|
+
try:
|
|
249
|
+
import ast
|
|
250
|
+
tree = ast.parse(content)
|
|
251
|
+
chunks = self._chunk_with_ast(content, tree, relative_path)
|
|
252
|
+
except SyntaxError:
|
|
253
|
+
# Fallback to regex-based extraction
|
|
254
|
+
chunks = self._chunk_with_regex(content, relative_path)
|
|
255
|
+
|
|
256
|
+
return chunks
|
|
257
|
+
|
|
258
|
+
def _chunk_with_ast(self, content: str, tree, file_path: str) -> List[Chunk]:
|
|
259
|
+
"""Extract chunks using Python AST."""
|
|
260
|
+
import ast
|
|
261
|
+
|
|
262
|
+
chunks = []
|
|
263
|
+
lines = content.split('\n')
|
|
264
|
+
|
|
265
|
+
for node in ast.iter_child_nodes(tree):
|
|
266
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
267
|
+
chunk_type = 'function'
|
|
268
|
+
name = node.name
|
|
269
|
+
elif isinstance(node, ast.ClassDef):
|
|
270
|
+
chunk_type = 'class'
|
|
271
|
+
name = node.name
|
|
272
|
+
else:
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
# Get source segment
|
|
276
|
+
start_line = node.lineno
|
|
277
|
+
end_line = node.end_lineno or start_line
|
|
278
|
+
code = '\n'.join(lines[start_line - 1:end_line])
|
|
279
|
+
|
|
280
|
+
# Handle large chunks
|
|
281
|
+
if len(code) > self.max_chunk_size:
|
|
282
|
+
# For classes, try to split by methods
|
|
283
|
+
if isinstance(node, ast.ClassDef):
|
|
284
|
+
method_chunks = self._split_class_by_methods(
|
|
285
|
+
code, node, lines, file_path, name
|
|
286
|
+
)
|
|
287
|
+
chunks.extend(method_chunks)
|
|
288
|
+
else:
|
|
289
|
+
# Split function (rare case)
|
|
290
|
+
sub_chunks = self._split_large_content(code)
|
|
291
|
+
for i, sub in enumerate(sub_chunks):
|
|
292
|
+
chunks.append(self._create_chunk(
|
|
293
|
+
content=sub,
|
|
294
|
+
file_path=file_path,
|
|
295
|
+
chunk_type=f'{chunk_type}_part',
|
|
296
|
+
name=f'{name}_part{i+1}',
|
|
297
|
+
start_line=start_line,
|
|
298
|
+
end_line=end_line
|
|
299
|
+
))
|
|
300
|
+
else:
|
|
301
|
+
chunks.append(self._create_chunk(
|
|
302
|
+
content=code,
|
|
303
|
+
file_path=file_path,
|
|
304
|
+
chunk_type=chunk_type,
|
|
305
|
+
name=name,
|
|
306
|
+
start_line=start_line,
|
|
307
|
+
end_line=end_line
|
|
308
|
+
))
|
|
309
|
+
|
|
310
|
+
# If no chunks extracted, treat as module
|
|
311
|
+
if not chunks:
|
|
312
|
+
if len(content) <= self.max_chunk_size:
|
|
313
|
+
chunks.append(self._create_chunk(
|
|
314
|
+
content=content,
|
|
315
|
+
file_path=file_path,
|
|
316
|
+
chunk_type='module',
|
|
317
|
+
name=Path(file_path).stem,
|
|
318
|
+
start_line=1,
|
|
319
|
+
end_line=len(lines)
|
|
320
|
+
))
|
|
321
|
+
else:
|
|
322
|
+
sub_chunks = self._split_large_content(content)
|
|
323
|
+
for i, sub in enumerate(sub_chunks):
|
|
324
|
+
chunks.append(self._create_chunk(
|
|
325
|
+
content=sub,
|
|
326
|
+
file_path=file_path,
|
|
327
|
+
chunk_type='module_part',
|
|
328
|
+
name=f'{Path(file_path).stem}_part{i+1}',
|
|
329
|
+
start_line=1,
|
|
330
|
+
end_line=len(lines)
|
|
331
|
+
))
|
|
332
|
+
|
|
333
|
+
return chunks
|
|
334
|
+
|
|
335
|
+
def _split_class_by_methods(
|
|
336
|
+
self,
|
|
337
|
+
code: str,
|
|
338
|
+
class_node,
|
|
339
|
+
lines: List[str],
|
|
340
|
+
file_path: str,
|
|
341
|
+
class_name: str
|
|
342
|
+
) -> List[Chunk]:
|
|
343
|
+
"""Split a large class into method-level chunks."""
|
|
344
|
+
import ast
|
|
345
|
+
|
|
346
|
+
chunks = []
|
|
347
|
+
|
|
348
|
+
# First chunk: class definition + docstring + class variables
|
|
349
|
+
class_start = class_node.lineno - 1
|
|
350
|
+
first_method_line = None
|
|
351
|
+
|
|
352
|
+
for item in class_node.body:
|
|
353
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
354
|
+
first_method_line = item.lineno - 1
|
|
355
|
+
break
|
|
356
|
+
|
|
357
|
+
if first_method_line:
|
|
358
|
+
class_header = '\n'.join(lines[class_start:first_method_line])
|
|
359
|
+
chunks.append(self._create_chunk(
|
|
360
|
+
content=class_header,
|
|
361
|
+
file_path=file_path,
|
|
362
|
+
chunk_type='class_header',
|
|
363
|
+
name=f'{class_name}_header',
|
|
364
|
+
start_line=class_start + 1,
|
|
365
|
+
end_line=first_method_line
|
|
366
|
+
))
|
|
367
|
+
|
|
368
|
+
# Each method as separate chunk
|
|
369
|
+
for item in class_node.body:
|
|
370
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
371
|
+
start = item.lineno - 1
|
|
372
|
+
end = item.end_lineno or start + 1
|
|
373
|
+
method_code = '\n'.join(lines[start:end])
|
|
374
|
+
|
|
375
|
+
chunks.append(self._create_chunk(
|
|
376
|
+
content=method_code,
|
|
377
|
+
file_path=file_path,
|
|
378
|
+
chunk_type='method',
|
|
379
|
+
name=f'{class_name}.{item.name}',
|
|
380
|
+
start_line=start + 1,
|
|
381
|
+
end_line=end
|
|
382
|
+
))
|
|
383
|
+
|
|
384
|
+
return chunks
|
|
385
|
+
|
|
386
|
+
def _chunk_with_regex(self, content: str, file_path: str) -> List[Chunk]:
|
|
387
|
+
"""Fallback regex-based chunking."""
|
|
388
|
+
chunks = []
|
|
389
|
+
lines = content.split('\n')
|
|
390
|
+
|
|
391
|
+
# Simple pattern for function/class definitions
|
|
392
|
+
def_pattern = re.compile(r'^(class|def|async\s+def)\s+(\w+)')
|
|
393
|
+
|
|
394
|
+
current_block = []
|
|
395
|
+
current_type = None
|
|
396
|
+
current_name = None
|
|
397
|
+
block_start = 0
|
|
398
|
+
base_indent = None
|
|
399
|
+
|
|
400
|
+
for i, line in enumerate(lines):
|
|
401
|
+
match = def_pattern.match(line)
|
|
402
|
+
|
|
403
|
+
if match:
|
|
404
|
+
# Save previous block
|
|
405
|
+
if current_block and current_name:
|
|
406
|
+
chunks.append(self._create_chunk(
|
|
407
|
+
content='\n'.join(current_block),
|
|
408
|
+
file_path=file_path,
|
|
409
|
+
chunk_type=current_type or 'code',
|
|
410
|
+
name=current_name,
|
|
411
|
+
start_line=block_start + 1,
|
|
412
|
+
end_line=i
|
|
413
|
+
))
|
|
414
|
+
|
|
415
|
+
# Start new block
|
|
416
|
+
current_type = 'class' if match.group(1) == 'class' else 'function'
|
|
417
|
+
current_name = match.group(2)
|
|
418
|
+
current_block = [line]
|
|
419
|
+
block_start = i
|
|
420
|
+
base_indent = len(line) - len(line.lstrip())
|
|
421
|
+
elif current_block:
|
|
422
|
+
# Check if still in block (indent-based)
|
|
423
|
+
stripped = line.lstrip()
|
|
424
|
+
if stripped: # Non-empty line
|
|
425
|
+
current_indent = len(line) - len(stripped)
|
|
426
|
+
if current_indent <= base_indent and not line.strip().startswith('#'):
|
|
427
|
+
# End of block
|
|
428
|
+
chunks.append(self._create_chunk(
|
|
429
|
+
content='\n'.join(current_block),
|
|
430
|
+
file_path=file_path,
|
|
431
|
+
chunk_type=current_type or 'code',
|
|
432
|
+
name=current_name,
|
|
433
|
+
start_line=block_start + 1,
|
|
434
|
+
end_line=i
|
|
435
|
+
))
|
|
436
|
+
current_block = []
|
|
437
|
+
current_name = None
|
|
438
|
+
current_type = None
|
|
439
|
+
else:
|
|
440
|
+
current_block.append(line)
|
|
441
|
+
else:
|
|
442
|
+
current_block.append(line)
|
|
443
|
+
|
|
444
|
+
# Don't forget last block
|
|
445
|
+
if current_block and current_name:
|
|
446
|
+
chunks.append(self._create_chunk(
|
|
447
|
+
content='\n'.join(current_block),
|
|
448
|
+
file_path=file_path,
|
|
449
|
+
chunk_type=current_type or 'code',
|
|
450
|
+
name=current_name,
|
|
451
|
+
start_line=block_start + 1,
|
|
452
|
+
end_line=len(lines)
|
|
453
|
+
))
|
|
454
|
+
|
|
455
|
+
return chunks
|
|
456
|
+
|
|
457
|
+
def _split_large_content(self, content: str) -> List[str]:
|
|
458
|
+
"""Split large content with overlap."""
|
|
459
|
+
chunks = []
|
|
460
|
+
lines = content.split('\n')
|
|
461
|
+
|
|
462
|
+
current_chunk = []
|
|
463
|
+
current_size = 0
|
|
464
|
+
|
|
465
|
+
for line in lines:
|
|
466
|
+
line_size = len(line) + 1
|
|
467
|
+
|
|
468
|
+
if current_size + line_size > self.max_chunk_size and current_chunk:
|
|
469
|
+
chunks.append('\n'.join(current_chunk))
|
|
470
|
+
overlap_lines = current_chunk[-3:] if len(current_chunk) > 3 else current_chunk
|
|
471
|
+
current_chunk = overlap_lines.copy()
|
|
472
|
+
current_size = sum(len(l) + 1 for l in current_chunk)
|
|
473
|
+
|
|
474
|
+
current_chunk.append(line)
|
|
475
|
+
current_size += line_size
|
|
476
|
+
|
|
477
|
+
if current_chunk:
|
|
478
|
+
chunks.append('\n'.join(current_chunk))
|
|
479
|
+
|
|
480
|
+
return chunks
|
|
481
|
+
|
|
482
|
+
def _create_chunk(
|
|
483
|
+
self,
|
|
484
|
+
content: str,
|
|
485
|
+
file_path: str,
|
|
486
|
+
chunk_type: str,
|
|
487
|
+
name: str,
|
|
488
|
+
start_line: int,
|
|
489
|
+
end_line: int
|
|
490
|
+
) -> Chunk:
|
|
491
|
+
"""Create a Chunk object."""
|
|
492
|
+
return Chunk(
|
|
493
|
+
id=generate_chunk_id(file_path, content),
|
|
494
|
+
content=content,
|
|
495
|
+
metadata={
|
|
496
|
+
'file_path': file_path,
|
|
497
|
+
'file_type': 'python',
|
|
498
|
+
'chunk_type': chunk_type,
|
|
499
|
+
'name': name,
|
|
500
|
+
'start_line': start_line,
|
|
501
|
+
'end_line': end_line,
|
|
502
|
+
'char_count': len(content),
|
|
503
|
+
'line_count': content.count('\n') + 1
|
|
504
|
+
}
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
class MarkdownChunker:
|
|
509
|
+
"""Chunk Markdown files by heading sections."""
|
|
510
|
+
|
|
511
|
+
EXTENSIONS = {'.md', '.mdx', '.markdown'}
|
|
512
|
+
|
|
513
|
+
def __init__(self, max_chunk_size: int = 2000, overlap: int = 100):
|
|
514
|
+
self.max_chunk_size = max_chunk_size
|
|
515
|
+
self.overlap = overlap
|
|
516
|
+
|
|
517
|
+
def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
|
|
518
|
+
"""Chunk a Markdown file by headings."""
|
|
519
|
+
try:
|
|
520
|
+
content = file_path.read_text(encoding='utf-8')
|
|
521
|
+
except (UnicodeDecodeError, PermissionError):
|
|
522
|
+
return []
|
|
523
|
+
|
|
524
|
+
relative_path = str(file_path.relative_to(base_path))
|
|
525
|
+
chunks = []
|
|
526
|
+
|
|
527
|
+
# Extract sections by heading
|
|
528
|
+
sections = self._extract_sections(content)
|
|
529
|
+
|
|
530
|
+
if sections:
|
|
531
|
+
for section in sections:
|
|
532
|
+
chunk_content = section['content']
|
|
533
|
+
|
|
534
|
+
# If section is too large, split it
|
|
535
|
+
if len(chunk_content) > self.max_chunk_size:
|
|
536
|
+
sub_chunks = self._split_by_paragraphs(chunk_content)
|
|
537
|
+
for i, sub in enumerate(sub_chunks):
|
|
538
|
+
chunks.append(self._create_chunk(
|
|
539
|
+
content=sub,
|
|
540
|
+
file_path=relative_path,
|
|
541
|
+
section_title=f"{section['title']}_part{i+1}",
|
|
542
|
+
heading_level=section['level'],
|
|
543
|
+
parent_headings=section['parents'],
|
|
544
|
+
start_line=section['start_line'],
|
|
545
|
+
end_line=section['end_line']
|
|
546
|
+
))
|
|
547
|
+
else:
|
|
548
|
+
chunks.append(self._create_chunk(
|
|
549
|
+
content=chunk_content,
|
|
550
|
+
file_path=relative_path,
|
|
551
|
+
section_title=section['title'],
|
|
552
|
+
heading_level=section['level'],
|
|
553
|
+
parent_headings=section['parents'],
|
|
554
|
+
start_line=section['start_line'],
|
|
555
|
+
end_line=section['end_line']
|
|
556
|
+
))
|
|
557
|
+
else:
|
|
558
|
+
# No headings found, treat entire file as one chunk or split by paragraphs
|
|
559
|
+
if len(content) <= self.max_chunk_size:
|
|
560
|
+
chunks.append(self._create_chunk(
|
|
561
|
+
content=content,
|
|
562
|
+
file_path=relative_path,
|
|
563
|
+
section_title=file_path.stem,
|
|
564
|
+
heading_level=0,
|
|
565
|
+
parent_headings=[],
|
|
566
|
+
start_line=1,
|
|
567
|
+
end_line=content.count('\n') + 1
|
|
568
|
+
))
|
|
569
|
+
else:
|
|
570
|
+
sub_chunks = self._split_by_paragraphs(content)
|
|
571
|
+
for i, sub in enumerate(sub_chunks):
|
|
572
|
+
chunks.append(self._create_chunk(
|
|
573
|
+
content=sub,
|
|
574
|
+
file_path=relative_path,
|
|
575
|
+
section_title=f"{file_path.stem}_part{i+1}",
|
|
576
|
+
heading_level=0,
|
|
577
|
+
parent_headings=[],
|
|
578
|
+
start_line=1,
|
|
579
|
+
end_line=content.count('\n') + 1
|
|
580
|
+
))
|
|
581
|
+
|
|
582
|
+
return chunks
|
|
583
|
+
|
|
584
|
+
def _extract_sections(self, content: str) -> List[Dict]:
|
|
585
|
+
"""Extract sections based on Markdown headings."""
|
|
586
|
+
sections = []
|
|
587
|
+
lines = content.split('\n')
|
|
588
|
+
|
|
589
|
+
heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
|
|
590
|
+
|
|
591
|
+
current_section = None
|
|
592
|
+
current_lines = []
|
|
593
|
+
parent_stack = [] # Track parent headings for context
|
|
594
|
+
|
|
595
|
+
for i, line in enumerate(lines):
|
|
596
|
+
match = heading_pattern.match(line)
|
|
597
|
+
|
|
598
|
+
if match:
|
|
599
|
+
# Save previous section
|
|
600
|
+
if current_section:
|
|
601
|
+
current_section['content'] = '\n'.join(current_lines).strip()
|
|
602
|
+
current_section['end_line'] = i
|
|
603
|
+
if current_section['content']:
|
|
604
|
+
sections.append(current_section)
|
|
605
|
+
|
|
606
|
+
# Update parent stack
|
|
607
|
+
level = len(match.group(1))
|
|
608
|
+
title = match.group(2).strip()
|
|
609
|
+
|
|
610
|
+
# Pop parents that are same or deeper level
|
|
611
|
+
while parent_stack and parent_stack[-1]['level'] >= level:
|
|
612
|
+
parent_stack.pop()
|
|
613
|
+
|
|
614
|
+
parents = [p['title'] for p in parent_stack]
|
|
615
|
+
|
|
616
|
+
# Start new section
|
|
617
|
+
current_section = {
|
|
618
|
+
'title': title,
|
|
619
|
+
'level': level,
|
|
620
|
+
'parents': parents.copy(),
|
|
621
|
+
'start_line': i + 1,
|
|
622
|
+
'end_line': i + 1,
|
|
623
|
+
'content': ''
|
|
624
|
+
}
|
|
625
|
+
current_lines = [line]
|
|
626
|
+
|
|
627
|
+
# Add this heading to parent stack
|
|
628
|
+
parent_stack.append({'level': level, 'title': title})
|
|
629
|
+
elif current_section:
|
|
630
|
+
current_lines.append(line)
|
|
631
|
+
|
|
632
|
+
# Don't forget last section
|
|
633
|
+
if current_section:
|
|
634
|
+
current_section['content'] = '\n'.join(current_lines).strip()
|
|
635
|
+
current_section['end_line'] = len(lines)
|
|
636
|
+
if current_section['content']:
|
|
637
|
+
sections.append(current_section)
|
|
638
|
+
|
|
639
|
+
return sections
|
|
640
|
+
|
|
641
|
+
def _split_by_paragraphs(self, content: str) -> List[str]:
|
|
642
|
+
"""Split content by paragraphs when too large."""
|
|
643
|
+
chunks = []
|
|
644
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
645
|
+
|
|
646
|
+
current_chunk = []
|
|
647
|
+
current_size = 0
|
|
648
|
+
|
|
649
|
+
for para in paragraphs:
|
|
650
|
+
para_size = len(para) + 2 # +2 for paragraph break
|
|
651
|
+
|
|
652
|
+
if current_size + para_size > self.max_chunk_size and current_chunk:
|
|
653
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
654
|
+
current_chunk = []
|
|
655
|
+
current_size = 0
|
|
656
|
+
|
|
657
|
+
current_chunk.append(para)
|
|
658
|
+
current_size += para_size
|
|
659
|
+
|
|
660
|
+
if current_chunk:
|
|
661
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
662
|
+
|
|
663
|
+
return chunks
|
|
664
|
+
|
|
665
|
+
def _create_chunk(
|
|
666
|
+
self,
|
|
667
|
+
content: str,
|
|
668
|
+
file_path: str,
|
|
669
|
+
section_title: str,
|
|
670
|
+
heading_level: int,
|
|
671
|
+
parent_headings: List[str],
|
|
672
|
+
start_line: int,
|
|
673
|
+
end_line: int
|
|
674
|
+
) -> Chunk:
|
|
675
|
+
"""Create a Chunk object with rich context."""
|
|
676
|
+
# Build context string for better retrieval
|
|
677
|
+
context_path = ' > '.join(parent_headings + [section_title]) if parent_headings else section_title
|
|
678
|
+
|
|
679
|
+
return Chunk(
|
|
680
|
+
id=generate_chunk_id(file_path, content),
|
|
681
|
+
content=content,
|
|
682
|
+
metadata={
|
|
683
|
+
'file_path': file_path,
|
|
684
|
+
'file_type': 'markdown',
|
|
685
|
+
'chunk_type': f'heading_{heading_level}' if heading_level > 0 else 'paragraph',
|
|
686
|
+
'name': section_title,
|
|
687
|
+
'context_path': context_path,
|
|
688
|
+
'heading_level': heading_level,
|
|
689
|
+
'parent_headings': parent_headings,
|
|
690
|
+
'start_line': start_line,
|
|
691
|
+
'end_line': end_line,
|
|
692
|
+
'char_count': len(content),
|
|
693
|
+
'line_count': content.count('\n') + 1
|
|
694
|
+
}
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
class TextChunker:
|
|
699
|
+
"""Chunk plain text files by paragraphs."""
|
|
700
|
+
|
|
701
|
+
EXTENSIONS = {'.txt', '.text', '.log'}
|
|
702
|
+
|
|
703
|
+
def __init__(self, max_chunk_size: int = 1500, overlap: int = 100):
|
|
704
|
+
self.max_chunk_size = max_chunk_size
|
|
705
|
+
self.overlap = overlap
|
|
706
|
+
|
|
707
|
+
def chunk_file(self, file_path: Path, base_path: Path) -> List[Chunk]:
|
|
708
|
+
"""Chunk a text file by paragraphs."""
|
|
709
|
+
try:
|
|
710
|
+
content = file_path.read_text(encoding='utf-8')
|
|
711
|
+
except (UnicodeDecodeError, PermissionError):
|
|
712
|
+
return []
|
|
713
|
+
|
|
714
|
+
relative_path = str(file_path.relative_to(base_path))
|
|
715
|
+
chunks = []
|
|
716
|
+
|
|
717
|
+
if len(content) <= self.max_chunk_size:
|
|
718
|
+
chunks.append(self._create_chunk(
|
|
719
|
+
content=content,
|
|
720
|
+
file_path=relative_path,
|
|
721
|
+
name=file_path.stem,
|
|
722
|
+
start_line=1,
|
|
723
|
+
end_line=content.count('\n') + 1
|
|
724
|
+
))
|
|
725
|
+
else:
|
|
726
|
+
# Split by paragraphs
|
|
727
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
728
|
+
current_chunk = []
|
|
729
|
+
current_size = 0
|
|
730
|
+
chunk_index = 0
|
|
731
|
+
|
|
732
|
+
for para in paragraphs:
|
|
733
|
+
para_size = len(para) + 2
|
|
734
|
+
|
|
735
|
+
if current_size + para_size > self.max_chunk_size and current_chunk:
|
|
736
|
+
chunk_index += 1
|
|
737
|
+
chunks.append(self._create_chunk(
|
|
738
|
+
content='\n\n'.join(current_chunk),
|
|
739
|
+
file_path=relative_path,
|
|
740
|
+
name=f"{file_path.stem}_part{chunk_index}",
|
|
741
|
+
start_line=1,
|
|
742
|
+
end_line=content.count('\n') + 1
|
|
743
|
+
))
|
|
744
|
+
current_chunk = []
|
|
745
|
+
current_size = 0
|
|
746
|
+
|
|
747
|
+
current_chunk.append(para)
|
|
748
|
+
current_size += para_size
|
|
749
|
+
|
|
750
|
+
if current_chunk:
|
|
751
|
+
chunk_index += 1
|
|
752
|
+
chunks.append(self._create_chunk(
|
|
753
|
+
content='\n\n'.join(current_chunk),
|
|
754
|
+
file_path=relative_path,
|
|
755
|
+
name=f"{file_path.stem}_part{chunk_index}",
|
|
756
|
+
start_line=1,
|
|
757
|
+
end_line=content.count('\n') + 1
|
|
758
|
+
))
|
|
759
|
+
|
|
760
|
+
return chunks
|
|
761
|
+
|
|
762
|
+
def _create_chunk(
|
|
763
|
+
self,
|
|
764
|
+
content: str,
|
|
765
|
+
file_path: str,
|
|
766
|
+
name: str,
|
|
767
|
+
start_line: int,
|
|
768
|
+
end_line: int
|
|
769
|
+
) -> Chunk:
|
|
770
|
+
"""Create a Chunk object."""
|
|
771
|
+
return Chunk(
|
|
772
|
+
id=generate_chunk_id(file_path, content),
|
|
773
|
+
content=content,
|
|
774
|
+
metadata={
|
|
775
|
+
'file_path': file_path,
|
|
776
|
+
'file_type': 'text',
|
|
777
|
+
'chunk_type': 'paragraph',
|
|
778
|
+
'name': name,
|
|
779
|
+
'start_line': start_line,
|
|
780
|
+
'end_line': end_line,
|
|
781
|
+
'char_count': len(content),
|
|
782
|
+
'line_count': content.count('\n') + 1
|
|
783
|
+
}
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
def chunk_directory(
|
|
788
|
+
src_path: Path,
|
|
789
|
+
lang: str,
|
|
790
|
+
exclude_patterns: List[str],
|
|
791
|
+
max_chunk_size: int = 1500
|
|
792
|
+
) -> List[Chunk]:
|
|
793
|
+
"""Chunk all files in a directory - Universal support for Code, Docs, and Text."""
|
|
794
|
+
all_chunks = []
|
|
795
|
+
|
|
796
|
+
# Initialize all chunkers
|
|
797
|
+
ts_chunker = TypeScriptChunker(max_chunk_size=max_chunk_size)
|
|
798
|
+
py_chunker = PythonChunker(max_chunk_size=max_chunk_size)
|
|
799
|
+
md_chunker = MarkdownChunker(max_chunk_size=max_chunk_size)
|
|
800
|
+
txt_chunker = TextChunker(max_chunk_size=max_chunk_size)
|
|
801
|
+
|
|
802
|
+
# Map extensions to chunkers
|
|
803
|
+
extension_map = {}
|
|
804
|
+
for ext in TypeScriptChunker.EXTENSIONS:
|
|
805
|
+
extension_map[ext] = ts_chunker
|
|
806
|
+
extension_map['.py'] = py_chunker
|
|
807
|
+
for ext in MarkdownChunker.EXTENSIONS:
|
|
808
|
+
extension_map[ext] = md_chunker
|
|
809
|
+
for ext in TextChunker.EXTENSIONS:
|
|
810
|
+
extension_map[ext] = txt_chunker
|
|
811
|
+
|
|
812
|
+
# All supported extensions
|
|
813
|
+
all_extensions = set(extension_map.keys())
|
|
814
|
+
|
|
815
|
+
# Process files
|
|
816
|
+
for file_path in src_path.rglob('*'):
|
|
817
|
+
if not file_path.is_file():
|
|
818
|
+
continue
|
|
819
|
+
|
|
820
|
+
# Check if extension is supported
|
|
821
|
+
if file_path.suffix not in all_extensions:
|
|
822
|
+
continue
|
|
823
|
+
|
|
824
|
+
# Check exclusions
|
|
825
|
+
path_str = str(file_path)
|
|
826
|
+
if any(pattern in path_str for pattern in exclude_patterns):
|
|
827
|
+
continue
|
|
828
|
+
|
|
829
|
+
# Select appropriate chunker
|
|
830
|
+
chunker = extension_map.get(file_path.suffix)
|
|
831
|
+
if chunker:
|
|
832
|
+
chunks = chunker.chunk_file(file_path, src_path)
|
|
833
|
+
all_chunks.extend(chunks)
|
|
834
|
+
|
|
835
|
+
return all_chunks
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
def main():
|
|
839
|
+
parser = argparse.ArgumentParser(
|
|
840
|
+
description='Universal Chunker - Code, Markdown, and Text files'
|
|
841
|
+
)
|
|
842
|
+
parser.add_argument('--src', default='./src', help='Source directory')
|
|
843
|
+
parser.add_argument('--output', default='.agent/rag/chunks.json', help='Output file')
|
|
844
|
+
parser.add_argument('--lang', choices=['typescript', 'python', 'auto', 'universal'],
|
|
845
|
+
default='universal', help='Language mode (universal = all file types)')
|
|
846
|
+
parser.add_argument('--max-size', type=int, default=1500, help='Max chunk size in chars')
|
|
847
|
+
parser.add_argument('--exclude', default='node_modules,__pycache__,.git,dist,build,.agent',
|
|
848
|
+
help='Patterns to exclude')
|
|
849
|
+
|
|
850
|
+
args = parser.parse_args()
|
|
851
|
+
|
|
852
|
+
src_path = Path(args.src).resolve()
|
|
853
|
+
output_path = Path(args.output)
|
|
854
|
+
exclude_patterns = args.exclude.split(',')
|
|
855
|
+
|
|
856
|
+
if not src_path.exists():
|
|
857
|
+
print(f"Error: Source directory '{src_path}' does not exist")
|
|
858
|
+
return 1
|
|
859
|
+
|
|
860
|
+
# Mode selection
|
|
861
|
+
lang = args.lang
|
|
862
|
+
if lang == 'auto':
|
|
863
|
+
ts_files = list(src_path.rglob('*.ts')) + list(src_path.rglob('*.tsx'))
|
|
864
|
+
py_files = list(src_path.rglob('*.py'))
|
|
865
|
+
lang = 'typescript' if len(ts_files) >= len(py_files) else 'python'
|
|
866
|
+
print(f"Auto-detected language: {lang}")
|
|
867
|
+
elif lang == 'universal':
|
|
868
|
+
print("Universal mode: Processing Code, Markdown, and Text files")
|
|
869
|
+
|
|
870
|
+
print(f"Chunking {src_path}...")
|
|
871
|
+
chunks = chunk_directory(src_path, lang, exclude_patterns, args.max_size)
|
|
872
|
+
print(f"Created {len(chunks)} chunks")
|
|
873
|
+
|
|
874
|
+
if len(chunks) == 0:
|
|
875
|
+
print("Warning: No chunks created. Check if source directory has supported files.")
|
|
876
|
+
print("Supported: .ts, .tsx, .js, .jsx, .py, .md, .mdx, .txt")
|
|
877
|
+
|
|
878
|
+
# Save output
|
|
879
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
880
|
+
|
|
881
|
+
# Categorize chunks by file type
|
|
882
|
+
file_types = {}
|
|
883
|
+
for c in chunks:
|
|
884
|
+
ft = c.metadata.get('file_type', 'unknown')
|
|
885
|
+
file_types[ft] = file_types.get(ft, 0) + 1
|
|
886
|
+
|
|
887
|
+
output_data = {
|
|
888
|
+
'metadata': {
|
|
889
|
+
'generated_at': datetime.now().isoformat(),
|
|
890
|
+
'source_path': str(src_path),
|
|
891
|
+
'mode': lang,
|
|
892
|
+
'total_chunks': len(chunks),
|
|
893
|
+
'max_chunk_size': args.max_size,
|
|
894
|
+
'file_types': file_types
|
|
895
|
+
},
|
|
896
|
+
'chunks': [asdict(c) for c in chunks]
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
output_path.write_text(json.dumps(output_data, indent=2), encoding='utf-8')
|
|
900
|
+
print(f"Saved to: {output_path}")
|
|
901
|
+
|
|
902
|
+
# Print summary
|
|
903
|
+
chunk_types = {}
|
|
904
|
+
for c in chunks:
|
|
905
|
+
t = c.metadata['chunk_type']
|
|
906
|
+
chunk_types[t] = chunk_types.get(t, 0) + 1
|
|
907
|
+
|
|
908
|
+
print("\nChunk summary:")
|
|
909
|
+
for t, count in sorted(chunk_types.items()):
|
|
910
|
+
print(f" {t}: {count}")
|
|
911
|
+
|
|
912
|
+
return 0
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
if __name__ == '__main__':
|
|
916
|
+
exit(main())
|