ai-coding-assistant 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_coding_assistant-0.5.0.dist-info/METADATA +226 -0
- ai_coding_assistant-0.5.0.dist-info/RECORD +89 -0
- ai_coding_assistant-0.5.0.dist-info/WHEEL +4 -0
- ai_coding_assistant-0.5.0.dist-info/entry_points.txt +3 -0
- ai_coding_assistant-0.5.0.dist-info/licenses/LICENSE +21 -0
- coding_assistant/__init__.py +3 -0
- coding_assistant/__main__.py +19 -0
- coding_assistant/cli/__init__.py +1 -0
- coding_assistant/cli/app.py +158 -0
- coding_assistant/cli/commands/__init__.py +19 -0
- coding_assistant/cli/commands/ask.py +178 -0
- coding_assistant/cli/commands/config.py +438 -0
- coding_assistant/cli/commands/diagram.py +267 -0
- coding_assistant/cli/commands/document.py +410 -0
- coding_assistant/cli/commands/explain.py +192 -0
- coding_assistant/cli/commands/fix.py +249 -0
- coding_assistant/cli/commands/index.py +162 -0
- coding_assistant/cli/commands/refactor.py +245 -0
- coding_assistant/cli/commands/search.py +182 -0
- coding_assistant/cli/commands/serve_docs.py +128 -0
- coding_assistant/cli/repl.py +381 -0
- coding_assistant/cli/theme.py +90 -0
- coding_assistant/codebase/__init__.py +1 -0
- coding_assistant/codebase/crawler.py +93 -0
- coding_assistant/codebase/parser.py +266 -0
- coding_assistant/config/__init__.py +25 -0
- coding_assistant/config/config_manager.py +615 -0
- coding_assistant/config/settings.py +82 -0
- coding_assistant/context/__init__.py +19 -0
- coding_assistant/context/chunker.py +443 -0
- coding_assistant/context/enhanced_retriever.py +322 -0
- coding_assistant/context/hybrid_search.py +311 -0
- coding_assistant/context/ranker.py +355 -0
- coding_assistant/context/retriever.py +119 -0
- coding_assistant/context/window.py +362 -0
- coding_assistant/documentation/__init__.py +23 -0
- coding_assistant/documentation/agents/__init__.py +27 -0
- coding_assistant/documentation/agents/coordinator.py +510 -0
- coding_assistant/documentation/agents/module_documenter.py +111 -0
- coding_assistant/documentation/agents/synthesizer.py +139 -0
- coding_assistant/documentation/agents/task_delegator.py +100 -0
- coding_assistant/documentation/decomposition/__init__.py +21 -0
- coding_assistant/documentation/decomposition/context_preserver.py +477 -0
- coding_assistant/documentation/decomposition/module_detector.py +302 -0
- coding_assistant/documentation/decomposition/partitioner.py +621 -0
- coding_assistant/documentation/generators/__init__.py +14 -0
- coding_assistant/documentation/generators/dataflow_generator.py +440 -0
- coding_assistant/documentation/generators/diagram_generator.py +511 -0
- coding_assistant/documentation/graph/__init__.py +13 -0
- coding_assistant/documentation/graph/dependency_builder.py +468 -0
- coding_assistant/documentation/graph/module_analyzer.py +475 -0
- coding_assistant/documentation/writers/__init__.py +11 -0
- coding_assistant/documentation/writers/markdown_writer.py +322 -0
- coding_assistant/embeddings/__init__.py +0 -0
- coding_assistant/embeddings/generator.py +89 -0
- coding_assistant/embeddings/store.py +187 -0
- coding_assistant/exceptions/__init__.py +50 -0
- coding_assistant/exceptions/base.py +110 -0
- coding_assistant/exceptions/llm.py +249 -0
- coding_assistant/exceptions/recovery.py +263 -0
- coding_assistant/exceptions/storage.py +213 -0
- coding_assistant/exceptions/validation.py +230 -0
- coding_assistant/llm/__init__.py +1 -0
- coding_assistant/llm/client.py +277 -0
- coding_assistant/llm/gemini_client.py +181 -0
- coding_assistant/llm/groq_client.py +160 -0
- coding_assistant/llm/prompts.py +98 -0
- coding_assistant/llm/together_client.py +160 -0
- coding_assistant/operations/__init__.py +13 -0
- coding_assistant/operations/differ.py +369 -0
- coding_assistant/operations/generator.py +347 -0
- coding_assistant/operations/linter.py +430 -0
- coding_assistant/operations/validator.py +406 -0
- coding_assistant/storage/__init__.py +9 -0
- coding_assistant/storage/database.py +363 -0
- coding_assistant/storage/session.py +231 -0
- coding_assistant/utils/__init__.py +31 -0
- coding_assistant/utils/cache.py +477 -0
- coding_assistant/utils/hardware.py +132 -0
- coding_assistant/utils/keystore.py +206 -0
- coding_assistant/utils/logger.py +32 -0
- coding_assistant/utils/progress.py +311 -0
- coding_assistant/validation/__init__.py +13 -0
- coding_assistant/validation/files.py +305 -0
- coding_assistant/validation/inputs.py +335 -0
- coding_assistant/validation/params.py +280 -0
- coding_assistant/validation/sanitizers.py +243 -0
- coding_assistant/vcs/__init__.py +5 -0
- coding_assistant/vcs/git.py +269 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Context management module."""
|
|
2
|
+
|
|
3
|
+
from coding_assistant.context.retriever import SemanticRetriever
|
|
4
|
+
from coding_assistant.context.chunker import SmartChunker, CodeChunk
|
|
5
|
+
from coding_assistant.context.hybrid_search import HybridSearch
|
|
6
|
+
from coding_assistant.context.ranker import ContextRanker
|
|
7
|
+
from coding_assistant.context.enhanced_retriever import EnhancedSemanticRetriever
|
|
8
|
+
from coding_assistant.context.window import TokenWindowManager, TokenBudget
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
'SemanticRetriever',
|
|
12
|
+
'SmartChunker',
|
|
13
|
+
'CodeChunk',
|
|
14
|
+
'HybridSearch',
|
|
15
|
+
'ContextRanker',
|
|
16
|
+
'EnhancedSemanticRetriever',
|
|
17
|
+
'TokenWindowManager',
|
|
18
|
+
'TokenBudget',
|
|
19
|
+
]
|
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
"""Smart code chunking for Python and JavaScript/TypeScript."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
from typing import List, Dict, Optional
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class CodeChunk:
|
|
10
|
+
"""Represents a chunk of code."""
|
|
11
|
+
type: str # 'function', 'class', 'method', 'file'
|
|
12
|
+
name: str
|
|
13
|
+
content: str
|
|
14
|
+
file_path: str
|
|
15
|
+
language: str
|
|
16
|
+
start_line: int
|
|
17
|
+
end_line: int
|
|
18
|
+
docstring: Optional[str] = None
|
|
19
|
+
imports: Optional[str] = None
|
|
20
|
+
metadata: Optional[Dict] = None
|
|
21
|
+
|
|
22
|
+
def __repr__(self):
|
|
23
|
+
return f"CodeChunk({self.type}:{self.name} in {self.file_path}:{self.start_line}-{self.end_line})"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SmartChunker:
|
|
27
|
+
"""
|
|
28
|
+
Smart code chunking that preserves semantic units.
|
|
29
|
+
|
|
30
|
+
Chunks code at function/class level while preserving imports and context.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, max_chunk_tokens: int = 500):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the smart chunker.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
max_chunk_tokens: Maximum tokens per chunk (approximate)
|
|
39
|
+
"""
|
|
40
|
+
self.max_chunk_tokens = max_chunk_tokens
|
|
41
|
+
|
|
42
|
+
def chunk_code(self, code: str, file_path: str, language: str) -> List[CodeChunk]:
|
|
43
|
+
"""
|
|
44
|
+
Chunk code based on language.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
code: The source code to chunk
|
|
48
|
+
file_path: Path to the file
|
|
49
|
+
language: Programming language
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of CodeChunk objects
|
|
53
|
+
"""
|
|
54
|
+
language = language.lower()
|
|
55
|
+
|
|
56
|
+
if language == 'python':
|
|
57
|
+
return self._chunk_python(code, file_path)
|
|
58
|
+
elif language in ('javascript', 'typescript', 'jsx', 'tsx'):
|
|
59
|
+
return self._chunk_javascript(code, file_path, language)
|
|
60
|
+
else:
|
|
61
|
+
# Fallback: create a single file-level chunk
|
|
62
|
+
return [CodeChunk(
|
|
63
|
+
type='file',
|
|
64
|
+
name=file_path.split('/')[-1],
|
|
65
|
+
content=code[:self.max_chunk_tokens * 4], # Rough char estimate
|
|
66
|
+
file_path=file_path,
|
|
67
|
+
language=language,
|
|
68
|
+
start_line=1,
|
|
69
|
+
end_line=len(code.split('\n'))
|
|
70
|
+
)]
|
|
71
|
+
|
|
72
|
+
def _chunk_python(self, code: str, file_path: str) -> List[CodeChunk]:
|
|
73
|
+
"""
|
|
74
|
+
Chunk Python code using AST parsing.
|
|
75
|
+
|
|
76
|
+
Extracts functions, methods, and classes as separate chunks,
|
|
77
|
+
preserving imports with each chunk.
|
|
78
|
+
"""
|
|
79
|
+
chunks = []
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
tree = ast.parse(code)
|
|
83
|
+
except SyntaxError:
|
|
84
|
+
# If can't parse, return file-level chunk
|
|
85
|
+
return [CodeChunk(
|
|
86
|
+
type='file',
|
|
87
|
+
name=file_path.split('/')[-1],
|
|
88
|
+
content=code,
|
|
89
|
+
file_path=file_path,
|
|
90
|
+
language='python',
|
|
91
|
+
start_line=1,
|
|
92
|
+
end_line=len(code.split('\n'))
|
|
93
|
+
)]
|
|
94
|
+
|
|
95
|
+
# Extract imports
|
|
96
|
+
imports = self._extract_python_imports(tree, code)
|
|
97
|
+
|
|
98
|
+
# Extract top-level functions and classes
|
|
99
|
+
for node in ast.iter_child_nodes(tree):
|
|
100
|
+
if isinstance(node, ast.FunctionDef):
|
|
101
|
+
chunk = self._python_function_to_chunk(node, code, file_path, imports)
|
|
102
|
+
if chunk:
|
|
103
|
+
chunks.append(chunk)
|
|
104
|
+
|
|
105
|
+
elif isinstance(node, ast.ClassDef):
|
|
106
|
+
# For classes, create one chunk for the whole class
|
|
107
|
+
class_chunk = self._python_class_to_chunk(node, code, file_path, imports)
|
|
108
|
+
if class_chunk:
|
|
109
|
+
chunks.append(class_chunk)
|
|
110
|
+
|
|
111
|
+
# Also create individual chunks for each method
|
|
112
|
+
for class_node in node.body:
|
|
113
|
+
if isinstance(class_node, ast.FunctionDef):
|
|
114
|
+
method_chunk = self._python_function_to_chunk(
|
|
115
|
+
class_node, code, file_path, imports,
|
|
116
|
+
class_name=node.name
|
|
117
|
+
)
|
|
118
|
+
if method_chunk:
|
|
119
|
+
chunks.append(method_chunk)
|
|
120
|
+
|
|
121
|
+
# If no chunks were created, return file-level chunk
|
|
122
|
+
if not chunks:
|
|
123
|
+
chunks.append(CodeChunk(
|
|
124
|
+
type='file',
|
|
125
|
+
name=file_path.split('/')[-1],
|
|
126
|
+
content=code,
|
|
127
|
+
file_path=file_path,
|
|
128
|
+
language='python',
|
|
129
|
+
start_line=1,
|
|
130
|
+
end_line=len(code.split('\n')),
|
|
131
|
+
imports=imports
|
|
132
|
+
))
|
|
133
|
+
|
|
134
|
+
return chunks
|
|
135
|
+
|
|
136
|
+
def _extract_python_imports(self, tree: ast.AST, code: str) -> str:
|
|
137
|
+
"""Extract import statements from Python code."""
|
|
138
|
+
imports_lines = []
|
|
139
|
+
lines = code.split('\n')
|
|
140
|
+
|
|
141
|
+
for node in ast.walk(tree):
|
|
142
|
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
|
143
|
+
if hasattr(node, 'lineno'):
|
|
144
|
+
line_idx = node.lineno - 1
|
|
145
|
+
if line_idx < len(lines):
|
|
146
|
+
imports_lines.append(lines[line_idx])
|
|
147
|
+
|
|
148
|
+
return '\n'.join(imports_lines)
|
|
149
|
+
|
|
150
|
+
def _python_function_to_chunk(self, node: ast.FunctionDef, code: str,
|
|
151
|
+
file_path: str, imports: str,
|
|
152
|
+
class_name: Optional[str] = None) -> Optional[CodeChunk]:
|
|
153
|
+
"""Convert Python function AST node to CodeChunk."""
|
|
154
|
+
lines = code.split('\n')
|
|
155
|
+
|
|
156
|
+
# Get function source
|
|
157
|
+
start_line = node.lineno
|
|
158
|
+
end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line
|
|
159
|
+
|
|
160
|
+
if start_line > len(lines) or end_line > len(lines):
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
function_code = '\n'.join(lines[start_line - 1:end_line])
|
|
164
|
+
|
|
165
|
+
# Get docstring
|
|
166
|
+
docstring = ast.get_docstring(node)
|
|
167
|
+
|
|
168
|
+
# Combine imports with function code
|
|
169
|
+
full_content = imports + '\n\n' + function_code if imports else function_code
|
|
170
|
+
|
|
171
|
+
# Determine chunk type and name
|
|
172
|
+
chunk_type = 'method' if class_name else 'function'
|
|
173
|
+
name = f"{class_name}.{node.name}" if class_name else node.name
|
|
174
|
+
|
|
175
|
+
return CodeChunk(
|
|
176
|
+
type=chunk_type,
|
|
177
|
+
name=name,
|
|
178
|
+
content=full_content,
|
|
179
|
+
file_path=file_path,
|
|
180
|
+
language='python',
|
|
181
|
+
start_line=start_line,
|
|
182
|
+
end_line=end_line,
|
|
183
|
+
docstring=docstring,
|
|
184
|
+
imports=imports,
|
|
185
|
+
metadata={
|
|
186
|
+
'class': class_name,
|
|
187
|
+
'decorators': [d.id if isinstance(d, ast.Name) else str(d) for d in node.decorator_list],
|
|
188
|
+
'args': [arg.arg for arg in node.args.args]
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def _python_class_to_chunk(self, node: ast.ClassDef, code: str,
|
|
193
|
+
file_path: str, imports: str) -> Optional[CodeChunk]:
|
|
194
|
+
"""Convert Python class AST node to CodeChunk."""
|
|
195
|
+
lines = code.split('\n')
|
|
196
|
+
|
|
197
|
+
start_line = node.lineno
|
|
198
|
+
end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line
|
|
199
|
+
|
|
200
|
+
if start_line > len(lines) or end_line > len(lines):
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
class_code = '\n'.join(lines[start_line - 1:end_line])
|
|
204
|
+
|
|
205
|
+
# Get docstring
|
|
206
|
+
docstring = ast.get_docstring(node)
|
|
207
|
+
|
|
208
|
+
# Get method names
|
|
209
|
+
methods = [n.name for n in node.body if isinstance(n, ast.FunctionDef)]
|
|
210
|
+
|
|
211
|
+
# Combine imports with class code
|
|
212
|
+
full_content = imports + '\n\n' + class_code if imports else class_code
|
|
213
|
+
|
|
214
|
+
return CodeChunk(
|
|
215
|
+
type='class',
|
|
216
|
+
name=node.name,
|
|
217
|
+
content=full_content,
|
|
218
|
+
file_path=file_path,
|
|
219
|
+
language='python',
|
|
220
|
+
start_line=start_line,
|
|
221
|
+
end_line=end_line,
|
|
222
|
+
docstring=docstring,
|
|
223
|
+
imports=imports,
|
|
224
|
+
metadata={
|
|
225
|
+
'methods': methods,
|
|
226
|
+
'bases': [base.id if isinstance(base, ast.Name) else str(base) for base in node.bases]
|
|
227
|
+
}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def _chunk_javascript(self, code: str, file_path: str, language: str) -> List[CodeChunk]:
|
|
231
|
+
"""
|
|
232
|
+
Chunk JavaScript/TypeScript code.
|
|
233
|
+
|
|
234
|
+
Uses regex patterns to identify functions and classes.
|
|
235
|
+
For better accuracy, tree-sitter could be used.
|
|
236
|
+
"""
|
|
237
|
+
chunks = []
|
|
238
|
+
lines = code.split('\n')
|
|
239
|
+
|
|
240
|
+
# Extract imports (ES6 and CommonJS)
|
|
241
|
+
imports = self._extract_js_imports(code)
|
|
242
|
+
|
|
243
|
+
# Try to find functions and classes using regex
|
|
244
|
+
# This is a simplified approach; tree-sitter would be more accurate
|
|
245
|
+
|
|
246
|
+
import re
|
|
247
|
+
|
|
248
|
+
# Pattern for function declarations
|
|
249
|
+
func_pattern = r'^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\('
|
|
250
|
+
# Pattern for arrow functions assigned to const/let/var
|
|
251
|
+
arrow_pattern = r'^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>'
|
|
252
|
+
# Pattern for class declarations
|
|
253
|
+
class_pattern = r'^(?:export\s+)?class\s+(\w+)'
|
|
254
|
+
|
|
255
|
+
current_line = 0
|
|
256
|
+
while current_line < len(lines):
|
|
257
|
+
line = lines[current_line].strip()
|
|
258
|
+
|
|
259
|
+
# Check for function
|
|
260
|
+
func_match = re.match(func_pattern, line)
|
|
261
|
+
arrow_match = re.match(arrow_pattern, line)
|
|
262
|
+
class_match = re.match(class_pattern, line)
|
|
263
|
+
|
|
264
|
+
if func_match or arrow_match:
|
|
265
|
+
# Found a function
|
|
266
|
+
func_name = func_match.group(1) if func_match else arrow_match.group(1)
|
|
267
|
+
start_line = current_line + 1
|
|
268
|
+
|
|
269
|
+
# Find end of function (look for closing brace)
|
|
270
|
+
end_line = self._find_js_block_end(lines, current_line)
|
|
271
|
+
|
|
272
|
+
if end_line > start_line:
|
|
273
|
+
function_code = '\n'.join(lines[current_line:end_line])
|
|
274
|
+
full_content = imports + '\n\n' + function_code if imports else function_code
|
|
275
|
+
|
|
276
|
+
chunks.append(CodeChunk(
|
|
277
|
+
type='function',
|
|
278
|
+
name=func_name,
|
|
279
|
+
content=full_content,
|
|
280
|
+
file_path=file_path,
|
|
281
|
+
language=language,
|
|
282
|
+
start_line=start_line,
|
|
283
|
+
end_line=end_line,
|
|
284
|
+
imports=imports
|
|
285
|
+
))
|
|
286
|
+
|
|
287
|
+
current_line = end_line
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
elif class_match:
|
|
291
|
+
# Found a class
|
|
292
|
+
class_name = class_match.group(1)
|
|
293
|
+
start_line = current_line + 1
|
|
294
|
+
|
|
295
|
+
# Find end of class
|
|
296
|
+
end_line = self._find_js_block_end(lines, current_line)
|
|
297
|
+
|
|
298
|
+
if end_line > start_line:
|
|
299
|
+
class_code = '\n'.join(lines[current_line:end_line])
|
|
300
|
+
full_content = imports + '\n\n' + class_code if imports else class_code
|
|
301
|
+
|
|
302
|
+
chunks.append(CodeChunk(
|
|
303
|
+
type='class',
|
|
304
|
+
name=class_name,
|
|
305
|
+
content=full_content,
|
|
306
|
+
file_path=file_path,
|
|
307
|
+
language=language,
|
|
308
|
+
start_line=start_line,
|
|
309
|
+
end_line=end_line,
|
|
310
|
+
imports=imports
|
|
311
|
+
))
|
|
312
|
+
|
|
313
|
+
current_line = end_line
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
current_line += 1
|
|
317
|
+
|
|
318
|
+
# If no chunks, return file-level chunk
|
|
319
|
+
if not chunks:
|
|
320
|
+
chunks.append(CodeChunk(
|
|
321
|
+
type='file',
|
|
322
|
+
name=file_path.split('/')[-1],
|
|
323
|
+
content=code,
|
|
324
|
+
file_path=file_path,
|
|
325
|
+
language=language,
|
|
326
|
+
start_line=1,
|
|
327
|
+
end_line=len(lines),
|
|
328
|
+
imports=imports
|
|
329
|
+
))
|
|
330
|
+
|
|
331
|
+
return chunks
|
|
332
|
+
|
|
333
|
+
def _extract_js_imports(self, code: str) -> str:
|
|
334
|
+
"""Extract import statements from JavaScript/TypeScript code."""
|
|
335
|
+
import re
|
|
336
|
+
|
|
337
|
+
lines = code.split('\n')
|
|
338
|
+
import_lines = []
|
|
339
|
+
|
|
340
|
+
# Patterns for imports
|
|
341
|
+
import_pattern = r'^import\s+'
|
|
342
|
+
require_pattern = r'^(?:const|let|var)\s+.*=\s*require\('
|
|
343
|
+
|
|
344
|
+
for line in lines:
|
|
345
|
+
stripped = line.strip()
|
|
346
|
+
if re.match(import_pattern, stripped) or re.match(require_pattern, stripped):
|
|
347
|
+
import_lines.append(line)
|
|
348
|
+
|
|
349
|
+
return '\n'.join(import_lines)
|
|
350
|
+
|
|
351
|
+
def _find_js_block_end(self, lines: List[str], start_idx: int) -> int:
|
|
352
|
+
"""
|
|
353
|
+
Find the end of a JavaScript code block (function/class).
|
|
354
|
+
|
|
355
|
+
Uses brace matching to find the closing brace.
|
|
356
|
+
"""
|
|
357
|
+
brace_count = 0
|
|
358
|
+
in_block = False
|
|
359
|
+
|
|
360
|
+
for i in range(start_idx, len(lines)):
|
|
361
|
+
line = lines[i]
|
|
362
|
+
|
|
363
|
+
# Count braces
|
|
364
|
+
for char in line:
|
|
365
|
+
if char == '{':
|
|
366
|
+
brace_count += 1
|
|
367
|
+
in_block = True
|
|
368
|
+
elif char == '}':
|
|
369
|
+
brace_count -= 1
|
|
370
|
+
|
|
371
|
+
if in_block and brace_count == 0:
|
|
372
|
+
return i + 1
|
|
373
|
+
|
|
374
|
+
# If no matching brace found, return end of file
|
|
375
|
+
return len(lines)
|
|
376
|
+
|
|
377
|
+
def chunk_with_token_limit(self, chunks: List[CodeChunk],
|
|
378
|
+
max_tokens: int = 500) -> List[CodeChunk]:
|
|
379
|
+
"""
|
|
380
|
+
Further split chunks that exceed token limit.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
chunks: List of CodeChunk objects
|
|
384
|
+
max_tokens: Maximum tokens per chunk
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
List of chunks, with large chunks split
|
|
388
|
+
"""
|
|
389
|
+
result = []
|
|
390
|
+
|
|
391
|
+
for chunk in chunks:
|
|
392
|
+
# Rough estimate: 1 token ≈ 4 characters
|
|
393
|
+
estimated_tokens = len(chunk.content) / 4
|
|
394
|
+
|
|
395
|
+
if estimated_tokens <= max_tokens:
|
|
396
|
+
result.append(chunk)
|
|
397
|
+
else:
|
|
398
|
+
# Split into smaller chunks using sliding window
|
|
399
|
+
sub_chunks = self._sliding_window_split(chunk, max_tokens)
|
|
400
|
+
result.extend(sub_chunks)
|
|
401
|
+
|
|
402
|
+
return result
|
|
403
|
+
|
|
404
|
+
def _sliding_window_split(self, chunk: CodeChunk, max_tokens: int) -> List[CodeChunk]:
|
|
405
|
+
"""Split a large chunk using sliding window approach."""
|
|
406
|
+
max_chars = max_tokens * 4 # Rough estimate
|
|
407
|
+
overlap = max_chars // 4 # 25% overlap
|
|
408
|
+
|
|
409
|
+
content_lines = chunk.content.split('\n')
|
|
410
|
+
sub_chunks = []
|
|
411
|
+
|
|
412
|
+
start = 0
|
|
413
|
+
chunk_num = 0
|
|
414
|
+
|
|
415
|
+
while start < len(content_lines):
|
|
416
|
+
# Take window of lines
|
|
417
|
+
end = start
|
|
418
|
+
current_chars = 0
|
|
419
|
+
|
|
420
|
+
while end < len(content_lines) and current_chars < max_chars:
|
|
421
|
+
current_chars += len(content_lines[end])
|
|
422
|
+
end += 1
|
|
423
|
+
|
|
424
|
+
# Create sub-chunk
|
|
425
|
+
sub_content = '\n'.join(content_lines[start:end])
|
|
426
|
+
sub_chunks.append(CodeChunk(
|
|
427
|
+
type=f"{chunk.type}_part",
|
|
428
|
+
name=f"{chunk.name}_part{chunk_num}",
|
|
429
|
+
content=sub_content,
|
|
430
|
+
file_path=chunk.file_path,
|
|
431
|
+
language=chunk.language,
|
|
432
|
+
start_line=chunk.start_line + start,
|
|
433
|
+
end_line=chunk.start_line + end,
|
|
434
|
+
imports=chunk.imports,
|
|
435
|
+
metadata={**chunk.metadata, 'is_partial': True} if chunk.metadata else {'is_partial': True}
|
|
436
|
+
))
|
|
437
|
+
|
|
438
|
+
chunk_num += 1
|
|
439
|
+
|
|
440
|
+
# Move window with overlap
|
|
441
|
+
start = max(start + 1, end - (overlap // 50)) # Overlap in lines
|
|
442
|
+
|
|
443
|
+
return sub_chunks
|