codetree-rag 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codetree/__init__.py +13 -0
- codetree/cli.py +220 -0
- codetree/config.py +110 -0
- codetree/core.py +179 -0
- codetree/indexer.py +322 -0
- codetree/llm.py +116 -0
- codetree/parser.py +352 -0
- codetree/retriever.py +192 -0
- codetree_rag-0.1.0.dist-info/METADATA +496 -0
- codetree_rag-0.1.0.dist-info/RECORD +14 -0
- codetree_rag-0.1.0.dist-info/WHEEL +5 -0
- codetree_rag-0.1.0.dist-info/entry_points.txt +2 -0
- codetree_rag-0.1.0.dist-info/licenses/LICENSE +21 -0
- codetree_rag-0.1.0.dist-info/top_level.txt +1 -0
codetree/indexer.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Code indexer - builds tree structure from repository."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import dataclass, field, asdict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, Any
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
from .config import Config, IndexConfig
|
|
11
|
+
from .parser import CodeParser, FileInfo, LANGUAGE_EXTENSIONS
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class TreeNode:
|
|
16
|
+
"""A node in the code tree."""
|
|
17
|
+
name: str
|
|
18
|
+
type: str # "directory", "file"
|
|
19
|
+
path: str
|
|
20
|
+
summary: Optional[str] = None
|
|
21
|
+
language: Optional[str] = None
|
|
22
|
+
|
|
23
|
+
# For files
|
|
24
|
+
imports: list[str] = field(default_factory=list)
|
|
25
|
+
functions: list[dict] = field(default_factory=list)
|
|
26
|
+
classes: list[dict] = field(default_factory=list)
|
|
27
|
+
variables: list[str] = field(default_factory=list)
|
|
28
|
+
line_count: int = 0
|
|
29
|
+
|
|
30
|
+
# For directories
|
|
31
|
+
children: list["TreeNode"] = field(default_factory=list)
|
|
32
|
+
file_count: int = 0
|
|
33
|
+
|
|
34
|
+
def to_dict(self) -> dict:
|
|
35
|
+
"""Convert to dictionary for JSON serialization."""
|
|
36
|
+
result = {
|
|
37
|
+
"name": self.name,
|
|
38
|
+
"type": self.type,
|
|
39
|
+
"path": self.path,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if self.summary:
|
|
43
|
+
result["summary"] = self.summary
|
|
44
|
+
|
|
45
|
+
if self.type == "file":
|
|
46
|
+
if self.language:
|
|
47
|
+
result["language"] = self.language
|
|
48
|
+
if self.imports:
|
|
49
|
+
result["imports"] = self.imports
|
|
50
|
+
if self.functions:
|
|
51
|
+
result["functions"] = self.functions
|
|
52
|
+
if self.classes:
|
|
53
|
+
result["classes"] = self.classes
|
|
54
|
+
if self.variables:
|
|
55
|
+
result["variables"] = self.variables
|
|
56
|
+
if self.line_count:
|
|
57
|
+
result["line_count"] = self.line_count
|
|
58
|
+
else: # directory
|
|
59
|
+
if self.children:
|
|
60
|
+
result["children"] = [c.to_dict() for c in self.children]
|
|
61
|
+
if self.file_count:
|
|
62
|
+
result["file_count"] = self.file_count
|
|
63
|
+
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class CodeIndex:
|
|
69
|
+
"""The complete code index for a repository."""
|
|
70
|
+
root: TreeNode
|
|
71
|
+
repo_path: str
|
|
72
|
+
created_at: str
|
|
73
|
+
version: str = "0.1.0"
|
|
74
|
+
total_files: int = 0
|
|
75
|
+
total_lines: int = 0
|
|
76
|
+
languages: dict[str, int] = field(default_factory=dict) # language -> file count
|
|
77
|
+
|
|
78
|
+
def to_dict(self) -> dict:
|
|
79
|
+
"""Convert to dictionary for JSON serialization."""
|
|
80
|
+
return {
|
|
81
|
+
"version": self.version,
|
|
82
|
+
"repo_path": self.repo_path,
|
|
83
|
+
"created_at": self.created_at,
|
|
84
|
+
"total_files": self.total_files,
|
|
85
|
+
"total_lines": self.total_lines,
|
|
86
|
+
"languages": self.languages,
|
|
87
|
+
"root": self.root.to_dict(),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def to_json(self, indent: int = 2) -> str:
|
|
91
|
+
"""Convert to JSON string."""
|
|
92
|
+
return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_dict(cls, data: dict) -> "CodeIndex":
|
|
96
|
+
"""Create CodeIndex from dictionary."""
|
|
97
|
+
def parse_node(node_data: dict) -> TreeNode:
|
|
98
|
+
children = []
|
|
99
|
+
if "children" in node_data:
|
|
100
|
+
children = [parse_node(c) for c in node_data["children"]]
|
|
101
|
+
|
|
102
|
+
return TreeNode(
|
|
103
|
+
name=node_data["name"],
|
|
104
|
+
type=node_data["type"],
|
|
105
|
+
path=node_data["path"],
|
|
106
|
+
summary=node_data.get("summary"),
|
|
107
|
+
language=node_data.get("language"),
|
|
108
|
+
imports=node_data.get("imports", []),
|
|
109
|
+
functions=node_data.get("functions", []),
|
|
110
|
+
classes=node_data.get("classes", []),
|
|
111
|
+
variables=node_data.get("variables", []),
|
|
112
|
+
line_count=node_data.get("line_count", 0),
|
|
113
|
+
children=children,
|
|
114
|
+
file_count=node_data.get("file_count", 0),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return cls(
|
|
118
|
+
root=parse_node(data["root"]),
|
|
119
|
+
repo_path=data["repo_path"],
|
|
120
|
+
created_at=data["created_at"],
|
|
121
|
+
version=data.get("version", "0.1.0"),
|
|
122
|
+
total_files=data.get("total_files", 0),
|
|
123
|
+
total_lines=data.get("total_lines", 0),
|
|
124
|
+
languages=data.get("languages", {}),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def from_json(cls, json_str: str) -> "CodeIndex":
|
|
129
|
+
"""Create CodeIndex from JSON string."""
|
|
130
|
+
return cls.from_dict(json.loads(json_str))
|
|
131
|
+
|
|
132
|
+
def get_compact_tree(self, max_depth: int = 3) -> str:
|
|
133
|
+
"""Get a compact text representation of the tree for LLM context."""
|
|
134
|
+
lines = []
|
|
135
|
+
|
|
136
|
+
def format_node(node: TreeNode, depth: int = 0, prefix: str = ""):
|
|
137
|
+
if depth > max_depth:
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
indent = " " * depth
|
|
141
|
+
|
|
142
|
+
if node.type == "directory":
|
|
143
|
+
lines.append(f"{indent}{prefix}{node.name}/")
|
|
144
|
+
if node.summary:
|
|
145
|
+
lines.append(f"{indent} # {node.summary}")
|
|
146
|
+
for i, child in enumerate(node.children):
|
|
147
|
+
format_node(child, depth + 1)
|
|
148
|
+
else:
|
|
149
|
+
# File
|
|
150
|
+
lang_tag = f"[{node.language}]" if node.language else ""
|
|
151
|
+
lines.append(f"{indent}{prefix}{node.name} {lang_tag}")
|
|
152
|
+
|
|
153
|
+
# Show functions and classes
|
|
154
|
+
if node.functions:
|
|
155
|
+
func_names = [f["name"] for f in node.functions[:5]]
|
|
156
|
+
more = f" (+{len(node.functions) - 5} more)" if len(node.functions) > 5 else ""
|
|
157
|
+
lines.append(f"{indent} → functions: {', '.join(func_names)}{more}")
|
|
158
|
+
|
|
159
|
+
if node.classes:
|
|
160
|
+
class_names = [c["name"] for c in node.classes[:5]]
|
|
161
|
+
lines.append(f"{indent} → classes: {', '.join(class_names)}")
|
|
162
|
+
|
|
163
|
+
format_node(self.root)
|
|
164
|
+
return "\n".join(lines)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class CodeIndexer:
|
|
168
|
+
"""Build code index from a repository."""
|
|
169
|
+
|
|
170
|
+
def __init__(self, config: Optional[Config] = None):
|
|
171
|
+
self.config = config or Config.load()
|
|
172
|
+
self.parser = CodeParser()
|
|
173
|
+
self._stats = {
|
|
174
|
+
"total_files": 0,
|
|
175
|
+
"total_lines": 0,
|
|
176
|
+
"languages": {},
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
def build_index(self, repo_path: Path) -> CodeIndex:
|
|
180
|
+
"""Build a code index from the repository."""
|
|
181
|
+
repo_path = Path(repo_path).resolve()
|
|
182
|
+
|
|
183
|
+
if not repo_path.exists():
|
|
184
|
+
raise ValueError(f"Repository path does not exist: {repo_path}")
|
|
185
|
+
|
|
186
|
+
self._stats = {
|
|
187
|
+
"total_files": 0,
|
|
188
|
+
"total_lines": 0,
|
|
189
|
+
"languages": {},
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
root = self._index_directory(repo_path, repo_path)
|
|
193
|
+
|
|
194
|
+
return CodeIndex(
|
|
195
|
+
root=root,
|
|
196
|
+
repo_path=str(repo_path),
|
|
197
|
+
created_at=datetime.now().isoformat(),
|
|
198
|
+
total_files=self._stats["total_files"],
|
|
199
|
+
total_lines=self._stats["total_lines"],
|
|
200
|
+
languages=self._stats["languages"],
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def _index_directory(self, dir_path: Path, repo_root: Path) -> TreeNode:
|
|
204
|
+
"""Recursively index a directory."""
|
|
205
|
+
relative_path = dir_path.relative_to(repo_root)
|
|
206
|
+
|
|
207
|
+
children = []
|
|
208
|
+
file_count = 0
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
entries = sorted(dir_path.iterdir(), key=lambda p: (p.is_file(), p.name.lower()))
|
|
212
|
+
except PermissionError:
|
|
213
|
+
entries = []
|
|
214
|
+
|
|
215
|
+
for entry in entries:
|
|
216
|
+
# Skip excluded patterns
|
|
217
|
+
if self._should_exclude(entry, repo_root):
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
if entry.is_dir():
|
|
221
|
+
child_node = self._index_directory(entry, repo_root)
|
|
222
|
+
if child_node.children or child_node.type == "file":
|
|
223
|
+
children.append(child_node)
|
|
224
|
+
file_count += child_node.file_count
|
|
225
|
+
elif entry.is_file():
|
|
226
|
+
child_node = self._index_file(entry, repo_root)
|
|
227
|
+
if child_node:
|
|
228
|
+
children.append(child_node)
|
|
229
|
+
file_count += 1
|
|
230
|
+
|
|
231
|
+
return TreeNode(
|
|
232
|
+
name=dir_path.name or str(repo_root.name),
|
|
233
|
+
type="directory",
|
|
234
|
+
path=str(relative_path) if str(relative_path) != "." else "",
|
|
235
|
+
children=children,
|
|
236
|
+
file_count=file_count,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def _index_file(self, file_path: Path, repo_root: Path) -> Optional[TreeNode]:
|
|
240
|
+
"""Index a single file."""
|
|
241
|
+
relative_path = file_path.relative_to(repo_root)
|
|
242
|
+
|
|
243
|
+
# Check file size
|
|
244
|
+
try:
|
|
245
|
+
if file_path.stat().st_size > self.config.index.max_file_size:
|
|
246
|
+
return None
|
|
247
|
+
except OSError:
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
# Check if it's a supported language
|
|
251
|
+
language = self.parser.detect_language(file_path)
|
|
252
|
+
if not language:
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
if language not in self.config.index.languages:
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
# Parse the file
|
|
259
|
+
file_info = self.parser.parse_file(file_path)
|
|
260
|
+
if not file_info:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
# Update stats
|
|
264
|
+
self._stats["total_files"] += 1
|
|
265
|
+
self._stats["total_lines"] += file_info.line_count
|
|
266
|
+
self._stats["languages"][language] = self._stats["languages"].get(language, 0) + 1
|
|
267
|
+
|
|
268
|
+
return TreeNode(
|
|
269
|
+
name=file_path.name,
|
|
270
|
+
type="file",
|
|
271
|
+
path=str(relative_path),
|
|
272
|
+
language=language,
|
|
273
|
+
imports=file_info.imports[:20], # Limit imports
|
|
274
|
+
functions=[
|
|
275
|
+
{
|
|
276
|
+
"name": f.name,
|
|
277
|
+
"signature": f.signature,
|
|
278
|
+
"docstring": f.docstring,
|
|
279
|
+
"line": f.start_line,
|
|
280
|
+
}
|
|
281
|
+
for f in file_info.functions[:50] # Limit functions
|
|
282
|
+
],
|
|
283
|
+
classes=[
|
|
284
|
+
{
|
|
285
|
+
"name": c.name,
|
|
286
|
+
"signature": c.signature,
|
|
287
|
+
"docstring": c.docstring,
|
|
288
|
+
"line": c.start_line,
|
|
289
|
+
}
|
|
290
|
+
for c in file_info.classes[:20] # Limit classes
|
|
291
|
+
],
|
|
292
|
+
variables=file_info.variables[:10],
|
|
293
|
+
line_count=file_info.line_count,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
def _should_exclude(self, path: Path, repo_root: Path) -> bool:
|
|
297
|
+
"""Check if a path should be excluded."""
|
|
298
|
+
name = path.name
|
|
299
|
+
|
|
300
|
+
# Check exclude patterns
|
|
301
|
+
for pattern in self.config.index.exclude:
|
|
302
|
+
if name == pattern or name.startswith(pattern):
|
|
303
|
+
return True
|
|
304
|
+
if path.match(pattern):
|
|
305
|
+
return True
|
|
306
|
+
|
|
307
|
+
# Skip hidden files/directories
|
|
308
|
+
if name.startswith(".") and name not in (".github",):
|
|
309
|
+
return True
|
|
310
|
+
|
|
311
|
+
return False
|
|
312
|
+
|
|
313
|
+
def save_index(self, index: CodeIndex, output_path: Path) -> None:
|
|
314
|
+
"""Save index to a JSON file."""
|
|
315
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
316
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
317
|
+
f.write(index.to_json())
|
|
318
|
+
|
|
319
|
+
def load_index(self, index_path: Path) -> CodeIndex:
|
|
320
|
+
"""Load index from a JSON file."""
|
|
321
|
+
with open(index_path, "r", encoding="utf-8") as f:
|
|
322
|
+
return CodeIndex.from_json(f.read())
|
codetree/llm.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""LLM client abstraction for multiple providers."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from .config import LLMConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LLMClient(ABC):
|
|
11
|
+
"""Abstract base class for LLM clients."""
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def chat(self, messages: list[dict], **kwargs) -> str:
|
|
15
|
+
"""Send chat messages and get response."""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OpenAIClient(LLMClient):
|
|
20
|
+
"""OpenAI API client."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, config: LLMConfig):
|
|
23
|
+
self.config = config
|
|
24
|
+
try:
|
|
25
|
+
from openai import OpenAI
|
|
26
|
+
self.client = OpenAI(
|
|
27
|
+
api_key=config.api_key,
|
|
28
|
+
base_url=config.base_url,
|
|
29
|
+
)
|
|
30
|
+
except ImportError:
|
|
31
|
+
raise ImportError("openai package not installed. Run: pip install openai")
|
|
32
|
+
|
|
33
|
+
def chat(self, messages: list[dict], **kwargs) -> str:
|
|
34
|
+
response = self.client.chat.completions.create(
|
|
35
|
+
model=self.config.model,
|
|
36
|
+
messages=messages,
|
|
37
|
+
temperature=kwargs.get("temperature", self.config.temperature),
|
|
38
|
+
max_tokens=kwargs.get("max_tokens", self.config.max_tokens),
|
|
39
|
+
)
|
|
40
|
+
return response.choices[0].message.content or ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class AnthropicClient(LLMClient):
|
|
44
|
+
"""Anthropic API client."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, config: LLMConfig):
|
|
47
|
+
self.config = config
|
|
48
|
+
try:
|
|
49
|
+
import anthropic
|
|
50
|
+
self.client = anthropic.Anthropic(api_key=config.api_key)
|
|
51
|
+
except ImportError:
|
|
52
|
+
raise ImportError("anthropic package not installed. Run: pip install anthropic")
|
|
53
|
+
|
|
54
|
+
def chat(self, messages: list[dict], **kwargs) -> str:
|
|
55
|
+
# Convert messages format
|
|
56
|
+
system = ""
|
|
57
|
+
chat_messages = []
|
|
58
|
+
|
|
59
|
+
for msg in messages:
|
|
60
|
+
if msg["role"] == "system":
|
|
61
|
+
system = msg["content"]
|
|
62
|
+
else:
|
|
63
|
+
chat_messages.append(msg)
|
|
64
|
+
|
|
65
|
+
response = self.client.messages.create(
|
|
66
|
+
model=self.config.model,
|
|
67
|
+
system=system,
|
|
68
|
+
messages=chat_messages,
|
|
69
|
+
temperature=kwargs.get("temperature", self.config.temperature),
|
|
70
|
+
max_tokens=kwargs.get("max_tokens", self.config.max_tokens),
|
|
71
|
+
)
|
|
72
|
+
return response.content[0].text
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class OllamaClient(LLMClient):
|
|
76
|
+
"""Ollama local model client."""
|
|
77
|
+
|
|
78
|
+
def __init__(self, config: LLMConfig):
|
|
79
|
+
self.config = config
|
|
80
|
+
self.base_url = config.base_url or "http://localhost:11434"
|
|
81
|
+
|
|
82
|
+
def chat(self, messages: list[dict], **kwargs) -> str:
|
|
83
|
+
try:
|
|
84
|
+
import ollama
|
|
85
|
+
response = ollama.chat(
|
|
86
|
+
model=self.config.model,
|
|
87
|
+
messages=messages,
|
|
88
|
+
)
|
|
89
|
+
return response["message"]["content"]
|
|
90
|
+
except ImportError:
|
|
91
|
+
# Fallback to requests
|
|
92
|
+
import requests
|
|
93
|
+
response = requests.post(
|
|
94
|
+
f"{self.base_url}/api/chat",
|
|
95
|
+
json={
|
|
96
|
+
"model": self.config.model,
|
|
97
|
+
"messages": messages,
|
|
98
|
+
"stream": False,
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
response.raise_for_status()
|
|
102
|
+
return response.json()["message"]["content"]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def create_llm_client(config: LLMConfig) -> LLMClient:
|
|
106
|
+
"""Factory function to create appropriate LLM client."""
|
|
107
|
+
provider = config.provider.lower()
|
|
108
|
+
|
|
109
|
+
if provider == "openai":
|
|
110
|
+
return OpenAIClient(config)
|
|
111
|
+
elif provider == "anthropic":
|
|
112
|
+
return AnthropicClient(config)
|
|
113
|
+
elif provider == "ollama":
|
|
114
|
+
return OllamaClient(config)
|
|
115
|
+
else:
|
|
116
|
+
raise ValueError(f"Unknown LLM provider: {provider}")
|