codetree-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codetree/indexer.py ADDED
@@ -0,0 +1,322 @@
1
+ """Code indexer - builds tree structure from repository."""
2
+
3
+ import hashlib
4
+ import json
5
+ from dataclasses import dataclass, field, asdict
6
+ from pathlib import Path
7
+ from typing import Optional, Any
8
+ from datetime import datetime
9
+
10
+ from .config import Config, IndexConfig
11
+ from .parser import CodeParser, FileInfo, LANGUAGE_EXTENSIONS
12
+
13
+
14
+ @dataclass
15
+ class TreeNode:
16
+ """A node in the code tree."""
17
+ name: str
18
+ type: str # "directory", "file"
19
+ path: str
20
+ summary: Optional[str] = None
21
+ language: Optional[str] = None
22
+
23
+ # For files
24
+ imports: list[str] = field(default_factory=list)
25
+ functions: list[dict] = field(default_factory=list)
26
+ classes: list[dict] = field(default_factory=list)
27
+ variables: list[str] = field(default_factory=list)
28
+ line_count: int = 0
29
+
30
+ # For directories
31
+ children: list["TreeNode"] = field(default_factory=list)
32
+ file_count: int = 0
33
+
34
+ def to_dict(self) -> dict:
35
+ """Convert to dictionary for JSON serialization."""
36
+ result = {
37
+ "name": self.name,
38
+ "type": self.type,
39
+ "path": self.path,
40
+ }
41
+
42
+ if self.summary:
43
+ result["summary"] = self.summary
44
+
45
+ if self.type == "file":
46
+ if self.language:
47
+ result["language"] = self.language
48
+ if self.imports:
49
+ result["imports"] = self.imports
50
+ if self.functions:
51
+ result["functions"] = self.functions
52
+ if self.classes:
53
+ result["classes"] = self.classes
54
+ if self.variables:
55
+ result["variables"] = self.variables
56
+ if self.line_count:
57
+ result["line_count"] = self.line_count
58
+ else: # directory
59
+ if self.children:
60
+ result["children"] = [c.to_dict() for c in self.children]
61
+ if self.file_count:
62
+ result["file_count"] = self.file_count
63
+
64
+ return result
65
+
66
+
67
+ @dataclass
68
+ class CodeIndex:
69
+ """The complete code index for a repository."""
70
+ root: TreeNode
71
+ repo_path: str
72
+ created_at: str
73
+ version: str = "0.1.0"
74
+ total_files: int = 0
75
+ total_lines: int = 0
76
+ languages: dict[str, int] = field(default_factory=dict) # language -> file count
77
+
78
+ def to_dict(self) -> dict:
79
+ """Convert to dictionary for JSON serialization."""
80
+ return {
81
+ "version": self.version,
82
+ "repo_path": self.repo_path,
83
+ "created_at": self.created_at,
84
+ "total_files": self.total_files,
85
+ "total_lines": self.total_lines,
86
+ "languages": self.languages,
87
+ "root": self.root.to_dict(),
88
+ }
89
+
90
+ def to_json(self, indent: int = 2) -> str:
91
+ """Convert to JSON string."""
92
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
93
+
94
+ @classmethod
95
+ def from_dict(cls, data: dict) -> "CodeIndex":
96
+ """Create CodeIndex from dictionary."""
97
+ def parse_node(node_data: dict) -> TreeNode:
98
+ children = []
99
+ if "children" in node_data:
100
+ children = [parse_node(c) for c in node_data["children"]]
101
+
102
+ return TreeNode(
103
+ name=node_data["name"],
104
+ type=node_data["type"],
105
+ path=node_data["path"],
106
+ summary=node_data.get("summary"),
107
+ language=node_data.get("language"),
108
+ imports=node_data.get("imports", []),
109
+ functions=node_data.get("functions", []),
110
+ classes=node_data.get("classes", []),
111
+ variables=node_data.get("variables", []),
112
+ line_count=node_data.get("line_count", 0),
113
+ children=children,
114
+ file_count=node_data.get("file_count", 0),
115
+ )
116
+
117
+ return cls(
118
+ root=parse_node(data["root"]),
119
+ repo_path=data["repo_path"],
120
+ created_at=data["created_at"],
121
+ version=data.get("version", "0.1.0"),
122
+ total_files=data.get("total_files", 0),
123
+ total_lines=data.get("total_lines", 0),
124
+ languages=data.get("languages", {}),
125
+ )
126
+
127
+ @classmethod
128
+ def from_json(cls, json_str: str) -> "CodeIndex":
129
+ """Create CodeIndex from JSON string."""
130
+ return cls.from_dict(json.loads(json_str))
131
+
132
+ def get_compact_tree(self, max_depth: int = 3) -> str:
133
+ """Get a compact text representation of the tree for LLM context."""
134
+ lines = []
135
+
136
+ def format_node(node: TreeNode, depth: int = 0, prefix: str = ""):
137
+ if depth > max_depth:
138
+ return
139
+
140
+ indent = " " * depth
141
+
142
+ if node.type == "directory":
143
+ lines.append(f"{indent}{prefix}{node.name}/")
144
+ if node.summary:
145
+ lines.append(f"{indent} # {node.summary}")
146
+ for i, child in enumerate(node.children):
147
+ format_node(child, depth + 1)
148
+ else:
149
+ # File
150
+ lang_tag = f"[{node.language}]" if node.language else ""
151
+ lines.append(f"{indent}{prefix}{node.name} {lang_tag}")
152
+
153
+ # Show functions and classes
154
+ if node.functions:
155
+ func_names = [f["name"] for f in node.functions[:5]]
156
+ more = f" (+{len(node.functions) - 5} more)" if len(node.functions) > 5 else ""
157
+ lines.append(f"{indent} → functions: {', '.join(func_names)}{more}")
158
+
159
+ if node.classes:
160
+ class_names = [c["name"] for c in node.classes[:5]]
161
+ lines.append(f"{indent} → classes: {', '.join(class_names)}")
162
+
163
+ format_node(self.root)
164
+ return "\n".join(lines)
165
+
166
+
167
+ class CodeIndexer:
168
+ """Build code index from a repository."""
169
+
170
+ def __init__(self, config: Optional[Config] = None):
171
+ self.config = config or Config.load()
172
+ self.parser = CodeParser()
173
+ self._stats = {
174
+ "total_files": 0,
175
+ "total_lines": 0,
176
+ "languages": {},
177
+ }
178
+
179
+ def build_index(self, repo_path: Path) -> CodeIndex:
180
+ """Build a code index from the repository."""
181
+ repo_path = Path(repo_path).resolve()
182
+
183
+ if not repo_path.exists():
184
+ raise ValueError(f"Repository path does not exist: {repo_path}")
185
+
186
+ self._stats = {
187
+ "total_files": 0,
188
+ "total_lines": 0,
189
+ "languages": {},
190
+ }
191
+
192
+ root = self._index_directory(repo_path, repo_path)
193
+
194
+ return CodeIndex(
195
+ root=root,
196
+ repo_path=str(repo_path),
197
+ created_at=datetime.now().isoformat(),
198
+ total_files=self._stats["total_files"],
199
+ total_lines=self._stats["total_lines"],
200
+ languages=self._stats["languages"],
201
+ )
202
+
203
+ def _index_directory(self, dir_path: Path, repo_root: Path) -> TreeNode:
204
+ """Recursively index a directory."""
205
+ relative_path = dir_path.relative_to(repo_root)
206
+
207
+ children = []
208
+ file_count = 0
209
+
210
+ try:
211
+ entries = sorted(dir_path.iterdir(), key=lambda p: (p.is_file(), p.name.lower()))
212
+ except PermissionError:
213
+ entries = []
214
+
215
+ for entry in entries:
216
+ # Skip excluded patterns
217
+ if self._should_exclude(entry, repo_root):
218
+ continue
219
+
220
+ if entry.is_dir():
221
+ child_node = self._index_directory(entry, repo_root)
222
+ if child_node.children or child_node.type == "file":
223
+ children.append(child_node)
224
+ file_count += child_node.file_count
225
+ elif entry.is_file():
226
+ child_node = self._index_file(entry, repo_root)
227
+ if child_node:
228
+ children.append(child_node)
229
+ file_count += 1
230
+
231
+ return TreeNode(
232
+ name=dir_path.name or str(repo_root.name),
233
+ type="directory",
234
+ path=str(relative_path) if str(relative_path) != "." else "",
235
+ children=children,
236
+ file_count=file_count,
237
+ )
238
+
239
+ def _index_file(self, file_path: Path, repo_root: Path) -> Optional[TreeNode]:
240
+ """Index a single file."""
241
+ relative_path = file_path.relative_to(repo_root)
242
+
243
+ # Check file size
244
+ try:
245
+ if file_path.stat().st_size > self.config.index.max_file_size:
246
+ return None
247
+ except OSError:
248
+ return None
249
+
250
+ # Check if it's a supported language
251
+ language = self.parser.detect_language(file_path)
252
+ if not language:
253
+ return None
254
+
255
+ if language not in self.config.index.languages:
256
+ return None
257
+
258
+ # Parse the file
259
+ file_info = self.parser.parse_file(file_path)
260
+ if not file_info:
261
+ return None
262
+
263
+ # Update stats
264
+ self._stats["total_files"] += 1
265
+ self._stats["total_lines"] += file_info.line_count
266
+ self._stats["languages"][language] = self._stats["languages"].get(language, 0) + 1
267
+
268
+ return TreeNode(
269
+ name=file_path.name,
270
+ type="file",
271
+ path=str(relative_path),
272
+ language=language,
273
+ imports=file_info.imports[:20], # Limit imports
274
+ functions=[
275
+ {
276
+ "name": f.name,
277
+ "signature": f.signature,
278
+ "docstring": f.docstring,
279
+ "line": f.start_line,
280
+ }
281
+ for f in file_info.functions[:50] # Limit functions
282
+ ],
283
+ classes=[
284
+ {
285
+ "name": c.name,
286
+ "signature": c.signature,
287
+ "docstring": c.docstring,
288
+ "line": c.start_line,
289
+ }
290
+ for c in file_info.classes[:20] # Limit classes
291
+ ],
292
+ variables=file_info.variables[:10],
293
+ line_count=file_info.line_count,
294
+ )
295
+
296
+ def _should_exclude(self, path: Path, repo_root: Path) -> bool:
297
+ """Check if a path should be excluded."""
298
+ name = path.name
299
+
300
+ # Check exclude patterns
301
+ for pattern in self.config.index.exclude:
302
+ if name == pattern or name.startswith(pattern):
303
+ return True
304
+ if path.match(pattern):
305
+ return True
306
+
307
+ # Skip hidden files/directories
308
+ if name.startswith(".") and name not in (".github",):
309
+ return True
310
+
311
+ return False
312
+
313
+ def save_index(self, index: CodeIndex, output_path: Path) -> None:
314
+ """Save index to a JSON file."""
315
+ output_path.parent.mkdir(parents=True, exist_ok=True)
316
+ with open(output_path, "w", encoding="utf-8") as f:
317
+ f.write(index.to_json())
318
+
319
+ def load_index(self, index_path: Path) -> CodeIndex:
320
+ """Load index from a JSON file."""
321
+ with open(index_path, "r", encoding="utf-8") as f:
322
+ return CodeIndex.from_json(f.read())
codetree/llm.py ADDED
@@ -0,0 +1,116 @@
1
+ """LLM client abstraction for multiple providers."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Optional
5
+ import json
6
+
7
+ from .config import LLMConfig
8
+
9
+
10
+ class LLMClient(ABC):
11
+ """Abstract base class for LLM clients."""
12
+
13
+ @abstractmethod
14
+ def chat(self, messages: list[dict], **kwargs) -> str:
15
+ """Send chat messages and get response."""
16
+ pass
17
+
18
+
19
+ class OpenAIClient(LLMClient):
20
+ """OpenAI API client."""
21
+
22
+ def __init__(self, config: LLMConfig):
23
+ self.config = config
24
+ try:
25
+ from openai import OpenAI
26
+ self.client = OpenAI(
27
+ api_key=config.api_key,
28
+ base_url=config.base_url,
29
+ )
30
+ except ImportError:
31
+ raise ImportError("openai package not installed. Run: pip install openai")
32
+
33
+ def chat(self, messages: list[dict], **kwargs) -> str:
34
+ response = self.client.chat.completions.create(
35
+ model=self.config.model,
36
+ messages=messages,
37
+ temperature=kwargs.get("temperature", self.config.temperature),
38
+ max_tokens=kwargs.get("max_tokens", self.config.max_tokens),
39
+ )
40
+ return response.choices[0].message.content or ""
41
+
42
+
43
+ class AnthropicClient(LLMClient):
44
+ """Anthropic API client."""
45
+
46
+ def __init__(self, config: LLMConfig):
47
+ self.config = config
48
+ try:
49
+ import anthropic
50
+ self.client = anthropic.Anthropic(api_key=config.api_key)
51
+ except ImportError:
52
+ raise ImportError("anthropic package not installed. Run: pip install anthropic")
53
+
54
+ def chat(self, messages: list[dict], **kwargs) -> str:
55
+ # Convert messages format
56
+ system = ""
57
+ chat_messages = []
58
+
59
+ for msg in messages:
60
+ if msg["role"] == "system":
61
+ system = msg["content"]
62
+ else:
63
+ chat_messages.append(msg)
64
+
65
+ response = self.client.messages.create(
66
+ model=self.config.model,
67
+ system=system,
68
+ messages=chat_messages,
69
+ temperature=kwargs.get("temperature", self.config.temperature),
70
+ max_tokens=kwargs.get("max_tokens", self.config.max_tokens),
71
+ )
72
+ return response.content[0].text
73
+
74
+
75
+ class OllamaClient(LLMClient):
76
+ """Ollama local model client."""
77
+
78
+ def __init__(self, config: LLMConfig):
79
+ self.config = config
80
+ self.base_url = config.base_url or "http://localhost:11434"
81
+
82
+ def chat(self, messages: list[dict], **kwargs) -> str:
83
+ try:
84
+ import ollama
85
+ response = ollama.chat(
86
+ model=self.config.model,
87
+ messages=messages,
88
+ )
89
+ return response["message"]["content"]
90
+ except ImportError:
91
+ # Fallback to requests
92
+ import requests
93
+ response = requests.post(
94
+ f"{self.base_url}/api/chat",
95
+ json={
96
+ "model": self.config.model,
97
+ "messages": messages,
98
+ "stream": False,
99
+ },
100
+ )
101
+ response.raise_for_status()
102
+ return response.json()["message"]["content"]
103
+
104
+
105
+ def create_llm_client(config: LLMConfig) -> LLMClient:
106
+ """Factory function to create appropriate LLM client."""
107
+ provider = config.provider.lower()
108
+
109
+ if provider == "openai":
110
+ return OpenAIClient(config)
111
+ elif provider == "anthropic":
112
+ return AnthropicClient(config)
113
+ elif provider == "ollama":
114
+ return OllamaClient(config)
115
+ else:
116
+ raise ValueError(f"Unknown LLM provider: {provider}")