tarang 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,449 @@
1
+ """
2
+ Context Collector - Gathers local project context for LLM.
3
+
4
+ This module scans the project and collects relevant files based on:
5
+ 1. Project structure (file list)
6
+ 2. Instruction keywords (relevant files)
7
+ 3. Recently modified files
8
+
9
+ The context is sent to the backend with the instruction,
10
+ enabling the LLM to make informed decisions without
11
+ bidirectional communication.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import fnmatch
16
+ import os
17
+ import re
18
+ from dataclasses import dataclass, field
19
+ from pathlib import Path
20
+ from typing import List, Optional, Set
21
+
22
+
23
+ @dataclass
24
+ class FileContent:
25
+ """A file with its content."""
26
+ path: str
27
+ content: str
28
+ lines: int = 0
29
+
30
+
31
+ @dataclass
32
+ class ProjectContext:
33
+ """Context about the project."""
34
+ cwd: str
35
+ files: List[str] = field(default_factory=list)
36
+ relevant_files: List[FileContent] = field(default_factory=list)
37
+ _indexed_context: Optional[dict] = field(default=None, repr=False)
38
+ _folder_tree: Optional[str] = field(default=None, repr=False)
39
+
40
+ def to_dict(self) -> dict:
41
+ """Convert to dictionary for API."""
42
+ result = {
43
+ "cwd": self.cwd,
44
+ "files": self.files,
45
+ "relevant_files": [
46
+ {"path": f.path, "content": f.content, "lines": f.lines}
47
+ for f in self.relevant_files
48
+ ],
49
+ }
50
+
51
+ # Include folder tree for project structure understanding
52
+ if self._folder_tree:
53
+ result["folder_tree"] = self._folder_tree
54
+
55
+ # Include indexed context if available (BM25 + KG retrieval)
56
+ if self._indexed_context:
57
+ result["indexed"] = self._indexed_context
58
+
59
+ return result
60
+
61
+
62
+ class ContextCollector:
63
+ """
64
+ Collects project context for LLM processing.
65
+
66
+ Usage:
67
+ collector = ContextCollector("/path/to/project")
68
+ context = collector.collect("add authentication")
69
+ """
70
+
71
+ # Files/directories to ignore
72
+ IGNORE_PATTERNS = {
73
+ # Version control
74
+ ".git", ".svn", ".hg",
75
+ # Dependencies
76
+ "node_modules", "venv", ".venv", "env", ".env",
77
+ "__pycache__", ".pytest_cache", ".mypy_cache",
78
+ "vendor", "packages",
79
+ # Build outputs
80
+ "dist", "build", ".next", ".nuxt", "out",
81
+ "target", "bin", "obj",
82
+ # IDE
83
+ ".idea", ".vscode", ".vs",
84
+ # Misc
85
+ ".tarang", ".tarang_backups",
86
+ "*.pyc", "*.pyo", "*.so", "*.dylib",
87
+ "*.egg-info", "*.egg",
88
+ ".DS_Store", "Thumbs.db",
89
+ }
90
+
91
+ # File extensions to read
92
+ CODE_EXTENSIONS = {
93
+ ".py", ".js", ".ts", ".jsx", ".tsx",
94
+ ".java", ".kt", ".scala",
95
+ ".go", ".rs", ".c", ".cpp", ".h", ".hpp",
96
+ ".rb", ".php", ".swift", ".m",
97
+ ".html", ".css", ".scss", ".sass", ".less",
98
+ ".json", ".yaml", ".yml", ".toml",
99
+ ".md", ".txt", ".rst",
100
+ ".sql", ".sh", ".bash", ".zsh",
101
+ ".vue", ".svelte",
102
+ ".xml", ".gradle",
103
+ }
104
+
105
+ # Max file size to read (100KB)
106
+ MAX_FILE_SIZE = 100 * 1024
107
+
108
+ # Max files to list
109
+ MAX_FILES = 500
110
+
111
+ # Max relevant files to include
112
+ MAX_RELEVANT_FILES = 15
113
+
114
+ # Max content per file
115
+ MAX_CONTENT_LINES = 300
116
+
117
+ # Config file extensions to auto-include from root (reveals project type)
118
+ CONFIG_EXTENSIONS = {
119
+ ".json", ".toml", ".yaml", ".yml", ".lock",
120
+ ".config.js", ".config.ts",
121
+ }
122
+
123
+ # Config filenames (no extension or special names)
124
+ CONFIG_NAMES = {
125
+ "Dockerfile", "Makefile", "Gemfile", "Procfile",
126
+ "requirements.txt", "setup.py", "setup.cfg",
127
+ ".gitignore", ".env.example",
128
+ }
129
+
130
+ # Skip these even if they match (too large or not useful)
131
+ SKIP_CONFIG_FILES = {
132
+ "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
133
+ "poetry.lock", "Cargo.lock", "composer.lock",
134
+ }
135
+
136
+ def __init__(self, project_root: str):
137
+ self.project_root = Path(project_root).resolve()
138
+
139
+ def collect(self, instruction: str) -> ProjectContext:
140
+ """
141
+ Collect project context based on instruction.
142
+
143
+ Args:
144
+ instruction: User instruction to inform file selection
145
+
146
+ Returns:
147
+ ProjectContext with file list and relevant file contents
148
+ """
149
+ # Get all files
150
+ all_files = self._scan_files()
151
+
152
+ # Build folder structure tree (helps LLM understand project layout)
153
+ folder_tree = self._build_folder_tree(all_files)
154
+
155
+ # ALWAYS include project identity files first (reduces tool calls!)
156
+ identity_files = self._collect_identity_files()
157
+
158
+ # Find relevant files based on instruction
159
+ relevant_paths = self._find_relevant_files(instruction, all_files)
160
+
161
+ # For small projects, include all files if we didn't find specific matches
162
+ if len(all_files) <= 10 and len(relevant_paths) < 3:
163
+ # Small project - read all code files
164
+ relevant_paths = all_files
165
+
166
+ # Combine: identity files first, then instruction-relevant files
167
+ # (avoiding duplicates)
168
+ identity_paths = {f.path for f in identity_files}
169
+ additional_files = []
170
+ for path in relevant_paths:
171
+ if path not in identity_paths:
172
+ content = self._read_file(path)
173
+ if content:
174
+ additional_files.append(content)
175
+ if len(identity_files) + len(additional_files) >= self.MAX_RELEVANT_FILES:
176
+ break
177
+
178
+ relevant_files = identity_files + additional_files
179
+
180
+ context = ProjectContext(
181
+ cwd=str(self.project_root),
182
+ files=all_files[:self.MAX_FILES],
183
+ relevant_files=relevant_files,
184
+ )
185
+
186
+ # Add folder tree as metadata
187
+ context._folder_tree = folder_tree
188
+
189
+ return context
190
+
191
+ def _collect_identity_files(self) -> List[FileContent]:
192
+ """
193
+ Collect root-level config files that reveal project type.
194
+
195
+ Reads actual files in root directory (not a hardcoded list).
196
+ Skips .md files, lock files, and other non-config files.
197
+ """
198
+ identity_files = []
199
+
200
+ # Scan root directory only (not recursive)
201
+ try:
202
+ for item in self.project_root.iterdir():
203
+ if not item.is_file():
204
+ continue
205
+
206
+ filename = item.name
207
+
208
+ # Skip ignored files
209
+ if self._should_ignore(filename):
210
+ continue
211
+
212
+ # Skip lock files and other large files
213
+ if filename in self.SKIP_CONFIG_FILES:
214
+ continue
215
+
216
+ # Skip markdown files (user specified: non .md)
217
+ if filename.endswith(".md"):
218
+ continue
219
+
220
+ # Include if it's a config file (by extension or name)
221
+ is_config = (
222
+ item.suffix.lower() in self.CONFIG_EXTENSIONS
223
+ or filename in self.CONFIG_NAMES
224
+ or filename.startswith(".") # dotfiles like .eslintrc
225
+ )
226
+
227
+ if is_config:
228
+ content = self._read_file(filename)
229
+ if content:
230
+ identity_files.append(content)
231
+
232
+ except OSError:
233
+ pass
234
+
235
+ return identity_files
236
+
237
+ def _build_folder_tree(self, files: List[str], max_depth: int = 3) -> str:
238
+ """
239
+ Build a folder structure tree string.
240
+
241
+ This helps LLM understand project layout without calling list_files.
242
+ """
243
+ # Build directory structure
244
+ dirs: Set[str] = set()
245
+ root_files: List[str] = []
246
+
247
+ for f in files:
248
+ parts = Path(f).parts
249
+ if len(parts) == 1:
250
+ root_files.append(f)
251
+ else:
252
+ # Add all parent directories up to max_depth
253
+ for i in range(1, min(len(parts), max_depth + 1)):
254
+ dirs.add("/".join(parts[:i]))
255
+
256
+ # Build tree string
257
+ lines = ["."]
258
+
259
+ # Root files first
260
+ for f in sorted(root_files)[:10]:
261
+ lines.append(f"├── {f}")
262
+ if len(root_files) > 10:
263
+ lines.append(f"├── ... ({len(root_files) - 10} more files)")
264
+
265
+ # Then directories
266
+ sorted_dirs = sorted(dirs)
267
+ for d in sorted_dirs[:20]:
268
+ depth = d.count("/")
269
+ indent = "│ " * depth
270
+ name = d.split("/")[-1]
271
+ lines.append(f"{indent}├── {name}/")
272
+
273
+ if len(sorted_dirs) > 20:
274
+ lines.append(f"... ({len(sorted_dirs) - 20} more directories)")
275
+
276
+ return "\n".join(lines)
277
+
278
+ def _scan_files(self) -> List[str]:
279
+ """Scan project for all files."""
280
+ files = []
281
+
282
+ for root, dirs, filenames in os.walk(self.project_root):
283
+ # Filter directories
284
+ dirs[:] = [d for d in dirs if not self._should_ignore(d)]
285
+
286
+ for filename in filenames:
287
+ if self._should_ignore(filename):
288
+ continue
289
+
290
+ full_path = Path(root) / filename
291
+ try:
292
+ rel_path = str(full_path.relative_to(self.project_root))
293
+ files.append(rel_path)
294
+ except ValueError:
295
+ continue
296
+
297
+ if len(files) >= self.MAX_FILES:
298
+ break
299
+
300
+ if len(files) >= self.MAX_FILES:
301
+ break
302
+
303
+ return sorted(files)
304
+
305
+ def _should_ignore(self, name: str) -> bool:
306
+ """Check if file/directory should be ignored."""
307
+ for pattern in self.IGNORE_PATTERNS:
308
+ if fnmatch.fnmatch(name, pattern):
309
+ return True
310
+ return False
311
+
312
+ def _find_relevant_files(
313
+ self,
314
+ instruction: str,
315
+ all_files: List[str],
316
+ ) -> List[str]:
317
+ """Find files relevant to the instruction."""
318
+ relevant: Set[str] = set()
319
+
320
+ # Extract keywords from instruction
321
+ keywords = self._extract_keywords(instruction)
322
+
323
+ # Score files by relevance
324
+ scored_files = []
325
+ for file_path in all_files:
326
+ score = self._score_file(file_path, keywords, instruction)
327
+ if score > 0:
328
+ scored_files.append((file_path, score))
329
+
330
+ # Sort by score and return top files
331
+ scored_files.sort(key=lambda x: x[1], reverse=True)
332
+ return [f[0] for f in scored_files[:self.MAX_RELEVANT_FILES]]
333
+
334
+ def _extract_keywords(self, instruction: str) -> List[str]:
335
+ """Extract keywords from instruction."""
336
+ # Remove common words
337
+ stopwords = {
338
+ "the", "a", "an", "is", "are", "was", "were", "be", "been",
339
+ "have", "has", "had", "do", "does", "did", "will", "would",
340
+ "could", "should", "may", "might", "must", "can", "need",
341
+ "to", "of", "in", "for", "on", "with", "at", "by", "from",
342
+ "as", "into", "through", "during", "before", "after",
343
+ "and", "but", "if", "or", "because", "until", "while",
344
+ "this", "that", "these", "those", "i", "me", "my", "we",
345
+ "you", "your", "it", "its", "they", "them", "their",
346
+ "what", "which", "who", "how", "where", "when", "why",
347
+ "add", "create", "make", "build", "implement", "write",
348
+ "fix", "update", "change", "modify", "remove", "delete",
349
+ "please", "help", "want", "like", "using", "use",
350
+ }
351
+
352
+ # Split and filter
353
+ words = re.findall(r'\b\w+\b', instruction.lower())
354
+ keywords = [w for w in words if w not in stopwords and len(w) > 2]
355
+
356
+ return keywords
357
+
358
+ def _score_file(
359
+ self,
360
+ file_path: str,
361
+ keywords: List[str],
362
+ instruction: str,
363
+ ) -> int:
364
+ """Score a file's relevance."""
365
+ score = 0
366
+ file_lower = file_path.lower()
367
+ filename = Path(file_path).name.lower()
368
+ stem = Path(file_path).stem.lower()
369
+
370
+ # Check if file is explicitly mentioned
371
+ if filename in instruction.lower() or stem in instruction.lower():
372
+ score += 100
373
+
374
+ # Check file path for keywords
375
+ for keyword in keywords:
376
+ if keyword in file_lower:
377
+ score += 10
378
+ if keyword in filename:
379
+ score += 20
380
+ if keyword == stem:
381
+ score += 50
382
+
383
+ # Boost common entry points
384
+ entry_points = ["main", "app", "index", "server", "cli", "__init__"]
385
+ if stem in entry_points:
386
+ score += 5
387
+
388
+ # Boost by extension relevance
389
+ ext = Path(file_path).suffix.lower()
390
+ if ext in {".py", ".js", ".ts", ".tsx", ".jsx"}:
391
+ score += 2
392
+ if ext in {".json", ".yaml", ".yml", ".toml"}:
393
+ score += 1
394
+
395
+ return score
396
+
397
+ def _read_file(self, rel_path: str) -> Optional[FileContent]:
398
+ """Read file content."""
399
+ full_path = self.project_root / rel_path
400
+
401
+ # Check if readable
402
+ if not full_path.exists() or not full_path.is_file():
403
+ return None
404
+
405
+ # Check extension
406
+ if full_path.suffix.lower() not in self.CODE_EXTENSIONS:
407
+ return None
408
+
409
+ # Check size
410
+ try:
411
+ size = full_path.stat().st_size
412
+ if size > self.MAX_FILE_SIZE:
413
+ return None
414
+ except OSError:
415
+ return None
416
+
417
+ # Read content
418
+ try:
419
+ content = full_path.read_text(encoding="utf-8", errors="replace")
420
+ lines = content.splitlines()
421
+
422
+ # Truncate if too long
423
+ if len(lines) > self.MAX_CONTENT_LINES:
424
+ lines = lines[:self.MAX_CONTENT_LINES]
425
+ content = "\n".join(lines) + "\n... (truncated)"
426
+
427
+ return FileContent(
428
+ path=rel_path,
429
+ content=content,
430
+ lines=len(lines),
431
+ )
432
+
433
+ except Exception:
434
+ return None
435
+
436
+
437
+ def collect_context(project_root: str, instruction: str) -> ProjectContext:
438
+ """
439
+ Convenience function to collect context.
440
+
441
+ Args:
442
+ project_root: Path to project
443
+ instruction: User instruction
444
+
445
+ Returns:
446
+ ProjectContext
447
+ """
448
+ collector = ContextCollector(project_root)
449
+ return collector.collect(instruction)
@@ -0,0 +1,6 @@
1
+ """Tarang Executor - Local file and shell operations."""
2
+
3
+ from tarang.executor.diff_apply import DiffApplicator, DiffResult
4
+ from tarang.executor.linter import ShadowLinter, LintResult
5
+
6
+ __all__ = ["DiffApplicator", "DiffResult", "ShadowLinter", "LintResult"]
@@ -0,0 +1,246 @@
1
+ """
2
+ Diff Applicator - Apply edits from backend to local files.
3
+
4
+ Supports unified diffs, search/replace, and full content replacement.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import shutil
9
+ import subprocess
10
+ import time
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+
16
+ @dataclass
17
+ class DiffResult:
18
+ """Result of applying a diff."""
19
+ success: bool
20
+ path: str
21
+ error: Optional[str] = None
22
+ backup_path: Optional[str] = None
23
+
24
+
25
+ class DiffApplicator:
26
+ """
27
+ Apply edits from backend to local files.
28
+
29
+ Supports:
30
+ - Unified diffs (via patch command)
31
+ - Search/replace edits
32
+ - Full content replacement
33
+
34
+ Includes backup/rollback for safety.
35
+ """
36
+
37
+ def __init__(self, project_root: Path):
38
+ self.project_root = project_root
39
+ self.backup_dir = project_root / ".tarang_backups"
40
+
41
+ def apply_diff(self, path: str, diff: str) -> DiffResult:
42
+ """
43
+ Apply a unified diff to a file.
44
+
45
+ Args:
46
+ path: File path relative to project root
47
+ diff: Unified diff content
48
+
49
+ Returns:
50
+ DiffResult with success/error info
51
+ """
52
+ file_path = self.project_root / path
53
+
54
+ # Create backup first
55
+ backup_path = self._create_backup(file_path)
56
+
57
+ try:
58
+ # Try using patch command
59
+ result = subprocess.run(
60
+ ["patch", "-u", str(file_path)],
61
+ input=diff.encode(),
62
+ capture_output=True,
63
+ timeout=30
64
+ )
65
+
66
+ if result.returncode != 0:
67
+ # Restore from backup
68
+ self._restore_backup(file_path, backup_path)
69
+ return DiffResult(
70
+ success=False,
71
+ path=path,
72
+ error=result.stderr.decode() or "Patch failed",
73
+ )
74
+
75
+ return DiffResult(
76
+ success=True,
77
+ path=path,
78
+ backup_path=str(backup_path) if backup_path else None,
79
+ )
80
+
81
+ except FileNotFoundError:
82
+ # patch command not available, restore and fail
83
+ self._restore_backup(file_path, backup_path)
84
+ return DiffResult(
85
+ success=False,
86
+ path=path,
87
+ error="patch command not available",
88
+ )
89
+ except subprocess.TimeoutExpired:
90
+ self._restore_backup(file_path, backup_path)
91
+ return DiffResult(
92
+ success=False,
93
+ path=path,
94
+ error="Patch timed out",
95
+ )
96
+
97
+ def apply_search_replace(
98
+ self,
99
+ path: str,
100
+ search: str,
101
+ replace: str,
102
+ ) -> DiffResult:
103
+ """
104
+ Apply a search/replace edit.
105
+
106
+ Args:
107
+ path: File path relative to project root
108
+ search: Text to find
109
+ replace: Text to replace with
110
+
111
+ Returns:
112
+ DiffResult with success/error info
113
+ """
114
+ file_path = self.project_root / path
115
+
116
+ if not file_path.exists():
117
+ return DiffResult(
118
+ success=False,
119
+ path=path,
120
+ error=f"File not found: {path}",
121
+ )
122
+
123
+ try:
124
+ content = file_path.read_text()
125
+
126
+ if search not in content:
127
+ return DiffResult(
128
+ success=False,
129
+ path=path,
130
+ error=f"Search text not found in {path}",
131
+ )
132
+
133
+ # Create backup
134
+ backup_path = self._create_backup(file_path)
135
+
136
+ # Apply replacement
137
+ new_content = content.replace(search, replace, 1)
138
+ file_path.write_text(new_content)
139
+
140
+ return DiffResult(
141
+ success=True,
142
+ path=path,
143
+ backup_path=str(backup_path) if backup_path else None,
144
+ )
145
+
146
+ except Exception as e:
147
+ return DiffResult(
148
+ success=False,
149
+ path=path,
150
+ error=str(e),
151
+ )
152
+
153
+ def apply_content(self, path: str, content: str) -> DiffResult:
154
+ """
155
+ Write full content to a file.
156
+
157
+ Args:
158
+ path: File path relative to project root
159
+ content: Full file content
160
+
161
+ Returns:
162
+ DiffResult with success/error info
163
+ """
164
+ file_path = self.project_root / path
165
+
166
+ try:
167
+ # Create backup if file exists
168
+ backup_path = self._create_backup(file_path) if file_path.exists() else None
169
+
170
+ # Ensure parent directory exists
171
+ file_path.parent.mkdir(parents=True, exist_ok=True)
172
+
173
+ # Write content
174
+ file_path.write_text(content)
175
+
176
+ return DiffResult(
177
+ success=True,
178
+ path=path,
179
+ backup_path=str(backup_path) if backup_path else None,
180
+ )
181
+
182
+ except Exception as e:
183
+ return DiffResult(
184
+ success=False,
185
+ path=path,
186
+ error=str(e),
187
+ )
188
+
189
+ def rollback(self, result: DiffResult) -> bool:
190
+ """
191
+ Rollback a change using backup.
192
+
193
+ Args:
194
+ result: DiffResult with backup_path
195
+
196
+ Returns:
197
+ True if rollback succeeded
198
+ """
199
+ if not result.backup_path:
200
+ return False
201
+
202
+ return self._restore_backup(
203
+ self.project_root / result.path,
204
+ Path(result.backup_path)
205
+ )
206
+
207
+ def cleanup_backups(self, max_age_hours: int = 24) -> int:
208
+ """
209
+ Clean up old backup files.
210
+
211
+ Args:
212
+ max_age_hours: Maximum age of backups to keep
213
+
214
+ Returns:
215
+ Number of files cleaned up
216
+ """
217
+ if not self.backup_dir.exists():
218
+ return 0
219
+
220
+ cleaned = 0
221
+ cutoff = time.time() - (max_age_hours * 3600)
222
+
223
+ for backup_file in self.backup_dir.glob("*.bak"):
224
+ if backup_file.stat().st_mtime < cutoff:
225
+ backup_file.unlink()
226
+ cleaned += 1
227
+
228
+ return cleaned
229
+
230
+ def _create_backup(self, file_path: Path) -> Optional[Path]:
231
+ """Create a backup of a file."""
232
+ if not file_path.exists():
233
+ return None
234
+
235
+ self.backup_dir.mkdir(exist_ok=True)
236
+ timestamp = int(time.time() * 1000)
237
+ backup_path = self.backup_dir / f"{file_path.name}.{timestamp}.bak"
238
+ shutil.copy2(file_path, backup_path)
239
+ return backup_path
240
+
241
+ def _restore_backup(self, file_path: Path, backup_path: Optional[Path]) -> bool:
242
+ """Restore a file from backup."""
243
+ if backup_path and backup_path.exists():
244
+ shutil.copy2(backup_path, file_path)
245
+ return True
246
+ return False