codebrain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. codebrain/__init__.py +3 -0
  2. codebrain/__main__.py +6 -0
  3. codebrain/agent_bridge.py +162 -0
  4. codebrain/analyzer.py +943 -0
  5. codebrain/api.py +578 -0
  6. codebrain/api_models.py +102 -0
  7. codebrain/cli.py +1927 -0
  8. codebrain/comprehension.py +1939 -0
  9. codebrain/config.py +46 -0
  10. codebrain/context.py +276 -0
  11. codebrain/export.py +334 -0
  12. codebrain/graph/__init__.py +0 -0
  13. codebrain/graph/query.py +656 -0
  14. codebrain/graph/schema.py +113 -0
  15. codebrain/graph/store.py +295 -0
  16. codebrain/hook_runner.py +71 -0
  17. codebrain/hooks.py +107 -0
  18. codebrain/indexer.py +450 -0
  19. codebrain/llm.py +676 -0
  20. codebrain/logging.py +42 -0
  21. codebrain/mcp_server.py +1635 -0
  22. codebrain/memory/__init__.py +5 -0
  23. codebrain/memory/store.py +270 -0
  24. codebrain/parser/__init__.py +0 -0
  25. codebrain/parser/base.py +27 -0
  26. codebrain/parser/config_parser.py +228 -0
  27. codebrain/parser/models.py +44 -0
  28. codebrain/parser/python_parser.py +658 -0
  29. codebrain/parser/registry.py +144 -0
  30. codebrain/parser/typescript_parser.py +1189 -0
  31. codebrain/parser/typescript_treesitter.py +535 -0
  32. codebrain/py.typed +0 -0
  33. codebrain/resolver.py +171 -0
  34. codebrain/settings.py +88 -0
  35. codebrain/utils.py +59 -0
  36. codebrain/validator.py +563 -0
  37. codebrain/watcher/__init__.py +0 -0
  38. codebrain/watcher/file_watcher.py +173 -0
  39. codebrain-0.1.0.dist-info/METADATA +360 -0
  40. codebrain-0.1.0.dist-info/RECORD +44 -0
  41. codebrain-0.1.0.dist-info/WHEEL +5 -0
  42. codebrain-0.1.0.dist-info/entry_points.txt +6 -0
  43. codebrain-0.1.0.dist-info/licenses/LICENSE +21 -0
  44. codebrain-0.1.0.dist-info/top_level.txt +1 -0
codebrain/indexer.py ADDED
@@ -0,0 +1,450 @@
1
+ """Orchestrator: scan a repository, parse Python files, store in the graph."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import fnmatch
6
+ import re
7
+ import time
8
+ import sys
9
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
10
+ from pathlib import Path
11
+ from typing import Callable
12
+
13
+ from codebrain.config import CODEBRAIN_DIR, DB_FILENAME, INDEXABLE_EXTENSIONS, SKIP_DIRS
14
+ from codebrain.graph.store import GraphStore
15
+ from codebrain.logging import get_logger
16
+ from codebrain.parser.registry import get_registry
17
+ from codebrain.settings import Settings, load_settings
18
+ from codebrain.utils import file_hash, normalize_path
19
+
20
+ _log = get_logger("indexer")
21
+
22
+ # Default threshold: use parallel parsing when file count exceeds this
23
+ _PARALLEL_THRESHOLD = 50
24
+
25
+
26
+ def _parse_file(file_path: Path, repo_root: Path) -> "ParsedFile":
27
+ """Route to the correct parser via the plugin registry."""
28
+ return get_registry().parse(file_path, repo_root)
29
+
30
+
31
+ def _parse_one(args: tuple[str, str]) -> "tuple[object | None, str | None]":
32
+ """Module-level function for ProcessPoolExecutor (must be picklable).
33
+
34
+ Returns (ParsedFile, None) on success, or (None, error_string) on failure.
35
+ """
36
+ file_path_str, repo_root_str = args
37
+ file_path = Path(file_path_str)
38
+ repo_root = Path(repo_root_str)
39
+ try:
40
+ pf = _parse_file(file_path, repo_root)
41
+ return (pf, None)
42
+ except Exception as exc:
43
+ rel = normalize_path(file_path, repo_root)
44
+ return (None, f"{rel}: {exc}")
45
+
46
+
47
+ _PARSE_SCRIPT = """\
48
+ import sys, json, pickle, base64
49
+ from pathlib import Path
50
+ from codebrain.parser.registry import get_registry
51
+ r = get_registry()
52
+ pf = r.parse(Path(sys.argv[1]), Path(sys.argv[2]))
53
+ data = base64.b64encode(pickle.dumps(pf)).decode()
54
+ print(data)
55
+ """
56
+
57
+
58
+ def _parse_with_timeout(
59
+ file_path: Path, repo_root: Path, timeout: int = 30,
60
+ ) -> "tuple[object | None, str | None]":
61
+ """Parse a file in a subprocess with a hard timeout.
62
+
63
+ Uses subprocess.run so we can actually kill the process if the parser
64
+ hangs (e.g. tree-sitter C extension holding the GIL on Windows).
65
+
66
+ Returns (ParsedFile, None) on success or (None, error_string) on failure/timeout.
67
+ """
68
+ import subprocess
69
+ import pickle
70
+ import base64
71
+
72
+ rel = normalize_path(file_path, repo_root)
73
+ try:
74
+ result = subprocess.run(
75
+ [sys.executable, "-c", _PARSE_SCRIPT, str(file_path), str(repo_root)],
76
+ capture_output=True, text=True, timeout=timeout,
77
+ )
78
+ if result.returncode != 0:
79
+ err_msg = result.stderr.strip().split("\n")[-1] if result.stderr else "unknown error"
80
+ return (None, f"{rel}: {err_msg}")
81
+ data = result.stdout.strip()
82
+ if not data:
83
+ return (None, f"{rel}: parser returned no output")
84
+ pf = pickle.loads(base64.b64decode(data))
85
+ return (pf, None)
86
+ except subprocess.TimeoutExpired:
87
+ _log.warning("Parse timeout: %s (>%ds), killed", rel, timeout)
88
+ return (None, f"{rel}: parse timeout (>{timeout}s, killed)")
89
+ except Exception as exc:
90
+ return (None, f"{rel}: subprocess error: {exc}")
91
+
92
+
93
+ def _load_gitignore_patterns(repo_root: Path) -> list[tuple[str, bool]]:
94
+ """Read .gitignore and return a list of (pattern, negated) tuples."""
95
+ gitignore = repo_root / ".gitignore"
96
+ if not gitignore.is_file():
97
+ return []
98
+ patterns: list[tuple[str, bool]] = []
99
+ for line in gitignore.read_text(errors="replace").splitlines():
100
+ line = line.strip()
101
+ if not line or line.startswith("#"):
102
+ continue
103
+ negated = line.startswith("!")
104
+ if negated:
105
+ line = line[1:]
106
+ patterns.append((line, negated))
107
+ return patterns
108
+
109
+
110
+ def _should_skip_dir(dir_name: str, skip_dirs: frozenset[str] = SKIP_DIRS) -> bool:
111
+ """Return True if this directory should be skipped entirely."""
112
+ for pattern in skip_dirs:
113
+ if fnmatch.fnmatch(dir_name, pattern):
114
+ return True
115
+ return False
116
+
117
+
118
+ def _gitignore_to_regex(pattern: str) -> re.Pattern[str]:
119
+ """Convert a single .gitignore pattern to a compiled regex.
120
+
121
+ Supports ``**`` (any path segment), anchored patterns (leading ``/``),
122
+ directory-only patterns (trailing ``/``), and standard globs.
123
+ """
124
+ clean = pattern.rstrip("/")
125
+ anchored = clean.startswith("/")
126
+ if anchored:
127
+ clean = clean[1:]
128
+
129
+ # Escape and then convert glob tokens
130
+ parts: list[str] = []
131
+ i = 0
132
+ while i < len(clean):
133
+ c = clean[i]
134
+ if c == "*":
135
+ if i + 1 < len(clean) and clean[i + 1] == "*":
136
+ # **
137
+ if i + 2 < len(clean) and clean[i + 2] == "/":
138
+ parts.append("(?:.+/)?")
139
+ i += 3
140
+ continue
141
+ parts.append(".*")
142
+ i += 2
143
+ continue
144
+ parts.append("[^/]*")
145
+ elif c == "?":
146
+ parts.append("[^/]")
147
+ elif c in r"\.+^${}()|[]":
148
+ parts.append(f"\\{c}")
149
+ else:
150
+ parts.append(c)
151
+ i += 1
152
+
153
+ regex = "".join(parts)
154
+ if anchored:
155
+ regex = f"^{regex}"
156
+ else:
157
+ regex = f"(?:^|/){regex}"
158
+ # Match both the exact path and anything beneath it
159
+ regex = f"(?:{regex})(?:/.*)?$"
160
+ return re.compile(regex)
161
+
162
+
163
+ def _matches_gitignore(rel_path: str, patterns: list[tuple[str, bool]]) -> bool:
164
+ """Check if *rel_path* matches the .gitignore patterns.
165
+
166
+ Processes patterns in order; negation patterns (``!``) can un-ignore
167
+ previously matched paths.
168
+ """
169
+ matched = False
170
+ for pattern, negated in patterns:
171
+ regex = _gitignore_to_regex(pattern)
172
+ if regex.search(rel_path):
173
+ matched = not negated
174
+ return matched
175
+
176
+
177
+ def discover_files(
178
+ repo_root: Path,
179
+ settings: Settings | None = None,
180
+ ) -> list[Path]:
181
+ """Walk the repo and return all indexable files.
182
+
183
+ Uses os.walk with directory pruning for performance on large repos.
184
+ Uses *settings* for skip_dirs and indexable_extensions when provided,
185
+ otherwise falls back to config.py defaults.
186
+ """
187
+ import os
188
+
189
+ extensions = settings.indexable_extensions if settings else INDEXABLE_EXTENSIONS
190
+ skip_dirs = settings.skip_dirs if settings else SKIP_DIRS
191
+
192
+ gitignore_patterns = _load_gitignore_patterns(repo_root)
193
+ files: list[Path] = []
194
+ root_str = str(repo_root)
195
+
196
+ # Name-based parseable files (e.g. docker-compose.yml)
197
+ from codebrain.parser.registry import get_registry
198
+ name_parseable = get_registry().supported_names
199
+
200
+ repo_resolved = repo_root.resolve()
201
+
202
+ for dirpath, dirnames, filenames in os.walk(root_str, followlinks=False):
203
+ # Prune skipped directories IN-PLACE (prevents os.walk from descending)
204
+ dirnames[:] = [
205
+ d for d in dirnames
206
+ if not _should_skip_dir(d, skip_dirs)
207
+ ]
208
+
209
+ for fname in filenames:
210
+ # Check extension first (cheapest filter)
211
+ ext = os.path.splitext(fname)[1]
212
+ if ext not in extensions and fname not in name_parseable:
213
+ continue
214
+
215
+ full_path = os.path.join(dirpath, fname)
216
+ item = Path(full_path)
217
+
218
+ # Skip symlinks pointing outside the repo (security + avoids cycles)
219
+ if item.is_symlink():
220
+ try:
221
+ target = item.resolve()
222
+ if not str(target).startswith(str(repo_resolved)):
223
+ _log.debug("Skipping external symlink: %s -> %s", fname, target)
224
+ continue
225
+ except OSError:
226
+ # Broken symlink
227
+ _log.debug("Skipping broken symlink: %s", fname)
228
+ continue
229
+
230
+ # Check gitignore
231
+ rel = normalize_path(item, repo_root)
232
+ if _matches_gitignore(rel, gitignore_patterns):
233
+ continue
234
+
235
+ # Skip files exceeding max_file_size_kb
236
+ try:
237
+ size_kb = item.stat().st_size / 1024
238
+ if settings and size_kb > settings.max_file_size_kb:
239
+ _log.warning(
240
+ "Skipping %s (%.1fMB) — exceeds max_file_size %dKB",
241
+ rel, size_kb / 1024, settings.max_file_size_kb,
242
+ )
243
+ continue
244
+ except OSError:
245
+ _log.debug("Skipping inaccessible file: %s", fname)
246
+ continue
247
+
248
+ files.append(item)
249
+
250
+ return sorted(files)
251
+
252
+
253
+ def full_index(
254
+ repo_root: Path,
255
+ db_path: Path | None = None,
256
+ *,
257
+ progress_callback: Callable[[int, int], None] | None = None,
258
+ parallel_threshold: int | None = None,
259
+ max_workers: int | None = None,
260
+ ) -> dict:
261
+ """Perform a full index of the repository.
262
+
263
+ Args:
264
+ progress_callback: Called with (current, total) after each file.
265
+ parallel_threshold: Use ProcessPoolExecutor when file count exceeds this.
266
+ Defaults to value from .codebrain.toml or 50.
267
+ max_workers: Max worker processes (None = from settings or cpu_count).
268
+
269
+ Returns a summary dict with counts and timing.
270
+ """
271
+ settings = load_settings(repo_root)
272
+
273
+ if db_path is None:
274
+ db_path = repo_root / settings.codebrain_dir / settings.db_filename
275
+ if parallel_threshold is None:
276
+ parallel_threshold = settings.parallel_threshold
277
+ if max_workers is None:
278
+ max_workers = settings.max_workers
279
+
280
+ db_path.parent.mkdir(parents=True, exist_ok=True)
281
+
282
+ files = discover_files(repo_root, settings)
283
+
284
+ # Mark index as in-progress so interrupted runs can be detected
285
+ with GraphStore(db_path) as _meta_store:
286
+ _meta_store.conn.execute(
287
+ "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
288
+ ("index_status", "in_progress"),
289
+ )
290
+ _meta_store.conn.execute(
291
+ "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
292
+ ("index_files_total", str(len(files))),
293
+ )
294
+ _meta_store.conn.commit()
295
+
296
+ start = time.perf_counter()
297
+
298
+ parsed_count = 0
299
+ skipped_count = 0
300
+ total_nodes = 0
301
+ total_edges = 0
302
+ errors: list[str] = []
303
+
304
+ use_parallel = (
305
+ len(files) >= parallel_threshold
306
+ and sys.platform != "win32"
307
+ )
308
+
309
+ # Per-file timeout (seconds). C extensions like tree-sitter can hold the
310
+ # GIL, making thread-based timeouts useless. On Windows we use
311
+ # multiprocessing.Process with kill() for every non-Python file.
312
+ per_file_timeout = 30
313
+
314
+ # File extensions where C-extension parsers may hang (tree-sitter).
315
+ # Python uses stdlib ast module which never hangs.
316
+ _NEEDS_ISOLATION = frozenset({".ts", ".tsx", ".js", ".jsx"})
317
+
318
+ if use_parallel:
319
+ _log.debug("Parallel parsing %d files (threshold=%d)", len(files), parallel_threshold)
320
+ args_list = [(str(fp), str(repo_root)) for fp in files]
321
+ parsed_results = []
322
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
323
+ for i, (pf, err) in enumerate(executor.map(_parse_one, args_list)):
324
+ if err:
325
+ errors.append(err)
326
+ elif pf is not None:
327
+ parsed_results.append(pf)
328
+ if progress_callback:
329
+ progress_callback(i + 1, len(files))
330
+
331
+ # Serial database writes
332
+ with GraphStore(db_path) as store:
333
+ for pf in parsed_results:
334
+ store.upsert_file(pf)
335
+ parsed_count += 1
336
+ total_nodes += len(pf.nodes)
337
+ total_edges += len(pf.edges)
338
+ else:
339
+ # Serial path with subprocess isolation for non-Python files (Windows)
340
+ with GraphStore(db_path) as store:
341
+ for i, file_path in enumerate(files):
342
+ rel = normalize_path(file_path, repo_root)
343
+ try:
344
+ # File may have been deleted between discover and parse
345
+ if not file_path.is_file():
346
+ _log.debug("Skipping deleted file: %s", rel)
347
+ skipped_count += 1
348
+ if progress_callback:
349
+ progress_callback(i + 1, len(files))
350
+ continue
351
+
352
+ needs_isolation = (
353
+ sys.platform == "win32"
354
+ and file_path.suffix in _NEEDS_ISOLATION
355
+ )
356
+ if needs_isolation:
357
+ pf, err = _parse_with_timeout(
358
+ file_path, repo_root, timeout=per_file_timeout,
359
+ )
360
+ if err:
361
+ errors.append(err)
362
+ if "timeout" in err:
363
+ skipped_count += 1
364
+ continue
365
+ else:
366
+ _log.debug("Parsing %s", rel)
367
+ pf = _parse_file(file_path, repo_root)
368
+
369
+ if pf is not None:
370
+ store.upsert_file(pf)
371
+ parsed_count += 1
372
+ total_nodes += len(pf.nodes)
373
+ total_edges += len(pf.edges)
374
+ except Exception as exc:
375
+ _log.debug("Error parsing %s: %s", rel, exc)
376
+ errors.append(f"{rel}: {exc}")
377
+ if progress_callback:
378
+ progress_callback(i + 1, len(files))
379
+
380
+ elapsed = time.perf_counter() - start
381
+
382
+ # Mark index as complete
383
+ with GraphStore(db_path) as _meta_store:
384
+ _meta_store.conn.execute(
385
+ "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
386
+ ("index_status", "complete"),
387
+ )
388
+ _meta_store.conn.execute(
389
+ "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
390
+ ("index_files_parsed", str(parsed_count)),
391
+ )
392
+ _meta_store.conn.execute(
393
+ "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
394
+ ("index_timestamp", str(time.time())),
395
+ )
396
+ _meta_store.conn.commit()
397
+
398
+ return {
399
+ "files_found": len(files),
400
+ "files_parsed": parsed_count,
401
+ "files_skipped": skipped_count,
402
+ "total_nodes": total_nodes,
403
+ "total_edges": total_edges,
404
+ "errors": errors,
405
+ "elapsed_seconds": round(elapsed, 3),
406
+ }
407
+
408
+
409
+ def incremental_update(
410
+ repo_root: Path,
411
+ changed_files: list[Path],
412
+ deleted_files: list[Path],
413
+ store: GraphStore,
414
+ ) -> dict:
415
+ """Re-parse only changed files and remove deleted ones.
416
+
417
+ Returns a summary dict.
418
+ """
419
+ start = time.perf_counter()
420
+ updated = 0
421
+ removed = 0
422
+ errors: list[str] = []
423
+
424
+ for file_path in deleted_files:
425
+ rel = normalize_path(file_path, repo_root)
426
+ store.remove_file(rel)
427
+ removed += 1
428
+
429
+ for file_path in changed_files:
430
+ rel = normalize_path(file_path, repo_root)
431
+ try:
432
+ # Check hash to skip unchanged
433
+ current_hash = file_hash(file_path)
434
+ stored_hash = store.get_file_hash(rel)
435
+ if current_hash == stored_hash:
436
+ continue
437
+
438
+ pf = _parse_file(file_path, repo_root)
439
+ store.upsert_file(pf)
440
+ updated += 1
441
+ except Exception as exc:
442
+ errors.append(f"{rel}: {exc}")
443
+
444
+ elapsed = time.perf_counter() - start
445
+ return {
446
+ "files_updated": updated,
447
+ "files_removed": removed,
448
+ "errors": errors,
449
+ "elapsed_seconds": round(elapsed, 3),
450
+ }