java-codebase-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
path_filtering.py ADDED
@@ -0,0 +1,472 @@
1
+ """Layered path ignore rules for Java indexing and graph enrichment (B5).
2
+
3
+ Resolution order (later overrides earlier; innermost nested wins among peers):
4
+
5
+ 1. ``builtin_default`` — legacy ``COMMON_EXCLUDED_PATH_PATTERNS`` (gitignore-style).
6
+ 2. ``project_root`` — ``<project>/.java-codebase-rag/ignore``.
7
+ 3. ``nested`` — each ``<dir>/.java-codebase-rag/ignore`` along the path from project root
8
+ to the file's parent (outer dirs first, inner dirs last).
9
+ 4. ``gitignore`` — each ``.gitignore`` from project root down to the file's parent
10
+ (when ``use_gitignore`` is true), using :class:`pathspec.GitIgnoreSpec`.
11
+
12
+ Paths outside ``project_root`` are never ignored by this object.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import fnmatch
17
+ import os
18
+ import warnings
19
+ from collections.abc import Iterator, Sequence
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+ from typing import overload
23
+
24
+ from pathspec import GitIgnoreSpec
25
+
26
+ # Pruning for LocalFile sources: skip VCS, build outputs, dependency trees, and
27
+ # test sources (we currently index prod Java only to keep the semantic index clean).
28
+ # Also avoids EMFILE under default ulimits when the engine traverses in parallel.
29
+ #
30
+ # Note on build-output dir names: ``out``, ``build`` and ``target`` are also legal
31
+ # Java package names (e.g. ``com.example.out.api``). The unconditional ``**/out/**``
32
+ # pattern that previously lived here false-matched such packages and silently
33
+ # dropped real source files. These dirs are now pruned only when they sit next to
34
+ # a build-tool indicator (``pom.xml``, ``build.gradle``, ``build.gradle.kts``,
35
+ # ``settings.gradle``, ``settings.gradle.kts``) — see ``_is_build_output_dir``
36
+ # and ``BUILD_DIR_NAMES``. If you genuinely need to skip an arbitrary nested
37
+ # directory, add a ``.java-codebase-rag/ignore`` entry at the project or subtree root.
38
+ COMMON_EXCLUDED_PATH_PATTERNS: list[str] = [
39
+ "**/.*",
40
+ "**/.git/**",
41
+ "**/.idea/**",
42
+ "**/.venv/**",
43
+ "**/node_modules/**",
44
+ "**/*.class",
45
+ "**/src/test/java/**",
46
+ "**/src/test/resources/**",
47
+ ]
48
+
49
+ # Directory names that are pruned ONLY when they sit next to a build-tool indicator.
50
+ # The check is ``parent_dir`` contains any of ``BUILD_TOOL_INDICATORS``.
51
+ BUILD_DIR_NAMES: tuple[str, ...] = ("target", "build", "out")
52
+
53
+ # Files whose presence in a directory marks it as a JVM build module. When one
54
+ # of these sits next to a ``BUILD_DIR_NAMES`` entry, that entry is treated as
55
+ # build output and pruned from the walk.
56
+ BUILD_TOOL_INDICATORS: tuple[str, ...] = (
57
+ "pom.xml",
58
+ "build.gradle",
59
+ "build.gradle.kts",
60
+ "settings.gradle",
61
+ "settings.gradle.kts",
62
+ )
63
+
64
+ # Directory names always pruned regardless of siblings (universal nuisance dirs;
65
+ # never a legal package name in practice).
66
+ UNCONDITIONAL_PRUNE_DIRS: frozenset[str] = frozenset({
67
+ ".git",
68
+ ".idea",
69
+ ".venv",
70
+ "node_modules",
71
+ })
72
+
73
+
74
+ def _is_build_output_dir(parent_dir: str, dirname: str) -> bool:
75
+ """True iff ``<parent_dir>/<dirname>`` looks like a JVM build-output directory.
76
+
77
+ A name in :data:`BUILD_DIR_NAMES` is build output only when its parent
78
+ directory contains a build-tool indicator (Maven/Gradle marker file).
79
+ Otherwise, names like ``out`` are treated as ordinary subdirectories so
80
+ Java sources under packages such as ``com.example.out.api`` survive the walk.
81
+ """
82
+ if dirname not in BUILD_DIR_NAMES:
83
+ return False
84
+ try:
85
+ with os.scandir(parent_dir) as it:
86
+ siblings = {entry.name for entry in it}
87
+ except OSError:
88
+ return False
89
+ return any(marker in siblings for marker in BUILD_TOOL_INDICATORS)
90
+
91
+
92
+ def compile_excluded_glob_patterns(
93
+ patterns: Sequence[str] | tuple[str, ...],
94
+ ) -> list[str]:
95
+ """Store exclude patterns in list form; same as ast-graph ``index`` compile step."""
96
+ return list(patterns)
97
+
98
+
99
+ def is_relative_path_excluded(rel_posix: str, exclude_globs: list[str]) -> bool:
100
+ """True if a project-relative path matches an exclude glob (incl. ``**/<path>``)."""
101
+ for pat in exclude_globs:
102
+ if fnmatch.fnmatch(rel_posix, pat):
103
+ return True
104
+ if fnmatch.fnmatch(f"**/{rel_posix}", pat):
105
+ return True
106
+ return False
107
+
108
+
109
+ @dataclass(frozen=True)
110
+ class IgnoreLayer:
111
+ """One ignore configuration anchored at ``root`` (patterns apply under this dir)."""
112
+
113
+ root: Path
114
+ spec: GitIgnoreSpec
115
+ source: str
116
+ ignore_file: Path | None = None
117
+
118
+
119
+ def _read_ignore_lines(path: Path) -> list[str]:
120
+ try:
121
+ text = path.read_text(encoding="utf-8", errors="replace")
122
+ except OSError:
123
+ return []
124
+ return text.splitlines()
125
+
126
+
127
+ def _line_has_negation(lines: Sequence[str]) -> bool:
128
+ for raw in lines:
129
+ s = raw.strip()
130
+ if not s or s.startswith("#"):
131
+ continue
132
+ if s.startswith("\\!"):
133
+ continue
134
+ if s.startswith("!"):
135
+ return True
136
+ return False
137
+
138
+
139
+ def _scan_negation_any_bundle_ignore(project_root: Path) -> bool:
140
+ """Return True if any ``.java-codebase-rag/ignore`` contains a negation (``!``) line.
141
+
142
+ Runs one ``rglob`` at :class:`LayeredIgnore` construction. Fine for typical
143
+ repos; very large monorepos pay a full-tree walk on every new ``LayeredIgnore``
144
+ instance (same for :func:`_scan_negation_any_gitignore`).
145
+ """
146
+ root = project_root.resolve()
147
+ try:
148
+ for p in root.rglob(".java-codebase-rag"):
149
+ if not p.is_dir():
150
+ continue
151
+ ign = p / "ignore"
152
+ if ign.is_file() and _line_has_negation(_read_ignore_lines(ign)):
153
+ return True
154
+ except OSError:
155
+ return False
156
+ return False
157
+
158
+
159
+ def _scan_negation_any_gitignore(project_root: Path) -> bool:
160
+ """See :func:`_scan_negation_any_bundle_ignore` (also uses ``rglob``)."""
161
+ root = project_root.resolve()
162
+ try:
163
+ for p in root.rglob(".gitignore"):
164
+ if p.is_file() and _line_has_negation(_read_ignore_lines(p)):
165
+ return True
166
+ except OSError:
167
+ return False
168
+ return False
169
+
170
+
171
+ def _prefix_line_to_project(
172
+ prefix_posix: str,
173
+ raw_line: str,
174
+ ) -> str | None:
175
+ """Map a gitignore line from a subdirectory anchor to project-root-relative."""
176
+ line = raw_line.strip()
177
+ if not line or line.startswith("#"):
178
+ return None
179
+ neg = line.startswith("!")
180
+ body = line[1:] if neg else line
181
+ if body.startswith("\\#") or body.startswith("\\!"):
182
+ body = body[1:]
183
+ anchored = body.startswith("/")
184
+ if anchored:
185
+ body = body[1:]
186
+ if prefix_posix:
187
+ mapped = f"{prefix_posix}/{body}" if body else prefix_posix
188
+ else:
189
+ mapped = body
190
+ return f"!{mapped}" if neg else mapped
191
+
192
+
193
+ def _mega_build_for_rel(
194
+ self_root: Path,
195
+ rel_project: str,
196
+ *,
197
+ use_gitignore: bool,
198
+ builtin_lines: list[str],
199
+ project_ignore_path: Path,
200
+ project_lines: list[str] | None,
201
+ ) -> tuple[list[str], list[tuple[str, Path | None, int, str]]]:
202
+ """Mega gitignore lines (project-relative) + (source, file, line_no, pattern_text)."""
203
+ mega: list[str] = []
204
+ meta: list[tuple[str, Path | None, int, str]] = []
205
+
206
+ def extend_builtin() -> None:
207
+ for i, raw in enumerate(builtin_lines, start=1):
208
+ s = raw.strip()
209
+ if not s or s.startswith("#"):
210
+ continue
211
+ mega.append(raw.rstrip("\n"))
212
+ meta.append(("builtin_default", None, i, s))
213
+
214
+ def extend_file(source: str, path: Path, lines: Sequence[str]) -> None:
215
+ for lineno, raw in enumerate(lines, start=1):
216
+ s = raw.strip()
217
+ if not s or s.startswith("#"):
218
+ continue
219
+ mega.append(raw.rstrip("\n"))
220
+ meta.append((source, path, lineno, s))
221
+
222
+ extend_builtin()
223
+ if project_lines is not None:
224
+ extend_file("project_root", project_ignore_path, project_lines)
225
+
226
+ parts = Path(rel_project).parts
227
+ dir_parts = parts[:-1] if len(parts) > 1 else ()
228
+ for i in range(1, len(dir_parts) + 1):
229
+ anchor = self_root.joinpath(*dir_parts[:i])
230
+ nested_path = anchor / ".java-codebase-rag" / "ignore"
231
+ if not nested_path.is_file():
232
+ continue
233
+ prefix = anchor.relative_to(self_root).as_posix()
234
+ nlines = _read_ignore_lines(nested_path)
235
+ for lineno, raw in enumerate(nlines, start=1):
236
+ mapped = _prefix_line_to_project(prefix, raw)
237
+ if mapped is None:
238
+ continue
239
+ mega.append(mapped)
240
+ meta.append(("nested", nested_path, lineno, raw.strip()))
241
+
242
+ if use_gitignore:
243
+ for i in range(len(dir_parts) + 1):
244
+ anchor = self_root if i == 0 else self_root.joinpath(*dir_parts[:i])
245
+ git_path = anchor / ".gitignore"
246
+ if not git_path.is_file():
247
+ continue
248
+ prefix = anchor.relative_to(self_root).as_posix() if i > 0 else ""
249
+ glines = _read_ignore_lines(git_path)
250
+ for lineno, raw in enumerate(glines, start=1):
251
+ mapped = _prefix_line_to_project(prefix, raw)
252
+ if mapped is None:
253
+ continue
254
+ mega.append(mapped)
255
+ meta.append(("gitignore", git_path, lineno, raw.strip()))
256
+
257
+ return mega, meta
258
+
259
+
260
+ def _winning_row(
261
+ rel: str,
262
+ mega: list[str],
263
+ meta: list[tuple[str, Path | None, int, str]],
264
+ ) -> tuple[str, Path | None, int, str]:
265
+ """The last rule line that changes the cumulative match result (git semantics)."""
266
+ if not mega:
267
+ return "builtin_default", None, 1, ""
268
+ state = False
269
+ last_idx = 0
270
+ for i in range(len(mega)):
271
+ cur = GitIgnoreSpec.from_lines(mega[: i + 1]).match_file(rel)
272
+ if cur != state:
273
+ last_idx = i
274
+ state = cur
275
+ return meta[last_idx]
276
+
277
+
278
+ class LayeredIgnore:
279
+ """Evaluate layered ignore rules anchored at a single project root."""
280
+
281
+ def __init__(
282
+ self,
283
+ project_root: Path | str,
284
+ *,
285
+ use_gitignore: bool = True,
286
+ builtin_patterns: Sequence[str] | None = None,
287
+ ) -> None:
288
+ self.project_root = Path(project_root).expanduser().resolve()
289
+ self.use_gitignore = use_gitignore
290
+ self._builtin_lines = (
291
+ list(builtin_patterns)
292
+ if builtin_patterns is not None
293
+ else list(COMMON_EXCLUDED_PATH_PATTERNS)
294
+ )
295
+ self._project_ignore_path = self.project_root / ".java-codebase-rag" / "ignore"
296
+ self._project_lines: list[str] | None = None
297
+ if self._project_ignore_path.is_file():
298
+ self._project_lines = _read_ignore_lines(self._project_ignore_path)
299
+ self._permissive_coco_walk = (
300
+ _scan_negation_any_bundle_ignore(self.project_root)
301
+ or (use_gitignore and _scan_negation_any_gitignore(self.project_root))
302
+ )
303
+
304
+ def cocoindex_excluded_patterns(self) -> list[str]:
305
+ """Patterns for CocoIndex ``PatternFilePathMatcher.excluded_patterns``.
306
+
307
+ Matches pre-B5 behaviour when no negation rules exist anywhere under the
308
+ project that could un-ignore paths under pruned directories. Otherwise
309
+ returns an empty list and callers must filter each path with
310
+ :meth:`is_ignored`.
311
+ """
312
+ if self._permissive_coco_walk:
313
+ return []
314
+ return list(self._builtin_lines)
315
+
316
+ def _rel_project(self, path: Path) -> str | None:
317
+ try:
318
+ return path.resolve().relative_to(self.project_root).as_posix()
319
+ except ValueError:
320
+ return None
321
+
322
+ def _path_for_display(self, path: Path | None) -> str:
323
+ """Project-relative POSIX path when under ``project_root``; else best-effort short path."""
324
+ if path is None:
325
+ return ""
326
+ try:
327
+ return path.resolve().relative_to(self.project_root).as_posix()
328
+ except ValueError:
329
+ try:
330
+ return path.resolve().relative_to(Path.cwd()).as_posix()
331
+ except ValueError:
332
+ return path.as_posix()
333
+
334
+ def _mega(self, rel_project: str) -> tuple[list[str], GitIgnoreSpec, list[tuple[str, Path | None, int, str]]]:
335
+ mega, meta = _mega_build_for_rel(
336
+ self.project_root,
337
+ rel_project,
338
+ use_gitignore=self.use_gitignore,
339
+ builtin_lines=self._builtin_lines,
340
+ project_ignore_path=self._project_ignore_path,
341
+ project_lines=self._project_lines,
342
+ )
343
+ return mega, GitIgnoreSpec.from_lines(mega), meta
344
+
345
+ def is_ignored(self, path: Path) -> tuple[bool, IgnoreLayer | None]:
346
+ """Return whether ``path`` is ignored and which layer last matched."""
347
+ rel = self._rel_project(path)
348
+ if rel is None:
349
+ return False, None
350
+ mega, spec, meta = self._mega(rel)
351
+ if not mega:
352
+ return False, None
353
+ ignored = spec.match_file(rel)
354
+ if not ignored:
355
+ return False, None
356
+ src, fp, ln, _pat = _winning_row(rel, mega, meta)
357
+ return True, IgnoreLayer(
358
+ root=self.project_root,
359
+ spec=spec,
360
+ source=src,
361
+ ignore_file=fp,
362
+ )
363
+
364
+ def diagnose(self, path: Path) -> str:
365
+ """Human-readable, multi-line explanation of the ignore decision."""
366
+ d = self.diagnose_dict(path)
367
+ expl = d.get("explanation", "")
368
+ layer = d.get("layer")
369
+ ign = d.get("ignored")
370
+ mp = d.get("matching_pattern")
371
+ lines = [
372
+ f"ignored={ign}",
373
+ f"layer={layer!r}",
374
+ f"matching_pattern={mp!r}",
375
+ str(expl),
376
+ ]
377
+ return "\n".join(lines)
378
+
379
+ def diagnose_dict(self, path: Path) -> dict[str, object]:
380
+ """Structured diagnose payload for MCP ``diagnose_ignore``."""
381
+ rel = self._rel_project(path)
382
+ if rel is None:
383
+ return {
384
+ "ignored": False,
385
+ "layer": None,
386
+ "matching_pattern": None,
387
+ "explanation": (
388
+ f"Path {self._path_for_display(path)!r} is outside the configured "
389
+ "project root — not ignored."
390
+ ),
391
+ }
392
+ mega, spec, meta = self._mega(rel)
393
+ if not mega:
394
+ return {
395
+ "ignored": False,
396
+ "layer": None,
397
+ "matching_pattern": None,
398
+ "explanation": f"Path {rel!r} is not ignored by any configured layer.",
399
+ }
400
+ ignored = spec.match_file(rel)
401
+ if not ignored:
402
+ return {
403
+ "ignored": False,
404
+ "layer": None,
405
+ "matching_pattern": None,
406
+ "explanation": f"Path {rel!r} is not ignored by any configured layer.",
407
+ }
408
+ src, fp, ln, pat = _winning_row(rel, mega, meta)
409
+ if fp is not None:
410
+ expl = (
411
+ f"Excluded by {self._path_for_display(fp)} ({src}) at line {ln}: {pat!r}"
412
+ )
413
+ else:
414
+ expl = f"Excluded by builtin default ({src}) at builtin line {ln}: {pat!r}"
415
+ return {
416
+ "ignored": True,
417
+ "layer": src,
418
+ "matching_pattern": pat,
419
+ "explanation": expl,
420
+ }
421
+
422
+
423
+ @overload
424
+ def iter_java_source_files(root: Path, exclude_globs: list[str]) -> Iterator[Path]: ...
425
+
426
+
427
+ @overload
428
+ def iter_java_source_files(root: Path, *, ignore: LayeredIgnore) -> Iterator[Path]: ...
429
+
430
+
431
+ def iter_java_source_files(
432
+ root: Path,
433
+ exclude_globs: list[str] | None = None,
434
+ *,
435
+ ignore: LayeredIgnore | None = None,
436
+ ) -> Iterator[Path]:
437
+ """Walk ``root`` for ``*.java``, honouring prunes and layered ignore rules."""
438
+ if exclude_globs is not None and ignore is not None:
439
+ raise TypeError("pass either exclude_globs or ignore=, not both")
440
+ if exclude_globs is not None:
441
+ warnings.warn(
442
+ "iter_java_source_files(root, exclude_globs) is deprecated; "
443
+ "use iter_java_source_files(root, ignore=LayeredIgnore(root, ...)).",
444
+ DeprecationWarning,
445
+ stacklevel=2,
446
+ )
447
+ ignore_ctx = LayeredIgnore(root, builtin_patterns=exclude_globs, use_gitignore=False)
448
+ elif ignore is not None:
449
+ ignore_ctx = ignore
450
+ else:
451
+ ignore_ctx = LayeredIgnore(root)
452
+ root = root.resolve()
453
+ for dirpath, dirnames, filenames in os.walk(root):
454
+ # Universal nuisance dirs (VCS, IDE, deps) are pruned unconditionally.
455
+ # Build-output dirs (``out`` / ``build`` / ``target``) are pruned only when
456
+ # they sit alongside a build-tool indicator file — otherwise names like
457
+ # ``out`` belong to a Java package (e.g. ``com.example.out.api``) and must
458
+ # be walked. See ``_is_build_output_dir``.
459
+ dirnames[:] = [
460
+ d
461
+ for d in dirnames
462
+ if d not in UNCONDITIONAL_PRUNE_DIRS
463
+ and not _is_build_output_dir(dirpath, d)
464
+ ]
465
+ for fn in filenames:
466
+ if not fn.endswith(".java"):
467
+ continue
468
+ p = Path(dirpath) / fn
469
+ ign, _ = ignore_ctx.is_ignored(p)
470
+ if ign:
471
+ continue
472
+ yield p