codetool-explore 0.5.0__py3-none-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codetool_explore/__init__.py +35 -0
- codetool_explore/_bin/codetool-explore-rust-windows-arm64.exe +0 -0
- codetool_explore/api.py +266 -0
- codetool_explore/cli.py +188 -0
- codetool_explore/compression.py +150 -0
- codetool_explore/cursor.py +71 -0
- codetool_explore/errors.py +23 -0
- codetool_explore/explorer.py +497 -0
- codetool_explore/ignore.py +222 -0
- codetool_explore/py.typed +0 -0
- codetool_explore/python_backend/__init__.py +154 -0
- codetool_explore/python_backend/case.py +19 -0
- codetool_explore/python_backend/config.py +35 -0
- codetool_explore/python_backend/constants.py +39 -0
- codetool_explore/python_backend/file_search.py +51 -0
- codetool_explore/python_backend/ignore_rules.py +40 -0
- codetool_explore/python_backend/literal.py +79 -0
- codetool_explore/python_backend/matcher.py +79 -0
- codetool_explore/python_backend/models.py +49 -0
- codetool_explore/python_backend/output.py +82 -0
- codetool_explore/python_backend/regex_search.py +63 -0
- codetool_explore/python_backend/search.py +327 -0
- codetool_explore/python_backend/text.py +39 -0
- codetool_explore/python_backend/walker.py +119 -0
- codetool_explore/ranking.py +384 -0
- codetool_explore/roots.py +148 -0
- codetool_explore/rust_backend.py +308 -0
- codetool_explore/text_output.py +475 -0
- codetool_explore-0.5.0.dist-info/METADATA +240 -0
- codetool_explore-0.5.0.dist-info/RECORD +33 -0
- codetool_explore-0.5.0.dist-info/WHEEL +4 -0
- codetool_explore-0.5.0.dist-info/entry_points.txt +2 -0
- codetool_explore-0.5.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""Ignore, glob, and path-normalisation helpers.
|
|
2
|
+
|
|
3
|
+
The implementation deliberately avoids external dependencies. It is not a full
|
|
4
|
+
Git wildmatch clone; it provides fast common ignores plus simple shell-style
|
|
5
|
+
patterns that are good enough for coding-agent search.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import fnmatch
|
|
11
|
+
import os
|
|
12
|
+
from collections.abc import Iterable
|
|
13
|
+
|
|
14
|
+
COMMON_IGNORED_DIRS: frozenset[str] = frozenset(
|
|
15
|
+
{
|
|
16
|
+
".git",
|
|
17
|
+
".hg",
|
|
18
|
+
".svn",
|
|
19
|
+
".tox",
|
|
20
|
+
".nox",
|
|
21
|
+
".venv",
|
|
22
|
+
"venv",
|
|
23
|
+
"env",
|
|
24
|
+
"__pycache__",
|
|
25
|
+
".mypy_cache",
|
|
26
|
+
".pytest_cache",
|
|
27
|
+
".ruff_cache",
|
|
28
|
+
".cache",
|
|
29
|
+
"node_modules",
|
|
30
|
+
"bower_components",
|
|
31
|
+
"dist",
|
|
32
|
+
"build",
|
|
33
|
+
"target",
|
|
34
|
+
".next",
|
|
35
|
+
".nuxt",
|
|
36
|
+
".idea",
|
|
37
|
+
".vscode",
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
COMMON_IGNORED_FILES: frozenset[str] = frozenset(
|
|
42
|
+
{
|
|
43
|
+
".DS_Store",
|
|
44
|
+
"Thumbs.db",
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
IGNORE_FILES: tuple[str, ...] = (".pbiignore", ".gitignore")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def normalize_relpath(path: str) -> str:
|
|
52
|
+
"""Return a compact relative path using forward slashes."""
|
|
53
|
+
|
|
54
|
+
rel = os.fspath(path).replace("\\", "/").replace(os.sep, "/")
|
|
55
|
+
if os.altsep:
|
|
56
|
+
rel = rel.replace(os.altsep, "/")
|
|
57
|
+
while rel.startswith("./"):
|
|
58
|
+
rel = rel[2:]
|
|
59
|
+
return "" if rel == "." else rel
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def relative_path(path: str, root: str) -> str:
|
|
63
|
+
"""Return ``path`` relative to ``root`` with stable slash separators."""
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
return normalize_relpath(os.path.relpath(path, root))
|
|
67
|
+
except ValueError:
|
|
68
|
+
# Different drives on Windows; fall back to a normalised absolute path.
|
|
69
|
+
return normalize_relpath(os.path.abspath(path))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def normalize_patterns(patterns: str | Iterable[str] | None) -> tuple[str, ...]:
|
|
73
|
+
"""Normalise a public glob/exclude value into a tuple of patterns."""
|
|
74
|
+
|
|
75
|
+
if patterns is None:
|
|
76
|
+
return ()
|
|
77
|
+
if isinstance(patterns, str):
|
|
78
|
+
raw_patterns = (patterns,)
|
|
79
|
+
else:
|
|
80
|
+
raw_patterns = tuple(patterns)
|
|
81
|
+
normalised: list[str] = []
|
|
82
|
+
for pattern in raw_patterns:
|
|
83
|
+
if pattern is None:
|
|
84
|
+
continue
|
|
85
|
+
text = normalize_relpath(str(pattern).strip())
|
|
86
|
+
if not text:
|
|
87
|
+
continue
|
|
88
|
+
normalised.append(text)
|
|
89
|
+
return tuple(normalised)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def load_ignore_patterns(root: str) -> tuple[str, ...]:
|
|
93
|
+
"""Load simple root-level patterns from ``.pbiignore`` and ``.gitignore``.
|
|
94
|
+
|
|
95
|
+
Supported syntax is intentionally small: comments and blank lines are
|
|
96
|
+
ignored, leading ``!`` negation is ignored, and trailing slash means the
|
|
97
|
+
directory and everything under it.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
patterns: list[str] = []
|
|
101
|
+
for filename in IGNORE_FILES:
|
|
102
|
+
path = os.path.join(root, filename)
|
|
103
|
+
try:
|
|
104
|
+
with open(path, encoding="utf-8", errors="replace") as handle:
|
|
105
|
+
lines = handle.readlines()
|
|
106
|
+
except OSError:
|
|
107
|
+
continue
|
|
108
|
+
for line in lines:
|
|
109
|
+
text = line.strip()
|
|
110
|
+
if not text or text.startswith("#") or text.startswith("!"):
|
|
111
|
+
continue
|
|
112
|
+
text = normalize_relpath(text)
|
|
113
|
+
if text.endswith("/"):
|
|
114
|
+
text = text.rstrip("/")
|
|
115
|
+
patterns.extend((text, f"{text}/**"))
|
|
116
|
+
else:
|
|
117
|
+
patterns.append(text)
|
|
118
|
+
return tuple(patterns)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _path_segments(rel_path: str) -> tuple[str, ...]:
|
|
122
|
+
return tuple(
|
|
123
|
+
segment for segment in normalize_relpath(rel_path).split("/") if segment
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def is_common_ignored_dir(name: str) -> bool:
|
|
128
|
+
"""Return true when ``name`` is a directory we should hard-prune."""
|
|
129
|
+
|
|
130
|
+
return name in COMMON_IGNORED_DIRS
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def is_common_ignored_file(name: str) -> bool:
|
|
134
|
+
"""Return true when ``name`` is a common unhelpful file."""
|
|
135
|
+
|
|
136
|
+
return name in COMMON_IGNORED_FILES
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def path_matches_pattern(rel_path: str, pattern: str) -> bool:
|
|
140
|
+
"""Match a normalised relative path against a simple shell-style pattern."""
|
|
141
|
+
|
|
142
|
+
rel_path = normalize_relpath(rel_path)
|
|
143
|
+
pattern = normalize_relpath(pattern)
|
|
144
|
+
if not pattern:
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
basename = rel_path.rsplit("/", 1)[-1]
|
|
148
|
+
|
|
149
|
+
if fnmatch.fnmatchcase(rel_path, pattern) or fnmatch.fnmatchcase(basename, pattern):
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
# Treat bare directory names as "any segment named X".
|
|
153
|
+
if "/" not in pattern and pattern in _path_segments(rel_path):
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
# Treat "dir" and "dir/**" as directory-prefix patterns.
|
|
157
|
+
prefix = pattern[:-3] if pattern.endswith("/**") else pattern
|
|
158
|
+
if prefix and (rel_path == prefix or rel_path.startswith(prefix.rstrip("/") + "/")):
|
|
159
|
+
return True
|
|
160
|
+
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def path_matches_any(rel_path: str, patterns: Iterable[str]) -> bool:
|
|
165
|
+
"""Return true if ``rel_path`` matches any pattern."""
|
|
166
|
+
|
|
167
|
+
return any(path_matches_pattern(rel_path, pattern) for pattern in patterns)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def pattern_targets_path_or_descendant(path: str, pattern: str) -> bool:
|
|
171
|
+
"""Return true when a pattern is anchored at ``path`` or its descendants."""
|
|
172
|
+
|
|
173
|
+
path = normalize_relpath(path).strip("/")
|
|
174
|
+
pattern = normalize_relpath(pattern).strip("/")
|
|
175
|
+
if not path or not pattern:
|
|
176
|
+
return False
|
|
177
|
+
return path_matches_pattern(path, pattern) or pattern.startswith(f"{path}/")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def matches_glob(rel_path: str, glob_patterns: Iterable[str]) -> bool:
|
|
181
|
+
"""Return true if the path is accepted by the optional glob filters."""
|
|
182
|
+
|
|
183
|
+
patterns = tuple(glob_patterns)
|
|
184
|
+
if not patterns:
|
|
185
|
+
return True
|
|
186
|
+
return path_matches_any(rel_path, patterns)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def should_ignore_path(
|
|
190
|
+
rel_path: str,
|
|
191
|
+
*,
|
|
192
|
+
is_dir: bool,
|
|
193
|
+
exclude_patterns: Iterable[str] = (),
|
|
194
|
+
ignore_patterns: Iterable[str] = (),
|
|
195
|
+
root_ignore_patterns: Iterable[str] = (),
|
|
196
|
+
common_rel_path: str | None = None,
|
|
197
|
+
) -> bool:
|
|
198
|
+
"""Return true when a file/directory should be skipped."""
|
|
199
|
+
|
|
200
|
+
rel_path = normalize_relpath(rel_path)
|
|
201
|
+
common_path = normalize_relpath(
|
|
202
|
+
common_rel_path if common_rel_path is not None else rel_path
|
|
203
|
+
)
|
|
204
|
+
common_name = common_path.rsplit("/", 1)[-1]
|
|
205
|
+
common_segments = _path_segments(common_path)
|
|
206
|
+
|
|
207
|
+
if is_dir:
|
|
208
|
+
if common_name in COMMON_IGNORED_DIRS or any(
|
|
209
|
+
segment in COMMON_IGNORED_DIRS for segment in common_segments
|
|
210
|
+
):
|
|
211
|
+
return True
|
|
212
|
+
elif common_name in COMMON_IGNORED_FILES:
|
|
213
|
+
return True
|
|
214
|
+
|
|
215
|
+
all_patterns = tuple(exclude_patterns) + tuple(ignore_patterns)
|
|
216
|
+
if all_patterns and path_matches_any(rel_path, all_patterns):
|
|
217
|
+
return True
|
|
218
|
+
|
|
219
|
+
if root_ignore_patterns and path_matches_any(common_path, root_ignore_patterns):
|
|
220
|
+
return True
|
|
221
|
+
|
|
222
|
+
return False
|
|
File without changes
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Pure-Python stdlib backend for workspace search.
|
|
2
|
+
|
|
3
|
+
The package mirrors the Rust helper's ``rust/src`` organization so equivalent
|
|
4
|
+
backend responsibilities live in equivalent module names:
|
|
5
|
+
|
|
6
|
+
* ``constants`` and ``models`` define shared backend data;
|
|
7
|
+
* ``case`` and ``config`` validate public search options;
|
|
8
|
+
* ``walker`` and ``ignore_rules`` enumerate candidate files;
|
|
9
|
+
* ``matcher``, ``literal``, ``regex_search``, ``file_search``, and ``text``
|
|
10
|
+
perform path/content matching; and
|
|
11
|
+
* ``search`` coordinates ranking, pagination, and result assembly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
from .case import resolve_case
|
|
19
|
+
from .config import normalize_mode, normalize_path_scope, normalize_target
|
|
20
|
+
from .constants import (
|
|
21
|
+
BINARY_CHECK_BYTES,
|
|
22
|
+
MAX_FILE_BYTES,
|
|
23
|
+
MAX_SNIPPETS_PER_FILE,
|
|
24
|
+
MAX_SNIPPET_CHARS,
|
|
25
|
+
VALID_MODES,
|
|
26
|
+
VALID_PATH_SCOPES,
|
|
27
|
+
VALID_TARGETS,
|
|
28
|
+
)
|
|
29
|
+
from .models import BinaryFileError, CandidateFile, IgnorePatterns
|
|
30
|
+
from .file_search import read_text_candidate
|
|
31
|
+
from .ignore_rules import ignore_patterns_for_root
|
|
32
|
+
from .literal import LiteralMatcher, count_non_overlapping, search_literal_file
|
|
33
|
+
from .matcher import PathMatcher, path_match_subject
|
|
34
|
+
from .output import (
|
|
35
|
+
base_result,
|
|
36
|
+
mark_snippets_for_content_or_path_target,
|
|
37
|
+
path_file_match,
|
|
38
|
+
record_path_only_match,
|
|
39
|
+
)
|
|
40
|
+
from .regex_search import RegexLineMatcher, search_regex_file
|
|
41
|
+
from .search import search_python
|
|
42
|
+
from .text import context_for_lines, crop, decode_line
|
|
43
|
+
from .walker import iter_candidate_files
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"BINARY_CHECK_BYTES",
|
|
47
|
+
"MAX_FILE_BYTES",
|
|
48
|
+
"MAX_SNIPPETS_PER_FILE",
|
|
49
|
+
"MAX_SNIPPET_CHARS",
|
|
50
|
+
"VALID_MODES",
|
|
51
|
+
"VALID_PATH_SCOPES",
|
|
52
|
+
"VALID_TARGETS",
|
|
53
|
+
"BinaryFileError",
|
|
54
|
+
"CandidateFile",
|
|
55
|
+
"IgnorePatterns",
|
|
56
|
+
"iter_candidate_files",
|
|
57
|
+
"normalize_mode",
|
|
58
|
+
"normalize_path_scope",
|
|
59
|
+
"normalize_target",
|
|
60
|
+
"resolve_case",
|
|
61
|
+
"search_python",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _read_text_candidate(path: str, size: int) -> bytes:
|
|
66
|
+
"""Compatibility wrapper for the former single-file module helper."""
|
|
67
|
+
|
|
68
|
+
return read_text_candidate(CandidateFile(path=path, rel_path=path, size=size))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _search_literal_file(
|
|
72
|
+
path: str,
|
|
73
|
+
rel_path: str,
|
|
74
|
+
size: int,
|
|
75
|
+
*,
|
|
76
|
+
pattern: str,
|
|
77
|
+
needle: bytes,
|
|
78
|
+
case_sensitive: bool,
|
|
79
|
+
context_lines: int,
|
|
80
|
+
collect_snippets: bool,
|
|
81
|
+
) -> tuple[dict[str, object] | None, list[dict[str, object]]]:
|
|
82
|
+
"""Compatibility wrapper for the former literal search helper."""
|
|
83
|
+
|
|
84
|
+
del pattern
|
|
85
|
+
candidate = CandidateFile(path=path, rel_path=rel_path, size=size)
|
|
86
|
+
return search_literal_file(
|
|
87
|
+
candidate,
|
|
88
|
+
read_text_candidate(candidate),
|
|
89
|
+
LiteralMatcher(needle=needle, case_sensitive=case_sensitive),
|
|
90
|
+
context_lines=context_lines,
|
|
91
|
+
collect_snippets=collect_snippets,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _search_regex_file(
|
|
96
|
+
path: str,
|
|
97
|
+
rel_path: str,
|
|
98
|
+
size: int,
|
|
99
|
+
*,
|
|
100
|
+
compiled: re.Pattern[str],
|
|
101
|
+
context_lines: int,
|
|
102
|
+
collect_snippets: bool,
|
|
103
|
+
) -> tuple[dict[str, object] | None, list[dict[str, object]]]:
|
|
104
|
+
"""Compatibility wrapper for the former regex search helper."""
|
|
105
|
+
|
|
106
|
+
candidate = CandidateFile(path=path, rel_path=rel_path, size=size)
|
|
107
|
+
return search_regex_file(
|
|
108
|
+
candidate,
|
|
109
|
+
read_text_candidate(candidate),
|
|
110
|
+
RegexLineMatcher(compiled=compiled),
|
|
111
|
+
context_lines=context_lines,
|
|
112
|
+
collect_snippets=collect_snippets,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _path_matches_literal(
|
|
117
|
+
rel_path: str,
|
|
118
|
+
*,
|
|
119
|
+
pattern: str,
|
|
120
|
+
case_sensitive: bool,
|
|
121
|
+
path_scope: str,
|
|
122
|
+
) -> bool:
|
|
123
|
+
"""Compatibility wrapper for the former path-literal helper."""
|
|
124
|
+
|
|
125
|
+
return PathMatcher.build(
|
|
126
|
+
pattern,
|
|
127
|
+
regex=False,
|
|
128
|
+
case_sensitive=case_sensitive,
|
|
129
|
+
).is_match(rel_path, path_scope)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _path_matches_regex(
|
|
133
|
+
rel_path: str,
|
|
134
|
+
*,
|
|
135
|
+
compiled: re.Pattern[str],
|
|
136
|
+
path_scope: str,
|
|
137
|
+
) -> bool:
|
|
138
|
+
"""Compatibility wrapper for the former path-regex helper."""
|
|
139
|
+
|
|
140
|
+
return compiled.search(path_match_subject(rel_path, path_scope)) is not None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
_base_result = base_result
|
|
144
|
+
_context_for_lines = context_for_lines
|
|
145
|
+
_count_non_overlapping = count_non_overlapping
|
|
146
|
+
_crop = crop
|
|
147
|
+
_decode_line = decode_line
|
|
148
|
+
_ignore_patterns_for_root = ignore_patterns_for_root
|
|
149
|
+
_mark_snippets_for_content_or_path_target = (
|
|
150
|
+
mark_snippets_for_content_or_path_target
|
|
151
|
+
)
|
|
152
|
+
_path_file_match = path_file_match
|
|
153
|
+
_path_match_subject = path_match_subject
|
|
154
|
+
_record_path_only_match = record_path_only_match
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Case-sensitivity handling for the Python backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..errors import ExploreArgumentError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def resolve_case(case: str, pattern: str) -> tuple[str, bool]:
|
|
9
|
+
"""Return ``(effective_case, case_sensitive)`` for a search pattern."""
|
|
10
|
+
|
|
11
|
+
normalised = str(case or "smart").lower()
|
|
12
|
+
if normalised == "smart":
|
|
13
|
+
case_sensitive = any(char.isupper() for char in pattern)
|
|
14
|
+
return "sensitive" if case_sensitive else "insensitive", case_sensitive
|
|
15
|
+
if normalised in {"sensitive", "case-sensitive", "exact"}:
|
|
16
|
+
return "sensitive", True
|
|
17
|
+
if normalised in {"insensitive", "ignore", "ignorecase", "case-insensitive", "i"}:
|
|
18
|
+
return "insensitive", False
|
|
19
|
+
raise ExploreArgumentError("case must be one of: smart, sensitive, insensitive")
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Public-option normalization for the Python backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..errors import ExploreArgumentError
|
|
6
|
+
from .constants import VALID_MODES, VALID_PATH_SCOPES, VALID_TARGETS
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def normalize_mode(mode: str) -> str:
|
|
10
|
+
"""Validate and normalise a public output mode."""
|
|
11
|
+
|
|
12
|
+
normalised = str(mode or "files").lower()
|
|
13
|
+
if normalised not in VALID_MODES:
|
|
14
|
+
raise ExploreArgumentError("mode must be one of: files, snippets, count")
|
|
15
|
+
return normalised
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def normalize_target(target: str) -> str:
|
|
19
|
+
"""Validate and normalise the public search target."""
|
|
20
|
+
|
|
21
|
+
normalised = str(target or "content").lower()
|
|
22
|
+
if normalised not in VALID_TARGETS:
|
|
23
|
+
raise ExploreArgumentError(
|
|
24
|
+
"target must be one of: content, path, content_or_path"
|
|
25
|
+
)
|
|
26
|
+
return normalised
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def normalize_path_scope(path_scope: str) -> str:
|
|
30
|
+
"""Validate and normalise the path field matched by path search."""
|
|
31
|
+
|
|
32
|
+
normalised = str(path_scope or "path").lower()
|
|
33
|
+
if normalised not in VALID_PATH_SCOPES:
|
|
34
|
+
raise ExploreArgumentError("path_scope must be one of: path, basename")
|
|
35
|
+
return normalised
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Constants used by the Python backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
DEFAULT_MAX_FILE_BYTES = 5 * 1024 * 1024
|
|
9
|
+
DEFAULT_BINARY_CHECK_BYTES = 8 * 1024
|
|
10
|
+
|
|
11
|
+
MAX_FILE_BYTES = DEFAULT_MAX_FILE_BYTES
|
|
12
|
+
BINARY_CHECK_BYTES = DEFAULT_BINARY_CHECK_BYTES
|
|
13
|
+
MAX_SNIPPETS_PER_FILE = 3
|
|
14
|
+
MAX_SNIPPET_CHARS = 180
|
|
15
|
+
VALID_MODES = frozenset({"files", "snippets", "count"})
|
|
16
|
+
VALID_TARGETS = frozenset({"content", "path", "content_or_path"})
|
|
17
|
+
VALID_PATH_SCOPES = frozenset({"path", "basename"})
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _runtime_constant(name: str, default: int) -> int:
|
|
21
|
+
"""Read a tunable constant, preserving package-level monkeypatch support."""
|
|
22
|
+
|
|
23
|
+
package = sys.modules.get("codetool_explore.python_backend")
|
|
24
|
+
package_value: Any = getattr(package, name, default) if package else default
|
|
25
|
+
module_value: Any = globals().get(name, default)
|
|
26
|
+
value = package_value if package_value != default else module_value
|
|
27
|
+
return int(value)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def max_file_bytes() -> int:
|
|
31
|
+
"""Return the active maximum content file size."""
|
|
32
|
+
|
|
33
|
+
return _runtime_constant("MAX_FILE_BYTES", DEFAULT_MAX_FILE_BYTES)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def binary_check_bytes() -> int:
|
|
37
|
+
"""Return the active binary-probe byte count."""
|
|
38
|
+
|
|
39
|
+
return _runtime_constant("BINARY_CHECK_BYTES", DEFAULT_BINARY_CHECK_BYTES)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Read text candidates and dispatch content searchers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .constants import binary_check_bytes, max_file_bytes
|
|
6
|
+
from .literal import LiteralMatcher, search_literal_file
|
|
7
|
+
from .matcher import SearchMatcher
|
|
8
|
+
from .models import BinaryFileError, CandidateFile
|
|
9
|
+
from .regex_search import RegexLineMatcher, search_regex_file
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read_text_candidate(candidate: CandidateFile) -> bytes:
|
|
13
|
+
"""Read a candidate file after applying huge-file and binary guards."""
|
|
14
|
+
|
|
15
|
+
if candidate.size > max_file_bytes():
|
|
16
|
+
raise OverflowError("huge file")
|
|
17
|
+
with open(candidate.path, "rb") as handle:
|
|
18
|
+
first = handle.read(binary_check_bytes())
|
|
19
|
+
if b"\x00" in first:
|
|
20
|
+
raise BinaryFileError("binary file")
|
|
21
|
+
rest = handle.read()
|
|
22
|
+
return first + rest
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def search_file(
|
|
26
|
+
candidate: CandidateFile,
|
|
27
|
+
matcher: SearchMatcher,
|
|
28
|
+
*,
|
|
29
|
+
context_lines: int,
|
|
30
|
+
collect_snippets: bool,
|
|
31
|
+
) -> tuple[dict[str, object] | None, list[dict[str, object]]]:
|
|
32
|
+
"""Search one already-filtered candidate file."""
|
|
33
|
+
|
|
34
|
+
data = read_text_candidate(candidate)
|
|
35
|
+
if isinstance(matcher, LiteralMatcher):
|
|
36
|
+
return search_literal_file(
|
|
37
|
+
candidate,
|
|
38
|
+
data,
|
|
39
|
+
matcher,
|
|
40
|
+
context_lines=context_lines,
|
|
41
|
+
collect_snippets=collect_snippets,
|
|
42
|
+
)
|
|
43
|
+
if isinstance(matcher, RegexLineMatcher):
|
|
44
|
+
return search_regex_file(
|
|
45
|
+
candidate,
|
|
46
|
+
data,
|
|
47
|
+
matcher,
|
|
48
|
+
context_lines=context_lines,
|
|
49
|
+
collect_snippets=collect_snippets,
|
|
50
|
+
)
|
|
51
|
+
raise TypeError(f"unsupported matcher: {type(matcher).__name__}")
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Ignore-pattern loading for Python backend candidate walking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from ..ignore import (
|
|
8
|
+
load_ignore_patterns,
|
|
9
|
+
pattern_targets_path_or_descendant,
|
|
10
|
+
relative_path,
|
|
11
|
+
)
|
|
12
|
+
from .models import IgnorePatterns
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def ignore_patterns_for_root(
|
|
16
|
+
root_abs: str,
|
|
17
|
+
*,
|
|
18
|
+
rel_base_abs: str | None,
|
|
19
|
+
is_file: bool,
|
|
20
|
+
) -> IgnorePatterns:
|
|
21
|
+
"""Return common-base and root-local ignore patterns for a search root."""
|
|
22
|
+
|
|
23
|
+
filter_root = os.path.dirname(root_abs) if is_file else root_abs
|
|
24
|
+
if rel_base_abs is None:
|
|
25
|
+
return IgnorePatterns(root=load_ignore_patterns(filter_root))
|
|
26
|
+
|
|
27
|
+
root_prefix = relative_path(filter_root, rel_base_abs)
|
|
28
|
+
patterns: list[str] = []
|
|
29
|
+
if os.path.isdir(rel_base_abs):
|
|
30
|
+
patterns.extend(
|
|
31
|
+
pattern
|
|
32
|
+
for pattern in load_ignore_patterns(rel_base_abs)
|
|
33
|
+
if not pattern_targets_path_or_descendant(root_prefix, pattern)
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
root_patterns = load_ignore_patterns(filter_root)
|
|
37
|
+
return IgnorePatterns(
|
|
38
|
+
common=tuple(dict.fromkeys(patterns)),
|
|
39
|
+
root=tuple(dict.fromkeys(root_patterns)),
|
|
40
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Literal content matching for the Python backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from .constants import MAX_SNIPPETS_PER_FILE
|
|
8
|
+
from .models import CandidateFile
|
|
9
|
+
from .text import context_for_lines, crop, decode_line
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class LiteralMatcher:
|
|
14
|
+
"""Prepared literal matcher state."""
|
|
15
|
+
|
|
16
|
+
needle: bytes
|
|
17
|
+
case_sensitive: bool
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def count_non_overlapping(haystack: bytes, needle: bytes) -> int:
|
|
21
|
+
"""Count non-overlapping byte literal matches."""
|
|
22
|
+
|
|
23
|
+
count = 0
|
|
24
|
+
start = 0
|
|
25
|
+
step = max(1, len(needle))
|
|
26
|
+
while True:
|
|
27
|
+
index = haystack.find(needle, start)
|
|
28
|
+
if index < 0:
|
|
29
|
+
return count
|
|
30
|
+
count += 1
|
|
31
|
+
start = index + step
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def search_literal_file(
|
|
35
|
+
candidate: CandidateFile,
|
|
36
|
+
data: bytes,
|
|
37
|
+
matcher: LiteralMatcher,
|
|
38
|
+
*,
|
|
39
|
+
context_lines: int,
|
|
40
|
+
collect_snippets: bool,
|
|
41
|
+
) -> tuple[dict[str, object] | None, list[dict[str, object]]]:
|
|
42
|
+
"""Search one file's bytes for a literal needle."""
|
|
43
|
+
|
|
44
|
+
compare_needle = matcher.needle if matcher.case_sensitive else matcher.needle.lower()
|
|
45
|
+
lines = data.splitlines()
|
|
46
|
+
|
|
47
|
+
count = 0
|
|
48
|
+
first_line: int | None = None
|
|
49
|
+
snippets: list[dict[str, object]] = []
|
|
50
|
+
|
|
51
|
+
for index, line in enumerate(lines):
|
|
52
|
+
compare_line = line if matcher.case_sensitive else line.lower()
|
|
53
|
+
line_count = count_non_overlapping(compare_line, compare_needle)
|
|
54
|
+
if line_count == 0:
|
|
55
|
+
continue
|
|
56
|
+
count += line_count
|
|
57
|
+
line_number = index + 1
|
|
58
|
+
if first_line is None:
|
|
59
|
+
first_line = line_number
|
|
60
|
+
if collect_snippets and len(snippets) < MAX_SNIPPETS_PER_FILE:
|
|
61
|
+
snippet: dict[str, object] = {
|
|
62
|
+
"path": candidate.rel_path,
|
|
63
|
+
"line": line_number,
|
|
64
|
+
"snippet": crop(decode_line(line)),
|
|
65
|
+
}
|
|
66
|
+
context = context_for_lines(lines, index, context_lines)
|
|
67
|
+
if context:
|
|
68
|
+
snippet["context"] = context
|
|
69
|
+
snippets.append(snippet)
|
|
70
|
+
|
|
71
|
+
if count == 0 or first_line is None:
|
|
72
|
+
return None, []
|
|
73
|
+
|
|
74
|
+
file_match: dict[str, object] = {
|
|
75
|
+
"path": candidate.rel_path,
|
|
76
|
+
"count": count,
|
|
77
|
+
"first_line": first_line,
|
|
78
|
+
}
|
|
79
|
+
return file_match, snippets
|