tree-sitter-analyzer 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tree-sitter-analyzer might be problematic. Click here for more details.
- tree_sitter_analyzer/languages/java_plugin.py +71 -1
- tree_sitter_analyzer/mcp/server.py +23 -1
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +550 -0
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +436 -0
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +269 -0
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +334 -0
- {tree_sitter_analyzer-1.2.2.dist-info → tree_sitter_analyzer-1.2.4.dist-info}/METADATA +233 -24
- {tree_sitter_analyzer-1.2.2.dist-info → tree_sitter_analyzer-1.2.4.dist-info}/RECORD +10 -6
- {tree_sitter_analyzer-1.2.2.dist-info → tree_sitter_analyzer-1.2.4.dist-info}/WHEEL +0 -0
- {tree_sitter_analyzer-1.2.2.dist-info → tree_sitter_analyzer-1.2.4.dist-info}/entry_points.txt +0 -0
|
@@ -150,7 +150,7 @@ class JavaElementExtractor(ElementExtractor):
|
|
|
150
150
|
def extract_imports(
|
|
151
151
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
152
152
|
) -> list[Import]:
|
|
153
|
-
"""Extract Java import statements"""
|
|
153
|
+
"""Extract Java import statements with enhanced robustness"""
|
|
154
154
|
self.source_code = source_code
|
|
155
155
|
self.content_lines = source_code.split("\n")
|
|
156
156
|
|
|
@@ -172,9 +172,79 @@ class JavaElementExtractor(ElementExtractor):
|
|
|
172
172
|
# After package and imports come class declarations, so stop
|
|
173
173
|
break
|
|
174
174
|
|
|
175
|
+
# Fallback: if no imports found via tree-sitter, try regex-based extraction
|
|
176
|
+
if not imports and "import" in source_code:
|
|
177
|
+
log_debug("No imports found via tree-sitter, trying regex fallback")
|
|
178
|
+
fallback_imports = self._extract_imports_fallback(source_code)
|
|
179
|
+
imports.extend(fallback_imports)
|
|
180
|
+
|
|
175
181
|
log_debug(f"Extracted {len(imports)} imports")
|
|
176
182
|
return imports
|
|
177
183
|
|
|
184
|
+
def _extract_imports_fallback(self, source_code: str) -> list[Import]:
|
|
185
|
+
"""Fallback import extraction using regex when tree-sitter fails"""
|
|
186
|
+
imports = []
|
|
187
|
+
lines = source_code.split("\n")
|
|
188
|
+
|
|
189
|
+
for line_num, line in enumerate(lines, 1):
|
|
190
|
+
line = line.strip()
|
|
191
|
+
if line.startswith("import ") and line.endswith(";"):
|
|
192
|
+
# Extract import statement
|
|
193
|
+
import_content = line[:-1] # Remove semicolon
|
|
194
|
+
|
|
195
|
+
if "static" in import_content:
|
|
196
|
+
# Static import
|
|
197
|
+
static_match = re.search(
|
|
198
|
+
r"import\s+static\s+([\w.]+)", import_content
|
|
199
|
+
)
|
|
200
|
+
if static_match:
|
|
201
|
+
import_name = static_match.group(1)
|
|
202
|
+
if import_content.endswith(".*"):
|
|
203
|
+
import_name = import_name.replace(".*", "")
|
|
204
|
+
parts = import_name.split(".")
|
|
205
|
+
if len(parts) > 1:
|
|
206
|
+
import_name = ".".join(parts[:-1])
|
|
207
|
+
|
|
208
|
+
imports.append(
|
|
209
|
+
Import(
|
|
210
|
+
name=import_name,
|
|
211
|
+
start_line=line_num,
|
|
212
|
+
end_line=line_num,
|
|
213
|
+
raw_text=line,
|
|
214
|
+
language="java",
|
|
215
|
+
module_name=import_name,
|
|
216
|
+
is_static=True,
|
|
217
|
+
is_wildcard=import_content.endswith(".*"),
|
|
218
|
+
import_statement=import_content,
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
# Normal import
|
|
223
|
+
normal_match = re.search(r"import\s+([\w.]+)", import_content)
|
|
224
|
+
if normal_match:
|
|
225
|
+
import_name = normal_match.group(1)
|
|
226
|
+
if import_content.endswith(".*"):
|
|
227
|
+
if import_name.endswith(".*"):
|
|
228
|
+
import_name = import_name[:-2]
|
|
229
|
+
elif import_name.endswith("."):
|
|
230
|
+
import_name = import_name[:-1]
|
|
231
|
+
|
|
232
|
+
imports.append(
|
|
233
|
+
Import(
|
|
234
|
+
name=import_name,
|
|
235
|
+
start_line=line_num,
|
|
236
|
+
end_line=line_num,
|
|
237
|
+
raw_text=line,
|
|
238
|
+
language="java",
|
|
239
|
+
module_name=import_name,
|
|
240
|
+
is_static=False,
|
|
241
|
+
is_wildcard=import_content.endswith(".*"),
|
|
242
|
+
import_statement=import_content,
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return imports
|
|
247
|
+
|
|
178
248
|
def extract_packages(
|
|
179
249
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
180
250
|
) -> list[Package]:
|
|
@@ -60,8 +60,11 @@ from ..utils import setup_logger
|
|
|
60
60
|
from . import MCP_INFO
|
|
61
61
|
from .resources import CodeFileResource, ProjectStatsResource
|
|
62
62
|
from .tools.analyze_scale_tool import AnalyzeScaleTool
|
|
63
|
+
from .tools.find_and_grep_tool import FindAndGrepTool
|
|
64
|
+
from .tools.list_files_tool import ListFilesTool
|
|
63
65
|
from .tools.query_tool import QueryTool
|
|
64
66
|
from .tools.read_partial_tool import ReadPartialTool
|
|
67
|
+
from .tools.search_content_tool import SearchContentTool
|
|
65
68
|
from .tools.table_format_tool import TableFormatTool
|
|
66
69
|
|
|
67
70
|
# Set up logging
|
|
@@ -87,11 +90,15 @@ class TreeSitterAnalyzerMCPServer:
|
|
|
87
90
|
self.security_validator = SecurityValidator(project_root)
|
|
88
91
|
# Use unified analysis engine instead of deprecated AdvancedAnalyzer
|
|
89
92
|
|
|
90
|
-
# Initialize MCP tools with security validation (
|
|
93
|
+
# Initialize MCP tools with security validation (core tools + fd/rg tools)
|
|
91
94
|
self.query_tool = QueryTool(project_root) # query_code
|
|
92
95
|
self.read_partial_tool = ReadPartialTool(project_root) # extract_code_section
|
|
93
96
|
self.table_format_tool = TableFormatTool(project_root) # analyze_code_structure
|
|
94
97
|
self.analyze_scale_tool = AnalyzeScaleTool(project_root) # check_code_scale
|
|
98
|
+
# New fd/rg tools
|
|
99
|
+
self.list_files_tool = ListFilesTool(project_root) # list_files
|
|
100
|
+
self.search_content_tool = SearchContentTool(project_root) # search_content
|
|
101
|
+
self.find_and_grep_tool = FindAndGrepTool(project_root) # find_and_grep
|
|
95
102
|
|
|
96
103
|
# Optional universal tool to satisfy initialization tests
|
|
97
104
|
try:
|
|
@@ -466,6 +473,9 @@ class TreeSitterAnalyzerMCPServer:
|
|
|
466
473
|
},
|
|
467
474
|
),
|
|
468
475
|
Tool(**self.query_tool.get_tool_definition()),
|
|
476
|
+
Tool(**self.list_files_tool.get_tool_definition()),
|
|
477
|
+
Tool(**self.search_content_tool.get_tool_definition()),
|
|
478
|
+
Tool(**self.find_and_grep_tool.get_tool_definition()),
|
|
469
479
|
]
|
|
470
480
|
|
|
471
481
|
logger.info(f"Returning {len(tools)} tools: {[t.name for t in tools]}")
|
|
@@ -545,6 +555,15 @@ class TreeSitterAnalyzerMCPServer:
|
|
|
545
555
|
elif name == "query_code":
|
|
546
556
|
result = await self.query_tool.execute(arguments)
|
|
547
557
|
|
|
558
|
+
elif name == "list_files":
|
|
559
|
+
result = await self.list_files_tool.execute(arguments)
|
|
560
|
+
|
|
561
|
+
elif name == "search_content":
|
|
562
|
+
result = await self.search_content_tool.execute(arguments)
|
|
563
|
+
|
|
564
|
+
elif name == "find_and_grep":
|
|
565
|
+
result = await self.find_and_grep_tool.execute(arguments)
|
|
566
|
+
|
|
548
567
|
else:
|
|
549
568
|
raise ValueError(f"Unknown tool: {name}")
|
|
550
569
|
|
|
@@ -653,6 +672,9 @@ class TreeSitterAnalyzerMCPServer:
|
|
|
653
672
|
self.read_partial_tool.set_project_path(project_path)
|
|
654
673
|
self.table_format_tool.set_project_path(project_path)
|
|
655
674
|
self.analyze_scale_tool.set_project_path(project_path)
|
|
675
|
+
self.list_files_tool.set_project_path(project_path)
|
|
676
|
+
self.search_content_tool.set_project_path(project_path)
|
|
677
|
+
self.find_and_grep_tool.set_project_path(project_path)
|
|
656
678
|
|
|
657
679
|
# Update universal tool if available
|
|
658
680
|
if hasattr(self, "universal_analyze_tool") and self.universal_analyze_tool:
|
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Shared utilities for fd/ripgrep based MCP tools.
|
|
4
|
+
|
|
5
|
+
This module centralizes subprocess execution, command building, result caps,
|
|
6
|
+
and JSON line parsing for ripgrep.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import tempfile
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
# Safety caps (hard limits)
|
|
20
|
+
MAX_RESULTS_HARD_CAP = 10000
|
|
21
|
+
DEFAULT_RESULTS_LIMIT = 2000
|
|
22
|
+
|
|
23
|
+
DEFAULT_RG_MAX_FILESIZE = "10M"
|
|
24
|
+
RG_MAX_FILESIZE_HARD_CAP_BYTES = 200 * 1024 * 1024 # 200M
|
|
25
|
+
|
|
26
|
+
DEFAULT_RG_TIMEOUT_MS = 4000
|
|
27
|
+
RG_TIMEOUT_HARD_CAP_MS = 30000
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def clamp_int(value: int | None, default_value: int, hard_cap: int) -> int:
|
|
31
|
+
if value is None:
|
|
32
|
+
return default_value
|
|
33
|
+
try:
|
|
34
|
+
v = int(value)
|
|
35
|
+
except (TypeError, ValueError):
|
|
36
|
+
return default_value
|
|
37
|
+
return max(0, min(v, hard_cap))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def parse_size_to_bytes(size_str: str) -> int | None:
|
|
41
|
+
"""Parse ripgrep --max-filesize strings like '10M', '200K' to bytes."""
|
|
42
|
+
if not size_str:
|
|
43
|
+
return None
|
|
44
|
+
s = size_str.strip().upper()
|
|
45
|
+
try:
|
|
46
|
+
if s.endswith("K"):
|
|
47
|
+
return int(float(s[:-1]) * 1024)
|
|
48
|
+
if s.endswith("M"):
|
|
49
|
+
return int(float(s[:-1]) * 1024 * 1024)
|
|
50
|
+
if s.endswith("G"):
|
|
51
|
+
return int(float(s[:-1]) * 1024 * 1024 * 1024)
|
|
52
|
+
return int(s)
|
|
53
|
+
except ValueError:
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
async def run_command_capture(
|
|
58
|
+
cmd: list[str],
|
|
59
|
+
input_data: bytes | None = None,
|
|
60
|
+
timeout_ms: int | None = None,
|
|
61
|
+
) -> tuple[int, bytes, bytes]:
|
|
62
|
+
"""Run a subprocess and capture output.
|
|
63
|
+
|
|
64
|
+
Returns (returncode, stdout, stderr). On timeout, kills process and returns 124.
|
|
65
|
+
Separated into a util for easy monkeypatching in tests.
|
|
66
|
+
"""
|
|
67
|
+
# Create process
|
|
68
|
+
proc = await asyncio.create_subprocess_exec(
|
|
69
|
+
*cmd,
|
|
70
|
+
stdin=asyncio.subprocess.PIPE if input_data is not None else None,
|
|
71
|
+
stdout=asyncio.subprocess.PIPE,
|
|
72
|
+
stderr=asyncio.subprocess.PIPE,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Compute timeout seconds
|
|
76
|
+
timeout_s: float | None = None
|
|
77
|
+
if timeout_ms and timeout_ms > 0:
|
|
78
|
+
timeout_s = timeout_ms / 1000.0
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
stdout, stderr = await asyncio.wait_for(
|
|
82
|
+
proc.communicate(input=input_data), timeout=timeout_s
|
|
83
|
+
)
|
|
84
|
+
return proc.returncode, stdout, stderr
|
|
85
|
+
except asyncio.TimeoutError:
|
|
86
|
+
try:
|
|
87
|
+
proc.kill()
|
|
88
|
+
finally:
|
|
89
|
+
with contextlib.suppress(Exception):
|
|
90
|
+
await proc.wait()
|
|
91
|
+
return 124, b"", f"Timeout after {timeout_ms} ms".encode()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def build_fd_command(
|
|
95
|
+
*,
|
|
96
|
+
pattern: str | None,
|
|
97
|
+
glob: bool,
|
|
98
|
+
types: list[str] | None,
|
|
99
|
+
extensions: list[str] | None,
|
|
100
|
+
exclude: list[str] | None,
|
|
101
|
+
depth: int | None,
|
|
102
|
+
follow_symlinks: bool,
|
|
103
|
+
hidden: bool,
|
|
104
|
+
no_ignore: bool,
|
|
105
|
+
size: list[str] | None,
|
|
106
|
+
changed_within: str | None,
|
|
107
|
+
changed_before: str | None,
|
|
108
|
+
full_path_match: bool,
|
|
109
|
+
absolute: bool,
|
|
110
|
+
limit: int | None,
|
|
111
|
+
roots: list[str],
|
|
112
|
+
) -> list[str]:
|
|
113
|
+
"""Build an fd command with appropriate flags."""
|
|
114
|
+
cmd: list[str] = ["fd", "--color", "never"]
|
|
115
|
+
if glob:
|
|
116
|
+
cmd.append("--glob")
|
|
117
|
+
if full_path_match:
|
|
118
|
+
cmd.append("-p")
|
|
119
|
+
if absolute:
|
|
120
|
+
cmd.append("-a")
|
|
121
|
+
if follow_symlinks:
|
|
122
|
+
cmd.append("-L")
|
|
123
|
+
if hidden:
|
|
124
|
+
cmd.append("-H")
|
|
125
|
+
if no_ignore:
|
|
126
|
+
cmd.append("-I")
|
|
127
|
+
if depth is not None:
|
|
128
|
+
cmd += ["-d", str(depth)]
|
|
129
|
+
if types:
|
|
130
|
+
for t in types:
|
|
131
|
+
cmd += ["-t", str(t)]
|
|
132
|
+
if extensions:
|
|
133
|
+
for ext in extensions:
|
|
134
|
+
if ext.startswith("."):
|
|
135
|
+
ext = ext[1:]
|
|
136
|
+
cmd += ["-e", ext]
|
|
137
|
+
if exclude:
|
|
138
|
+
for ex in exclude:
|
|
139
|
+
cmd += ["-E", ex]
|
|
140
|
+
if size:
|
|
141
|
+
for s in size:
|
|
142
|
+
cmd += ["-S", s]
|
|
143
|
+
if changed_within:
|
|
144
|
+
cmd += ["--changed-within", str(changed_within)]
|
|
145
|
+
if changed_before:
|
|
146
|
+
cmd += ["--changed-before", str(changed_before)]
|
|
147
|
+
if limit is not None:
|
|
148
|
+
cmd += ["--max-results", str(limit)]
|
|
149
|
+
|
|
150
|
+
# Pattern goes before roots if present
|
|
151
|
+
# If no pattern is specified, use '.' to match all files
|
|
152
|
+
if pattern:
|
|
153
|
+
cmd.append(pattern)
|
|
154
|
+
else:
|
|
155
|
+
cmd.append(".")
|
|
156
|
+
|
|
157
|
+
# Append roots - these are search directories, not patterns
|
|
158
|
+
if roots:
|
|
159
|
+
cmd += roots
|
|
160
|
+
|
|
161
|
+
return cmd
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def normalize_max_filesize(user_value: str | None) -> str:
|
|
165
|
+
if not user_value:
|
|
166
|
+
return DEFAULT_RG_MAX_FILESIZE
|
|
167
|
+
bytes_val = parse_size_to_bytes(user_value)
|
|
168
|
+
if bytes_val is None:
|
|
169
|
+
return DEFAULT_RG_MAX_FILESIZE
|
|
170
|
+
if bytes_val > RG_MAX_FILESIZE_HARD_CAP_BYTES:
|
|
171
|
+
return "200M"
|
|
172
|
+
return user_value
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def build_rg_command(
|
|
176
|
+
*,
|
|
177
|
+
query: str,
|
|
178
|
+
case: str | None,
|
|
179
|
+
fixed_strings: bool,
|
|
180
|
+
word: bool,
|
|
181
|
+
multiline: bool,
|
|
182
|
+
include_globs: list[str] | None,
|
|
183
|
+
exclude_globs: list[str] | None,
|
|
184
|
+
follow_symlinks: bool,
|
|
185
|
+
hidden: bool,
|
|
186
|
+
no_ignore: bool,
|
|
187
|
+
max_filesize: str | None,
|
|
188
|
+
context_before: int | None,
|
|
189
|
+
context_after: int | None,
|
|
190
|
+
encoding: str | None,
|
|
191
|
+
max_count: int | None,
|
|
192
|
+
timeout_ms: int | None,
|
|
193
|
+
roots: list[str] | None,
|
|
194
|
+
files_from: str | None,
|
|
195
|
+
count_only_matches: bool = False,
|
|
196
|
+
) -> list[str]:
|
|
197
|
+
"""Build ripgrep command with JSON output and options."""
|
|
198
|
+
if count_only_matches:
|
|
199
|
+
# Use --count-matches for count-only mode (no JSON output)
|
|
200
|
+
cmd: list[str] = [
|
|
201
|
+
"rg",
|
|
202
|
+
"--count-matches",
|
|
203
|
+
"--no-heading",
|
|
204
|
+
"--color",
|
|
205
|
+
"never",
|
|
206
|
+
]
|
|
207
|
+
else:
|
|
208
|
+
# Use --json for full match details
|
|
209
|
+
cmd: list[str] = [
|
|
210
|
+
"rg",
|
|
211
|
+
"--json",
|
|
212
|
+
"--no-heading",
|
|
213
|
+
"--color",
|
|
214
|
+
"never",
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
# Case sensitivity
|
|
218
|
+
if case == "smart":
|
|
219
|
+
cmd.append("-S")
|
|
220
|
+
elif case == "insensitive":
|
|
221
|
+
cmd.append("-i")
|
|
222
|
+
elif case == "sensitive":
|
|
223
|
+
cmd.append("-s")
|
|
224
|
+
|
|
225
|
+
if fixed_strings:
|
|
226
|
+
cmd.append("-F")
|
|
227
|
+
if word:
|
|
228
|
+
cmd.append("-w")
|
|
229
|
+
if multiline:
|
|
230
|
+
# Prefer --multiline (does not imply binary)
|
|
231
|
+
cmd.append("--multiline")
|
|
232
|
+
|
|
233
|
+
if follow_symlinks:
|
|
234
|
+
cmd.append("-L")
|
|
235
|
+
if hidden:
|
|
236
|
+
cmd.append("-H")
|
|
237
|
+
if no_ignore:
|
|
238
|
+
# Use -u (respect ignore but include hidden); do not escalate to -uu automatically
|
|
239
|
+
cmd.append("-u")
|
|
240
|
+
|
|
241
|
+
if include_globs:
|
|
242
|
+
for g in include_globs:
|
|
243
|
+
cmd += ["-g", g]
|
|
244
|
+
if exclude_globs:
|
|
245
|
+
for g in exclude_globs:
|
|
246
|
+
# ripgrep exclusion via !pattern
|
|
247
|
+
if not g.startswith("!"):
|
|
248
|
+
cmd += ["-g", f"!{g}"]
|
|
249
|
+
else:
|
|
250
|
+
cmd += ["-g", g]
|
|
251
|
+
|
|
252
|
+
if context_before is not None:
|
|
253
|
+
cmd += ["-B", str(context_before)]
|
|
254
|
+
if context_after is not None:
|
|
255
|
+
cmd += ["-A", str(context_after)]
|
|
256
|
+
if encoding:
|
|
257
|
+
cmd += ["--encoding", encoding]
|
|
258
|
+
if max_count is not None:
|
|
259
|
+
cmd += ["-m", str(max_count)]
|
|
260
|
+
|
|
261
|
+
# Normalize filesize
|
|
262
|
+
cmd += ["--max-filesize", normalize_max_filesize(max_filesize)]
|
|
263
|
+
|
|
264
|
+
# Only add timeout if supported (check if timeout_ms is provided and > 0)
|
|
265
|
+
# Note: --timeout flag may not be available in all ripgrep versions
|
|
266
|
+
# For now, we'll skip the timeout flag to ensure compatibility
|
|
267
|
+
# effective_timeout = clamp_int(timeout_ms, DEFAULT_RG_TIMEOUT_MS, RG_TIMEOUT_HARD_CAP_MS)
|
|
268
|
+
# cmd += ["--timeout", str(effective_timeout)]
|
|
269
|
+
|
|
270
|
+
# Query must be last before roots/files
|
|
271
|
+
cmd.append(query)
|
|
272
|
+
|
|
273
|
+
# Skip --files-from flag as it's not supported in this ripgrep version
|
|
274
|
+
# Use roots instead for compatibility
|
|
275
|
+
if roots:
|
|
276
|
+
cmd += roots
|
|
277
|
+
# Note: files_from functionality is disabled for compatibility
|
|
278
|
+
|
|
279
|
+
return cmd
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def parse_rg_json_lines_to_matches(stdout_bytes: bytes) -> list[dict[str, Any]]:
|
|
283
|
+
"""Parse ripgrep JSON event stream and keep only match events."""
|
|
284
|
+
results: list[dict[str, Any]] = []
|
|
285
|
+
for raw_line in stdout_bytes.splitlines():
|
|
286
|
+
if not raw_line.strip():
|
|
287
|
+
continue
|
|
288
|
+
try:
|
|
289
|
+
evt = json.loads(raw_line.decode("utf-8", errors="replace"))
|
|
290
|
+
except (json.JSONDecodeError, UnicodeDecodeError): # nosec B112
|
|
291
|
+
continue
|
|
292
|
+
if evt.get("type") != "match":
|
|
293
|
+
continue
|
|
294
|
+
data = evt.get("data", {})
|
|
295
|
+
path_text = (data.get("path", {}) or {}).get("text")
|
|
296
|
+
line_number = data.get("line_number")
|
|
297
|
+
line_text = (data.get("lines", {}) or {}).get("text")
|
|
298
|
+
submatches_raw = data.get("submatches", []) or []
|
|
299
|
+
# Normalize line content to reduce token usage
|
|
300
|
+
normalized_line = " ".join(line_text.split()) if line_text else ""
|
|
301
|
+
|
|
302
|
+
# Simplify submatches - remove redundant match text, keep only positions
|
|
303
|
+
simplified_matches = []
|
|
304
|
+
for sm in submatches_raw:
|
|
305
|
+
start = sm.get("start")
|
|
306
|
+
end = sm.get("end")
|
|
307
|
+
if start is not None and end is not None:
|
|
308
|
+
simplified_matches.append([start, end])
|
|
309
|
+
|
|
310
|
+
results.append(
|
|
311
|
+
{
|
|
312
|
+
"file": path_text,
|
|
313
|
+
"line": line_number, # Shortened field name
|
|
314
|
+
"text": normalized_line, # Normalized content
|
|
315
|
+
"matches": simplified_matches, # Simplified match positions
|
|
316
|
+
}
|
|
317
|
+
)
|
|
318
|
+
return results
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def group_matches_by_file(matches: list[dict[str, Any]]) -> dict[str, Any]:
|
|
322
|
+
"""Group matches by file to eliminate file path duplication."""
|
|
323
|
+
if not matches:
|
|
324
|
+
return {"success": True, "count": 0, "files": []}
|
|
325
|
+
|
|
326
|
+
# Group matches by file
|
|
327
|
+
file_groups: dict[str, list[dict[str, Any]]] = {}
|
|
328
|
+
total_matches = 0
|
|
329
|
+
|
|
330
|
+
for match in matches:
|
|
331
|
+
file_path = match.get("file", "unknown")
|
|
332
|
+
if file_path not in file_groups:
|
|
333
|
+
file_groups[file_path] = []
|
|
334
|
+
|
|
335
|
+
# Create match entry without file path
|
|
336
|
+
match_entry = {
|
|
337
|
+
"line": match.get("line", match.get("line_number", "?")),
|
|
338
|
+
"text": match.get("text", match.get("line", "")),
|
|
339
|
+
"positions": match.get("matches", match.get("submatches", [])),
|
|
340
|
+
}
|
|
341
|
+
file_groups[file_path].append(match_entry)
|
|
342
|
+
total_matches += 1
|
|
343
|
+
|
|
344
|
+
# Convert to grouped structure
|
|
345
|
+
files = []
|
|
346
|
+
for file_path, file_matches in file_groups.items():
|
|
347
|
+
files.append({"file": file_path, "matches": file_matches})
|
|
348
|
+
|
|
349
|
+
return {"success": True, "count": total_matches, "files": files}
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def optimize_match_paths(matches: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
353
|
+
"""Optimize file paths in match results to reduce token consumption."""
|
|
354
|
+
if not matches:
|
|
355
|
+
return matches
|
|
356
|
+
|
|
357
|
+
# Find common prefix among all file paths
|
|
358
|
+
file_paths = [match.get("file", "") for match in matches if match.get("file")]
|
|
359
|
+
common_prefix = ""
|
|
360
|
+
if len(file_paths) > 1:
|
|
361
|
+
import os
|
|
362
|
+
|
|
363
|
+
try:
|
|
364
|
+
common_prefix = os.path.commonpath(file_paths)
|
|
365
|
+
except (ValueError, TypeError):
|
|
366
|
+
common_prefix = ""
|
|
367
|
+
|
|
368
|
+
# Optimize each match
|
|
369
|
+
optimized_matches = []
|
|
370
|
+
for match in matches:
|
|
371
|
+
optimized_match = match.copy()
|
|
372
|
+
file_path = match.get("file")
|
|
373
|
+
if file_path:
|
|
374
|
+
optimized_match["file"] = _optimize_file_path(file_path, common_prefix)
|
|
375
|
+
optimized_matches.append(optimized_match)
|
|
376
|
+
|
|
377
|
+
return optimized_matches
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _optimize_file_path(file_path: str, common_prefix: str = "") -> str:
|
|
381
|
+
"""Optimize file path for token efficiency by removing common prefixes and shortening."""
|
|
382
|
+
if not file_path:
|
|
383
|
+
return file_path
|
|
384
|
+
|
|
385
|
+
# Remove common prefix if provided
|
|
386
|
+
if common_prefix and file_path.startswith(common_prefix):
|
|
387
|
+
optimized = file_path[len(common_prefix) :].lstrip("/\\")
|
|
388
|
+
if optimized:
|
|
389
|
+
return optimized
|
|
390
|
+
|
|
391
|
+
# For very long paths, show only the last few components
|
|
392
|
+
from pathlib import Path
|
|
393
|
+
|
|
394
|
+
path_obj = Path(file_path)
|
|
395
|
+
parts = path_obj.parts
|
|
396
|
+
|
|
397
|
+
if len(parts) > 4:
|
|
398
|
+
# Show first part + ... + last 3 parts
|
|
399
|
+
return str(Path(parts[0]) / "..." / Path(*parts[-3:]))
|
|
400
|
+
|
|
401
|
+
return file_path
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def summarize_search_results(
|
|
405
|
+
matches: list[dict[str, Any]], max_files: int = 10, max_total_lines: int = 50
|
|
406
|
+
) -> dict[str, Any]:
|
|
407
|
+
"""Summarize search results to reduce context size while preserving key information."""
|
|
408
|
+
if not matches:
|
|
409
|
+
return {
|
|
410
|
+
"total_matches": 0,
|
|
411
|
+
"total_files": 0,
|
|
412
|
+
"summary": "No matches found",
|
|
413
|
+
"top_files": [],
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
# Group matches by file and find common prefix for optimization
|
|
417
|
+
file_groups: dict[str, list[dict[str, Any]]] = {}
|
|
418
|
+
all_file_paths = []
|
|
419
|
+
for match in matches:
|
|
420
|
+
file_path = match.get("file", "unknown")
|
|
421
|
+
all_file_paths.append(file_path)
|
|
422
|
+
if file_path not in file_groups:
|
|
423
|
+
file_groups[file_path] = []
|
|
424
|
+
file_groups[file_path].append(match)
|
|
425
|
+
|
|
426
|
+
# Find common prefix to optimize paths
|
|
427
|
+
common_prefix = ""
|
|
428
|
+
if len(all_file_paths) > 1:
|
|
429
|
+
import os
|
|
430
|
+
|
|
431
|
+
common_prefix = os.path.commonpath(all_file_paths) if all_file_paths else ""
|
|
432
|
+
|
|
433
|
+
# Sort files by match count (descending)
|
|
434
|
+
sorted_files = sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True)
|
|
435
|
+
|
|
436
|
+
# Create summary
|
|
437
|
+
total_matches = len(matches)
|
|
438
|
+
total_files = len(file_groups)
|
|
439
|
+
|
|
440
|
+
# Top files with match counts
|
|
441
|
+
top_files = []
|
|
442
|
+
remaining_lines = max_total_lines
|
|
443
|
+
|
|
444
|
+
for file_path, file_matches in sorted_files[:max_files]:
|
|
445
|
+
match_count = len(file_matches)
|
|
446
|
+
|
|
447
|
+
# Include a few sample lines from this file
|
|
448
|
+
sample_lines = []
|
|
449
|
+
lines_to_include = min(3, remaining_lines, len(file_matches))
|
|
450
|
+
|
|
451
|
+
for _i, match in enumerate(file_matches[:lines_to_include]):
|
|
452
|
+
line_num = match.get(
|
|
453
|
+
"line", match.get("line_number", "?")
|
|
454
|
+
) # Support both old and new format
|
|
455
|
+
line_text = match.get(
|
|
456
|
+
"text", match.get("line", "")
|
|
457
|
+
).strip() # Support both old and new format
|
|
458
|
+
if line_text:
|
|
459
|
+
# Truncate long lines and remove extra whitespace to save tokens
|
|
460
|
+
truncated_line = " ".join(line_text.split())[:60]
|
|
461
|
+
if len(line_text) > 60:
|
|
462
|
+
truncated_line += "..."
|
|
463
|
+
sample_lines.append(f"L{line_num}: {truncated_line}")
|
|
464
|
+
remaining_lines -= 1
|
|
465
|
+
|
|
466
|
+
# Optimize file path for token efficiency
|
|
467
|
+
optimized_path = _optimize_file_path(file_path, common_prefix)
|
|
468
|
+
|
|
469
|
+
top_files.append(
|
|
470
|
+
{
|
|
471
|
+
"file": optimized_path,
|
|
472
|
+
"match_count": match_count,
|
|
473
|
+
"sample_lines": sample_lines,
|
|
474
|
+
}
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
if remaining_lines <= 0:
|
|
478
|
+
break
|
|
479
|
+
|
|
480
|
+
# Create summary text
|
|
481
|
+
if total_files <= max_files:
|
|
482
|
+
summary = f"Found {total_matches} matches in {total_files} files"
|
|
483
|
+
else:
|
|
484
|
+
summary = f"Found {total_matches} matches in {total_files} files (showing top {len(top_files)})"
|
|
485
|
+
|
|
486
|
+
return {
|
|
487
|
+
"total_matches": total_matches,
|
|
488
|
+
"total_files": total_files,
|
|
489
|
+
"summary": summary,
|
|
490
|
+
"top_files": top_files,
|
|
491
|
+
"truncated": total_files > max_files,
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def parse_rg_count_output(stdout_bytes: bytes) -> dict[str, int]:
|
|
496
|
+
"""Parse ripgrep --count-matches output and return file->count mapping."""
|
|
497
|
+
results: dict[str, int] = {}
|
|
498
|
+
total_matches = 0
|
|
499
|
+
|
|
500
|
+
for line in stdout_bytes.decode("utf-8", errors="replace").splitlines():
|
|
501
|
+
line = line.strip()
|
|
502
|
+
if not line:
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
# Format: "file_path:count"
|
|
506
|
+
if ":" in line:
|
|
507
|
+
file_path, count_str = line.rsplit(":", 1)
|
|
508
|
+
try:
|
|
509
|
+
count = int(count_str)
|
|
510
|
+
results[file_path] = count
|
|
511
|
+
total_matches += count
|
|
512
|
+
except ValueError:
|
|
513
|
+
# Skip lines that don't have valid count format
|
|
514
|
+
continue
|
|
515
|
+
|
|
516
|
+
# Add total count as special key
|
|
517
|
+
results["__total__"] = total_matches
|
|
518
|
+
return results
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
@dataclass
|
|
522
|
+
class TempFileList:
|
|
523
|
+
path: str
|
|
524
|
+
|
|
525
|
+
def __enter__(self) -> TempFileList:
|
|
526
|
+
return self
|
|
527
|
+
|
|
528
|
+
def __exit__(self, exc_type, exc, tb) -> None:
|
|
529
|
+
with contextlib.suppress(Exception):
|
|
530
|
+
Path(self.path).unlink(missing_ok=True)
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
class contextlib: # minimal shim for suppress without importing globally
|
|
534
|
+
class suppress:
|
|
535
|
+
def __init__(self, *exceptions: type[BaseException]) -> None:
|
|
536
|
+
self.exceptions = exceptions
|
|
537
|
+
|
|
538
|
+
def __enter__(self) -> None: # noqa: D401
|
|
539
|
+
return None
|
|
540
|
+
|
|
541
|
+
def __exit__(self, exc_type, exc, tb) -> bool:
|
|
542
|
+
return exc_type is not None and issubclass(exc_type, self.exceptions)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def write_files_to_temp(files: list[str]) -> TempFileList:
|
|
546
|
+
fd, temp_path = tempfile.mkstemp(prefix="rg-files-", suffix=".lst")
|
|
547
|
+
os.close(fd)
|
|
548
|
+
content = "\n".join(files)
|
|
549
|
+
Path(temp_path).write_text(content, encoding="utf-8")
|
|
550
|
+
return TempFileList(path=temp_path)
|