tree-sitter-analyzer 1.9.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. tree_sitter_analyzer/__init__.py +132 -0
  2. tree_sitter_analyzer/__main__.py +11 -0
  3. tree_sitter_analyzer/api.py +853 -0
  4. tree_sitter_analyzer/cli/__init__.py +39 -0
  5. tree_sitter_analyzer/cli/__main__.py +12 -0
  6. tree_sitter_analyzer/cli/argument_validator.py +89 -0
  7. tree_sitter_analyzer/cli/commands/__init__.py +26 -0
  8. tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
  9. tree_sitter_analyzer/cli/commands/base_command.py +181 -0
  10. tree_sitter_analyzer/cli/commands/default_command.py +18 -0
  11. tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
  12. tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
  13. tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
  14. tree_sitter_analyzer/cli/commands/query_command.py +109 -0
  15. tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
  16. tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
  17. tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
  18. tree_sitter_analyzer/cli/commands/table_command.py +414 -0
  19. tree_sitter_analyzer/cli/info_commands.py +124 -0
  20. tree_sitter_analyzer/cli_main.py +472 -0
  21. tree_sitter_analyzer/constants.py +85 -0
  22. tree_sitter_analyzer/core/__init__.py +15 -0
  23. tree_sitter_analyzer/core/analysis_engine.py +580 -0
  24. tree_sitter_analyzer/core/cache_service.py +333 -0
  25. tree_sitter_analyzer/core/engine.py +585 -0
  26. tree_sitter_analyzer/core/parser.py +293 -0
  27. tree_sitter_analyzer/core/query.py +605 -0
  28. tree_sitter_analyzer/core/query_filter.py +200 -0
  29. tree_sitter_analyzer/core/query_service.py +340 -0
  30. tree_sitter_analyzer/encoding_utils.py +530 -0
  31. tree_sitter_analyzer/exceptions.py +747 -0
  32. tree_sitter_analyzer/file_handler.py +246 -0
  33. tree_sitter_analyzer/formatters/__init__.py +1 -0
  34. tree_sitter_analyzer/formatters/base_formatter.py +201 -0
  35. tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
  36. tree_sitter_analyzer/formatters/formatter_config.py +197 -0
  37. tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
  38. tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
  39. tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
  40. tree_sitter_analyzer/formatters/go_formatter.py +368 -0
  41. tree_sitter_analyzer/formatters/html_formatter.py +498 -0
  42. tree_sitter_analyzer/formatters/java_formatter.py +423 -0
  43. tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
  44. tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
  45. tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
  46. tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
  47. tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
  48. tree_sitter_analyzer/formatters/php_formatter.py +301 -0
  49. tree_sitter_analyzer/formatters/python_formatter.py +830 -0
  50. tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
  51. tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
  52. tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
  53. tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
  54. tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
  55. tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
  56. tree_sitter_analyzer/interfaces/__init__.py +9 -0
  57. tree_sitter_analyzer/interfaces/cli.py +535 -0
  58. tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
  59. tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
  60. tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
  61. tree_sitter_analyzer/language_detector.py +553 -0
  62. tree_sitter_analyzer/language_loader.py +271 -0
  63. tree_sitter_analyzer/languages/__init__.py +10 -0
  64. tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
  65. tree_sitter_analyzer/languages/css_plugin.py +449 -0
  66. tree_sitter_analyzer/languages/go_plugin.py +836 -0
  67. tree_sitter_analyzer/languages/html_plugin.py +496 -0
  68. tree_sitter_analyzer/languages/java_plugin.py +1299 -0
  69. tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
  70. tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
  71. tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
  72. tree_sitter_analyzer/languages/php_plugin.py +862 -0
  73. tree_sitter_analyzer/languages/python_plugin.py +1636 -0
  74. tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
  75. tree_sitter_analyzer/languages/rust_plugin.py +673 -0
  76. tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
  77. tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
  78. tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
  79. tree_sitter_analyzer/legacy_table_formatter.py +860 -0
  80. tree_sitter_analyzer/mcp/__init__.py +34 -0
  81. tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
  82. tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
  83. tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
  84. tree_sitter_analyzer/mcp/server.py +869 -0
  85. tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
  86. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
  87. tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
  88. tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
  89. tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
  90. tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
  91. tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
  92. tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
  93. tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
  94. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
  95. tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
  96. tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
  97. tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
  98. tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
  99. tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
  100. tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
  101. tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
  102. tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
  103. tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
  104. tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
  105. tree_sitter_analyzer/models.py +840 -0
  106. tree_sitter_analyzer/mypy_current_errors.txt +2 -0
  107. tree_sitter_analyzer/output_manager.py +255 -0
  108. tree_sitter_analyzer/platform_compat/__init__.py +3 -0
  109. tree_sitter_analyzer/platform_compat/adapter.py +324 -0
  110. tree_sitter_analyzer/platform_compat/compare.py +224 -0
  111. tree_sitter_analyzer/platform_compat/detector.py +67 -0
  112. tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
  113. tree_sitter_analyzer/platform_compat/profiles.py +217 -0
  114. tree_sitter_analyzer/platform_compat/record.py +55 -0
  115. tree_sitter_analyzer/platform_compat/recorder.py +155 -0
  116. tree_sitter_analyzer/platform_compat/report.py +92 -0
  117. tree_sitter_analyzer/plugins/__init__.py +280 -0
  118. tree_sitter_analyzer/plugins/base.py +647 -0
  119. tree_sitter_analyzer/plugins/manager.py +384 -0
  120. tree_sitter_analyzer/project_detector.py +328 -0
  121. tree_sitter_analyzer/queries/__init__.py +27 -0
  122. tree_sitter_analyzer/queries/csharp.py +216 -0
  123. tree_sitter_analyzer/queries/css.py +615 -0
  124. tree_sitter_analyzer/queries/go.py +275 -0
  125. tree_sitter_analyzer/queries/html.py +543 -0
  126. tree_sitter_analyzer/queries/java.py +402 -0
  127. tree_sitter_analyzer/queries/javascript.py +724 -0
  128. tree_sitter_analyzer/queries/kotlin.py +192 -0
  129. tree_sitter_analyzer/queries/markdown.py +258 -0
  130. tree_sitter_analyzer/queries/php.py +95 -0
  131. tree_sitter_analyzer/queries/python.py +859 -0
  132. tree_sitter_analyzer/queries/ruby.py +92 -0
  133. tree_sitter_analyzer/queries/rust.py +223 -0
  134. tree_sitter_analyzer/queries/sql.py +555 -0
  135. tree_sitter_analyzer/queries/typescript.py +871 -0
  136. tree_sitter_analyzer/queries/yaml.py +236 -0
  137. tree_sitter_analyzer/query_loader.py +272 -0
  138. tree_sitter_analyzer/security/__init__.py +22 -0
  139. tree_sitter_analyzer/security/boundary_manager.py +277 -0
  140. tree_sitter_analyzer/security/regex_checker.py +297 -0
  141. tree_sitter_analyzer/security/validator.py +599 -0
  142. tree_sitter_analyzer/table_formatter.py +782 -0
  143. tree_sitter_analyzer/utils/__init__.py +53 -0
  144. tree_sitter_analyzer/utils/logging.py +433 -0
  145. tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
  146. tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
  147. tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
  148. tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
  149. tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
@@ -0,0 +1,816 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Shared utilities for fd/ripgrep based MCP tools.
4
+
5
+ This module centralizes subprocess execution, command building, result caps,
6
+ and JSON line parsing for ripgrep.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import json
13
+ import os
14
+ import shutil
15
+ import tempfile
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ # Safety caps (hard limits)
21
+ MAX_RESULTS_HARD_CAP = 10000
22
+ DEFAULT_RESULTS_LIMIT = 2000
23
+
24
+ DEFAULT_RG_MAX_FILESIZE = "10M"
25
+ RG_MAX_FILESIZE_HARD_CAP_BYTES = 200 * 1024 * 1024 # 200M
26
+
27
+ DEFAULT_RG_TIMEOUT_MS = 4000
28
+ RG_TIMEOUT_HARD_CAP_MS = 30000
29
+
30
+
31
+ def check_external_command(command: str) -> bool:
32
+ """Check if an external command is available in the system PATH."""
33
+ return shutil.which(command) is not None
34
+
35
+
36
+ def get_missing_commands() -> list[str]:
37
+ """Get list of missing external commands required by fd/rg tools."""
38
+ missing = []
39
+ if not check_external_command("fd"):
40
+ missing.append("fd")
41
+ if not check_external_command("rg"):
42
+ missing.append("rg")
43
+ return missing
44
+
45
+
46
+ def clamp_int(value: int | None, default_value: int, hard_cap: int) -> int:
47
+ if value is None:
48
+ return default_value
49
+ try:
50
+ v = int(value)
51
+ except (TypeError, ValueError):
52
+ return default_value
53
+ return max(0, min(v, hard_cap))
54
+
55
+
56
+ def parse_size_to_bytes(size_str: str) -> int | None:
57
+ """Parse ripgrep --max-filesize strings like '10M', '200K' to bytes."""
58
+ if not size_str:
59
+ return None
60
+ s = size_str.strip().upper()
61
+ try:
62
+ if s.endswith("K"):
63
+ return int(float(s[:-1]) * 1024)
64
+ if s.endswith("M"):
65
+ return int(float(s[:-1]) * 1024 * 1024)
66
+ if s.endswith("G"):
67
+ return int(float(s[:-1]) * 1024 * 1024 * 1024)
68
+ return int(s)
69
+ except ValueError:
70
+ return None
71
+
72
+
73
+ async def run_command_capture(
74
+ cmd: list[str],
75
+ input_data: bytes | None = None,
76
+ timeout_ms: int | None = None,
77
+ ) -> tuple[int, bytes, bytes]:
78
+ """Run a subprocess and capture output.
79
+
80
+ Returns (returncode, stdout, stderr). On timeout, kills process and returns 124.
81
+ Separated into a util for easy monkeypatching in tests.
82
+ """
83
+ # Check if command exists before attempting to run
84
+ if cmd and not check_external_command(cmd[0]):
85
+ error_msg = f"Command '{cmd[0]}' not found in PATH. Please install {cmd[0]} to use this functionality."
86
+ return 127, b"", error_msg.encode()
87
+
88
+ try:
89
+ # Create process
90
+ proc = await asyncio.create_subprocess_exec(
91
+ *cmd,
92
+ stdin=asyncio.subprocess.PIPE if input_data is not None else None,
93
+ stdout=asyncio.subprocess.PIPE,
94
+ stderr=asyncio.subprocess.PIPE,
95
+ )
96
+ except FileNotFoundError as e:
97
+ error_msg = f"Command '{cmd[0]}' not found: {e}"
98
+ return 127, b"", error_msg.encode()
99
+
100
+ # Compute timeout seconds
101
+ timeout_s: float | None = None
102
+ if timeout_ms and timeout_ms > 0:
103
+ timeout_s = timeout_ms / 1000.0
104
+
105
+ try:
106
+ stdout, stderr = await asyncio.wait_for(
107
+ proc.communicate(input=input_data), timeout=timeout_s
108
+ )
109
+ return proc.returncode or 0, stdout, stderr
110
+ except asyncio.TimeoutError:
111
+ try:
112
+ proc.kill()
113
+ finally:
114
+ with contextlib.suppress(Exception):
115
+ await proc.wait()
116
+ return 124, b"", f"Timeout after {timeout_ms} ms".encode()
117
+
118
+
119
+ def build_fd_command(
120
+ *,
121
+ pattern: str | None,
122
+ glob: bool,
123
+ types: list[str] | None,
124
+ extensions: list[str] | None,
125
+ exclude: list[str] | None,
126
+ depth: int | None,
127
+ follow_symlinks: bool,
128
+ hidden: bool,
129
+ no_ignore: bool,
130
+ size: list[str] | None,
131
+ changed_within: str | None,
132
+ changed_before: str | None,
133
+ full_path_match: bool,
134
+ absolute: bool,
135
+ limit: int | None,
136
+ roots: list[str],
137
+ ) -> list[str]:
138
+ """Build an fd command with appropriate flags."""
139
+ cmd: list[str] = ["fd", "--color", "never"]
140
+ if glob:
141
+ cmd.append("--glob")
142
+ if full_path_match:
143
+ cmd.append("-p")
144
+ if absolute:
145
+ cmd.append("-a")
146
+ if follow_symlinks:
147
+ cmd.append("-L")
148
+ if hidden:
149
+ cmd.append("-H")
150
+ if no_ignore:
151
+ cmd.append("-I")
152
+ if depth is not None:
153
+ cmd += ["-d", str(depth)]
154
+ if types:
155
+ for t in types:
156
+ cmd += ["-t", str(t)]
157
+ if extensions:
158
+ for ext in extensions:
159
+ if ext.startswith("."):
160
+ ext = ext[1:]
161
+ cmd += ["-e", ext]
162
+ if exclude:
163
+ for ex in exclude:
164
+ cmd += ["-E", ex]
165
+ if size:
166
+ for s in size:
167
+ cmd += ["-S", s]
168
+ if changed_within:
169
+ cmd += ["--changed-within", str(changed_within)]
170
+ if changed_before:
171
+ cmd += ["--changed-before", str(changed_before)]
172
+ if limit is not None:
173
+ cmd += ["--max-results", str(limit)]
174
+
175
+ # Pattern goes before roots if present
176
+ # If no pattern is specified, use '.' to match all files (required to prevent roots being interpreted as pattern)
177
+ if pattern:
178
+ cmd.append(pattern)
179
+ else:
180
+ cmd.append(".")
181
+
182
+ # Append roots - these are search directories, not patterns
183
+ if roots:
184
+ cmd += roots
185
+
186
+ return cmd
187
+
188
+
189
+ def normalize_max_filesize(user_value: str | None) -> str:
190
+ if not user_value:
191
+ return DEFAULT_RG_MAX_FILESIZE
192
+ bytes_val = parse_size_to_bytes(user_value)
193
+ if bytes_val is None:
194
+ return DEFAULT_RG_MAX_FILESIZE
195
+ if bytes_val > RG_MAX_FILESIZE_HARD_CAP_BYTES:
196
+ return "200M"
197
+ return user_value
198
+
199
+
200
+ def build_rg_command(
201
+ *,
202
+ query: str,
203
+ case: str | None,
204
+ fixed_strings: bool,
205
+ word: bool,
206
+ multiline: bool,
207
+ include_globs: list[str] | None,
208
+ exclude_globs: list[str] | None,
209
+ follow_symlinks: bool,
210
+ hidden: bool,
211
+ no_ignore: bool,
212
+ max_filesize: str | None,
213
+ context_before: int | None,
214
+ context_after: int | None,
215
+ encoding: str | None,
216
+ max_count: int | None,
217
+ timeout_ms: int | None,
218
+ roots: list[str] | None,
219
+ files_from: str | None,
220
+ count_only_matches: bool = False,
221
+ ) -> list[str]:
222
+ """Build ripgrep command with JSON output and options."""
223
+ if count_only_matches:
224
+ # Use --count-matches for count-only mode (no JSON output)
225
+ cmd = [
226
+ "rg",
227
+ "--count-matches",
228
+ "--no-heading",
229
+ "--color",
230
+ "never",
231
+ ]
232
+ else:
233
+ # Use --json for full match details
234
+ cmd = [
235
+ "rg",
236
+ "--json",
237
+ "--no-heading",
238
+ "--color",
239
+ "never",
240
+ ]
241
+
242
+ # Case sensitivity
243
+ if case == "smart":
244
+ cmd.append("-S")
245
+ elif case == "insensitive":
246
+ cmd.append("-i")
247
+ elif case == "sensitive":
248
+ cmd.append("-s")
249
+
250
+ if fixed_strings:
251
+ cmd.append("-F")
252
+ if word:
253
+ cmd.append("-w")
254
+ if multiline:
255
+ # Prefer --multiline (does not imply binary)
256
+ cmd.append("--multiline")
257
+
258
+ if follow_symlinks:
259
+ cmd.append("-L")
260
+ if hidden:
261
+ cmd.append("-H")
262
+ if no_ignore:
263
+ # Use -u (respect ignore but include hidden); do not escalate to -uu automatically
264
+ cmd.append("-u")
265
+
266
+ if include_globs:
267
+ for g in include_globs:
268
+ cmd += ["-g", g]
269
+ if exclude_globs:
270
+ for g in exclude_globs:
271
+ # ripgrep exclusion via !pattern
272
+ if not g.startswith("!"):
273
+ cmd += ["-g", f"!{g}"]
274
+ else:
275
+ cmd += ["-g", g]
276
+
277
+ if context_before is not None:
278
+ cmd += ["-B", str(context_before)]
279
+ if context_after is not None:
280
+ cmd += ["-A", str(context_after)]
281
+ if encoding:
282
+ cmd += ["--encoding", encoding]
283
+ if max_count is not None:
284
+ cmd += ["-m", str(max_count)]
285
+
286
+ # Normalize filesize
287
+ cmd += ["--max-filesize", normalize_max_filesize(max_filesize)]
288
+
289
+ # Add timeout if provided and > 0 (enable timeout for performance optimization)
290
+ if timeout_ms is not None and timeout_ms > 0:
291
+ # effective_timeout = clamp_int(
292
+ # timeout_ms, DEFAULT_RG_TIMEOUT_MS, RG_TIMEOUT_HARD_CAP_MS
293
+ # ) # Commented out as not used yet
294
+ # Use timeout in milliseconds for better control
295
+ # Note: We'll handle timeout at the process level instead of ripgrep flag
296
+ # to ensure compatibility across ripgrep versions
297
+ pass
298
+
299
+ # Query must be last before roots/files
300
+ cmd.append(query)
301
+
302
+ # Skip --files-from flag as it's not supported in this ripgrep version
303
+ # Use roots instead for compatibility
304
+ if roots:
305
+ cmd += roots
306
+ # Note: files_from functionality is disabled for compatibility
307
+
308
+ return cmd
309
+
310
+
311
+ def parse_rg_json_lines_to_matches(stdout_bytes: bytes) -> list[dict[str, Any]]:
312
+ """Parse ripgrep JSON event stream and keep only match events."""
313
+ results: list[dict[str, Any]] = []
314
+ lines = stdout_bytes.splitlines()
315
+
316
+ # Batch process lines for better performance
317
+ for raw_line in lines:
318
+ if not raw_line.strip():
319
+ continue
320
+ try:
321
+ # Decode once and parse JSON
322
+ line_str = raw_line.decode("utf-8", errors="replace")
323
+ evt = json.loads(line_str)
324
+ except (json.JSONDecodeError, UnicodeDecodeError): # nosec B112
325
+ continue
326
+
327
+ # Quick type check to skip non-match events
328
+ if evt.get("type") != "match":
329
+ continue
330
+
331
+ data = evt.get("data", {})
332
+ if not data:
333
+ continue
334
+
335
+ # Extract data with safe defaults
336
+ path_data = data.get("path", {})
337
+ path_text = path_data.get("text") if path_data else None
338
+ if not path_text:
339
+ continue
340
+
341
+ line_number = data.get("line_number")
342
+ lines_data = data.get("lines", {})
343
+ line_text = lines_data.get("text") if lines_data else ""
344
+
345
+ # Normalize line content to reduce token usage (optimized)
346
+ normalized_line = " ".join(line_text.split()) if line_text else ""
347
+
348
+ # Simplify submatches - keep only essential position data
349
+ submatches_raw = data.get("submatches", [])
350
+ simplified_matches = []
351
+ if submatches_raw:
352
+ for sm in submatches_raw:
353
+ start = sm.get("start")
354
+ end = sm.get("end")
355
+ if start is not None and end is not None:
356
+ simplified_matches.append([start, end])
357
+
358
+ results.append(
359
+ {
360
+ "file": path_text,
361
+ "line": line_number,
362
+ "text": normalized_line,
363
+ "matches": simplified_matches,
364
+ }
365
+ )
366
+
367
+ # Early exit if we have too many results to prevent memory issues
368
+ if len(results) >= MAX_RESULTS_HARD_CAP:
369
+ break
370
+
371
+ return results
372
+
373
+
374
+ def group_matches_by_file(matches: list[dict[str, Any]]) -> dict[str, Any]:
375
+ """Group matches by file to eliminate file path duplication."""
376
+ if not matches:
377
+ return {"success": True, "count": 0, "files": []}
378
+
379
+ # Group matches by file
380
+ file_groups: dict[str, list[dict[str, Any]]] = {}
381
+ total_matches = 0
382
+
383
+ for match in matches:
384
+ file_path = match.get("file", "unknown")
385
+ if file_path not in file_groups:
386
+ file_groups[file_path] = []
387
+
388
+ # Create match entry without file path
389
+ match_entry = {
390
+ "line": match.get("line", match.get("line_number", "?")),
391
+ "text": match.get("text", match.get("line", "")),
392
+ "positions": match.get("matches", match.get("submatches", [])),
393
+ }
394
+ file_groups[file_path].append(match_entry)
395
+ total_matches += 1
396
+
397
+ # Convert to grouped structure
398
+ files = []
399
+ for file_path, file_matches in file_groups.items():
400
+ files.append(
401
+ {
402
+ "file": file_path,
403
+ "matches": file_matches,
404
+ "match_count": len(file_matches),
405
+ }
406
+ )
407
+
408
+ return {"success": True, "count": total_matches, "files": files}
409
+
410
+
411
+ def optimize_match_paths(matches: list[dict[str, Any]]) -> list[dict[str, Any]]:
412
+ """Optimize file paths in match results to reduce token consumption."""
413
+ if not matches:
414
+ return matches
415
+
416
+ # Find common prefix among all file paths
417
+ file_paths = [match.get("file", "") for match in matches if match.get("file")]
418
+ common_prefix = ""
419
+ if len(file_paths) > 1:
420
+ import os
421
+
422
+ try:
423
+ common_prefix = os.path.commonpath(file_paths)
424
+ except (ValueError, TypeError):
425
+ common_prefix = ""
426
+
427
+ # Optimize each match
428
+ optimized_matches = []
429
+ for match in matches:
430
+ optimized_match = match.copy()
431
+ file_path = match.get("file")
432
+ if file_path:
433
+ optimized_match["file"] = _optimize_file_path(file_path, common_prefix)
434
+ optimized_matches.append(optimized_match)
435
+
436
+ return optimized_matches
437
+
438
+
439
+ def _optimize_file_path(file_path: str, common_prefix: str = "") -> str:
440
+ """Optimize file path for token efficiency by removing common prefixes and shortening."""
441
+ if not file_path:
442
+ return file_path
443
+
444
+ # Remove common prefix if provided
445
+ if common_prefix and file_path.startswith(common_prefix):
446
+ optimized = file_path[len(common_prefix) :].lstrip("/\\")
447
+ if optimized:
448
+ return optimized
449
+
450
+ # For very long paths, show only the last few components
451
+ from pathlib import Path
452
+
453
+ path_obj = Path(file_path)
454
+ parts = path_obj.parts
455
+
456
+ if len(parts) > 4:
457
+ # Show first part + ... + last 3 parts
458
+ return str(Path(parts[0]) / "..." / Path(*parts[-3:]))
459
+
460
+ return file_path
461
+
462
+
463
+ def summarize_search_results(
464
+ matches: list[dict[str, Any]], max_files: int = 10, max_total_lines: int = 50
465
+ ) -> dict[str, Any]:
466
+ """Summarize search results to reduce context size while preserving key information."""
467
+ if not matches:
468
+ return {
469
+ "total_matches": 0,
470
+ "total_files": 0,
471
+ "summary": "No matches found",
472
+ "top_files": [],
473
+ }
474
+
475
+ # Group matches by file and find common prefix for optimization
476
+ file_groups: dict[str, list[dict[str, Any]]] = {}
477
+ all_file_paths = []
478
+ for match in matches:
479
+ file_path = match.get("file", "unknown")
480
+ all_file_paths.append(file_path)
481
+ if file_path not in file_groups:
482
+ file_groups[file_path] = []
483
+ file_groups[file_path].append(match)
484
+
485
+ # Find common prefix to optimize paths
486
+ common_prefix = ""
487
+ if len(all_file_paths) > 1:
488
+ import os
489
+
490
+ common_prefix = os.path.commonpath(all_file_paths) if all_file_paths else ""
491
+
492
+ # Sort files by match count (descending)
493
+ sorted_files = sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True)
494
+
495
+ # Create summary
496
+ total_matches = len(matches)
497
+ total_files = len(file_groups)
498
+
499
+ # Top files with match counts
500
+ top_files = []
501
+ remaining_lines = max_total_lines
502
+
503
+ for file_path, file_matches in sorted_files[:max_files]:
504
+ match_count = len(file_matches)
505
+
506
+ # Include a few sample lines from this file
507
+ sample_lines = []
508
+ lines_to_include = min(3, remaining_lines, len(file_matches))
509
+
510
+ for _i, match in enumerate(file_matches[:lines_to_include]):
511
+ line_num = match.get(
512
+ "line", match.get("line_number", "?")
513
+ ) # Support both old and new format
514
+ line_text = match.get(
515
+ "text", match.get("line", "")
516
+ ).strip() # Support both old and new format
517
+ if line_text:
518
+ # Truncate long lines and remove extra whitespace to save tokens
519
+ truncated_line = " ".join(line_text.split())[:60]
520
+ if len(line_text) > 60:
521
+ truncated_line += "..."
522
+ sample_lines.append(f"L{line_num}: {truncated_line}")
523
+ remaining_lines -= 1
524
+
525
+ # Ensure we have at least some sample lines if matches exist
526
+ if not sample_lines and file_matches:
527
+ # Fallback: create a simple summary line
528
+ sample_lines.append(f"Found {len(file_matches)} matches")
529
+
530
+ # Optimize file path for token efficiency
531
+ optimized_path = _optimize_file_path(file_path, common_prefix)
532
+
533
+ top_files.append(
534
+ {
535
+ "file": optimized_path,
536
+ "match_count": match_count,
537
+ "sample_lines": sample_lines,
538
+ }
539
+ )
540
+
541
+ if remaining_lines <= 0:
542
+ break
543
+
544
+ # Create summary text
545
+ if total_files <= max_files:
546
+ summary = f"Found {total_matches} matches in {total_files} files"
547
+ else:
548
+ summary = f"Found {total_matches} matches in {total_files} files (showing top {len(top_files)})"
549
+
550
+ return {
551
+ "total_matches": total_matches,
552
+ "total_files": total_files,
553
+ "summary": summary,
554
+ "top_files": top_files,
555
+ "truncated": total_files > max_files,
556
+ }
557
+
558
+
559
+ def parse_rg_count_output(stdout_bytes: bytes) -> dict[str, int]:
560
+ """Parse ripgrep --count-matches output and return file->count mapping."""
561
+ results: dict[str, int] = {}
562
+ total_matches = 0
563
+
564
+ for line in stdout_bytes.decode("utf-8", errors="replace").splitlines():
565
+ line = line.strip()
566
+ if not line:
567
+ continue
568
+
569
+ # Format: "file_path:count"
570
+ if ":" in line:
571
+ file_path, count_str = line.rsplit(":", 1)
572
+ try:
573
+ count = int(count_str)
574
+ results[file_path] = count
575
+ total_matches += count
576
+ except ValueError:
577
+ # Skip lines that don't have valid count format
578
+ continue
579
+
580
+ # Add total count as special key
581
+ results["__total__"] = total_matches
582
+ return results
583
+
584
+
585
+ def extract_file_list_from_count_data(count_data: dict[str, int]) -> list[str]:
586
+ """Extract file list from count data, excluding the special __total__ key."""
587
+ return [file_path for file_path in count_data.keys() if file_path != "__total__"]
588
+
589
+
590
+ def create_file_summary_from_count_data(count_data: dict[str, int]) -> dict[str, Any]:
591
+ """Create a file summary structure from count data."""
592
+ file_list = extract_file_list_from_count_data(count_data)
593
+ total_matches = count_data.get("__total__", 0)
594
+
595
+ return {
596
+ "success": True,
597
+ "total_matches": total_matches,
598
+ "file_count": len(file_list),
599
+ "files": [
600
+ {"file": file_path, "match_count": count_data[file_path]}
601
+ for file_path in file_list
602
+ ],
603
+ "derived_from_count": True, # 标识这是从count数据推导的
604
+ }
605
+
606
+
607
+ @dataclass
608
+ class TempFileList:
609
+ path: str
610
+
611
+ def __enter__(self) -> TempFileList:
612
+ return self
613
+
614
+ def __exit__(
615
+ self, exc_type: type[BaseException] | None, exc: BaseException | None, tb: Any
616
+ ) -> None:
617
+ with contextlib.suppress(Exception):
618
+ Path(self.path).unlink(missing_ok=True)
619
+
620
+
621
+ class contextlib: # minimal shim for suppress without importing globally
622
+ class suppress:
623
+ def __init__(self, *exceptions: type[BaseException]) -> None:
624
+ self.exceptions = exceptions
625
+
626
+ def __enter__(self) -> None: # noqa: D401
627
+ return None
628
+
629
+ def __exit__(
630
+ self,
631
+ exc_type: type[BaseException] | None,
632
+ exc: BaseException | None,
633
+ tb: Any,
634
+ ) -> bool:
635
+ return exc_type is not None and issubclass(exc_type, self.exceptions)
636
+
637
+
638
+ def write_files_to_temp(files: list[str]) -> TempFileList:
639
+ fd, temp_path = tempfile.mkstemp(prefix="rg-files-", suffix=".lst")
640
+ os.close(fd)
641
+ content = "\n".join(files)
642
+ from ...encoding_utils import write_file_safe
643
+
644
+ write_file_safe(temp_path, content)
645
+ return TempFileList(path=temp_path)
646
+
647
+
648
+ async def run_parallel_rg_searches(
649
+ commands: list[list[str]],
650
+ timeout_ms: int | None = None,
651
+ max_concurrent: int = 4,
652
+ ) -> list[tuple[int, bytes, bytes]]:
653
+ """
654
+ Run multiple ripgrep commands in parallel with concurrency control.
655
+
656
+ Args:
657
+ commands: List of ripgrep command lists to execute
658
+ timeout_ms: Timeout in milliseconds for each command
659
+ max_concurrent: Maximum number of concurrent processes (default: 4)
660
+
661
+ Returns:
662
+ List of (returncode, stdout, stderr) tuples in the same order as commands
663
+ """
664
+ if not commands:
665
+ return []
666
+
667
+ # Create semaphore to limit concurrent processes
668
+ semaphore = asyncio.Semaphore(max_concurrent)
669
+
670
+ async def run_single_command(cmd: list[str]) -> tuple[int, bytes, bytes]:
671
+ async with semaphore:
672
+ return await run_command_capture(cmd, timeout_ms=timeout_ms)
673
+
674
+ # Execute all commands concurrently
675
+ tasks = [run_single_command(cmd) for cmd in commands]
676
+ results = await asyncio.gather(*tasks, return_exceptions=True)
677
+
678
+ # Handle exceptions and convert to proper format
679
+ processed_results: list[tuple[int, bytes, bytes]] = []
680
+ for _i, result in enumerate(results):
681
+ if isinstance(result, Exception):
682
+ # Convert exception to error result
683
+ error_msg = f"Command failed: {str(result)}"
684
+ processed_results.append((1, b"", error_msg.encode()))
685
+ elif isinstance(result, tuple) and len(result) == 3:
686
+ processed_results.append(result)
687
+ else:
688
+ # Fallback for unexpected result types
689
+ processed_results.append((1, b"", b"Unexpected result type"))
690
+
691
+ return processed_results
692
+
693
+
694
+ def merge_rg_results(
695
+ results: list[tuple[int, bytes, bytes]],
696
+ count_only_mode: bool = False,
697
+ ) -> tuple[int, bytes, bytes]:
698
+ """
699
+ Merge results from multiple ripgrep executions.
700
+
701
+ Args:
702
+ results: List of (returncode, stdout, stderr) tuples
703
+ count_only_mode: Whether the results are from count-only mode
704
+
705
+ Returns:
706
+ Merged (returncode, stdout, stderr) tuple
707
+ """
708
+ if not results:
709
+ return (1, b"", b"No results to merge")
710
+
711
+ # Check if any command failed critically (not just "no matches found")
712
+ critical_failures = []
713
+ successful_results = []
714
+
715
+ for rc, stdout, stderr in results:
716
+ if rc not in (0, 1): # 0=matches found, 1=no matches, others=errors
717
+ critical_failures.append((rc, stdout, stderr))
718
+ else:
719
+ successful_results.append((rc, stdout, stderr))
720
+
721
+ # If all commands failed critically, return the first failure
722
+ if not successful_results:
723
+ return critical_failures[0] if critical_failures else (1, b"", b"")
724
+
725
+ # Merge successful results
726
+ if count_only_mode:
727
+ return _merge_count_results(successful_results)
728
+ else:
729
+ return _merge_json_results(successful_results)
730
+
731
+
732
+ def _merge_count_results(
733
+ results: list[tuple[int, bytes, bytes]],
734
+ ) -> tuple[int, bytes, bytes]:
735
+ """Merge count-only results from multiple ripgrep executions."""
736
+ merged_counts: dict[str, int] = {}
737
+ total_matches = 0
738
+
739
+ for rc, stdout, _stderr in results:
740
+ if rc in (0, 1): # Success or no matches
741
+ file_counts = parse_rg_count_output(stdout)
742
+ # Remove the __total__ key and merge file counts
743
+ for file_path, count in file_counts.items():
744
+ if file_path != "__total__":
745
+ merged_counts[file_path] = merged_counts.get(file_path, 0) + count
746
+ total_matches += count
747
+
748
+ # Format as ripgrep count output
749
+ output_lines = []
750
+ for file_path, count in merged_counts.items():
751
+ output_lines.append(f"{file_path}:{count}")
752
+
753
+ merged_stdout = "\n".join(output_lines).encode("utf-8")
754
+
755
+ # Return code 0 if we have matches, 1 if no matches
756
+ return_code = 0 if total_matches > 0 else 1
757
+ return (return_code, merged_stdout, b"")
758
+
759
+
760
+ def _merge_json_results(
761
+ results: list[tuple[int, bytes, bytes]],
762
+ ) -> tuple[int, bytes, bytes]:
763
+ """Merge JSON results from multiple ripgrep executions."""
764
+ merged_lines = []
765
+ has_matches = False
766
+
767
+ for rc, stdout, _stderr in results:
768
+ if rc in (0, 1): # Success or no matches
769
+ if stdout.strip():
770
+ merged_lines.extend(stdout.splitlines())
771
+ if rc == 0: # Has matches
772
+ has_matches = True
773
+
774
+ merged_stdout = b"\n".join(merged_lines)
775
+ return_code = 0 if has_matches else 1
776
+ return (return_code, merged_stdout, b"")
777
+
778
+
779
+ def split_roots_for_parallel_processing(
780
+ roots: list[str], max_chunks: int = 4
781
+ ) -> list[list[str]]:
782
+ """
783
+ Split roots into chunks for parallel processing.
784
+
785
+ Args:
786
+ roots: List of root directories
787
+ max_chunks: Maximum number of chunks to create
788
+
789
+ Returns:
790
+ List of root chunks for parallel processing
791
+ """
792
+ if not roots:
793
+ return []
794
+
795
+ if len(roots) <= max_chunks:
796
+ # Each root gets its own chunk
797
+ return [[root] for root in roots]
798
+
799
+ # Distribute roots across chunks
800
+ chunk_size = len(roots) // max_chunks
801
+ remainder = len(roots) % max_chunks
802
+
803
+ chunks = []
804
+ start = 0
805
+
806
+ for i in range(max_chunks):
807
+ # Add one extra item to first 'remainder' chunks
808
+ current_chunk_size = chunk_size + (1 if i < remainder else 0)
809
+ end = start + current_chunk_size
810
+
811
+ if start < len(roots):
812
+ chunks.append(roots[start:end])
813
+
814
+ start = end
815
+
816
+ return [chunk for chunk in chunks if chunk] # Remove empty chunks