sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
- sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1026 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import re
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
9
|
+
|
|
10
|
+
from loguru import logger
|
|
11
|
+
|
|
12
|
+
from ..utils.constants import GREP_CONCURRENT_LIMIT, DEFAULT_WORK_PATH
|
|
13
|
+
from ..utils.file_utils import StorageStructure
|
|
14
|
+
from .base import BaseRetriever
|
|
15
|
+
|
|
16
|
+
RGA_SEMAPHORE = asyncio.Semaphore(value=GREP_CONCURRENT_LIMIT)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GrepRetriever(BaseRetriever):
|
|
20
|
+
"""A Python wrapper for ripgrep-all (rga), exposing its functionality via static methods.
|
|
21
|
+
|
|
22
|
+
All methods are static and return parsed results. JSON output is preferred where possible
|
|
23
|
+
for reliable parsing. Shell injection is mitigated by using `subprocess.run` with `shell=False`
|
|
24
|
+
and explicit argument lists.
|
|
25
|
+
|
|
26
|
+
For more information about ripgrep-all, please refer to `https://github.com/phiresky/ripgrep-all`
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, work_path: Union[str, Path] = None, **kwargs):
|
|
30
|
+
super().__init__()
|
|
31
|
+
|
|
32
|
+
self.work_path: Path = Path(work_path or DEFAULT_WORK_PATH)
|
|
33
|
+
self.rga_cache: Path = (
|
|
34
|
+
self.work_path / StorageStructure.CACHE_DIR / StorageStructure.GREP_DIR
|
|
35
|
+
)
|
|
36
|
+
self.rga_cache.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
async def retrieve(
|
|
39
|
+
self,
|
|
40
|
+
terms: Union[str, List[str]],
|
|
41
|
+
path: Union[str, Path, List[str], List[Path], None] = None,
|
|
42
|
+
logic: Literal["and", "or", "not"] = "or",
|
|
43
|
+
*,
|
|
44
|
+
case_sensitive: bool = False,
|
|
45
|
+
whole_word: bool = False,
|
|
46
|
+
literal: bool = False,
|
|
47
|
+
regex: bool = True,
|
|
48
|
+
max_depth: Optional[int] = None,
|
|
49
|
+
include: Optional[List[str]] = None,
|
|
50
|
+
exclude: Optional[List[str]] = None,
|
|
51
|
+
file_type: Optional[str] = None,
|
|
52
|
+
invert_match: bool = False,
|
|
53
|
+
count_only: bool = False,
|
|
54
|
+
line_number: bool = True,
|
|
55
|
+
with_filename: bool = True,
|
|
56
|
+
rank: bool = True,
|
|
57
|
+
rank_kwargs: Optional[Dict] = None,
|
|
58
|
+
rga_no_cache: bool = False,
|
|
59
|
+
rga_cache_max_blob_len: int = 10000000,
|
|
60
|
+
rga_cache_path: Optional[Union[str, Path]] = None,
|
|
61
|
+
timeout: float = 60.0,
|
|
62
|
+
) -> List[Dict[str, Any]]:
|
|
63
|
+
"""Search for terms in files using ripgrep-all, supporting AND/OR/NOT logic.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
terms: Single pattern (str) or list of patterns (List[str]).
|
|
67
|
+
path: Single path (str/Path) or multiple paths (List[str]/List[Path]) to search in (defaults to current directory).
|
|
68
|
+
logic:
|
|
69
|
+
- "or" (default): match any term (A OR B OR C)
|
|
70
|
+
- "and": match all terms in same file (A AND B AND C)
|
|
71
|
+
- "not": match first term but NOT any of the rest (A AND NOT B AND NOT C)
|
|
72
|
+
case_sensitive: If True, enable case-sensitive search (`-s`).
|
|
73
|
+
whole_word: If True, match whole words only (`-w`).
|
|
74
|
+
literal: If True, treat patterns as literal strings (`-F`). Applies to all terms.
|
|
75
|
+
regex: If False, implies `literal=True`.
|
|
76
|
+
max_depth: Maximum directory depth to search (`--max-depth`).
|
|
77
|
+
include: List of glob patterns to include (`-g`).
|
|
78
|
+
exclude: List of glob patterns to exclude (`-g '!...'`).
|
|
79
|
+
file_type: Search only files of given type (`-t <type>`), e.g., 'py', 'md'.
|
|
80
|
+
invert_match: Invert match (`-v`). Note: conflicts with `logic="not"`; ignored in that case.
|
|
81
|
+
count_only: Only output match counts per file (`-c`).
|
|
82
|
+
line_number: Show line numbers (`-n`, default True).
|
|
83
|
+
with_filename: Show filenames (`-H`, default True).
|
|
84
|
+
rank: If True, rerank results by relevance score.
|
|
85
|
+
rank_kwargs: Additional kwargs for ranking (see `_rerank_results`).
|
|
86
|
+
rga_no_cache: If True, disable rga caching (`--rga-no-cache`).
|
|
87
|
+
rga_cache_max_blob_len: Max blob length for rga cache (`--rga-cache-max-blob-len`). Defaults to 10MB.
|
|
88
|
+
rga_cache_path: Custom path for rga cache (`--rga-cache-path`).
|
|
89
|
+
If None, then set the path to `/path/to/your_work_path/.cache/rga`
|
|
90
|
+
timeout: Maximum time in seconds to wait for the search to complete.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of match objects (from `rga --json`), or list of {'path': str, 'count': int} if `count_only=True`.
|
|
94
|
+
For "and"/"not", matches correspond to the **last term** (or first, if only one) in qualifying files/lines.
|
|
95
|
+
"""
|
|
96
|
+
results: List[Dict[str, Any]] = []
|
|
97
|
+
|
|
98
|
+
# Normalize terms
|
|
99
|
+
if isinstance(terms, str):
|
|
100
|
+
terms = [terms]
|
|
101
|
+
if not terms:
|
|
102
|
+
return results
|
|
103
|
+
|
|
104
|
+
rga_cache_path = rga_cache_path or self.rga_cache
|
|
105
|
+
rga_cache_path = str(Path(rga_cache_path).resolve())
|
|
106
|
+
|
|
107
|
+
# Multi-term logic routing
|
|
108
|
+
if logic == "or":
|
|
109
|
+
results, retrieve_pattern = await self._retrieve_or(
|
|
110
|
+
terms=terms,
|
|
111
|
+
path=path,
|
|
112
|
+
case_sensitive=case_sensitive,
|
|
113
|
+
whole_word=whole_word,
|
|
114
|
+
literal=literal,
|
|
115
|
+
regex=regex,
|
|
116
|
+
max_depth=max_depth,
|
|
117
|
+
include=include,
|
|
118
|
+
exclude=exclude,
|
|
119
|
+
file_type=file_type,
|
|
120
|
+
invert_match=invert_match,
|
|
121
|
+
count_only=count_only,
|
|
122
|
+
line_number=line_number,
|
|
123
|
+
with_filename=with_filename,
|
|
124
|
+
rga_no_cache=rga_no_cache,
|
|
125
|
+
rga_cache_max_blob_len=rga_cache_max_blob_len,
|
|
126
|
+
rga_cache_path=rga_cache_path,
|
|
127
|
+
timeout=timeout,
|
|
128
|
+
)
|
|
129
|
+
elif logic == "and":
|
|
130
|
+
results = await self._retrieve_and(
|
|
131
|
+
terms=terms,
|
|
132
|
+
path=path,
|
|
133
|
+
case_sensitive=case_sensitive,
|
|
134
|
+
whole_word=whole_word,
|
|
135
|
+
literal=literal,
|
|
136
|
+
regex=regex,
|
|
137
|
+
max_depth=max_depth,
|
|
138
|
+
include=include,
|
|
139
|
+
exclude=exclude,
|
|
140
|
+
file_type=file_type,
|
|
141
|
+
count_only=count_only,
|
|
142
|
+
line_number=line_number,
|
|
143
|
+
with_filename=with_filename,
|
|
144
|
+
match_same_line=False, # file-level AND (most useful)
|
|
145
|
+
rga_no_cache=rga_no_cache,
|
|
146
|
+
rga_cache_max_blob_len=rga_cache_max_blob_len,
|
|
147
|
+
rga_cache_path=rga_cache_path,
|
|
148
|
+
timeout=timeout,
|
|
149
|
+
)
|
|
150
|
+
elif logic == "not":
|
|
151
|
+
if len(terms) < 2:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
"logic='not' requires at least two terms: [positive, negative1, ...]"
|
|
154
|
+
)
|
|
155
|
+
results = await self._retrieve_not(
|
|
156
|
+
positive=terms[0],
|
|
157
|
+
negatives=terms[1:],
|
|
158
|
+
path=path,
|
|
159
|
+
case_sensitive=case_sensitive,
|
|
160
|
+
whole_word=whole_word,
|
|
161
|
+
literal=literal,
|
|
162
|
+
regex=regex,
|
|
163
|
+
max_depth=max_depth,
|
|
164
|
+
include=include,
|
|
165
|
+
exclude=exclude,
|
|
166
|
+
file_type=file_type,
|
|
167
|
+
count_only=count_only,
|
|
168
|
+
line_number=line_number,
|
|
169
|
+
with_filename=with_filename,
|
|
170
|
+
rga_no_cache=rga_no_cache,
|
|
171
|
+
rga_cache_max_blob_len=rga_cache_max_blob_len,
|
|
172
|
+
rga_cache_path=rga_cache_path,
|
|
173
|
+
timeout=timeout,
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
raise ValueError(
|
|
177
|
+
f"Unsupported logic: {logic}. Choose from 'and', 'or', 'not'."
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# ====== Reranking Post-Processing ======
|
|
181
|
+
if rank and not count_only and results:
|
|
182
|
+
rank_kwargs = rank_kwargs or {}
|
|
183
|
+
|
|
184
|
+
def _default_text_extractor(match: Dict) -> str:
|
|
185
|
+
try:
|
|
186
|
+
return match["data"]["lines"]["text"]
|
|
187
|
+
except (KeyError, TypeError):
|
|
188
|
+
return ""
|
|
189
|
+
|
|
190
|
+
score_opts = {
|
|
191
|
+
"case_sensitive": case_sensitive,
|
|
192
|
+
"whole_word": whole_word,
|
|
193
|
+
"length_norm": rank_kwargs.get("length_norm", "linear"),
|
|
194
|
+
"base_length": rank_kwargs.get("base_length", 100),
|
|
195
|
+
"exact_bonus": rank_kwargs.get("exact_bonus", 2.0),
|
|
196
|
+
"tf_weight": rank_kwargs.get("tf_weight", 1.0),
|
|
197
|
+
"term_weights": rank_kwargs.get("term_weights", None),
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# Reconstruct file groups: [begin, match*, end]*
|
|
201
|
+
grouped: List[List[Dict]] = []
|
|
202
|
+
current_group: List[Dict] = []
|
|
203
|
+
|
|
204
|
+
for item in results:
|
|
205
|
+
item_type = item.get("type")
|
|
206
|
+
if item_type == "begin":
|
|
207
|
+
# Start new group
|
|
208
|
+
if current_group:
|
|
209
|
+
grouped.append(current_group)
|
|
210
|
+
current_group = [item]
|
|
211
|
+
elif item_type == "end":
|
|
212
|
+
# Close current group
|
|
213
|
+
current_group.append(item)
|
|
214
|
+
grouped.append(current_group)
|
|
215
|
+
current_group = []
|
|
216
|
+
elif item_type == "match":
|
|
217
|
+
# Accumulate in current group
|
|
218
|
+
if current_group: # defensive: should always be inside begin/end
|
|
219
|
+
current_group.append(item)
|
|
220
|
+
else:
|
|
221
|
+
# Orphan match? Append to new dummy group (should not happen)
|
|
222
|
+
current_group = [
|
|
223
|
+
{"type": "begin", "data": {"path": {"text": "<unknown>"}}},
|
|
224
|
+
item,
|
|
225
|
+
]
|
|
226
|
+
else:
|
|
227
|
+
# e.g., "summary" — append to current or new group
|
|
228
|
+
if current_group:
|
|
229
|
+
current_group.append(item)
|
|
230
|
+
else:
|
|
231
|
+
grouped.append([item])
|
|
232
|
+
|
|
233
|
+
# If unclosed group remains (e.g., no final 'end'), flush it
|
|
234
|
+
if current_group:
|
|
235
|
+
grouped.append(current_group)
|
|
236
|
+
|
|
237
|
+
# Process each group: rerank only the 'match' items inside
|
|
238
|
+
new_results: List[Dict] = []
|
|
239
|
+
|
|
240
|
+
for group in grouped:
|
|
241
|
+
if not group:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
# Identify begin / match / end segments
|
|
245
|
+
match_items = [g for g in group if g.get("type") == "match"]
|
|
246
|
+
|
|
247
|
+
# Rerank match items
|
|
248
|
+
scored_matches = []
|
|
249
|
+
for m in match_items:
|
|
250
|
+
text = _default_text_extractor(m)
|
|
251
|
+
score = self._calculate_relevance_score(
|
|
252
|
+
text=text, terms=terms, **score_opts
|
|
253
|
+
)
|
|
254
|
+
new_m = {**m, "score": score}
|
|
255
|
+
scored_matches.append((score, new_m))
|
|
256
|
+
|
|
257
|
+
# Sort descending by score
|
|
258
|
+
scored_matches.sort(key=lambda x: x[0], reverse=True)
|
|
259
|
+
reranked_matches = [item for _, item in scored_matches]
|
|
260
|
+
|
|
261
|
+
# Rebuild group in correct order:
|
|
262
|
+
# [begin] + [other non-match items in original order] + [reranked matches] + [end]
|
|
263
|
+
# But preserve *relative order* of non-match items (e.g., context lines)
|
|
264
|
+
# Simpler: walk original group, replace match list with reranked one
|
|
265
|
+
rebuilt_group = []
|
|
266
|
+
match_iter = iter(reranked_matches)
|
|
267
|
+
for item in group:
|
|
268
|
+
if item.get("type") == "match":
|
|
269
|
+
# Pull next from reranked list (should be same length)
|
|
270
|
+
try:
|
|
271
|
+
rebuilt_group.append(next(match_iter))
|
|
272
|
+
except StopIteration:
|
|
273
|
+
pass # fallback: skip (should not happen)
|
|
274
|
+
else:
|
|
275
|
+
rebuilt_group.append(item)
|
|
276
|
+
|
|
277
|
+
new_results.extend(rebuilt_group)
|
|
278
|
+
|
|
279
|
+
results = new_results
|
|
280
|
+
|
|
281
|
+
return results
|
|
282
|
+
|
|
283
|
+
@staticmethod
|
|
284
|
+
def _run_rga(
|
|
285
|
+
args: List[str], json_output: bool = True
|
|
286
|
+
) -> subprocess.CompletedProcess:
|
|
287
|
+
"""Run ripgrep-all with given arguments.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
args: List of ripgrep-all CLI arguments.
|
|
291
|
+
json_output: If True, forces `--json` and parses stdout as JSON Lines.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
CompletedProcess object with parsed stdout (as list of dicts if json_output=True).
|
|
295
|
+
"""
|
|
296
|
+
cmd = ["rga", "--no-config"] # disable user config for reproducibility
|
|
297
|
+
if json_output:
|
|
298
|
+
cmd.append("--json")
|
|
299
|
+
cmd.extend(args)
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
result = subprocess.run(
|
|
303
|
+
cmd,
|
|
304
|
+
stdout=subprocess.PIPE,
|
|
305
|
+
stderr=subprocess.PIPE,
|
|
306
|
+
text=True,
|
|
307
|
+
check=False, # we handle non-zero exit codes manually
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if result.returncode != 0:
|
|
311
|
+
if "ripgrep" in result.stderr.lower() or " rg " in result.stderr:
|
|
312
|
+
raise RuntimeError(
|
|
313
|
+
f"ripgrep-all depends on 'ripgrep' (rg), but it's missing: {result.stderr.strip()}"
|
|
314
|
+
)
|
|
315
|
+
elif result.returncode > 1:
|
|
316
|
+
raise RuntimeError(f"rga execution failed: {result.stderr.strip()}")
|
|
317
|
+
|
|
318
|
+
# Parse JSON Lines if requested
|
|
319
|
+
if json_output and result.returncode in (0, 1) and result.stdout.strip():
|
|
320
|
+
lines = result.stdout.strip().splitlines()
|
|
321
|
+
result.stdout = [json.loads(line) for line in lines if line]
|
|
322
|
+
return result
|
|
323
|
+
except FileNotFoundError:
|
|
324
|
+
raise RuntimeError(
|
|
325
|
+
"ripgrep-all ('rga') not found. Please install ripgrep-all first."
|
|
326
|
+
)
|
|
327
|
+
except json.JSONDecodeError as e:
|
|
328
|
+
raise RuntimeError(
|
|
329
|
+
f"Failed to parse ripgrep-all JSON output: {e}\nRaw output: {result.stdout}"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
@staticmethod
|
|
333
|
+
async def _run_rga_async(
|
|
334
|
+
args: List[str], json_output: bool = True, timeout: float = 60.0
|
|
335
|
+
) -> Dict[str, Any]:
|
|
336
|
+
cmd = ["rga", "--no-config"]
|
|
337
|
+
if json_output:
|
|
338
|
+
cmd.append("--json")
|
|
339
|
+
cmd.extend(args)
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
await asyncio.wait_for(RGA_SEMAPHORE.acquire(), timeout=timeout)
|
|
343
|
+
except asyncio.TimeoutError:
|
|
344
|
+
raise RuntimeError(
|
|
345
|
+
f"rga search timed out while waiting for a queue slot ({timeout}s)."
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
try:
|
|
350
|
+
process = await asyncio.create_subprocess_exec(
|
|
351
|
+
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
352
|
+
)
|
|
353
|
+
except FileNotFoundError:
|
|
354
|
+
raise RuntimeError("ripgrep-all ('rga') not found. Please install it first.")
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
stdout, stderr = await asyncio.wait_for(
|
|
358
|
+
process.communicate(), timeout=timeout
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
stdout_str = stdout.decode().strip()
|
|
362
|
+
stderr_str = stderr.decode().strip()
|
|
363
|
+
returncode = process.returncode
|
|
364
|
+
|
|
365
|
+
if returncode != 0:
|
|
366
|
+
if "ripgrep" in stderr_str.lower() or " rg " in stderr_str:
|
|
367
|
+
raise RuntimeError(
|
|
368
|
+
f"ripgrep-all depends on 'ripgrep' (rg), but it's missing or failed: {stderr_str}"
|
|
369
|
+
)
|
|
370
|
+
elif returncode > 1:
|
|
371
|
+
raise RuntimeError(f"rga execution failed with code {returncode}: {stderr_str}")
|
|
372
|
+
# 4. Parse JSON Lines
|
|
373
|
+
parsed_stdout = stdout_str
|
|
374
|
+
if json_output and returncode in (0, 1) and stdout_str:
|
|
375
|
+
try:
|
|
376
|
+
parsed_stdout = [
|
|
377
|
+
json.loads(line) for line in stdout_str.splitlines() if line
|
|
378
|
+
]
|
|
379
|
+
except json.JSONDecodeError as e:
|
|
380
|
+
raise RuntimeError(f"Failed to parse rga JSON output: {e}")
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
"returncode": returncode,
|
|
384
|
+
"stdout": parsed_stdout,
|
|
385
|
+
"stderr": stderr_str,
|
|
386
|
+
}
|
|
387
|
+
except asyncio.TimeoutError:
|
|
388
|
+
try:
|
|
389
|
+
process.kill()
|
|
390
|
+
except ProcessLookupError:
|
|
391
|
+
pass
|
|
392
|
+
raise RuntimeError(f"rga process execution timed out ({timeout}s).")
|
|
393
|
+
|
|
394
|
+
finally:
|
|
395
|
+
RGA_SEMAPHORE.release()
|
|
396
|
+
|
|
397
|
+
@staticmethod
|
|
398
|
+
async def _retrieve_single(**kwargs) -> List[Dict[str, Any]]:
|
|
399
|
+
"""Wrapper for original single-pattern search (extracted for reuse)."""
|
|
400
|
+
pattern = kwargs.pop("pattern")
|
|
401
|
+
args = []
|
|
402
|
+
|
|
403
|
+
# Basic ripgrep-all args
|
|
404
|
+
regex = kwargs.get("regex", True)
|
|
405
|
+
literal = kwargs.get("literal", False)
|
|
406
|
+
case_sensitive = kwargs.get("case_sensitive", False)
|
|
407
|
+
whole_word = kwargs.get("whole_word", False)
|
|
408
|
+
invert_match = kwargs.get("invert_match", False)
|
|
409
|
+
count_only = kwargs.get("count_only", False)
|
|
410
|
+
line_number = kwargs.get("line_number", True)
|
|
411
|
+
with_filename = kwargs.get("with_filename", True)
|
|
412
|
+
max_depth = kwargs.get("max_depth")
|
|
413
|
+
include = kwargs.get("include")
|
|
414
|
+
exclude = kwargs.get("exclude")
|
|
415
|
+
file_type = kwargs.get("file_type")
|
|
416
|
+
path = kwargs.get("path")
|
|
417
|
+
timeout = kwargs.get("timeout", 60.0)
|
|
418
|
+
|
|
419
|
+
# Additional ripgrep-all args
|
|
420
|
+
rga_no_cache = kwargs.get("rga_no_cache", False)
|
|
421
|
+
rga_cache_max_blob_len = kwargs.get(
|
|
422
|
+
"rga_cache_max_blob_len", 10000000
|
|
423
|
+
) # Default 10MB
|
|
424
|
+
rga_cache_path = kwargs.get("rga_cache_path")
|
|
425
|
+
|
|
426
|
+
# Build argument list
|
|
427
|
+
if not regex:
|
|
428
|
+
literal = True
|
|
429
|
+
if literal:
|
|
430
|
+
args.append("-F")
|
|
431
|
+
if case_sensitive:
|
|
432
|
+
args.append("-s")
|
|
433
|
+
else:
|
|
434
|
+
args.append("-i")
|
|
435
|
+
if whole_word:
|
|
436
|
+
args.append("-w")
|
|
437
|
+
if invert_match:
|
|
438
|
+
args.append("-v")
|
|
439
|
+
if count_only:
|
|
440
|
+
args.append("-c")
|
|
441
|
+
if not line_number:
|
|
442
|
+
args.append("--no-line-number")
|
|
443
|
+
if not with_filename:
|
|
444
|
+
args.append("--no-filename")
|
|
445
|
+
if max_depth is not None:
|
|
446
|
+
args.extend(["--max-depth", str(max_depth)])
|
|
447
|
+
if include:
|
|
448
|
+
for inc in include:
|
|
449
|
+
args.extend(["-g", inc])
|
|
450
|
+
if exclude:
|
|
451
|
+
for exc in exclude:
|
|
452
|
+
args.extend(["-g", f"!{exc}"])
|
|
453
|
+
if file_type:
|
|
454
|
+
args.extend(["-t", file_type])
|
|
455
|
+
|
|
456
|
+
if rga_no_cache:
|
|
457
|
+
args.append("--rga-no-cache")
|
|
458
|
+
|
|
459
|
+
args.extend([f"--rga-cache-max-blob-len={str(rga_cache_max_blob_len)}"])
|
|
460
|
+
|
|
461
|
+
if rga_cache_path:
|
|
462
|
+
args.extend([f"--rga-cache-path={str(rga_cache_path)}"])
|
|
463
|
+
|
|
464
|
+
args.append(pattern)
|
|
465
|
+
|
|
466
|
+
if path is not None:
|
|
467
|
+
if isinstance(path, (str, Path)):
|
|
468
|
+
args.append(str(path))
|
|
469
|
+
elif isinstance(path, list):
|
|
470
|
+
for p in path:
|
|
471
|
+
args.append(str(p))
|
|
472
|
+
else:
|
|
473
|
+
raise TypeError(f"Unsupported type for 'path': {type(path)}")
|
|
474
|
+
|
|
475
|
+
# keys: returncode, stdout, stderr
|
|
476
|
+
result: Dict[str, Any] = await GrepRetriever._run_rga_async(
|
|
477
|
+
args=args,
|
|
478
|
+
json_output=not count_only,
|
|
479
|
+
timeout=timeout,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
if result["returncode"] == 0:
|
|
483
|
+
if count_only:
|
|
484
|
+
counts = []
|
|
485
|
+
for line in result.stdout.strip().splitlines():
|
|
486
|
+
if ":" in line:
|
|
487
|
+
p, c = line.rsplit(":", 1)
|
|
488
|
+
counts.append({"path": p, "count": int(c)})
|
|
489
|
+
return counts
|
|
490
|
+
else:
|
|
491
|
+
return result["stdout"]
|
|
492
|
+
elif result["returncode"] == 1:
|
|
493
|
+
return []
|
|
494
|
+
else:
|
|
495
|
+
raise RuntimeError(
|
|
496
|
+
f"ripgrep-all failed (exit {result['returncode']}): {result['stderr'].strip()}"
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
@staticmethod
|
|
500
|
+
async def _retrieve_or(
|
|
501
|
+
terms: List[str],
|
|
502
|
+
**kwargs,
|
|
503
|
+
) -> (List[Dict[str, Any]], str):
|
|
504
|
+
"""OR: Match any term — simply concatenate with | (ripgrep-all supports alternation)."""
|
|
505
|
+
# Escape terms if literal mode
|
|
506
|
+
literal = kwargs.get("literal", False)
|
|
507
|
+
if literal:
|
|
508
|
+
escaped_terms = [
|
|
509
|
+
term.replace("\\", "\\\\").replace("|", "\\|") for term in terms
|
|
510
|
+
]
|
|
511
|
+
pattern = "|".join(escaped_terms)
|
|
512
|
+
kwargs["literal"] = True # still use -F to avoid regex meta
|
|
513
|
+
else:
|
|
514
|
+
# Wrap each term in (?:...) to avoid precedence issues
|
|
515
|
+
pattern = "|".join(f"(?:{term})" for term in terms)
|
|
516
|
+
kwargs["literal"] = False
|
|
517
|
+
|
|
518
|
+
result = await GrepRetriever._retrieve_single(pattern=pattern, **kwargs)
|
|
519
|
+
|
|
520
|
+
return result, pattern
|
|
521
|
+
|
|
522
|
+
@staticmethod
|
|
523
|
+
async def _retrieve_and(
|
|
524
|
+
terms: List[str],
|
|
525
|
+
match_same_line: bool = False,
|
|
526
|
+
**kwargs,
|
|
527
|
+
) -> List[Dict[str, Any]]:
|
|
528
|
+
"""AND: All terms in same file (default) or same line."""
|
|
529
|
+
count_only = kwargs.get("count_only", False)
|
|
530
|
+
|
|
531
|
+
# Step 1: Get files containing first term
|
|
532
|
+
first_matches = await GrepRetriever._retrieve_single(pattern=terms[0], **kwargs)
|
|
533
|
+
if not first_matches:
|
|
534
|
+
return []
|
|
535
|
+
|
|
536
|
+
if match_same_line:
|
|
537
|
+
# Line-level AND: filter lines containing all terms
|
|
538
|
+
def line_contains_all(
|
|
539
|
+
line: str, others: List[str], case_sensitive: bool
|
|
540
|
+
) -> bool:
|
|
541
|
+
if not case_sensitive:
|
|
542
|
+
line = line.lower()
|
|
543
|
+
others = [t.lower() for t in others]
|
|
544
|
+
return all(term in line for term in others)
|
|
545
|
+
|
|
546
|
+
case_sensitive = kwargs.get("case_sensitive", False)
|
|
547
|
+
others = terms[1:]
|
|
548
|
+
return [
|
|
549
|
+
m
|
|
550
|
+
for m in first_matches
|
|
551
|
+
if m["type"] == "match"
|
|
552
|
+
and line_contains_all(
|
|
553
|
+
m["data"]["lines"]["text"], others, case_sensitive
|
|
554
|
+
)
|
|
555
|
+
]
|
|
556
|
+
else:
|
|
557
|
+
# File-level AND (default)
|
|
558
|
+
files_with_first = {
|
|
559
|
+
m["data"]["path"]["text"] for m in first_matches if m["type"] == "match"
|
|
560
|
+
}
|
|
561
|
+
qualifying_files = set()
|
|
562
|
+
|
|
563
|
+
for f in files_with_first:
|
|
564
|
+
valid = True
|
|
565
|
+
for term in terms[1:]:
|
|
566
|
+
res = await GrepRetriever._retrieve_single(
|
|
567
|
+
pattern=term,
|
|
568
|
+
path=f,
|
|
569
|
+
count_only=count_only,
|
|
570
|
+
**{
|
|
571
|
+
k: v
|
|
572
|
+
for k, v in kwargs.items()
|
|
573
|
+
if k not in ["path", "count_only"]
|
|
574
|
+
},
|
|
575
|
+
)
|
|
576
|
+
if not res: # no match
|
|
577
|
+
valid = False
|
|
578
|
+
break
|
|
579
|
+
if valid:
|
|
580
|
+
qualifying_files.add(f)
|
|
581
|
+
|
|
582
|
+
# Collect matches for last term (or first) in qualifying files
|
|
583
|
+
target_term = terms[-1]
|
|
584
|
+
all_matches = []
|
|
585
|
+
kwargs["pattern"] = target_term
|
|
586
|
+
for f in qualifying_files:
|
|
587
|
+
kwargs["path"] = f
|
|
588
|
+
matches = await GrepRetriever._retrieve_single(**kwargs)
|
|
589
|
+
all_matches.extend(matches)
|
|
590
|
+
return all_matches
|
|
591
|
+
|
|
592
|
+
@staticmethod
|
|
593
|
+
async def _retrieve_not(
|
|
594
|
+
positive: str,
|
|
595
|
+
negatives: List[str],
|
|
596
|
+
**kwargs,
|
|
597
|
+
) -> List[Dict[str, Any]]:
|
|
598
|
+
"""NOT: Match positive term, but exclude files/lines containing any negative term."""
|
|
599
|
+
# count_only = kwargs.get("count_only", False)
|
|
600
|
+
|
|
601
|
+
# Step 1: Get matches for positive term
|
|
602
|
+
pos_matches = await GrepRetriever._retrieve_single(pattern=positive, **kwargs)
|
|
603
|
+
if not pos_matches:
|
|
604
|
+
return []
|
|
605
|
+
|
|
606
|
+
# Decide: file-level NOT (default) vs line-level NOT
|
|
607
|
+
# We use file-level for efficiency (avoid per-line Python filtering on large outputs)
|
|
608
|
+
files_with_positive = {
|
|
609
|
+
m["data"]["path"]["text"] for m in pos_matches if m["type"] == "match"
|
|
610
|
+
}
|
|
611
|
+
excluded_files = set()
|
|
612
|
+
|
|
613
|
+
for f in files_with_positive:
|
|
614
|
+
for neg in negatives:
|
|
615
|
+
res = await GrepRetriever._retrieve_single(
|
|
616
|
+
pattern=neg,
|
|
617
|
+
path=f,
|
|
618
|
+
count_only=True,
|
|
619
|
+
**{
|
|
620
|
+
k: v
|
|
621
|
+
for k, v in kwargs.items()
|
|
622
|
+
if k not in ["path", "count_only"]
|
|
623
|
+
},
|
|
624
|
+
)
|
|
625
|
+
if res: # found negative → exclude this file
|
|
626
|
+
excluded_files.add(f)
|
|
627
|
+
break
|
|
628
|
+
|
|
629
|
+
# Keep only matches from non-excluded files
|
|
630
|
+
kept_matches = [
|
|
631
|
+
m
|
|
632
|
+
for m in pos_matches
|
|
633
|
+
if m["type"] == "match" and m["data"]["path"]["text"] not in excluded_files
|
|
634
|
+
]
|
|
635
|
+
|
|
636
|
+
return kept_matches
|
|
637
|
+
|
|
638
|
+
async def list_files(
|
|
639
|
+
self,
|
|
640
|
+
path: Optional[str] = None,
|
|
641
|
+
*,
|
|
642
|
+
max_depth: Optional[int] = None,
|
|
643
|
+
include: Optional[List[str]] = None,
|
|
644
|
+
exclude: Optional[List[str]] = None,
|
|
645
|
+
file_type: Optional[str] = None,
|
|
646
|
+
hidden: bool = False,
|
|
647
|
+
follow_symlinks: bool = False,
|
|
648
|
+
) -> List[str]:
|
|
649
|
+
"""List files that would be searched by ripgrep-all (like `rga --files`).
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
path: Path to list files in.
|
|
653
|
+
max_depth: Maximum directory depth.
|
|
654
|
+
include: Glob patterns to include.
|
|
655
|
+
exclude: Glob patterns to exclude.
|
|
656
|
+
file_type: Restrict to file type (e.g., 'py').
|
|
657
|
+
hidden: Include hidden files/dirs (`--hidden`).
|
|
658
|
+
follow_symlinks: Follow symbolic links (`--follow`).
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
List of relative file paths (strings).
|
|
662
|
+
"""
|
|
663
|
+
args = ["--files"]
|
|
664
|
+
if max_depth is not None:
|
|
665
|
+
args.extend(["--max-depth", str(max_depth)])
|
|
666
|
+
if include:
|
|
667
|
+
for inc in include:
|
|
668
|
+
args.extend(["-g", inc])
|
|
669
|
+
if exclude:
|
|
670
|
+
for exc in exclude:
|
|
671
|
+
args.extend(["-g", f"!{exc}"])
|
|
672
|
+
if file_type:
|
|
673
|
+
args.extend(["-t", file_type])
|
|
674
|
+
if hidden:
|
|
675
|
+
args.append("--hidden")
|
|
676
|
+
if follow_symlinks:
|
|
677
|
+
args.append("--follow")
|
|
678
|
+
if path:
|
|
679
|
+
args.append(path)
|
|
680
|
+
|
|
681
|
+
result: Dict[str, Any] = await GrepRetriever._run_rga_async(
|
|
682
|
+
args, json_output=False
|
|
683
|
+
)
|
|
684
|
+
if result["returncode"] not in (0, 1):
|
|
685
|
+
raise RuntimeError(
|
|
686
|
+
f"ripgrep-all --files failed: {result['stderr'].strip()}"
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
return result["stdout"].strip().splitlines() if result["stdout"].strip() else []
|
|
690
|
+
|
|
691
|
+
def file_types(self) -> Dict[str, List[str]]:
|
|
692
|
+
"""List supported file types and their associated globs/extensions.
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
Dict mapping type names (e.g., 'python') to list of globs (e.g., ['*.py', '*.pyi']).
|
|
696
|
+
"""
|
|
697
|
+
result = subprocess.run(
|
|
698
|
+
["rga", "--type-list"],
|
|
699
|
+
stdout=subprocess.PIPE,
|
|
700
|
+
stderr=subprocess.PIPE,
|
|
701
|
+
text=True,
|
|
702
|
+
check=True,
|
|
703
|
+
)
|
|
704
|
+
types = {}
|
|
705
|
+
for line in result.stdout.strip().splitlines():
|
|
706
|
+
if ":" in line:
|
|
707
|
+
name, globs = line.split(":", 1)
|
|
708
|
+
types[name.strip()] = [g.strip() for g in globs.split(",") if g.strip()]
|
|
709
|
+
return types
|
|
710
|
+
|
|
711
|
+
async def replace(
|
|
712
|
+
self,
|
|
713
|
+
pattern: str,
|
|
714
|
+
replacement: str,
|
|
715
|
+
path: Optional[str] = None,
|
|
716
|
+
*,
|
|
717
|
+
dry_run: bool = False,
|
|
718
|
+
case_sensitive: bool = False,
|
|
719
|
+
literal: bool = False,
|
|
720
|
+
whole_word: bool = False,
|
|
721
|
+
max_depth: Optional[int] = None,
|
|
722
|
+
include: Optional[List[str]] = None,
|
|
723
|
+
exclude: Optional[List[str]] = None,
|
|
724
|
+
) -> List[Dict[str, Any]]:
|
|
725
|
+
"""Perform search-and-replace using ripgrep-all (via `--replace`).
|
|
726
|
+
|
|
727
|
+
Caution: This modifies files in-place if dry_run=False.
|
|
728
|
+
|
|
729
|
+
Args:
|
|
730
|
+
pattern: Regex pattern to search for.
|
|
731
|
+
replacement: Replacement string (supports $1, $2, etc.).
|
|
732
|
+
path: Path to operate on.
|
|
733
|
+
dry_run: If True, only show matches/replacements, don't modify files.
|
|
734
|
+
case_sensitive: Enable case-sensitive matching.
|
|
735
|
+
literal: Treat pattern as literal string.
|
|
736
|
+
whole_word: Match whole words only.
|
|
737
|
+
max_depth: Max search depth.
|
|
738
|
+
include/exclude: Globs to include/exclude.
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
List of replacement events (from `rga --json` output).
|
|
742
|
+
"""
|
|
743
|
+
args = ["--replace", replacement]
|
|
744
|
+
if dry_run:
|
|
745
|
+
args.append("--dry-run")
|
|
746
|
+
else:
|
|
747
|
+
args.append("--passthru") # needed for in-place replace with --replace
|
|
748
|
+
if case_sensitive:
|
|
749
|
+
args.append("-s")
|
|
750
|
+
else:
|
|
751
|
+
args.append("-i")
|
|
752
|
+
if literal:
|
|
753
|
+
args.append("-F")
|
|
754
|
+
if whole_word:
|
|
755
|
+
args.append("-w")
|
|
756
|
+
if max_depth is not None:
|
|
757
|
+
args.extend(["--max-depth", str(max_depth)])
|
|
758
|
+
if include:
|
|
759
|
+
for inc in include:
|
|
760
|
+
args.extend(["-g", inc])
|
|
761
|
+
if exclude:
|
|
762
|
+
for exc in exclude:
|
|
763
|
+
args.extend(["-g", f"!{exc}"])
|
|
764
|
+
|
|
765
|
+
args.append(pattern)
|
|
766
|
+
if path:
|
|
767
|
+
args.append(path)
|
|
768
|
+
|
|
769
|
+
result = await GrepRetriever._run_rga_async(args)
|
|
770
|
+
if result["returncode"] not in (0, 1):
|
|
771
|
+
raise RuntimeError(
|
|
772
|
+
f"ripgrep-all replace failed: {result['stderr'].strip()}"
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
return result["stdout"]
|
|
776
|
+
|
|
777
|
+
def version(self) -> str:
|
|
778
|
+
"""Get ripgrep-all version string.
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
Version string.
|
|
782
|
+
"""
|
|
783
|
+
result = subprocess.run(
|
|
784
|
+
["rga", "--version"],
|
|
785
|
+
stdout=subprocess.PIPE,
|
|
786
|
+
stderr=subprocess.PIPE,
|
|
787
|
+
text=True,
|
|
788
|
+
check=True,
|
|
789
|
+
)
|
|
790
|
+
return result.stdout.strip().split("\n")[0]
|
|
791
|
+
|
|
792
|
+
def supports_feature(self, feature: str) -> bool:
|
|
793
|
+
"""
|
|
794
|
+
Check if ripgrep-all supports a given feature (e.g., 'pcre2', 'json').
|
|
795
|
+
|
|
796
|
+
Args:
|
|
797
|
+
feature: Feature name (e.g., 'json', 'pcre2', 'lz4').
|
|
798
|
+
|
|
799
|
+
Returns:
|
|
800
|
+
True if feature is available in this rga build.
|
|
801
|
+
"""
|
|
802
|
+
result = subprocess.run(
|
|
803
|
+
["rga", "--help"],
|
|
804
|
+
stdout=subprocess.PIPE,
|
|
805
|
+
stderr=subprocess.PIPE,
|
|
806
|
+
text=True,
|
|
807
|
+
check=True,
|
|
808
|
+
)
|
|
809
|
+
return f"--{feature}" in result.stdout
|
|
810
|
+
|
|
811
|
+
@staticmethod
|
|
812
|
+
def _calculate_relevance_score(
|
|
813
|
+
text: str,
|
|
814
|
+
terms: List[str],
|
|
815
|
+
*,
|
|
816
|
+
case_sensitive: bool = False,
|
|
817
|
+
whole_word: bool = False,
|
|
818
|
+
length_norm: Literal["linear", "log", "none"] = "linear",
|
|
819
|
+
base_length: int = 100,
|
|
820
|
+
exact_bonus: float = 2.0,
|
|
821
|
+
tf_weight: float = 1.0,
|
|
822
|
+
term_weights: Optional[List[float]] = None,
|
|
823
|
+
tf_saturation: Literal["log", "sigmoid", "none"] = "sigmoid",
|
|
824
|
+
saturation_k: float = 1.0,
|
|
825
|
+
idf_simulate: bool = True,
|
|
826
|
+
) -> float:
|
|
827
|
+
"""
|
|
828
|
+
Compute a relevance score for a text w.r.t. a list of query terms.
|
|
829
|
+
|
|
830
|
+
Scoring formula (per term, then summed):
|
|
831
|
+
term_score = (TF_term ** tf_weight) * bonus_term * length_factor
|
|
832
|
+
|
|
833
|
+
Where:
|
|
834
|
+
- TF_term = number of matches for the term
|
|
835
|
+
- bonus_term = exact_bonus if at least one match is *isolated*, else 1.0
|
|
836
|
+
- length_factor = global penalty for long texts (shared across terms)
|
|
837
|
+
|
|
838
|
+
An *isolated* match means the term is surrounded by non-alphanumeric characters
|
|
839
|
+
or string boundaries (i.e., standalone token — higher relevance).
|
|
840
|
+
|
|
841
|
+
Args:
|
|
842
|
+
text: Text to score (e.g., matching line or surrounding context).
|
|
843
|
+
terms: List of query terms (e.g., ["TODO", "fix"]).
|
|
844
|
+
case_sensitive: Whether matching is case-sensitive.
|
|
845
|
+
whole_word: If True, only matches bounded by non-word chars are counted.
|
|
846
|
+
length_norm: Length penalty strategy ('linear' / 'log' / 'none').
|
|
847
|
+
base_length: Scaling for linear norm (default 100 chars).
|
|
848
|
+
exact_bonus: Bonus multiplier for isolated matches (e.g., 2.0).
|
|
849
|
+
tf_weight: Exponent for term frequency (e.g., 0.5 for sqrt(TF)).
|
|
850
|
+
term_weights: Optional weights for each term (default: uniform 1.0).
|
|
851
|
+
Must have same length as `terms` if provided.
|
|
852
|
+
tf_saturation: How to saturate term frequency:
|
|
853
|
+
- 'log': tf_adj = 1 + log(tf) (smooth, BM25-like)
|
|
854
|
+
- 'sigmoid':tf_adj = tf / (tf + k) (bounded [0,1))
|
|
855
|
+
- 'none': tf_adj = tf (original)
|
|
856
|
+
saturation_k: Parameter for sigmoid (default 1.0); larger → slower saturation.
|
|
857
|
+
idf_simulate: If True, penalize short/common terms heuristically:
|
|
858
|
+
idf_factor = max(0.1, min(1.0, len(term) / 5)) # e.g., "a" → 0.2, "error" → 1.0
|
|
859
|
+
|
|
860
|
+
Returns:
|
|
861
|
+
Non-negative float relevance score (sum of term scores).
|
|
862
|
+
"""
|
|
863
|
+
if not text or not terms:
|
|
864
|
+
return 0.0
|
|
865
|
+
|
|
866
|
+
if term_weights is not None:
|
|
867
|
+
if len(term_weights) != len(terms):
|
|
868
|
+
raise ValueError("term_weights must have same length as terms")
|
|
869
|
+
else:
|
|
870
|
+
term_weights = [1.0] * len(terms)
|
|
871
|
+
|
|
872
|
+
flags = 0 if case_sensitive else re.IGNORECASE
|
|
873
|
+
|
|
874
|
+
# Precompute length factor (shared)
|
|
875
|
+
n = len(text)
|
|
876
|
+
if length_norm == "linear":
|
|
877
|
+
length_factor = 1.0 / (n / base_length + 1)
|
|
878
|
+
elif length_norm == "log":
|
|
879
|
+
length_factor = 1.0 / (math.log(n + 1) + 1)
|
|
880
|
+
else: # "none"
|
|
881
|
+
length_factor = 1.0
|
|
882
|
+
|
|
883
|
+
total_score = 0.0
|
|
884
|
+
|
|
885
|
+
for term, weight in zip(terms, term_weights):
|
|
886
|
+
if not term:
|
|
887
|
+
continue
|
|
888
|
+
|
|
889
|
+
if idf_simulate:
|
|
890
|
+
# Normalize: len=1→0.2, len=5+→1.0 (clamped)
|
|
891
|
+
idf_factor = max(0.2, min(1.0, len(term) / 5.0))
|
|
892
|
+
else:
|
|
893
|
+
idf_factor = 1.0
|
|
894
|
+
|
|
895
|
+
escaped = re.escape(term)
|
|
896
|
+
if whole_word:
|
|
897
|
+
regex = rf"(?<!\w){escaped}(?!\w)"
|
|
898
|
+
else:
|
|
899
|
+
regex = escaped
|
|
900
|
+
|
|
901
|
+
# Find matches
|
|
902
|
+
try:
|
|
903
|
+
matches_iter = re.finditer(regex, text, flags=flags)
|
|
904
|
+
match_positions: List[int] = [m.start() for m in matches_iter]
|
|
905
|
+
tf = len(match_positions)
|
|
906
|
+
except re.error as e:
|
|
907
|
+
logger.warning(f"Regex failed for term {term!r}: {e}", RuntimeWarning)
|
|
908
|
+
tf = 0
|
|
909
|
+
|
|
910
|
+
if tf == 0:
|
|
911
|
+
continue
|
|
912
|
+
|
|
913
|
+
# Isolation bonus
|
|
914
|
+
has_isolated_match = any(
|
|
915
|
+
(pos == 0 or not text[pos - 1].isalnum())
|
|
916
|
+
and (
|
|
917
|
+
pos + len(term) == len(text) or not text[pos + len(term)].isalnum()
|
|
918
|
+
)
|
|
919
|
+
for pos in match_positions
|
|
920
|
+
)
|
|
921
|
+
bonus = exact_bonus if has_isolated_match else 1.0
|
|
922
|
+
|
|
923
|
+
if tf_saturation == "log":
|
|
924
|
+
tf_adj = 1.0 + math.log(tf) # log(1)=0 → tf=1 → tf_adj=1.0
|
|
925
|
+
elif tf_saturation == "sigmoid":
|
|
926
|
+
tf_adj = tf / (tf + saturation_k) # e.g., k=1: tf=1→0.5, tf=10→0.91
|
|
927
|
+
elif tf_saturation == "none":
|
|
928
|
+
tf_adj = float(tf)
|
|
929
|
+
else:
|
|
930
|
+
raise ValueError(f"Unknown tf_saturation: {tf_saturation}")
|
|
931
|
+
|
|
932
|
+
# Term score with saturation + IDF + bonus + weight
|
|
933
|
+
term_score = (tf_adj**tf_weight) * bonus * weight * idf_factor
|
|
934
|
+
total_score += term_score
|
|
935
|
+
|
|
936
|
+
score = total_score * length_factor
|
|
937
|
+
return max(0.0, score)
|
|
938
|
+
|
|
939
|
+
@staticmethod
|
|
940
|
+
def merge_results(
|
|
941
|
+
raw_results: List[Dict[str, Any]], limit: int = 50
|
|
942
|
+
) -> List[Dict[str, Any]]:
|
|
943
|
+
"""
|
|
944
|
+
Merge ripgrep-all --json output into a structured per-file result list.
|
|
945
|
+
|
|
946
|
+
This function:
|
|
947
|
+
- Groups 'match' entries by file path (bounded by 'begin' and 'end' events).
|
|
948
|
+
- For each file, collects 'match' items and sorts them by score (desc).
|
|
949
|
+
- Takes top-`limit` matches per file (default: 50).
|
|
950
|
+
- Combines all lines.text from selected matches into a list (in original match order).
|
|
951
|
+
- Returns a list of unified file results.
|
|
952
|
+
|
|
953
|
+
Args:
|
|
954
|
+
raw_results: List of parsed JSON objects from `rga --json` output.
|
|
955
|
+
limit: Maximum number of match items to keep per file (default: 50).
|
|
956
|
+
|
|
957
|
+
Returns:
|
|
958
|
+
A list of dictionaries, each representing one file with:
|
|
959
|
+
- "path": str
|
|
960
|
+
- "matches": List[Dict] # top `limit` match items (sorted by score desc)
|
|
961
|
+
- "lines": List[str] # lines.text from those matches, in match order
|
|
962
|
+
- "total_matches": int # total matches found in this file (before limit)
|
|
963
|
+
"""
|
|
964
|
+
if not raw_results:
|
|
965
|
+
return []
|
|
966
|
+
|
|
967
|
+
# State tracking
|
|
968
|
+
current_path: Optional[str] = None
|
|
969
|
+
file_matches: List[Dict[str, Any]] = [] # matches for current file
|
|
970
|
+
all_files: List[Dict[str, Any]] = [] # final result accumulator
|
|
971
|
+
|
|
972
|
+
for item in raw_results:
|
|
973
|
+
item_type = item.get("type")
|
|
974
|
+
data = item.get("data", {})
|
|
975
|
+
|
|
976
|
+
if item_type == "begin":
|
|
977
|
+
# Start a new file context
|
|
978
|
+
path_obj = data.get("path", {})
|
|
979
|
+
current_path = path_obj.get("text")
|
|
980
|
+
file_matches = [] # reset match buffer
|
|
981
|
+
|
|
982
|
+
elif item_type == "match" and current_path is not None:
|
|
983
|
+
# Accumulate match; retain full item for sorting & line extraction
|
|
984
|
+
# Note: 'score' is top-level in your example (not in 'data')
|
|
985
|
+
file_matches.append(item)
|
|
986
|
+
|
|
987
|
+
elif item_type == "end":
|
|
988
|
+
# Finalize current file
|
|
989
|
+
if current_path is not None:
|
|
990
|
+
# Sort matches by score (descending); assume score exists
|
|
991
|
+
file_matches.sort(key=lambda x: x.get("score", 0.0), reverse=True)
|
|
992
|
+
|
|
993
|
+
total_count = len(file_matches)
|
|
994
|
+
top_matches = file_matches[:limit]
|
|
995
|
+
|
|
996
|
+
# Extract lines.text in match order (not sorted order if stable sort not guaranteed)
|
|
997
|
+
# But since we sort, we preserve sorted order → lines follow score order
|
|
998
|
+
lines = [
|
|
999
|
+
match["data"]["lines"]["text"]
|
|
1000
|
+
for match in top_matches
|
|
1001
|
+
if "data" in match and "lines" in match["data"]
|
|
1002
|
+
]
|
|
1003
|
+
|
|
1004
|
+
all_files.append(
|
|
1005
|
+
{
|
|
1006
|
+
"path": current_path,
|
|
1007
|
+
"matches": top_matches, # full match objects
|
|
1008
|
+
"lines": lines, # list of line strings
|
|
1009
|
+
"total_matches": total_count, # before limiting
|
|
1010
|
+
"total_score": 0.0,
|
|
1011
|
+
}
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
# Reset
|
|
1015
|
+
current_path = None
|
|
1016
|
+
file_matches = []
|
|
1017
|
+
|
|
1018
|
+
# Ignore "summary" and unknown types
|
|
1019
|
+
|
|
1020
|
+
return all_files
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
class TextRetriever(GrepRetriever):
|
|
1024
|
+
"""Alias for GrepRetriever for backward compatibility."""
|
|
1025
|
+
|
|
1026
|
+
pass
|