sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.dist-info/METADATA +416 -0
  41. sirchmunk-0.0.1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1026 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import asyncio
3
+ import json
4
+ import math
5
+ import re
6
+ import subprocess
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Literal, Optional, Union
9
+
10
+ from loguru import logger
11
+
12
+ from ..utils.constants import GREP_CONCURRENT_LIMIT, DEFAULT_WORK_PATH
13
+ from ..utils.file_utils import StorageStructure
14
+ from .base import BaseRetriever
15
+
16
+ RGA_SEMAPHORE = asyncio.Semaphore(value=GREP_CONCURRENT_LIMIT)
17
+
18
+
19
+ class GrepRetriever(BaseRetriever):
20
+ """A Python wrapper for ripgrep-all (rga), exposing its functionality via static methods.
21
+
22
+ All methods are static and return parsed results. JSON output is preferred where possible
23
+ for reliable parsing. Shell injection is mitigated by using `subprocess.run` with `shell=False`
24
+ and explicit argument lists.
25
+
26
+ For more information about ripgrep-all, please refer to `https://github.com/phiresky/ripgrep-all`
27
+ """
28
+
29
+ def __init__(self, work_path: Union[str, Path] = None, **kwargs):
30
+ super().__init__()
31
+
32
+ self.work_path: Path = Path(work_path or DEFAULT_WORK_PATH)
33
+ self.rga_cache: Path = (
34
+ self.work_path / StorageStructure.CACHE_DIR / StorageStructure.GREP_DIR
35
+ )
36
+ self.rga_cache.mkdir(parents=True, exist_ok=True)
37
+
38
+ async def retrieve(
39
+ self,
40
+ terms: Union[str, List[str]],
41
+ path: Union[str, Path, List[str], List[Path], None] = None,
42
+ logic: Literal["and", "or", "not"] = "or",
43
+ *,
44
+ case_sensitive: bool = False,
45
+ whole_word: bool = False,
46
+ literal: bool = False,
47
+ regex: bool = True,
48
+ max_depth: Optional[int] = None,
49
+ include: Optional[List[str]] = None,
50
+ exclude: Optional[List[str]] = None,
51
+ file_type: Optional[str] = None,
52
+ invert_match: bool = False,
53
+ count_only: bool = False,
54
+ line_number: bool = True,
55
+ with_filename: bool = True,
56
+ rank: bool = True,
57
+ rank_kwargs: Optional[Dict] = None,
58
+ rga_no_cache: bool = False,
59
+ rga_cache_max_blob_len: int = 10000000,
60
+ rga_cache_path: Optional[Union[str, Path]] = None,
61
+ timeout: float = 60.0,
62
+ ) -> List[Dict[str, Any]]:
63
+ """Search for terms in files using ripgrep-all, supporting AND/OR/NOT logic.
64
+
65
+ Args:
66
+ terms: Single pattern (str) or list of patterns (List[str]).
67
+ path: Single path (str/Path) or multiple paths (List[str]/List[Path]) to search in (defaults to current directory).
68
+ logic:
69
+ - "or" (default): match any term (A OR B OR C)
70
+ - "and": match all terms in same file (A AND B AND C)
71
+ - "not": match first term but NOT any of the rest (A AND NOT B AND NOT C)
72
+ case_sensitive: If True, enable case-sensitive search (`-s`).
73
+ whole_word: If True, match whole words only (`-w`).
74
+ literal: If True, treat patterns as literal strings (`-F`). Applies to all terms.
75
+ regex: If False, implies `literal=True`.
76
+ max_depth: Maximum directory depth to search (`--max-depth`).
77
+ include: List of glob patterns to include (`-g`).
78
+ exclude: List of glob patterns to exclude (`-g '!...'`).
79
+ file_type: Search only files of given type (`-t <type>`), e.g., 'py', 'md'.
80
+ invert_match: Invert match (`-v`). Note: conflicts with `logic="not"`; ignored in that case.
81
+ count_only: Only output match counts per file (`-c`).
82
+ line_number: Show line numbers (`-n`, default True).
83
+ with_filename: Show filenames (`-H`, default True).
84
+ rank: If True, rerank results by relevance score.
85
+ rank_kwargs: Additional kwargs for ranking (see `_rerank_results`).
86
+ rga_no_cache: If True, disable rga caching (`--rga-no-cache`).
87
+ rga_cache_max_blob_len: Max blob length for rga cache (`--rga-cache-max-blob-len`). Defaults to 10MB.
88
+ rga_cache_path: Custom path for rga cache (`--rga-cache-path`).
89
+ If None, then set the path to `/path/to/your_work_path/.cache/rga`
90
+ timeout: Maximum time in seconds to wait for the search to complete.
91
+
92
+ Returns:
93
+ List of match objects (from `rga --json`), or list of {'path': str, 'count': int} if `count_only=True`.
94
+ For "and"/"not", matches correspond to the **last term** (or first, if only one) in qualifying files/lines.
95
+ """
96
+ results: List[Dict[str, Any]] = []
97
+
98
+ # Normalize terms
99
+ if isinstance(terms, str):
100
+ terms = [terms]
101
+ if not terms:
102
+ return results
103
+
104
+ rga_cache_path = rga_cache_path or self.rga_cache
105
+ rga_cache_path = str(Path(rga_cache_path).resolve())
106
+
107
+ # Multi-term logic routing
108
+ if logic == "or":
109
+ results, retrieve_pattern = await self._retrieve_or(
110
+ terms=terms,
111
+ path=path,
112
+ case_sensitive=case_sensitive,
113
+ whole_word=whole_word,
114
+ literal=literal,
115
+ regex=regex,
116
+ max_depth=max_depth,
117
+ include=include,
118
+ exclude=exclude,
119
+ file_type=file_type,
120
+ invert_match=invert_match,
121
+ count_only=count_only,
122
+ line_number=line_number,
123
+ with_filename=with_filename,
124
+ rga_no_cache=rga_no_cache,
125
+ rga_cache_max_blob_len=rga_cache_max_blob_len,
126
+ rga_cache_path=rga_cache_path,
127
+ timeout=timeout,
128
+ )
129
+ elif logic == "and":
130
+ results = await self._retrieve_and(
131
+ terms=terms,
132
+ path=path,
133
+ case_sensitive=case_sensitive,
134
+ whole_word=whole_word,
135
+ literal=literal,
136
+ regex=regex,
137
+ max_depth=max_depth,
138
+ include=include,
139
+ exclude=exclude,
140
+ file_type=file_type,
141
+ count_only=count_only,
142
+ line_number=line_number,
143
+ with_filename=with_filename,
144
+ match_same_line=False, # file-level AND (most useful)
145
+ rga_no_cache=rga_no_cache,
146
+ rga_cache_max_blob_len=rga_cache_max_blob_len,
147
+ rga_cache_path=rga_cache_path,
148
+ timeout=timeout,
149
+ )
150
+ elif logic == "not":
151
+ if len(terms) < 2:
152
+ raise ValueError(
153
+ "logic='not' requires at least two terms: [positive, negative1, ...]"
154
+ )
155
+ results = await self._retrieve_not(
156
+ positive=terms[0],
157
+ negatives=terms[1:],
158
+ path=path,
159
+ case_sensitive=case_sensitive,
160
+ whole_word=whole_word,
161
+ literal=literal,
162
+ regex=regex,
163
+ max_depth=max_depth,
164
+ include=include,
165
+ exclude=exclude,
166
+ file_type=file_type,
167
+ count_only=count_only,
168
+ line_number=line_number,
169
+ with_filename=with_filename,
170
+ rga_no_cache=rga_no_cache,
171
+ rga_cache_max_blob_len=rga_cache_max_blob_len,
172
+ rga_cache_path=rga_cache_path,
173
+ timeout=timeout,
174
+ )
175
+ else:
176
+ raise ValueError(
177
+ f"Unsupported logic: {logic}. Choose from 'and', 'or', 'not'."
178
+ )
179
+
180
+ # ====== Reranking Post-Processing ======
181
+ if rank and not count_only and results:
182
+ rank_kwargs = rank_kwargs or {}
183
+
184
+ def _default_text_extractor(match: Dict) -> str:
185
+ try:
186
+ return match["data"]["lines"]["text"]
187
+ except (KeyError, TypeError):
188
+ return ""
189
+
190
+ score_opts = {
191
+ "case_sensitive": case_sensitive,
192
+ "whole_word": whole_word,
193
+ "length_norm": rank_kwargs.get("length_norm", "linear"),
194
+ "base_length": rank_kwargs.get("base_length", 100),
195
+ "exact_bonus": rank_kwargs.get("exact_bonus", 2.0),
196
+ "tf_weight": rank_kwargs.get("tf_weight", 1.0),
197
+ "term_weights": rank_kwargs.get("term_weights", None),
198
+ }
199
+
200
+ # Reconstruct file groups: [begin, match*, end]*
201
+ grouped: List[List[Dict]] = []
202
+ current_group: List[Dict] = []
203
+
204
+ for item in results:
205
+ item_type = item.get("type")
206
+ if item_type == "begin":
207
+ # Start new group
208
+ if current_group:
209
+ grouped.append(current_group)
210
+ current_group = [item]
211
+ elif item_type == "end":
212
+ # Close current group
213
+ current_group.append(item)
214
+ grouped.append(current_group)
215
+ current_group = []
216
+ elif item_type == "match":
217
+ # Accumulate in current group
218
+ if current_group: # defensive: should always be inside begin/end
219
+ current_group.append(item)
220
+ else:
221
+ # Orphan match? Append to new dummy group (should not happen)
222
+ current_group = [
223
+ {"type": "begin", "data": {"path": {"text": "<unknown>"}}},
224
+ item,
225
+ ]
226
+ else:
227
+ # e.g., "summary" — append to current or new group
228
+ if current_group:
229
+ current_group.append(item)
230
+ else:
231
+ grouped.append([item])
232
+
233
+ # If unclosed group remains (e.g., no final 'end'), flush it
234
+ if current_group:
235
+ grouped.append(current_group)
236
+
237
+ # Process each group: rerank only the 'match' items inside
238
+ new_results: List[Dict] = []
239
+
240
+ for group in grouped:
241
+ if not group:
242
+ continue
243
+
244
+ # Identify begin / match / end segments
245
+ match_items = [g for g in group if g.get("type") == "match"]
246
+
247
+ # Rerank match items
248
+ scored_matches = []
249
+ for m in match_items:
250
+ text = _default_text_extractor(m)
251
+ score = self._calculate_relevance_score(
252
+ text=text, terms=terms, **score_opts
253
+ )
254
+ new_m = {**m, "score": score}
255
+ scored_matches.append((score, new_m))
256
+
257
+ # Sort descending by score
258
+ scored_matches.sort(key=lambda x: x[0], reverse=True)
259
+ reranked_matches = [item for _, item in scored_matches]
260
+
261
+ # Rebuild group in correct order:
262
+ # [begin] + [other non-match items in original order] + [reranked matches] + [end]
263
+ # But preserve *relative order* of non-match items (e.g., context lines)
264
+ # Simpler: walk original group, replace match list with reranked one
265
+ rebuilt_group = []
266
+ match_iter = iter(reranked_matches)
267
+ for item in group:
268
+ if item.get("type") == "match":
269
+ # Pull next from reranked list (should be same length)
270
+ try:
271
+ rebuilt_group.append(next(match_iter))
272
+ except StopIteration:
273
+ pass # fallback: skip (should not happen)
274
+ else:
275
+ rebuilt_group.append(item)
276
+
277
+ new_results.extend(rebuilt_group)
278
+
279
+ results = new_results
280
+
281
+ return results
282
+
283
+ @staticmethod
284
+ def _run_rga(
285
+ args: List[str], json_output: bool = True
286
+ ) -> subprocess.CompletedProcess:
287
+ """Run ripgrep-all with given arguments.
288
+
289
+ Args:
290
+ args: List of ripgrep-all CLI arguments.
291
+ json_output: If True, forces `--json` and parses stdout as JSON Lines.
292
+
293
+ Returns:
294
+ CompletedProcess object with parsed stdout (as list of dicts if json_output=True).
295
+ """
296
+ cmd = ["rga", "--no-config"] # disable user config for reproducibility
297
+ if json_output:
298
+ cmd.append("--json")
299
+ cmd.extend(args)
300
+
301
+ try:
302
+ result = subprocess.run(
303
+ cmd,
304
+ stdout=subprocess.PIPE,
305
+ stderr=subprocess.PIPE,
306
+ text=True,
307
+ check=False, # we handle non-zero exit codes manually
308
+ )
309
+
310
+ if result.returncode != 0:
311
+ if "ripgrep" in result.stderr.lower() or " rg " in result.stderr:
312
+ raise RuntimeError(
313
+ f"ripgrep-all depends on 'ripgrep' (rg), but it's missing: {result.stderr.strip()}"
314
+ )
315
+ elif result.returncode > 1:
316
+ raise RuntimeError(f"rga execution failed: {result.stderr.strip()}")
317
+
318
+ # Parse JSON Lines if requested
319
+ if json_output and result.returncode in (0, 1) and result.stdout.strip():
320
+ lines = result.stdout.strip().splitlines()
321
+ result.stdout = [json.loads(line) for line in lines if line]
322
+ return result
323
+ except FileNotFoundError:
324
+ raise RuntimeError(
325
+ "ripgrep-all ('rga') not found. Please install ripgrep-all first."
326
+ )
327
+ except json.JSONDecodeError as e:
328
+ raise RuntimeError(
329
+ f"Failed to parse ripgrep-all JSON output: {e}\nRaw output: {result.stdout}"
330
+ )
331
+
332
+ @staticmethod
333
+ async def _run_rga_async(
334
+ args: List[str], json_output: bool = True, timeout: float = 60.0
335
+ ) -> Dict[str, Any]:
336
+ cmd = ["rga", "--no-config"]
337
+ if json_output:
338
+ cmd.append("--json")
339
+ cmd.extend(args)
340
+
341
+ try:
342
+ await asyncio.wait_for(RGA_SEMAPHORE.acquire(), timeout=timeout)
343
+ except asyncio.TimeoutError:
344
+ raise RuntimeError(
345
+ f"rga search timed out while waiting for a queue slot ({timeout}s)."
346
+ )
347
+
348
+ try:
349
+ try:
350
+ process = await asyncio.create_subprocess_exec(
351
+ *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
352
+ )
353
+ except FileNotFoundError:
354
+ raise RuntimeError("ripgrep-all ('rga') not found. Please install it first.")
355
+
356
+ try:
357
+ stdout, stderr = await asyncio.wait_for(
358
+ process.communicate(), timeout=timeout
359
+ )
360
+
361
+ stdout_str = stdout.decode().strip()
362
+ stderr_str = stderr.decode().strip()
363
+ returncode = process.returncode
364
+
365
+ if returncode != 0:
366
+ if "ripgrep" in stderr_str.lower() or " rg " in stderr_str:
367
+ raise RuntimeError(
368
+ f"ripgrep-all depends on 'ripgrep' (rg), but it's missing or failed: {stderr_str}"
369
+ )
370
+ elif returncode > 1:
371
+ raise RuntimeError(f"rga execution failed with code {returncode}: {stderr_str}")
372
+ # 4. Parse JSON Lines
373
+ parsed_stdout = stdout_str
374
+ if json_output and returncode in (0, 1) and stdout_str:
375
+ try:
376
+ parsed_stdout = [
377
+ json.loads(line) for line in stdout_str.splitlines() if line
378
+ ]
379
+ except json.JSONDecodeError as e:
380
+ raise RuntimeError(f"Failed to parse rga JSON output: {e}")
381
+
382
+ return {
383
+ "returncode": returncode,
384
+ "stdout": parsed_stdout,
385
+ "stderr": stderr_str,
386
+ }
387
+ except asyncio.TimeoutError:
388
+ try:
389
+ process.kill()
390
+ except ProcessLookupError:
391
+ pass
392
+ raise RuntimeError(f"rga process execution timed out ({timeout}s).")
393
+
394
+ finally:
395
+ RGA_SEMAPHORE.release()
396
+
397
+ @staticmethod
398
+ async def _retrieve_single(**kwargs) -> List[Dict[str, Any]]:
399
+ """Wrapper for original single-pattern search (extracted for reuse)."""
400
+ pattern = kwargs.pop("pattern")
401
+ args = []
402
+
403
+ # Basic ripgrep-all args
404
+ regex = kwargs.get("regex", True)
405
+ literal = kwargs.get("literal", False)
406
+ case_sensitive = kwargs.get("case_sensitive", False)
407
+ whole_word = kwargs.get("whole_word", False)
408
+ invert_match = kwargs.get("invert_match", False)
409
+ count_only = kwargs.get("count_only", False)
410
+ line_number = kwargs.get("line_number", True)
411
+ with_filename = kwargs.get("with_filename", True)
412
+ max_depth = kwargs.get("max_depth")
413
+ include = kwargs.get("include")
414
+ exclude = kwargs.get("exclude")
415
+ file_type = kwargs.get("file_type")
416
+ path = kwargs.get("path")
417
+ timeout = kwargs.get("timeout", 60.0)
418
+
419
+ # Additional ripgrep-all args
420
+ rga_no_cache = kwargs.get("rga_no_cache", False)
421
+ rga_cache_max_blob_len = kwargs.get(
422
+ "rga_cache_max_blob_len", 10000000
423
+ ) # Default 10MB
424
+ rga_cache_path = kwargs.get("rga_cache_path")
425
+
426
+ # Build argument list
427
+ if not regex:
428
+ literal = True
429
+ if literal:
430
+ args.append("-F")
431
+ if case_sensitive:
432
+ args.append("-s")
433
+ else:
434
+ args.append("-i")
435
+ if whole_word:
436
+ args.append("-w")
437
+ if invert_match:
438
+ args.append("-v")
439
+ if count_only:
440
+ args.append("-c")
441
+ if not line_number:
442
+ args.append("--no-line-number")
443
+ if not with_filename:
444
+ args.append("--no-filename")
445
+ if max_depth is not None:
446
+ args.extend(["--max-depth", str(max_depth)])
447
+ if include:
448
+ for inc in include:
449
+ args.extend(["-g", inc])
450
+ if exclude:
451
+ for exc in exclude:
452
+ args.extend(["-g", f"!{exc}"])
453
+ if file_type:
454
+ args.extend(["-t", file_type])
455
+
456
+ if rga_no_cache:
457
+ args.append("--rga-no-cache")
458
+
459
+ args.extend([f"--rga-cache-max-blob-len={str(rga_cache_max_blob_len)}"])
460
+
461
+ if rga_cache_path:
462
+ args.extend([f"--rga-cache-path={str(rga_cache_path)}"])
463
+
464
+ args.append(pattern)
465
+
466
+ if path is not None:
467
+ if isinstance(path, (str, Path)):
468
+ args.append(str(path))
469
+ elif isinstance(path, list):
470
+ for p in path:
471
+ args.append(str(p))
472
+ else:
473
+ raise TypeError(f"Unsupported type for 'path': {type(path)}")
474
+
475
+ # keys: returncode, stdout, stderr
476
+ result: Dict[str, Any] = await GrepRetriever._run_rga_async(
477
+ args=args,
478
+ json_output=not count_only,
479
+ timeout=timeout,
480
+ )
481
+
482
+ if result["returncode"] == 0:
483
+ if count_only:
484
+ counts = []
485
+ for line in result.stdout.strip().splitlines():
486
+ if ":" in line:
487
+ p, c = line.rsplit(":", 1)
488
+ counts.append({"path": p, "count": int(c)})
489
+ return counts
490
+ else:
491
+ return result["stdout"]
492
+ elif result["returncode"] == 1:
493
+ return []
494
+ else:
495
+ raise RuntimeError(
496
+ f"ripgrep-all failed (exit {result['returncode']}): {result['stderr'].strip()}"
497
+ )
498
+
499
+ @staticmethod
500
+ async def _retrieve_or(
501
+ terms: List[str],
502
+ **kwargs,
503
+ ) -> (List[Dict[str, Any]], str):
504
+ """OR: Match any term — simply concatenate with | (ripgrep-all supports alternation)."""
505
+ # Escape terms if literal mode
506
+ literal = kwargs.get("literal", False)
507
+ if literal:
508
+ escaped_terms = [
509
+ term.replace("\\", "\\\\").replace("|", "\\|") for term in terms
510
+ ]
511
+ pattern = "|".join(escaped_terms)
512
+ kwargs["literal"] = True # still use -F to avoid regex meta
513
+ else:
514
+ # Wrap each term in (?:...) to avoid precedence issues
515
+ pattern = "|".join(f"(?:{term})" for term in terms)
516
+ kwargs["literal"] = False
517
+
518
+ result = await GrepRetriever._retrieve_single(pattern=pattern, **kwargs)
519
+
520
+ return result, pattern
521
+
522
+ @staticmethod
523
+ async def _retrieve_and(
524
+ terms: List[str],
525
+ match_same_line: bool = False,
526
+ **kwargs,
527
+ ) -> List[Dict[str, Any]]:
528
+ """AND: All terms in same file (default) or same line."""
529
+ count_only = kwargs.get("count_only", False)
530
+
531
+ # Step 1: Get files containing first term
532
+ first_matches = await GrepRetriever._retrieve_single(pattern=terms[0], **kwargs)
533
+ if not first_matches:
534
+ return []
535
+
536
+ if match_same_line:
537
+ # Line-level AND: filter lines containing all terms
538
+ def line_contains_all(
539
+ line: str, others: List[str], case_sensitive: bool
540
+ ) -> bool:
541
+ if not case_sensitive:
542
+ line = line.lower()
543
+ others = [t.lower() for t in others]
544
+ return all(term in line for term in others)
545
+
546
+ case_sensitive = kwargs.get("case_sensitive", False)
547
+ others = terms[1:]
548
+ return [
549
+ m
550
+ for m in first_matches
551
+ if m["type"] == "match"
552
+ and line_contains_all(
553
+ m["data"]["lines"]["text"], others, case_sensitive
554
+ )
555
+ ]
556
+ else:
557
+ # File-level AND (default)
558
+ files_with_first = {
559
+ m["data"]["path"]["text"] for m in first_matches if m["type"] == "match"
560
+ }
561
+ qualifying_files = set()
562
+
563
+ for f in files_with_first:
564
+ valid = True
565
+ for term in terms[1:]:
566
+ res = await GrepRetriever._retrieve_single(
567
+ pattern=term,
568
+ path=f,
569
+ count_only=count_only,
570
+ **{
571
+ k: v
572
+ for k, v in kwargs.items()
573
+ if k not in ["path", "count_only"]
574
+ },
575
+ )
576
+ if not res: # no match
577
+ valid = False
578
+ break
579
+ if valid:
580
+ qualifying_files.add(f)
581
+
582
+ # Collect matches for last term (or first) in qualifying files
583
+ target_term = terms[-1]
584
+ all_matches = []
585
+ kwargs["pattern"] = target_term
586
+ for f in qualifying_files:
587
+ kwargs["path"] = f
588
+ matches = await GrepRetriever._retrieve_single(**kwargs)
589
+ all_matches.extend(matches)
590
+ return all_matches
591
+
592
+ @staticmethod
593
+ async def _retrieve_not(
594
+ positive: str,
595
+ negatives: List[str],
596
+ **kwargs,
597
+ ) -> List[Dict[str, Any]]:
598
+ """NOT: Match positive term, but exclude files/lines containing any negative term."""
599
+ # count_only = kwargs.get("count_only", False)
600
+
601
+ # Step 1: Get matches for positive term
602
+ pos_matches = await GrepRetriever._retrieve_single(pattern=positive, **kwargs)
603
+ if not pos_matches:
604
+ return []
605
+
606
+ # Decide: file-level NOT (default) vs line-level NOT
607
+ # We use file-level for efficiency (avoid per-line Python filtering on large outputs)
608
+ files_with_positive = {
609
+ m["data"]["path"]["text"] for m in pos_matches if m["type"] == "match"
610
+ }
611
+ excluded_files = set()
612
+
613
+ for f in files_with_positive:
614
+ for neg in negatives:
615
+ res = await GrepRetriever._retrieve_single(
616
+ pattern=neg,
617
+ path=f,
618
+ count_only=True,
619
+ **{
620
+ k: v
621
+ for k, v in kwargs.items()
622
+ if k not in ["path", "count_only"]
623
+ },
624
+ )
625
+ if res: # found negative → exclude this file
626
+ excluded_files.add(f)
627
+ break
628
+
629
+ # Keep only matches from non-excluded files
630
+ kept_matches = [
631
+ m
632
+ for m in pos_matches
633
+ if m["type"] == "match" and m["data"]["path"]["text"] not in excluded_files
634
+ ]
635
+
636
+ return kept_matches
637
+
638
+ async def list_files(
639
+ self,
640
+ path: Optional[str] = None,
641
+ *,
642
+ max_depth: Optional[int] = None,
643
+ include: Optional[List[str]] = None,
644
+ exclude: Optional[List[str]] = None,
645
+ file_type: Optional[str] = None,
646
+ hidden: bool = False,
647
+ follow_symlinks: bool = False,
648
+ ) -> List[str]:
649
+ """List files that would be searched by ripgrep-all (like `rga --files`).
650
+
651
+ Args:
652
+ path: Path to list files in.
653
+ max_depth: Maximum directory depth.
654
+ include: Glob patterns to include.
655
+ exclude: Glob patterns to exclude.
656
+ file_type: Restrict to file type (e.g., 'py').
657
+ hidden: Include hidden files/dirs (`--hidden`).
658
+ follow_symlinks: Follow symbolic links (`--follow`).
659
+
660
+ Returns:
661
+ List of relative file paths (strings).
662
+ """
663
+ args = ["--files"]
664
+ if max_depth is not None:
665
+ args.extend(["--max-depth", str(max_depth)])
666
+ if include:
667
+ for inc in include:
668
+ args.extend(["-g", inc])
669
+ if exclude:
670
+ for exc in exclude:
671
+ args.extend(["-g", f"!{exc}"])
672
+ if file_type:
673
+ args.extend(["-t", file_type])
674
+ if hidden:
675
+ args.append("--hidden")
676
+ if follow_symlinks:
677
+ args.append("--follow")
678
+ if path:
679
+ args.append(path)
680
+
681
+ result: Dict[str, Any] = await GrepRetriever._run_rga_async(
682
+ args, json_output=False
683
+ )
684
+ if result["returncode"] not in (0, 1):
685
+ raise RuntimeError(
686
+ f"ripgrep-all --files failed: {result['stderr'].strip()}"
687
+ )
688
+
689
+ return result["stdout"].strip().splitlines() if result["stdout"].strip() else []
690
+
691
+ def file_types(self) -> Dict[str, List[str]]:
692
+ """List supported file types and their associated globs/extensions.
693
+
694
+ Returns:
695
+ Dict mapping type names (e.g., 'python') to list of globs (e.g., ['*.py', '*.pyi']).
696
+ """
697
+ result = subprocess.run(
698
+ ["rga", "--type-list"],
699
+ stdout=subprocess.PIPE,
700
+ stderr=subprocess.PIPE,
701
+ text=True,
702
+ check=True,
703
+ )
704
+ types = {}
705
+ for line in result.stdout.strip().splitlines():
706
+ if ":" in line:
707
+ name, globs = line.split(":", 1)
708
+ types[name.strip()] = [g.strip() for g in globs.split(",") if g.strip()]
709
+ return types
710
+
711
+ async def replace(
712
+ self,
713
+ pattern: str,
714
+ replacement: str,
715
+ path: Optional[str] = None,
716
+ *,
717
+ dry_run: bool = False,
718
+ case_sensitive: bool = False,
719
+ literal: bool = False,
720
+ whole_word: bool = False,
721
+ max_depth: Optional[int] = None,
722
+ include: Optional[List[str]] = None,
723
+ exclude: Optional[List[str]] = None,
724
+ ) -> List[Dict[str, Any]]:
725
+ """Perform search-and-replace using ripgrep-all (via `--replace`).
726
+
727
+ Caution: This modifies files in-place if dry_run=False.
728
+
729
+ Args:
730
+ pattern: Regex pattern to search for.
731
+ replacement: Replacement string (supports $1, $2, etc.).
732
+ path: Path to operate on.
733
+ dry_run: If True, only show matches/replacements, don't modify files.
734
+ case_sensitive: Enable case-sensitive matching.
735
+ literal: Treat pattern as literal string.
736
+ whole_word: Match whole words only.
737
+ max_depth: Max search depth.
738
+ include/exclude: Globs to include/exclude.
739
+
740
+ Returns:
741
+ List of replacement events (from `rga --json` output).
742
+ """
743
+ args = ["--replace", replacement]
744
+ if dry_run:
745
+ args.append("--dry-run")
746
+ else:
747
+ args.append("--passthru") # needed for in-place replace with --replace
748
+ if case_sensitive:
749
+ args.append("-s")
750
+ else:
751
+ args.append("-i")
752
+ if literal:
753
+ args.append("-F")
754
+ if whole_word:
755
+ args.append("-w")
756
+ if max_depth is not None:
757
+ args.extend(["--max-depth", str(max_depth)])
758
+ if include:
759
+ for inc in include:
760
+ args.extend(["-g", inc])
761
+ if exclude:
762
+ for exc in exclude:
763
+ args.extend(["-g", f"!{exc}"])
764
+
765
+ args.append(pattern)
766
+ if path:
767
+ args.append(path)
768
+
769
+ result = await GrepRetriever._run_rga_async(args)
770
+ if result["returncode"] not in (0, 1):
771
+ raise RuntimeError(
772
+ f"ripgrep-all replace failed: {result['stderr'].strip()}"
773
+ )
774
+
775
+ return result["stdout"]
776
+
777
+ def version(self) -> str:
778
+ """Get ripgrep-all version string.
779
+
780
+ Returns:
781
+ Version string.
782
+ """
783
+ result = subprocess.run(
784
+ ["rga", "--version"],
785
+ stdout=subprocess.PIPE,
786
+ stderr=subprocess.PIPE,
787
+ text=True,
788
+ check=True,
789
+ )
790
+ return result.stdout.strip().split("\n")[0]
791
+
792
+ def supports_feature(self, feature: str) -> bool:
793
+ """
794
+ Check if ripgrep-all supports a given feature (e.g., 'pcre2', 'json').
795
+
796
+ Args:
797
+ feature: Feature name (e.g., 'json', 'pcre2', 'lz4').
798
+
799
+ Returns:
800
+ True if feature is available in this rga build.
801
+ """
802
+ result = subprocess.run(
803
+ ["rga", "--help"],
804
+ stdout=subprocess.PIPE,
805
+ stderr=subprocess.PIPE,
806
+ text=True,
807
+ check=True,
808
+ )
809
+ return f"--{feature}" in result.stdout
810
+
811
+ @staticmethod
812
+ def _calculate_relevance_score(
813
+ text: str,
814
+ terms: List[str],
815
+ *,
816
+ case_sensitive: bool = False,
817
+ whole_word: bool = False,
818
+ length_norm: Literal["linear", "log", "none"] = "linear",
819
+ base_length: int = 100,
820
+ exact_bonus: float = 2.0,
821
+ tf_weight: float = 1.0,
822
+ term_weights: Optional[List[float]] = None,
823
+ tf_saturation: Literal["log", "sigmoid", "none"] = "sigmoid",
824
+ saturation_k: float = 1.0,
825
+ idf_simulate: bool = True,
826
+ ) -> float:
827
+ """
828
+ Compute a relevance score for a text w.r.t. a list of query terms.
829
+
830
+ Scoring formula (per term, then summed):
831
+ term_score = (TF_term ** tf_weight) * bonus_term * length_factor
832
+
833
+ Where:
834
+ - TF_term = number of matches for the term
835
+ - bonus_term = exact_bonus if at least one match is *isolated*, else 1.0
836
+ - length_factor = global penalty for long texts (shared across terms)
837
+
838
+ An *isolated* match means the term is surrounded by non-alphanumeric characters
839
+ or string boundaries (i.e., standalone token — higher relevance).
840
+
841
+ Args:
842
+ text: Text to score (e.g., matching line or surrounding context).
843
+ terms: List of query terms (e.g., ["TODO", "fix"]).
844
+ case_sensitive: Whether matching is case-sensitive.
845
+ whole_word: If True, only matches bounded by non-word chars are counted.
846
+ length_norm: Length penalty strategy ('linear' / 'log' / 'none').
847
+ base_length: Scaling for linear norm (default 100 chars).
848
+ exact_bonus: Bonus multiplier for isolated matches (e.g., 2.0).
849
+ tf_weight: Exponent for term frequency (e.g., 0.5 for sqrt(TF)).
850
+ term_weights: Optional weights for each term (default: uniform 1.0).
851
+ Must have same length as `terms` if provided.
852
+ tf_saturation: How to saturate term frequency:
853
+ - 'log': tf_adj = 1 + log(tf) (smooth, BM25-like)
854
+ - 'sigmoid':tf_adj = tf / (tf + k) (bounded [0,1))
855
+ - 'none': tf_adj = tf (original)
856
+ saturation_k: Parameter for sigmoid (default 1.0); larger → slower saturation.
857
+ idf_simulate: If True, penalize short/common terms heuristically:
858
+ idf_factor = max(0.1, min(1.0, len(term) / 5)) # e.g., "a" → 0.2, "error" → 1.0
859
+
860
+ Returns:
861
+ Non-negative float relevance score (sum of term scores).
862
+ """
863
+ if not text or not terms:
864
+ return 0.0
865
+
866
+ if term_weights is not None:
867
+ if len(term_weights) != len(terms):
868
+ raise ValueError("term_weights must have same length as terms")
869
+ else:
870
+ term_weights = [1.0] * len(terms)
871
+
872
+ flags = 0 if case_sensitive else re.IGNORECASE
873
+
874
+ # Precompute length factor (shared)
875
+ n = len(text)
876
+ if length_norm == "linear":
877
+ length_factor = 1.0 / (n / base_length + 1)
878
+ elif length_norm == "log":
879
+ length_factor = 1.0 / (math.log(n + 1) + 1)
880
+ else: # "none"
881
+ length_factor = 1.0
882
+
883
+ total_score = 0.0
884
+
885
+ for term, weight in zip(terms, term_weights):
886
+ if not term:
887
+ continue
888
+
889
+ if idf_simulate:
890
+ # Normalize: len=1→0.2, len=5+→1.0 (clamped)
891
+ idf_factor = max(0.2, min(1.0, len(term) / 5.0))
892
+ else:
893
+ idf_factor = 1.0
894
+
895
+ escaped = re.escape(term)
896
+ if whole_word:
897
+ regex = rf"(?<!\w){escaped}(?!\w)"
898
+ else:
899
+ regex = escaped
900
+
901
+ # Find matches
902
+ try:
903
+ matches_iter = re.finditer(regex, text, flags=flags)
904
+ match_positions: List[int] = [m.start() for m in matches_iter]
905
+ tf = len(match_positions)
906
+ except re.error as e:
907
+ logger.warning(f"Regex failed for term {term!r}: {e}", RuntimeWarning)
908
+ tf = 0
909
+
910
+ if tf == 0:
911
+ continue
912
+
913
+ # Isolation bonus
914
+ has_isolated_match = any(
915
+ (pos == 0 or not text[pos - 1].isalnum())
916
+ and (
917
+ pos + len(term) == len(text) or not text[pos + len(term)].isalnum()
918
+ )
919
+ for pos in match_positions
920
+ )
921
+ bonus = exact_bonus if has_isolated_match else 1.0
922
+
923
+ if tf_saturation == "log":
924
+ tf_adj = 1.0 + math.log(tf) # log(1)=0 → tf=1 → tf_adj=1.0
925
+ elif tf_saturation == "sigmoid":
926
+ tf_adj = tf / (tf + saturation_k) # e.g., k=1: tf=1→0.5, tf=10→0.91
927
+ elif tf_saturation == "none":
928
+ tf_adj = float(tf)
929
+ else:
930
+ raise ValueError(f"Unknown tf_saturation: {tf_saturation}")
931
+
932
+ # Term score with saturation + IDF + bonus + weight
933
+ term_score = (tf_adj**tf_weight) * bonus * weight * idf_factor
934
+ total_score += term_score
935
+
936
+ score = total_score * length_factor
937
+ return max(0.0, score)
938
+
939
+ @staticmethod
940
+ def merge_results(
941
+ raw_results: List[Dict[str, Any]], limit: int = 50
942
+ ) -> List[Dict[str, Any]]:
943
+ """
944
+ Merge ripgrep-all --json output into a structured per-file result list.
945
+
946
+ This function:
947
+ - Groups 'match' entries by file path (bounded by 'begin' and 'end' events).
948
+ - For each file, collects 'match' items and sorts them by score (desc).
949
+ - Takes top-`limit` matches per file (default: 50).
950
+ - Combines all lines.text from selected matches into a list (in original match order).
951
+ - Returns a list of unified file results.
952
+
953
+ Args:
954
+ raw_results: List of parsed JSON objects from `rga --json` output.
955
+ limit: Maximum number of match items to keep per file (default: 50).
956
+
957
+ Returns:
958
+ A list of dictionaries, each representing one file with:
959
+ - "path": str
960
+ - "matches": List[Dict] # top `limit` match items (sorted by score desc)
961
+ - "lines": List[str] # lines.text from those matches, in match order
962
+ - "total_matches": int # total matches found in this file (before limit)
963
+ """
964
+ if not raw_results:
965
+ return []
966
+
967
+ # State tracking
968
+ current_path: Optional[str] = None
969
+ file_matches: List[Dict[str, Any]] = [] # matches for current file
970
+ all_files: List[Dict[str, Any]] = [] # final result accumulator
971
+
972
+ for item in raw_results:
973
+ item_type = item.get("type")
974
+ data = item.get("data", {})
975
+
976
+ if item_type == "begin":
977
+ # Start a new file context
978
+ path_obj = data.get("path", {})
979
+ current_path = path_obj.get("text")
980
+ file_matches = [] # reset match buffer
981
+
982
+ elif item_type == "match" and current_path is not None:
983
+ # Accumulate match; retain full item for sorting & line extraction
984
+ # Note: 'score' is top-level in your example (not in 'data')
985
+ file_matches.append(item)
986
+
987
+ elif item_type == "end":
988
+ # Finalize current file
989
+ if current_path is not None:
990
+ # Sort matches by score (descending); assume score exists
991
+ file_matches.sort(key=lambda x: x.get("score", 0.0), reverse=True)
992
+
993
+ total_count = len(file_matches)
994
+ top_matches = file_matches[:limit]
995
+
996
+ # Extract lines.text in match order (not sorted order if stable sort not guaranteed)
997
+ # But since we sort, we preserve sorted order → lines follow score order
998
+ lines = [
999
+ match["data"]["lines"]["text"]
1000
+ for match in top_matches
1001
+ if "data" in match and "lines" in match["data"]
1002
+ ]
1003
+
1004
+ all_files.append(
1005
+ {
1006
+ "path": current_path,
1007
+ "matches": top_matches, # full match objects
1008
+ "lines": lines, # list of line strings
1009
+ "total_matches": total_count, # before limiting
1010
+ "total_score": 0.0,
1011
+ }
1012
+ )
1013
+
1014
+ # Reset
1015
+ current_path = None
1016
+ file_matches = []
1017
+
1018
+ # Ignore "summary" and unknown types
1019
+
1020
+ return all_files
1021
+
1022
+
1023
+ class TextRetriever(GrepRetriever):
1024
+ """Alias for GrepRetriever for backward compatibility."""
1025
+
1026
+ pass