ragrep 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragrep/__init__.py ADDED
@@ -0,0 +1,41 @@
1
+ """RAGrep public package API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.2.1"
6
+ __author__ = "RAGrep Team"
7
+
8
+
9
+ def __getattr__(name: str):
10
+ if name in {"RAGrep", "RAGSystem"}:
11
+ from .core.rag_system import RAGrep, RAGSystem
12
+
13
+ return RAGrep if name == "RAGrep" else RAGSystem
14
+
15
+ if name == "DocumentProcessor":
16
+ from .core.document_processor import DocumentProcessor
17
+
18
+ return DocumentProcessor
19
+
20
+ if name == "VectorStore":
21
+ from .retrieval.vector_store import VectorStore
22
+
23
+ return VectorStore
24
+
25
+ if name in {"LocalEmbedder", "OllamaEmbedder"}:
26
+ # Keep OllamaEmbedder alias for backward compatibility.
27
+ from .retrieval.embeddings import LocalEmbedder
28
+
29
+ return LocalEmbedder
30
+
31
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
32
+
33
+
34
+ __all__ = [
35
+ "RAGrep",
36
+ "RAGSystem",
37
+ "DocumentProcessor",
38
+ "VectorStore",
39
+ "LocalEmbedder",
40
+ "OllamaEmbedder",
41
+ ]
ragrep/cli.py ADDED
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python3
2
+ """Command-line interface for RAGrep."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import logging
9
+ import os
10
+ import sys
11
+ from typing import List
12
+
13
+ from .core.rag_system import RAGrep
14
+ from .retrieval.embeddings import get_runtime_device_info
15
+
16
+
17
+ def setup_logging(verbose: bool) -> None:
18
+ logging.basicConfig(
19
+ level=logging.DEBUG if verbose else logging.WARNING,
20
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
21
+ )
22
+
23
+
24
+ def _build_common_parser(description: str) -> argparse.ArgumentParser:
25
+ parser = argparse.ArgumentParser(description=description)
26
+ parser.add_argument(
27
+ "--db-path",
28
+ default=os.getenv("RAGREP_DB_PATH", "./.ragrep.db"),
29
+ help="Path to local SQLite database",
30
+ )
31
+ parser.add_argument(
32
+ "--chunk-size",
33
+ type=int,
34
+ default=int(os.getenv("CHUNK_SIZE", "1000")),
35
+ help="Chunk size",
36
+ )
37
+ parser.add_argument(
38
+ "--chunk-overlap",
39
+ type=int,
40
+ default=int(os.getenv("CHUNK_OVERLAP", "200")),
41
+ help="Chunk overlap",
42
+ )
43
+ parser.add_argument(
44
+ "--model",
45
+ default=os.getenv("EMBEDDING_MODEL", "mxbai-embed-large"),
46
+ help="Embedding model name",
47
+ )
48
+ parser.add_argument(
49
+ "--model-dir",
50
+ default=os.getenv("RAGREP_MODEL_DIR"),
51
+ help="Optional directory for downloaded embedding models",
52
+ )
53
+ parser.add_argument(
54
+ "--device",
55
+ default=os.getenv("RAGREP_DEVICE", "auto"),
56
+ help="Embedding device: auto, cpu, cuda, mps, or explicit device (e.g. cuda:0)",
57
+ )
58
+ parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
59
+ return parser
60
+
61
+
62
+ def _build_recall_parser(prog: str = "ragrep") -> argparse.ArgumentParser:
63
+ parser = _build_common_parser("Recall relevant chunks (auto-indexes when files changed)")
64
+ parser.prog = prog
65
+ parser.add_argument("query", nargs="+", help="Semantic query")
66
+ parser.add_argument(
67
+ "--path",
68
+ default=None,
69
+ help="Directory or file to index when needed (defaults to existing indexed root, else current dir)",
70
+ )
71
+ parser.add_argument("--limit", type=int, default=20, help="Maximum number of results")
72
+ parser.add_argument("--no-auto-index", action="store_true", help="Disable automatic index updates")
73
+ parser.add_argument("--json", action="store_true", help="Output JSON")
74
+ return parser
75
+
76
+
77
+ def _build_index_parser() -> argparse.ArgumentParser:
78
+ parser = _build_common_parser("Index a directory or file")
79
+ parser.add_argument("path", nargs="?", default=".", help="Path to index")
80
+ parser.add_argument("--force", action="store_true", help="Force a full re-index")
81
+ parser.add_argument("--json", action="store_true", help="Output JSON")
82
+ return parser
83
+
84
+
85
+ def _build_stats_parser() -> argparse.ArgumentParser:
86
+ parser = _build_common_parser("Show index statistics")
87
+ parser.add_argument("--json", action="store_true", help="Output JSON")
88
+ return parser
89
+
90
+
91
+ def _build_gpu_parser() -> argparse.ArgumentParser:
92
+ parser = argparse.ArgumentParser(description="Show GPU/device support for embeddings")
93
+ parser.add_argument(
94
+ "--device",
95
+ default=os.getenv("RAGREP_DEVICE", "auto"),
96
+ help="Requested embedding device: auto, cpu, cuda, mps, cuda:0, etc.",
97
+ )
98
+ parser.add_argument("--json", action="store_true", help="Output JSON")
99
+ return parser
100
+
101
+
102
+ def _run_gpu_info(args: argparse.Namespace) -> int:
103
+ info = get_runtime_device_info(args.device)
104
+ if args.json:
105
+ print(json.dumps(info, indent=2))
106
+ else:
107
+ print(f"Requested: {info['requested_device']}")
108
+ print(f"Resolved: {info['resolved_device']}")
109
+ print(f"PyTorch available: {info['torch_available']}")
110
+ print(f"CUDA available: {info['cuda_available']}")
111
+ print(f"CUDA device count: {info['cuda_device_count']}")
112
+ if info["cuda_devices"]:
113
+ print("CUDA devices:")
114
+ for index, name in enumerate(info["cuda_devices"]):
115
+ print(f" {index}: {name}")
116
+ print(f"MPS available: {info['mps_available']}")
117
+ return 0
118
+
119
+
120
+ def _print_new_file_paths(index_result: dict) -> None:
121
+ new_files = index_result.get("new_files") or []
122
+ if not new_files:
123
+ return
124
+ print("New files indexed:")
125
+ for path in new_files:
126
+ print(path)
127
+
128
+
129
+ def _run_recall(args: argparse.Namespace) -> int:
130
+ setup_logging(args.verbose)
131
+ query = " ".join(args.query).strip()
132
+
133
+ with RAGrep(
134
+ db_path=args.db_path,
135
+ chunk_size=args.chunk_size,
136
+ chunk_overlap=args.chunk_overlap,
137
+ embedding_model=args.model,
138
+ model_dir=args.model_dir,
139
+ embedding_device=args.device,
140
+ ) as rag:
141
+ result = rag.recall(
142
+ query,
143
+ limit=args.limit,
144
+ path=args.path,
145
+ auto_index=not args.no_auto_index,
146
+ )
147
+
148
+ if args.json:
149
+ print(json.dumps(result, indent=2))
150
+ return 0
151
+
152
+ index_info = result.get("auto_index")
153
+ if index_info and index_info.get("indexed"):
154
+ _print_new_file_paths(index_info)
155
+ print(
156
+ f"Indexed {index_info['indexed_files']} changed files "
157
+ f"({index_info['chunks_indexed']} chunks updated, {index_info['chunks']} total): "
158
+ f"{index_info['reason']}"
159
+ )
160
+
161
+ matches = result.get("matches", [])
162
+ print(f"Results: {len(matches)}")
163
+ for position, match in enumerate(matches, start=1):
164
+ source = match.get("metadata", {}).get("source", "unknown")
165
+ print(f"{position}. score={match['score']:.4f} source={source}")
166
+ print(match.get("text", "").rstrip())
167
+
168
+ return 0
169
+
170
+
171
+ def _run_index(args: argparse.Namespace) -> int:
172
+ setup_logging(args.verbose)
173
+
174
+ with RAGrep(
175
+ db_path=args.db_path,
176
+ chunk_size=args.chunk_size,
177
+ chunk_overlap=args.chunk_overlap,
178
+ embedding_model=args.model,
179
+ model_dir=args.model_dir,
180
+ embedding_device=args.device,
181
+ ) as rag:
182
+ result = rag.index(path=args.path, force=args.force)
183
+
184
+ if args.json:
185
+ print(json.dumps(result, indent=2))
186
+ return 0
187
+
188
+ if result["indexed"]:
189
+ _print_new_file_paths(result)
190
+ print(
191
+ f"Indexed {result['indexed_files']} changed files "
192
+ f"({result['chunks_indexed']} chunks updated, {result['chunks']} total)"
193
+ )
194
+ else:
195
+ print(f"Index unchanged: {result['reason']}")
196
+
197
+ return 0
198
+
199
+
200
+ def _run_stats(args: argparse.Namespace) -> int:
201
+ setup_logging(args.verbose)
202
+
203
+ with RAGrep(
204
+ db_path=args.db_path,
205
+ chunk_size=args.chunk_size,
206
+ chunk_overlap=args.chunk_overlap,
207
+ embedding_model=args.model,
208
+ model_dir=args.model_dir,
209
+ embedding_device=args.device,
210
+ ) as rag:
211
+ result = rag.stats()
212
+
213
+ if args.json:
214
+ print(json.dumps(result, indent=2))
215
+ else:
216
+ print(f"Database: {result['persist_path']}")
217
+ print(f"Indexed root: {result.get('indexed_root')}")
218
+ print(f"Embedding model: {result.get('embedding_model')}")
219
+ print(f"Files: {result['total_files']}")
220
+ print(f"Chunks: {result['total_chunks']}")
221
+ print(f"Indexed at: {result.get('indexed_at')}")
222
+
223
+ return 0
224
+
225
+
226
+ def main(argv: List[str] | None = None) -> int:
227
+ args_list = list(argv) if argv is not None else sys.argv[1:]
228
+
229
+ if not args_list:
230
+ parser = _build_recall_parser()
231
+ parser.print_help()
232
+ return 0
233
+
234
+ try:
235
+ first = args_list[0]
236
+ if first in {"--check-gpu", "--gpu-info"}:
237
+ parser = _build_gpu_parser()
238
+ args = parser.parse_args(args_list[1:])
239
+ return _run_gpu_info(args)
240
+
241
+ if first in {"--stats", "-s"}:
242
+ parser = _build_stats_parser()
243
+ args = parser.parse_args(args_list[1:])
244
+ return _run_stats(args)
245
+
246
+ if first == "index":
247
+ parser = _build_index_parser()
248
+ args = parser.parse_args(args_list[1:])
249
+ return _run_index(args)
250
+
251
+ if first == "stats":
252
+ parser = _build_stats_parser()
253
+ args = parser.parse_args(args_list[1:])
254
+ return _run_stats(args)
255
+
256
+ if first == "recall":
257
+ parser = _build_recall_parser("ragrep recall")
258
+ args = parser.parse_args(args_list[1:])
259
+ return _run_recall(args)
260
+
261
+ parser = _build_recall_parser()
262
+ args = parser.parse_args(args_list)
263
+ return _run_recall(args)
264
+ except Exception as exc:
265
+ print(f"Error: {exc}", file=sys.stderr)
266
+ return 1
267
+
268
+
269
+ if __name__ == "__main__":
270
+ raise SystemExit(main())
@@ -0,0 +1,8 @@
1
+ """Core RAGrep functionality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .document_processor import DocumentProcessor
6
+ from .rag_system import RAGrep, RAGSystem
7
+
8
+ __all__ = ["RAGrep", "RAGSystem", "DocumentProcessor"]
@@ -0,0 +1,242 @@
1
+ """Document loading and chunking utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import fnmatch
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, List
8
+
9
+
10
+ _DEFAULT_EXTENSIONS = {
11
+ ".c",
12
+ ".cc",
13
+ ".cpp",
14
+ ".css",
15
+ ".go",
16
+ ".h",
17
+ ".hpp",
18
+ ".html",
19
+ ".java",
20
+ ".js",
21
+ ".json",
22
+ ".md",
23
+ ".py",
24
+ ".rb",
25
+ ".rs",
26
+ ".sql",
27
+ ".toml",
28
+ ".ts",
29
+ ".txt",
30
+ ".xml",
31
+ ".yaml",
32
+ ".yml",
33
+ }
34
+
35
+ _DEFAULT_IGNORE_PATTERNS = {
36
+ ".git/",
37
+ "__pycache__/",
38
+ "*.pyc",
39
+ "*.sqlite",
40
+ "*.sqlite3",
41
+ "*.db",
42
+ ".ragrep.db",
43
+ ".ragrep.db.legacy/",
44
+ "venv/",
45
+ ".venv/",
46
+ "node_modules/",
47
+ "dist/",
48
+ "build/",
49
+ }
50
+
51
+
52
+ class DocumentProcessor:
53
+ """Read text files from disk and split them into overlapping chunks."""
54
+
55
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200) -> None:
56
+ if chunk_size <= 0:
57
+ raise ValueError("chunk_size must be > 0")
58
+ if chunk_overlap < 0:
59
+ raise ValueError("chunk_overlap must be >= 0")
60
+ if chunk_overlap >= chunk_size:
61
+ raise ValueError("chunk_overlap must be smaller than chunk_size")
62
+
63
+ self.chunk_size = chunk_size
64
+ self.chunk_overlap = chunk_overlap
65
+
66
+ def process_path(
67
+ self,
68
+ path: str,
69
+ *,
70
+ extra_ignore_paths: Iterable[Path] | None = None,
71
+ ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], Path]:
72
+ files, file_records, scan_root = self.discover_path(
73
+ path,
74
+ extra_ignore_paths=extra_ignore_paths,
75
+ )
76
+ chunks = self.process_files(files, scan_root)
77
+ return chunks, file_records, scan_root
78
+
79
+ def discover_path(
80
+ self,
81
+ path: str,
82
+ *,
83
+ extra_ignore_paths: Iterable[Path] | None = None,
84
+ ) -> tuple[List[Path], List[Dict[str, Any]], Path]:
85
+ root = Path(path).resolve()
86
+ if not root.exists():
87
+ raise ValueError(f"Path does not exist: {path}")
88
+
89
+ if root.is_file():
90
+ files = [root]
91
+ scan_root = root.parent
92
+ else:
93
+ scan_root = root
94
+ files = self.scan_files(scan_root, extra_ignore_paths=extra_ignore_paths)
95
+
96
+ file_records = self.collect_file_records(files, scan_root)
97
+ return files, file_records, scan_root
98
+
99
+ def process_files(self, files: Iterable[Path], scan_root: Path) -> List[Dict[str, Any]]:
100
+ chunks: List[Dict[str, Any]] = []
101
+ for file_path in files:
102
+ relative_path = file_path.relative_to(scan_root).as_posix()
103
+ text = self._load_text(file_path)
104
+ chunks.extend(self._chunk_text(text, relative_path))
105
+ return chunks
106
+
107
+ def scan_files(
108
+ self,
109
+ root: Path,
110
+ *,
111
+ extra_ignore_paths: Iterable[Path] | None = None,
112
+ ) -> List[Path]:
113
+ ignore_patterns = self._load_ignore_patterns(root)
114
+ resolved_ignores = [path.expanduser().resolve() for path in (extra_ignore_paths or [])]
115
+ files: List[Path] = []
116
+
117
+ for file_path in root.rglob("*"):
118
+ if not file_path.is_file():
119
+ continue
120
+ if self._matches_extra_ignore(file_path.resolve(), resolved_ignores):
121
+ continue
122
+ if file_path.suffix.lower() not in _DEFAULT_EXTENSIONS:
123
+ continue
124
+
125
+ relative = file_path.relative_to(root).as_posix()
126
+ if self._should_ignore(relative, ignore_patterns):
127
+ continue
128
+
129
+ files.append(file_path)
130
+
131
+ files.sort()
132
+ return files
133
+
134
+ def collect_file_records(self, files: Iterable[Path], root: Path) -> List[Dict[str, Any]]:
135
+ records: List[Dict[str, Any]] = []
136
+
137
+ for file_path in files:
138
+ stat = file_path.stat()
139
+ records.append(
140
+ {
141
+ "path": file_path.relative_to(root).as_posix(),
142
+ "size": int(stat.st_size),
143
+ "mtime_ns": int(stat.st_mtime_ns),
144
+ }
145
+ )
146
+
147
+ records.sort(key=lambda item: item["path"])
148
+ return records
149
+
150
+ def _chunk_text(self, text: str, relative_path: str) -> List[Dict[str, Any]]:
151
+ chunks: List[Dict[str, Any]] = []
152
+ start = 0
153
+
154
+ while start < len(text):
155
+ end = min(start + self.chunk_size, len(text))
156
+ chunk_text = text[start:end]
157
+ chunk_index = len(chunks)
158
+ chunk_id = f"{relative_path}:{chunk_index}:{start}:{end}"
159
+
160
+ metadata = {
161
+ "source": relative_path,
162
+ "chunk_index": chunk_index,
163
+ "start_char": start,
164
+ "end_char": end,
165
+ }
166
+ chunks.append(
167
+ {
168
+ "id": chunk_id,
169
+ "file_path": relative_path,
170
+ "chunk_index": chunk_index,
171
+ "start_char": start,
172
+ "end_char": end,
173
+ "text": chunk_text,
174
+ "metadata": metadata,
175
+ }
176
+ )
177
+
178
+ if end >= len(text):
179
+ break
180
+ start = end - self.chunk_overlap
181
+
182
+ return chunks
183
+
184
+ @staticmethod
185
+ def _load_text(file_path: Path) -> str:
186
+ return file_path.read_text(encoding="utf-8", errors="ignore")
187
+
188
+ @staticmethod
189
+ def _load_ignore_patterns(root: Path) -> List[str]:
190
+ patterns = set(_DEFAULT_IGNORE_PATTERNS)
191
+
192
+ current = root
193
+ while True:
194
+ gitignore = current / ".gitignore"
195
+ if gitignore.exists():
196
+ for line in gitignore.read_text(encoding="utf-8", errors="ignore").splitlines():
197
+ stripped = line.strip()
198
+ if not stripped or stripped.startswith("#"):
199
+ continue
200
+ patterns.add(stripped)
201
+ if current.parent == current:
202
+ break
203
+ current = current.parent
204
+
205
+ return sorted(patterns)
206
+
207
+ @staticmethod
208
+ def _should_ignore(relative_path: str, patterns: Iterable[str]) -> bool:
209
+ path = relative_path
210
+
211
+ for pattern in patterns:
212
+ if pattern.startswith("!"):
213
+ continue
214
+
215
+ normalized = pattern.strip()
216
+ if not normalized:
217
+ continue
218
+
219
+ if normalized.endswith("/"):
220
+ directory = normalized.rstrip("/")
221
+ if path == directory or path.startswith(directory + "/"):
222
+ return True
223
+ continue
224
+
225
+ if fnmatch.fnmatch(path, normalized):
226
+ return True
227
+ if fnmatch.fnmatch(Path(path).name, normalized):
228
+ return True
229
+
230
+ return False
231
+
232
+ @staticmethod
233
+ def _matches_extra_ignore(file_path: Path, ignored_paths: Iterable[Path]) -> bool:
234
+ for ignored in ignored_paths:
235
+ if file_path == ignored:
236
+ return True
237
+ try:
238
+ file_path.relative_to(ignored)
239
+ return True
240
+ except ValueError:
241
+ continue
242
+ return False