ragrep 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragrep/__init__.py +41 -0
- ragrep/cli.py +270 -0
- ragrep/core/__init__.py +8 -0
- ragrep/core/document_processor.py +242 -0
- ragrep/core/rag_system.py +198 -0
- ragrep/retrieval/__init__.py +8 -0
- ragrep/retrieval/embeddings.py +174 -0
- ragrep/retrieval/vector_store.py +461 -0
- ragrep-0.2.1.dist-info/METADATA +161 -0
- ragrep-0.2.1.dist-info/RECORD +14 -0
- ragrep-0.2.1.dist-info/WHEEL +5 -0
- ragrep-0.2.1.dist-info/entry_points.txt +2 -0
- ragrep-0.2.1.dist-info/licenses/LICENSE +201 -0
- ragrep-0.2.1.dist-info/top_level.txt +1 -0
ragrep/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""RAGrep public package API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.2.1"
|
|
6
|
+
__author__ = "RAGrep Team"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def __getattr__(name: str):
|
|
10
|
+
if name in {"RAGrep", "RAGSystem"}:
|
|
11
|
+
from .core.rag_system import RAGrep, RAGSystem
|
|
12
|
+
|
|
13
|
+
return RAGrep if name == "RAGrep" else RAGSystem
|
|
14
|
+
|
|
15
|
+
if name == "DocumentProcessor":
|
|
16
|
+
from .core.document_processor import DocumentProcessor
|
|
17
|
+
|
|
18
|
+
return DocumentProcessor
|
|
19
|
+
|
|
20
|
+
if name == "VectorStore":
|
|
21
|
+
from .retrieval.vector_store import VectorStore
|
|
22
|
+
|
|
23
|
+
return VectorStore
|
|
24
|
+
|
|
25
|
+
if name in {"LocalEmbedder", "OllamaEmbedder"}:
|
|
26
|
+
# Keep OllamaEmbedder alias for backward compatibility.
|
|
27
|
+
from .retrieval.embeddings import LocalEmbedder
|
|
28
|
+
|
|
29
|
+
return LocalEmbedder
|
|
30
|
+
|
|
31
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"RAGrep",
|
|
36
|
+
"RAGSystem",
|
|
37
|
+
"DocumentProcessor",
|
|
38
|
+
"VectorStore",
|
|
39
|
+
"LocalEmbedder",
|
|
40
|
+
"OllamaEmbedder",
|
|
41
|
+
]
|
ragrep/cli.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Command-line interface for RAGrep."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
from .core.rag_system import RAGrep
|
|
14
|
+
from .retrieval.embeddings import get_runtime_device_info
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def setup_logging(verbose: bool) -> None:
|
|
18
|
+
logging.basicConfig(
|
|
19
|
+
level=logging.DEBUG if verbose else logging.WARNING,
|
|
20
|
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _build_common_parser(description: str) -> argparse.ArgumentParser:
|
|
25
|
+
parser = argparse.ArgumentParser(description=description)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--db-path",
|
|
28
|
+
default=os.getenv("RAGREP_DB_PATH", "./.ragrep.db"),
|
|
29
|
+
help="Path to local SQLite database",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--chunk-size",
|
|
33
|
+
type=int,
|
|
34
|
+
default=int(os.getenv("CHUNK_SIZE", "1000")),
|
|
35
|
+
help="Chunk size",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--chunk-overlap",
|
|
39
|
+
type=int,
|
|
40
|
+
default=int(os.getenv("CHUNK_OVERLAP", "200")),
|
|
41
|
+
help="Chunk overlap",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--model",
|
|
45
|
+
default=os.getenv("EMBEDDING_MODEL", "mxbai-embed-large"),
|
|
46
|
+
help="Embedding model name",
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--model-dir",
|
|
50
|
+
default=os.getenv("RAGREP_MODEL_DIR"),
|
|
51
|
+
help="Optional directory for downloaded embedding models",
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--device",
|
|
55
|
+
default=os.getenv("RAGREP_DEVICE", "auto"),
|
|
56
|
+
help="Embedding device: auto, cpu, cuda, mps, or explicit device (e.g. cuda:0)",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
|
|
59
|
+
return parser
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _build_recall_parser(prog: str = "ragrep") -> argparse.ArgumentParser:
|
|
63
|
+
parser = _build_common_parser("Recall relevant chunks (auto-indexes when files changed)")
|
|
64
|
+
parser.prog = prog
|
|
65
|
+
parser.add_argument("query", nargs="+", help="Semantic query")
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--path",
|
|
68
|
+
default=None,
|
|
69
|
+
help="Directory or file to index when needed (defaults to existing indexed root, else current dir)",
|
|
70
|
+
)
|
|
71
|
+
parser.add_argument("--limit", type=int, default=20, help="Maximum number of results")
|
|
72
|
+
parser.add_argument("--no-auto-index", action="store_true", help="Disable automatic index updates")
|
|
73
|
+
parser.add_argument("--json", action="store_true", help="Output JSON")
|
|
74
|
+
return parser
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _build_index_parser() -> argparse.ArgumentParser:
|
|
78
|
+
parser = _build_common_parser("Index a directory or file")
|
|
79
|
+
parser.add_argument("path", nargs="?", default=".", help="Path to index")
|
|
80
|
+
parser.add_argument("--force", action="store_true", help="Force a full re-index")
|
|
81
|
+
parser.add_argument("--json", action="store_true", help="Output JSON")
|
|
82
|
+
return parser
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _build_stats_parser() -> argparse.ArgumentParser:
|
|
86
|
+
parser = _build_common_parser("Show index statistics")
|
|
87
|
+
parser.add_argument("--json", action="store_true", help="Output JSON")
|
|
88
|
+
return parser
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _build_gpu_parser() -> argparse.ArgumentParser:
|
|
92
|
+
parser = argparse.ArgumentParser(description="Show GPU/device support for embeddings")
|
|
93
|
+
parser.add_argument(
|
|
94
|
+
"--device",
|
|
95
|
+
default=os.getenv("RAGREP_DEVICE", "auto"),
|
|
96
|
+
help="Requested embedding device: auto, cpu, cuda, mps, cuda:0, etc.",
|
|
97
|
+
)
|
|
98
|
+
parser.add_argument("--json", action="store_true", help="Output JSON")
|
|
99
|
+
return parser
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _run_gpu_info(args: argparse.Namespace) -> int:
|
|
103
|
+
info = get_runtime_device_info(args.device)
|
|
104
|
+
if args.json:
|
|
105
|
+
print(json.dumps(info, indent=2))
|
|
106
|
+
else:
|
|
107
|
+
print(f"Requested: {info['requested_device']}")
|
|
108
|
+
print(f"Resolved: {info['resolved_device']}")
|
|
109
|
+
print(f"PyTorch available: {info['torch_available']}")
|
|
110
|
+
print(f"CUDA available: {info['cuda_available']}")
|
|
111
|
+
print(f"CUDA device count: {info['cuda_device_count']}")
|
|
112
|
+
if info["cuda_devices"]:
|
|
113
|
+
print("CUDA devices:")
|
|
114
|
+
for index, name in enumerate(info["cuda_devices"]):
|
|
115
|
+
print(f" {index}: {name}")
|
|
116
|
+
print(f"MPS available: {info['mps_available']}")
|
|
117
|
+
return 0
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _print_new_file_paths(index_result: dict) -> None:
|
|
121
|
+
new_files = index_result.get("new_files") or []
|
|
122
|
+
if not new_files:
|
|
123
|
+
return
|
|
124
|
+
print("New files indexed:")
|
|
125
|
+
for path in new_files:
|
|
126
|
+
print(path)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _run_recall(args: argparse.Namespace) -> int:
|
|
130
|
+
setup_logging(args.verbose)
|
|
131
|
+
query = " ".join(args.query).strip()
|
|
132
|
+
|
|
133
|
+
with RAGrep(
|
|
134
|
+
db_path=args.db_path,
|
|
135
|
+
chunk_size=args.chunk_size,
|
|
136
|
+
chunk_overlap=args.chunk_overlap,
|
|
137
|
+
embedding_model=args.model,
|
|
138
|
+
model_dir=args.model_dir,
|
|
139
|
+
embedding_device=args.device,
|
|
140
|
+
) as rag:
|
|
141
|
+
result = rag.recall(
|
|
142
|
+
query,
|
|
143
|
+
limit=args.limit,
|
|
144
|
+
path=args.path,
|
|
145
|
+
auto_index=not args.no_auto_index,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if args.json:
|
|
149
|
+
print(json.dumps(result, indent=2))
|
|
150
|
+
return 0
|
|
151
|
+
|
|
152
|
+
index_info = result.get("auto_index")
|
|
153
|
+
if index_info and index_info.get("indexed"):
|
|
154
|
+
_print_new_file_paths(index_info)
|
|
155
|
+
print(
|
|
156
|
+
f"Indexed {index_info['indexed_files']} changed files "
|
|
157
|
+
f"({index_info['chunks_indexed']} chunks updated, {index_info['chunks']} total): "
|
|
158
|
+
f"{index_info['reason']}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
matches = result.get("matches", [])
|
|
162
|
+
print(f"Results: {len(matches)}")
|
|
163
|
+
for position, match in enumerate(matches, start=1):
|
|
164
|
+
source = match.get("metadata", {}).get("source", "unknown")
|
|
165
|
+
print(f"{position}. score={match['score']:.4f} source={source}")
|
|
166
|
+
print(match.get("text", "").rstrip())
|
|
167
|
+
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _run_index(args: argparse.Namespace) -> int:
|
|
172
|
+
setup_logging(args.verbose)
|
|
173
|
+
|
|
174
|
+
with RAGrep(
|
|
175
|
+
db_path=args.db_path,
|
|
176
|
+
chunk_size=args.chunk_size,
|
|
177
|
+
chunk_overlap=args.chunk_overlap,
|
|
178
|
+
embedding_model=args.model,
|
|
179
|
+
model_dir=args.model_dir,
|
|
180
|
+
embedding_device=args.device,
|
|
181
|
+
) as rag:
|
|
182
|
+
result = rag.index(path=args.path, force=args.force)
|
|
183
|
+
|
|
184
|
+
if args.json:
|
|
185
|
+
print(json.dumps(result, indent=2))
|
|
186
|
+
return 0
|
|
187
|
+
|
|
188
|
+
if result["indexed"]:
|
|
189
|
+
_print_new_file_paths(result)
|
|
190
|
+
print(
|
|
191
|
+
f"Indexed {result['indexed_files']} changed files "
|
|
192
|
+
f"({result['chunks_indexed']} chunks updated, {result['chunks']} total)"
|
|
193
|
+
)
|
|
194
|
+
else:
|
|
195
|
+
print(f"Index unchanged: {result['reason']}")
|
|
196
|
+
|
|
197
|
+
return 0
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _run_stats(args: argparse.Namespace) -> int:
|
|
201
|
+
setup_logging(args.verbose)
|
|
202
|
+
|
|
203
|
+
with RAGrep(
|
|
204
|
+
db_path=args.db_path,
|
|
205
|
+
chunk_size=args.chunk_size,
|
|
206
|
+
chunk_overlap=args.chunk_overlap,
|
|
207
|
+
embedding_model=args.model,
|
|
208
|
+
model_dir=args.model_dir,
|
|
209
|
+
embedding_device=args.device,
|
|
210
|
+
) as rag:
|
|
211
|
+
result = rag.stats()
|
|
212
|
+
|
|
213
|
+
if args.json:
|
|
214
|
+
print(json.dumps(result, indent=2))
|
|
215
|
+
else:
|
|
216
|
+
print(f"Database: {result['persist_path']}")
|
|
217
|
+
print(f"Indexed root: {result.get('indexed_root')}")
|
|
218
|
+
print(f"Embedding model: {result.get('embedding_model')}")
|
|
219
|
+
print(f"Files: {result['total_files']}")
|
|
220
|
+
print(f"Chunks: {result['total_chunks']}")
|
|
221
|
+
print(f"Indexed at: {result.get('indexed_at')}")
|
|
222
|
+
|
|
223
|
+
return 0
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def main(argv: List[str] | None = None) -> int:
|
|
227
|
+
args_list = list(argv) if argv is not None else sys.argv[1:]
|
|
228
|
+
|
|
229
|
+
if not args_list:
|
|
230
|
+
parser = _build_recall_parser()
|
|
231
|
+
parser.print_help()
|
|
232
|
+
return 0
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
first = args_list[0]
|
|
236
|
+
if first in {"--check-gpu", "--gpu-info"}:
|
|
237
|
+
parser = _build_gpu_parser()
|
|
238
|
+
args = parser.parse_args(args_list[1:])
|
|
239
|
+
return _run_gpu_info(args)
|
|
240
|
+
|
|
241
|
+
if first in {"--stats", "-s"}:
|
|
242
|
+
parser = _build_stats_parser()
|
|
243
|
+
args = parser.parse_args(args_list[1:])
|
|
244
|
+
return _run_stats(args)
|
|
245
|
+
|
|
246
|
+
if first == "index":
|
|
247
|
+
parser = _build_index_parser()
|
|
248
|
+
args = parser.parse_args(args_list[1:])
|
|
249
|
+
return _run_index(args)
|
|
250
|
+
|
|
251
|
+
if first == "stats":
|
|
252
|
+
parser = _build_stats_parser()
|
|
253
|
+
args = parser.parse_args(args_list[1:])
|
|
254
|
+
return _run_stats(args)
|
|
255
|
+
|
|
256
|
+
if first == "recall":
|
|
257
|
+
parser = _build_recall_parser("ragrep recall")
|
|
258
|
+
args = parser.parse_args(args_list[1:])
|
|
259
|
+
return _run_recall(args)
|
|
260
|
+
|
|
261
|
+
parser = _build_recall_parser()
|
|
262
|
+
args = parser.parse_args(args_list)
|
|
263
|
+
return _run_recall(args)
|
|
264
|
+
except Exception as exc:
|
|
265
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
266
|
+
return 1
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
if __name__ == "__main__":
|
|
270
|
+
raise SystemExit(main())
|
ragrep/core/__init__.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Document loading and chunking utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import fnmatch
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Iterable, List
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_DEFAULT_EXTENSIONS = {
|
|
11
|
+
".c",
|
|
12
|
+
".cc",
|
|
13
|
+
".cpp",
|
|
14
|
+
".css",
|
|
15
|
+
".go",
|
|
16
|
+
".h",
|
|
17
|
+
".hpp",
|
|
18
|
+
".html",
|
|
19
|
+
".java",
|
|
20
|
+
".js",
|
|
21
|
+
".json",
|
|
22
|
+
".md",
|
|
23
|
+
".py",
|
|
24
|
+
".rb",
|
|
25
|
+
".rs",
|
|
26
|
+
".sql",
|
|
27
|
+
".toml",
|
|
28
|
+
".ts",
|
|
29
|
+
".txt",
|
|
30
|
+
".xml",
|
|
31
|
+
".yaml",
|
|
32
|
+
".yml",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
_DEFAULT_IGNORE_PATTERNS = {
|
|
36
|
+
".git/",
|
|
37
|
+
"__pycache__/",
|
|
38
|
+
"*.pyc",
|
|
39
|
+
"*.sqlite",
|
|
40
|
+
"*.sqlite3",
|
|
41
|
+
"*.db",
|
|
42
|
+
".ragrep.db",
|
|
43
|
+
".ragrep.db.legacy/",
|
|
44
|
+
"venv/",
|
|
45
|
+
".venv/",
|
|
46
|
+
"node_modules/",
|
|
47
|
+
"dist/",
|
|
48
|
+
"build/",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DocumentProcessor:
|
|
53
|
+
"""Read text files from disk and split them into overlapping chunks."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200) -> None:
|
|
56
|
+
if chunk_size <= 0:
|
|
57
|
+
raise ValueError("chunk_size must be > 0")
|
|
58
|
+
if chunk_overlap < 0:
|
|
59
|
+
raise ValueError("chunk_overlap must be >= 0")
|
|
60
|
+
if chunk_overlap >= chunk_size:
|
|
61
|
+
raise ValueError("chunk_overlap must be smaller than chunk_size")
|
|
62
|
+
|
|
63
|
+
self.chunk_size = chunk_size
|
|
64
|
+
self.chunk_overlap = chunk_overlap
|
|
65
|
+
|
|
66
|
+
def process_path(
|
|
67
|
+
self,
|
|
68
|
+
path: str,
|
|
69
|
+
*,
|
|
70
|
+
extra_ignore_paths: Iterable[Path] | None = None,
|
|
71
|
+
) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], Path]:
|
|
72
|
+
files, file_records, scan_root = self.discover_path(
|
|
73
|
+
path,
|
|
74
|
+
extra_ignore_paths=extra_ignore_paths,
|
|
75
|
+
)
|
|
76
|
+
chunks = self.process_files(files, scan_root)
|
|
77
|
+
return chunks, file_records, scan_root
|
|
78
|
+
|
|
79
|
+
def discover_path(
|
|
80
|
+
self,
|
|
81
|
+
path: str,
|
|
82
|
+
*,
|
|
83
|
+
extra_ignore_paths: Iterable[Path] | None = None,
|
|
84
|
+
) -> tuple[List[Path], List[Dict[str, Any]], Path]:
|
|
85
|
+
root = Path(path).resolve()
|
|
86
|
+
if not root.exists():
|
|
87
|
+
raise ValueError(f"Path does not exist: {path}")
|
|
88
|
+
|
|
89
|
+
if root.is_file():
|
|
90
|
+
files = [root]
|
|
91
|
+
scan_root = root.parent
|
|
92
|
+
else:
|
|
93
|
+
scan_root = root
|
|
94
|
+
files = self.scan_files(scan_root, extra_ignore_paths=extra_ignore_paths)
|
|
95
|
+
|
|
96
|
+
file_records = self.collect_file_records(files, scan_root)
|
|
97
|
+
return files, file_records, scan_root
|
|
98
|
+
|
|
99
|
+
def process_files(self, files: Iterable[Path], scan_root: Path) -> List[Dict[str, Any]]:
|
|
100
|
+
chunks: List[Dict[str, Any]] = []
|
|
101
|
+
for file_path in files:
|
|
102
|
+
relative_path = file_path.relative_to(scan_root).as_posix()
|
|
103
|
+
text = self._load_text(file_path)
|
|
104
|
+
chunks.extend(self._chunk_text(text, relative_path))
|
|
105
|
+
return chunks
|
|
106
|
+
|
|
107
|
+
def scan_files(
|
|
108
|
+
self,
|
|
109
|
+
root: Path,
|
|
110
|
+
*,
|
|
111
|
+
extra_ignore_paths: Iterable[Path] | None = None,
|
|
112
|
+
) -> List[Path]:
|
|
113
|
+
ignore_patterns = self._load_ignore_patterns(root)
|
|
114
|
+
resolved_ignores = [path.expanduser().resolve() for path in (extra_ignore_paths or [])]
|
|
115
|
+
files: List[Path] = []
|
|
116
|
+
|
|
117
|
+
for file_path in root.rglob("*"):
|
|
118
|
+
if not file_path.is_file():
|
|
119
|
+
continue
|
|
120
|
+
if self._matches_extra_ignore(file_path.resolve(), resolved_ignores):
|
|
121
|
+
continue
|
|
122
|
+
if file_path.suffix.lower() not in _DEFAULT_EXTENSIONS:
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
relative = file_path.relative_to(root).as_posix()
|
|
126
|
+
if self._should_ignore(relative, ignore_patterns):
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
files.append(file_path)
|
|
130
|
+
|
|
131
|
+
files.sort()
|
|
132
|
+
return files
|
|
133
|
+
|
|
134
|
+
def collect_file_records(self, files: Iterable[Path], root: Path) -> List[Dict[str, Any]]:
|
|
135
|
+
records: List[Dict[str, Any]] = []
|
|
136
|
+
|
|
137
|
+
for file_path in files:
|
|
138
|
+
stat = file_path.stat()
|
|
139
|
+
records.append(
|
|
140
|
+
{
|
|
141
|
+
"path": file_path.relative_to(root).as_posix(),
|
|
142
|
+
"size": int(stat.st_size),
|
|
143
|
+
"mtime_ns": int(stat.st_mtime_ns),
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
records.sort(key=lambda item: item["path"])
|
|
148
|
+
return records
|
|
149
|
+
|
|
150
|
+
def _chunk_text(self, text: str, relative_path: str) -> List[Dict[str, Any]]:
|
|
151
|
+
chunks: List[Dict[str, Any]] = []
|
|
152
|
+
start = 0
|
|
153
|
+
|
|
154
|
+
while start < len(text):
|
|
155
|
+
end = min(start + self.chunk_size, len(text))
|
|
156
|
+
chunk_text = text[start:end]
|
|
157
|
+
chunk_index = len(chunks)
|
|
158
|
+
chunk_id = f"{relative_path}:{chunk_index}:{start}:{end}"
|
|
159
|
+
|
|
160
|
+
metadata = {
|
|
161
|
+
"source": relative_path,
|
|
162
|
+
"chunk_index": chunk_index,
|
|
163
|
+
"start_char": start,
|
|
164
|
+
"end_char": end,
|
|
165
|
+
}
|
|
166
|
+
chunks.append(
|
|
167
|
+
{
|
|
168
|
+
"id": chunk_id,
|
|
169
|
+
"file_path": relative_path,
|
|
170
|
+
"chunk_index": chunk_index,
|
|
171
|
+
"start_char": start,
|
|
172
|
+
"end_char": end,
|
|
173
|
+
"text": chunk_text,
|
|
174
|
+
"metadata": metadata,
|
|
175
|
+
}
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if end >= len(text):
|
|
179
|
+
break
|
|
180
|
+
start = end - self.chunk_overlap
|
|
181
|
+
|
|
182
|
+
return chunks
|
|
183
|
+
|
|
184
|
+
@staticmethod
|
|
185
|
+
def _load_text(file_path: Path) -> str:
|
|
186
|
+
return file_path.read_text(encoding="utf-8", errors="ignore")
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def _load_ignore_patterns(root: Path) -> List[str]:
|
|
190
|
+
patterns = set(_DEFAULT_IGNORE_PATTERNS)
|
|
191
|
+
|
|
192
|
+
current = root
|
|
193
|
+
while True:
|
|
194
|
+
gitignore = current / ".gitignore"
|
|
195
|
+
if gitignore.exists():
|
|
196
|
+
for line in gitignore.read_text(encoding="utf-8", errors="ignore").splitlines():
|
|
197
|
+
stripped = line.strip()
|
|
198
|
+
if not stripped or stripped.startswith("#"):
|
|
199
|
+
continue
|
|
200
|
+
patterns.add(stripped)
|
|
201
|
+
if current.parent == current:
|
|
202
|
+
break
|
|
203
|
+
current = current.parent
|
|
204
|
+
|
|
205
|
+
return sorted(patterns)
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def _should_ignore(relative_path: str, patterns: Iterable[str]) -> bool:
|
|
209
|
+
path = relative_path
|
|
210
|
+
|
|
211
|
+
for pattern in patterns:
|
|
212
|
+
if pattern.startswith("!"):
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
normalized = pattern.strip()
|
|
216
|
+
if not normalized:
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
if normalized.endswith("/"):
|
|
220
|
+
directory = normalized.rstrip("/")
|
|
221
|
+
if path == directory or path.startswith(directory + "/"):
|
|
222
|
+
return True
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
if fnmatch.fnmatch(path, normalized):
|
|
226
|
+
return True
|
|
227
|
+
if fnmatch.fnmatch(Path(path).name, normalized):
|
|
228
|
+
return True
|
|
229
|
+
|
|
230
|
+
return False
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
def _matches_extra_ignore(file_path: Path, ignored_paths: Iterable[Path]) -> bool:
|
|
234
|
+
for ignored in ignored_paths:
|
|
235
|
+
if file_path == ignored:
|
|
236
|
+
return True
|
|
237
|
+
try:
|
|
238
|
+
file_path.relative_to(ignored)
|
|
239
|
+
return True
|
|
240
|
+
except ValueError:
|
|
241
|
+
continue
|
|
242
|
+
return False
|