ragrep 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragrep-0.2.1/src/ragrep.egg-info → ragrep-0.2.2}/PKG-INFO +1 -1
- {ragrep-0.2.1 → ragrep-0.2.2}/pyproject.toml +1 -1
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/__init__.py +1 -1
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/cli.py +35 -20
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/core/document_processor.py +13 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/retrieval/embeddings.py +60 -8
- {ragrep-0.2.1 → ragrep-0.2.2/src/ragrep.egg-info}/PKG-INFO +1 -1
- ragrep-0.2.2/tests/test_cli.py +140 -0
- ragrep-0.2.2/tests/test_embeddings.py +179 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/tests/test_ragrep.py +54 -0
- ragrep-0.2.1/tests/test_cli.py +0 -76
- ragrep-0.2.1/tests/test_embeddings.py +0 -80
- {ragrep-0.2.1 → ragrep-0.2.2}/LICENSE +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/MANIFEST.in +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/README.md +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/docs/README.md +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/env.example +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/examples/basic_usage.py +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/requirements.txt +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/setup.cfg +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/setup.py +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/core/__init__.py +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/core/rag_system.py +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/retrieval/__init__.py +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/retrieval/vector_store.py +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/SOURCES.txt +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/dependency_links.txt +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/entry_points.txt +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/requires.txt +0 -0
- {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/top_level.txt +0 -0
|
@@ -117,15 +117,42 @@ def _run_gpu_info(args: argparse.Namespace) -> int:
|
|
|
117
117
|
return 0
|
|
118
118
|
|
|
119
119
|
|
|
120
|
-
def
|
|
121
|
-
|
|
122
|
-
if not new_files:
|
|
120
|
+
def _print_file_paths(title: str, paths: list[str]) -> None:
|
|
121
|
+
if not paths:
|
|
123
122
|
return
|
|
124
|
-
print("
|
|
125
|
-
for path in
|
|
123
|
+
print(f"{title}:")
|
|
124
|
+
for path in paths:
|
|
126
125
|
print(path)
|
|
127
126
|
|
|
128
127
|
|
|
128
|
+
def _print_index_status(index_result: dict) -> None:
|
|
129
|
+
root = index_result.get("root") or index_result.get("indexed_root") or "."
|
|
130
|
+
new_files = index_result.get("new_files") or []
|
|
131
|
+
updated_files = index_result.get("updated_files") or []
|
|
132
|
+
removed_files = index_result.get("removed_files") or []
|
|
133
|
+
|
|
134
|
+
if index_result.get("indexed"):
|
|
135
|
+
print(
|
|
136
|
+
f"Index updated for {root}: "
|
|
137
|
+
f"{len(new_files)} added, {len(updated_files)} modified, {len(removed_files)} removed."
|
|
138
|
+
)
|
|
139
|
+
_print_file_paths("Added files", new_files)
|
|
140
|
+
_print_file_paths("Modified files", updated_files)
|
|
141
|
+
_print_file_paths("Removed files", removed_files)
|
|
142
|
+
print(
|
|
143
|
+
f"Indexed {index_result['indexed_files']} changed files "
|
|
144
|
+
f"({index_result['chunks_indexed']} chunks updated, {index_result['chunks']} total): "
|
|
145
|
+
f"{index_result['reason']}"
|
|
146
|
+
)
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
print(
|
|
150
|
+
f"Index is already up to date for {root} "
|
|
151
|
+
f"({index_result['files']} files, {index_result['chunks']} chunks): "
|
|
152
|
+
f"{index_result['reason']}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
129
156
|
def _run_recall(args: argparse.Namespace) -> int:
|
|
130
157
|
setup_logging(args.verbose)
|
|
131
158
|
query = " ".join(args.query).strip()
|
|
@@ -150,13 +177,8 @@ def _run_recall(args: argparse.Namespace) -> int:
|
|
|
150
177
|
return 0
|
|
151
178
|
|
|
152
179
|
index_info = result.get("auto_index")
|
|
153
|
-
if index_info
|
|
154
|
-
|
|
155
|
-
print(
|
|
156
|
-
f"Indexed {index_info['indexed_files']} changed files "
|
|
157
|
-
f"({index_info['chunks_indexed']} chunks updated, {index_info['chunks']} total): "
|
|
158
|
-
f"{index_info['reason']}"
|
|
159
|
-
)
|
|
180
|
+
if index_info:
|
|
181
|
+
_print_index_status(index_info)
|
|
160
182
|
|
|
161
183
|
matches = result.get("matches", [])
|
|
162
184
|
print(f"Results: {len(matches)}")
|
|
@@ -185,14 +207,7 @@ def _run_index(args: argparse.Namespace) -> int:
|
|
|
185
207
|
print(json.dumps(result, indent=2))
|
|
186
208
|
return 0
|
|
187
209
|
|
|
188
|
-
|
|
189
|
-
_print_new_file_paths(result)
|
|
190
|
-
print(
|
|
191
|
-
f"Indexed {result['indexed_files']} changed files "
|
|
192
|
-
f"({result['chunks_indexed']} chunks updated, {result['chunks']} total)"
|
|
193
|
-
)
|
|
194
|
-
else:
|
|
195
|
-
print(f"Index unchanged: {result['reason']}")
|
|
210
|
+
_print_index_status(result)
|
|
196
211
|
|
|
197
212
|
return 0
|
|
198
213
|
|
|
@@ -8,26 +8,39 @@ from typing import Any, Dict, Iterable, List
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
_DEFAULT_EXTENSIONS = {
|
|
11
|
+
".avsc",
|
|
11
12
|
".c",
|
|
12
13
|
".cc",
|
|
13
14
|
".cpp",
|
|
14
15
|
".css",
|
|
16
|
+
".gql",
|
|
15
17
|
".go",
|
|
18
|
+
".graphql",
|
|
19
|
+
".graphqls",
|
|
16
20
|
".h",
|
|
17
21
|
".hpp",
|
|
18
22
|
".html",
|
|
19
23
|
".java",
|
|
20
24
|
".js",
|
|
21
25
|
".json",
|
|
26
|
+
".markdown",
|
|
22
27
|
".md",
|
|
28
|
+
".mdown",
|
|
29
|
+
".mdx",
|
|
30
|
+
".mkd",
|
|
31
|
+
".mkdn",
|
|
23
32
|
".py",
|
|
24
33
|
".rb",
|
|
34
|
+
".raml",
|
|
25
35
|
".rs",
|
|
26
36
|
".sql",
|
|
27
37
|
".toml",
|
|
28
38
|
".ts",
|
|
29
39
|
".txt",
|
|
40
|
+
".proto",
|
|
41
|
+
".wsdl",
|
|
30
42
|
".xml",
|
|
43
|
+
".xsd",
|
|
31
44
|
".yaml",
|
|
32
45
|
".yml",
|
|
33
46
|
}
|
|
@@ -115,6 +115,29 @@ def default_model_dir() -> Path:
|
|
|
115
115
|
return base / "models"
|
|
116
116
|
|
|
117
117
|
|
|
118
|
+
def _has_local_model_snapshot(model: str, model_dir: Path) -> bool:
|
|
119
|
+
"""Return whether the requested model already exists in the local cache."""
|
|
120
|
+
model_path = Path(model).expanduser()
|
|
121
|
+
if model_path.exists():
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
from huggingface_hub import _CACHED_NO_EXIST, try_to_load_from_cache
|
|
126
|
+
except Exception:
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
for filename in ("modules.json", "config.json", "tokenizer_config.json"):
|
|
130
|
+
cached = try_to_load_from_cache(
|
|
131
|
+
repo_id=model,
|
|
132
|
+
filename=filename,
|
|
133
|
+
cache_dir=str(model_dir),
|
|
134
|
+
)
|
|
135
|
+
if cached is not None and cached != _CACHED_NO_EXIST:
|
|
136
|
+
return True
|
|
137
|
+
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
|
|
118
141
|
class LocalEmbedder:
|
|
119
142
|
"""Generate embeddings in-process using sentence-transformers."""
|
|
120
143
|
|
|
@@ -139,17 +162,46 @@ class LocalEmbedder:
|
|
|
139
162
|
"Install with: pip install sentence-transformers"
|
|
140
163
|
) from exc
|
|
141
164
|
|
|
165
|
+
prefer_local_only = _has_local_model_snapshot(self.resolved_model, self.model_dir)
|
|
166
|
+
|
|
142
167
|
try:
|
|
143
|
-
self._model =
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
device=self.device,
|
|
168
|
+
self._model = self._load_model(
|
|
169
|
+
SentenceTransformer,
|
|
170
|
+
local_files_only=prefer_local_only,
|
|
147
171
|
)
|
|
148
172
|
except Exception as exc: # pragma: no cover - model download/load depends on environment
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
173
|
+
if prefer_local_only:
|
|
174
|
+
try:
|
|
175
|
+
self._model = self._load_model(
|
|
176
|
+
SentenceTransformer,
|
|
177
|
+
local_files_only=False,
|
|
178
|
+
)
|
|
179
|
+
except Exception as retry_exc:
|
|
180
|
+
raise EmbeddingError(
|
|
181
|
+
f"Failed to load embedding model '{self.resolved_model}'. "
|
|
182
|
+
f"Model directory: {self.model_dir}."
|
|
183
|
+
) from retry_exc
|
|
184
|
+
else:
|
|
185
|
+
raise EmbeddingError(
|
|
186
|
+
f"Failed to load embedding model '{self.resolved_model}'. "
|
|
187
|
+
f"Model directory: {self.model_dir}."
|
|
188
|
+
) from exc
|
|
189
|
+
|
|
190
|
+
def _load_model(self, sentence_transformer_cls: Any, *, local_files_only: bool) -> Any:
|
|
191
|
+
kwargs: Dict[str, Any] = {
|
|
192
|
+
"cache_folder": str(self.model_dir),
|
|
193
|
+
"device": self.device,
|
|
194
|
+
"local_files_only": local_files_only,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
return sentence_transformer_cls(self.resolved_model, **kwargs)
|
|
199
|
+
except TypeError as exc:
|
|
200
|
+
if "local_files_only" not in str(exc):
|
|
201
|
+
raise
|
|
202
|
+
|
|
203
|
+
kwargs.pop("local_files_only", None)
|
|
204
|
+
return sentence_transformer_cls(self.resolved_model, **kwargs)
|
|
153
205
|
|
|
154
206
|
def embed_texts(self, texts: Iterable[str], batch_size: int = 32) -> List[List[float]]:
|
|
155
207
|
items = list(texts)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import tempfile
|
|
5
|
+
import unittest
|
|
6
|
+
from contextlib import redirect_stdout
|
|
7
|
+
from io import StringIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from unittest.mock import patch
|
|
10
|
+
|
|
11
|
+
from ragrep.cli import main
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CLITests(unittest.TestCase):
|
|
15
|
+
def test_stats_flag_alias(self):
|
|
16
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
17
|
+
db_path = Path(temp_dir) / ".ragrep.db"
|
|
18
|
+
|
|
19
|
+
output = StringIO()
|
|
20
|
+
with redirect_stdout(output):
|
|
21
|
+
exit_code = main(["--stats", "--json", "--db-path", str(db_path)])
|
|
22
|
+
|
|
23
|
+
self.assertEqual(exit_code, 0)
|
|
24
|
+
payload = json.loads(output.getvalue())
|
|
25
|
+
self.assertEqual(payload["backend"], "sqlite")
|
|
26
|
+
self.assertEqual(payload["total_chunks"], 0)
|
|
27
|
+
|
|
28
|
+
def test_check_gpu_flag_alias(self):
|
|
29
|
+
output = StringIO()
|
|
30
|
+
with redirect_stdout(output):
|
|
31
|
+
exit_code = main(["--check-gpu", "--json"])
|
|
32
|
+
|
|
33
|
+
self.assertEqual(exit_code, 0)
|
|
34
|
+
payload = json.loads(output.getvalue())
|
|
35
|
+
self.assertIn("resolved_device", payload)
|
|
36
|
+
self.assertIn("torch_available", payload)
|
|
37
|
+
|
|
38
|
+
def test_index_prints_added_modified_and_removed_file_paths(self):
|
|
39
|
+
class DummyRAG:
|
|
40
|
+
def __init__(self, *args, **kwargs):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
def __enter__(self):
|
|
44
|
+
return self
|
|
45
|
+
|
|
46
|
+
def __exit__(self, exc_type, exc, tb):
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
def index(self, path=".", force=False):
|
|
50
|
+
return {
|
|
51
|
+
"indexed": True,
|
|
52
|
+
"reason": "new files detected, updated files detected, files removed",
|
|
53
|
+
"root": "/tmp/work",
|
|
54
|
+
"files": 3,
|
|
55
|
+
"chunks": 10,
|
|
56
|
+
"chunks_indexed": 4,
|
|
57
|
+
"indexed_files": 2,
|
|
58
|
+
"new_files": ["src/new_file.py"],
|
|
59
|
+
"updated_files": ["src/changed_file.py"],
|
|
60
|
+
"removed_files": ["src/removed_file.py"],
|
|
61
|
+
"full_rebuild": False,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
output = StringIO()
|
|
65
|
+
with patch("ragrep.cli.RAGrep", DummyRAG):
|
|
66
|
+
with redirect_stdout(output):
|
|
67
|
+
exit_code = main(["index", "."])
|
|
68
|
+
|
|
69
|
+
self.assertEqual(exit_code, 0)
|
|
70
|
+
text = output.getvalue()
|
|
71
|
+
self.assertIn("Index updated for /tmp/work: 1 added, 1 modified, 1 removed.", text)
|
|
72
|
+
self.assertIn("Added files:", text)
|
|
73
|
+
self.assertIn("src/new_file.py", text)
|
|
74
|
+
self.assertIn("Modified files:", text)
|
|
75
|
+
self.assertIn("src/changed_file.py", text)
|
|
76
|
+
self.assertIn("Removed files:", text)
|
|
77
|
+
self.assertIn("src/removed_file.py", text)
|
|
78
|
+
self.assertIn(
|
|
79
|
+
"Indexed 2 changed files (4 chunks updated, 10 total): "
|
|
80
|
+
"new files detected, updated files detected, files removed",
|
|
81
|
+
text,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def test_recall_prints_up_to_date_status_before_results(self):
|
|
85
|
+
class DummyRAG:
|
|
86
|
+
def __init__(self, *args, **kwargs):
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
def __enter__(self):
|
|
90
|
+
return self
|
|
91
|
+
|
|
92
|
+
def __exit__(self, exc_type, exc, tb):
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
def recall(self, query, limit=20, path=None, auto_index=True):
|
|
96
|
+
return {
|
|
97
|
+
"query": query,
|
|
98
|
+
"count": 1,
|
|
99
|
+
"matches": [
|
|
100
|
+
{
|
|
101
|
+
"score": 0.9,
|
|
102
|
+
"text": "# Schema",
|
|
103
|
+
"metadata": {"source": "docs/schema.md"},
|
|
104
|
+
}
|
|
105
|
+
],
|
|
106
|
+
"auto_index": {
|
|
107
|
+
"indexed": False,
|
|
108
|
+
"reason": "index is current",
|
|
109
|
+
"root": "/tmp/work",
|
|
110
|
+
"files": 2,
|
|
111
|
+
"chunks": 8,
|
|
112
|
+
"chunks_indexed": 0,
|
|
113
|
+
"indexed_files": 0,
|
|
114
|
+
"new_files": [],
|
|
115
|
+
"updated_files": [],
|
|
116
|
+
"removed_files": [],
|
|
117
|
+
"full_rebuild": False,
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
output = StringIO()
|
|
122
|
+
with patch("ragrep.cli.RAGrep", DummyRAG):
|
|
123
|
+
with redirect_stdout(output):
|
|
124
|
+
exit_code = main(["schema"])
|
|
125
|
+
|
|
126
|
+
self.assertEqual(exit_code, 0)
|
|
127
|
+
text = output.getvalue()
|
|
128
|
+
self.assertIn(
|
|
129
|
+
"Index is already up to date for /tmp/work (2 files, 8 chunks): index is current",
|
|
130
|
+
text,
|
|
131
|
+
)
|
|
132
|
+
self.assertIn("Results: 1", text)
|
|
133
|
+
self.assertLess(
|
|
134
|
+
text.index("Index is already up to date for /tmp/work"),
|
|
135
|
+
text.index("Results: 1"),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
unittest.main()
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import tempfile
|
|
6
|
+
import unittest
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from types import SimpleNamespace
|
|
9
|
+
from types import ModuleType
|
|
10
|
+
from unittest.mock import patch
|
|
11
|
+
|
|
12
|
+
from ragrep.retrieval.embeddings import (
|
|
13
|
+
LocalEmbedder,
|
|
14
|
+
default_model_dir,
|
|
15
|
+
get_runtime_device_info,
|
|
16
|
+
resolve_embedding_model,
|
|
17
|
+
resolve_runtime_device,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EmbeddingConfigTests(unittest.TestCase):
|
|
22
|
+
def test_model_alias_resolution(self):
|
|
23
|
+
self.assertEqual(
|
|
24
|
+
resolve_embedding_model("mxbai-embed-large"),
|
|
25
|
+
"mixedbread-ai/mxbai-embed-large-v1",
|
|
26
|
+
)
|
|
27
|
+
self.assertEqual(resolve_embedding_model("custom/model"), "custom/model")
|
|
28
|
+
|
|
29
|
+
def test_model_dir_env_override(self):
|
|
30
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
31
|
+
with patch.dict(os.environ, {"RAGREP_MODEL_DIR": temp_dir}, clear=False):
|
|
32
|
+
self.assertEqual(default_model_dir(), Path(temp_dir).resolve())
|
|
33
|
+
|
|
34
|
+
def test_device_auto_without_torch(self):
|
|
35
|
+
with patch.dict(sys.modules, {"torch": None}):
|
|
36
|
+
self.assertEqual(resolve_runtime_device("auto"), "cpu")
|
|
37
|
+
|
|
38
|
+
def test_device_auto_prefers_cuda(self):
|
|
39
|
+
fake_torch = SimpleNamespace(
|
|
40
|
+
cuda=SimpleNamespace(is_available=lambda: True),
|
|
41
|
+
backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
|
|
42
|
+
)
|
|
43
|
+
with patch.dict(sys.modules, {"torch": fake_torch}):
|
|
44
|
+
self.assertEqual(resolve_runtime_device("auto"), "cuda")
|
|
45
|
+
|
|
46
|
+
def test_device_auto_uses_mps_when_cuda_missing(self):
|
|
47
|
+
fake_torch = SimpleNamespace(
|
|
48
|
+
cuda=SimpleNamespace(is_available=lambda: False),
|
|
49
|
+
backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: True)),
|
|
50
|
+
)
|
|
51
|
+
with patch.dict(sys.modules, {"torch": fake_torch}):
|
|
52
|
+
self.assertEqual(resolve_runtime_device("auto"), "mps")
|
|
53
|
+
|
|
54
|
+
def test_explicit_device_is_respected(self):
|
|
55
|
+
self.assertEqual(resolve_runtime_device("cpu"), "cpu")
|
|
56
|
+
self.assertEqual(resolve_runtime_device("cuda:0"), "cuda:0")
|
|
57
|
+
|
|
58
|
+
def test_runtime_device_info_without_torch(self):
|
|
59
|
+
with patch.dict(sys.modules, {"torch": None}):
|
|
60
|
+
info = get_runtime_device_info("auto")
|
|
61
|
+
self.assertFalse(info["torch_available"])
|
|
62
|
+
self.assertEqual(info["resolved_device"], "cpu")
|
|
63
|
+
|
|
64
|
+
def test_runtime_device_info_with_cuda_inventory(self):
|
|
65
|
+
fake_torch = SimpleNamespace(
|
|
66
|
+
cuda=SimpleNamespace(
|
|
67
|
+
is_available=lambda: True,
|
|
68
|
+
device_count=lambda: 2,
|
|
69
|
+
get_device_name=lambda i: f"GPU-{i}",
|
|
70
|
+
),
|
|
71
|
+
backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
|
|
72
|
+
)
|
|
73
|
+
with patch.dict(sys.modules, {"torch": fake_torch}):
|
|
74
|
+
info = get_runtime_device_info("auto")
|
|
75
|
+
self.assertTrue(info["torch_available"])
|
|
76
|
+
self.assertTrue(info["cuda_available"])
|
|
77
|
+
self.assertEqual(info["cuda_device_count"], 2)
|
|
78
|
+
self.assertEqual(info["cuda_devices"], ["GPU-0", "GPU-1"])
|
|
79
|
+
|
|
80
|
+
def test_local_embedder_uses_local_files_only_when_model_is_cached(self):
|
|
81
|
+
calls = []
|
|
82
|
+
sentence_transformers = ModuleType("sentence_transformers")
|
|
83
|
+
huggingface_hub = ModuleType("huggingface_hub")
|
|
84
|
+
cached_marker = object()
|
|
85
|
+
|
|
86
|
+
class FakeSentenceTransformer:
|
|
87
|
+
def __init__(self, model_name, **kwargs):
|
|
88
|
+
calls.append({"model_name": model_name, **kwargs})
|
|
89
|
+
|
|
90
|
+
def try_to_load_from_cache(*, repo_id, filename, cache_dir):
|
|
91
|
+
if repo_id == "mixedbread-ai/mxbai-embed-large-v1" and filename == "modules.json":
|
|
92
|
+
return str(Path(cache_dir) / "models--cached" / "modules.json")
|
|
93
|
+
return cached_marker
|
|
94
|
+
|
|
95
|
+
sentence_transformers.SentenceTransformer = FakeSentenceTransformer
|
|
96
|
+
huggingface_hub.try_to_load_from_cache = try_to_load_from_cache
|
|
97
|
+
huggingface_hub._CACHED_NO_EXIST = cached_marker
|
|
98
|
+
|
|
99
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
100
|
+
with patch.dict(
|
|
101
|
+
sys.modules,
|
|
102
|
+
{
|
|
103
|
+
"sentence_transformers": sentence_transformers,
|
|
104
|
+
"huggingface_hub": huggingface_hub,
|
|
105
|
+
},
|
|
106
|
+
):
|
|
107
|
+
LocalEmbedder(model_dir=temp_dir, device="cpu")
|
|
108
|
+
|
|
109
|
+
self.assertEqual(len(calls), 1)
|
|
110
|
+
self.assertTrue(calls[0]["local_files_only"])
|
|
111
|
+
|
|
112
|
+
def test_local_embedder_allows_download_when_cache_is_missing(self):
|
|
113
|
+
calls = []
|
|
114
|
+
sentence_transformers = ModuleType("sentence_transformers")
|
|
115
|
+
huggingface_hub = ModuleType("huggingface_hub")
|
|
116
|
+
cached_marker = object()
|
|
117
|
+
|
|
118
|
+
class FakeSentenceTransformer:
|
|
119
|
+
def __init__(self, model_name, **kwargs):
|
|
120
|
+
calls.append({"model_name": model_name, **kwargs})
|
|
121
|
+
|
|
122
|
+
def try_to_load_from_cache(*, repo_id, filename, cache_dir):
|
|
123
|
+
return cached_marker
|
|
124
|
+
|
|
125
|
+
sentence_transformers.SentenceTransformer = FakeSentenceTransformer
|
|
126
|
+
huggingface_hub.try_to_load_from_cache = try_to_load_from_cache
|
|
127
|
+
huggingface_hub._CACHED_NO_EXIST = cached_marker
|
|
128
|
+
|
|
129
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
130
|
+
with patch.dict(
|
|
131
|
+
sys.modules,
|
|
132
|
+
{
|
|
133
|
+
"sentence_transformers": sentence_transformers,
|
|
134
|
+
"huggingface_hub": huggingface_hub,
|
|
135
|
+
},
|
|
136
|
+
):
|
|
137
|
+
LocalEmbedder(model_dir=temp_dir, device="cpu")
|
|
138
|
+
|
|
139
|
+
self.assertEqual(len(calls), 1)
|
|
140
|
+
self.assertFalse(calls[0]["local_files_only"])
|
|
141
|
+
|
|
142
|
+
def test_local_embedder_retries_without_local_only_when_cached_load_fails(self):
|
|
143
|
+
calls = []
|
|
144
|
+
sentence_transformers = ModuleType("sentence_transformers")
|
|
145
|
+
huggingface_hub = ModuleType("huggingface_hub")
|
|
146
|
+
cached_marker = object()
|
|
147
|
+
|
|
148
|
+
class FakeSentenceTransformer:
|
|
149
|
+
def __init__(self, model_name, **kwargs):
|
|
150
|
+
calls.append({"model_name": model_name, **kwargs})
|
|
151
|
+
if kwargs.get("local_files_only"):
|
|
152
|
+
raise OSError("cache incomplete")
|
|
153
|
+
|
|
154
|
+
def try_to_load_from_cache(*, repo_id, filename, cache_dir):
|
|
155
|
+
if filename == "modules.json":
|
|
156
|
+
return str(Path(cache_dir) / "models--cached" / "modules.json")
|
|
157
|
+
return cached_marker
|
|
158
|
+
|
|
159
|
+
sentence_transformers.SentenceTransformer = FakeSentenceTransformer
|
|
160
|
+
huggingface_hub.try_to_load_from_cache = try_to_load_from_cache
|
|
161
|
+
huggingface_hub._CACHED_NO_EXIST = cached_marker
|
|
162
|
+
|
|
163
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
164
|
+
with patch.dict(
|
|
165
|
+
sys.modules,
|
|
166
|
+
{
|
|
167
|
+
"sentence_transformers": sentence_transformers,
|
|
168
|
+
"huggingface_hub": huggingface_hub,
|
|
169
|
+
},
|
|
170
|
+
):
|
|
171
|
+
LocalEmbedder(model_dir=temp_dir, device="cpu")
|
|
172
|
+
|
|
173
|
+
self.assertEqual(len(calls), 2)
|
|
174
|
+
self.assertTrue(calls[0]["local_files_only"])
|
|
175
|
+
self.assertFalse(calls[1]["local_files_only"])
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
if __name__ == "__main__":
|
|
179
|
+
unittest.main()
|
|
@@ -20,6 +20,10 @@ class FakeEmbedder:
|
|
|
20
20
|
"payment",
|
|
21
21
|
"cache",
|
|
22
22
|
"error",
|
|
23
|
+
"schema",
|
|
24
|
+
"user",
|
|
25
|
+
"message",
|
|
26
|
+
"type",
|
|
23
27
|
]
|
|
24
28
|
|
|
25
29
|
def embed_texts(self, texts, batch_size: int = 32):
|
|
@@ -151,6 +155,56 @@ class RAGrepTests(unittest.TestCase):
|
|
|
151
155
|
finally:
|
|
152
156
|
rag.close()
|
|
153
157
|
|
|
158
|
+
def test_index_and_recall_schema_files(self):
|
|
159
|
+
schema_root = self.root / "schemas"
|
|
160
|
+
schema_root.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
(schema_root / "user.graphql").write_text(
|
|
162
|
+
"type User {\n id: ID!\n email: String!\n}\n",
|
|
163
|
+
encoding="utf-8",
|
|
164
|
+
)
|
|
165
|
+
(schema_root / "user.proto").write_text(
|
|
166
|
+
"syntax = \"proto3\";\nmessage User {\n string id = 1;\n}\n",
|
|
167
|
+
encoding="utf-8",
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
rag = RAGrep(db_path=str(self.db_path), embedder=FakeEmbedder())
|
|
171
|
+
try:
|
|
172
|
+
index_result = rag.index(str(schema_root))
|
|
173
|
+
self.assertTrue(index_result["indexed"])
|
|
174
|
+
self.assertEqual(index_result["files"], 2)
|
|
175
|
+
|
|
176
|
+
recall_result = rag.recall("schema user", limit=5, auto_index=False)
|
|
177
|
+
self.assertEqual(recall_result["count"], 2)
|
|
178
|
+
sources = {match["metadata"]["source"] for match in recall_result["matches"]}
|
|
179
|
+
self.assertEqual(sources, {"user.graphql", "user.proto"})
|
|
180
|
+
finally:
|
|
181
|
+
rag.close()
|
|
182
|
+
|
|
183
|
+
def test_index_and_recall_markdown_variants(self):
|
|
184
|
+
docs_root = self.root / "docs"
|
|
185
|
+
docs_root.mkdir(parents=True, exist_ok=True)
|
|
186
|
+
(docs_root / "schema.mdx").write_text(
|
|
187
|
+
"# Schema\n\nUser field documentation.\n",
|
|
188
|
+
encoding="utf-8",
|
|
189
|
+
)
|
|
190
|
+
(docs_root / "database.markdown").write_text(
|
|
191
|
+
"# Database\n\nSchema for the user table.\n",
|
|
192
|
+
encoding="utf-8",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
rag = RAGrep(db_path=str(self.db_path), embedder=FakeEmbedder())
|
|
196
|
+
try:
|
|
197
|
+
index_result = rag.index(str(docs_root))
|
|
198
|
+
self.assertTrue(index_result["indexed"])
|
|
199
|
+
self.assertEqual(index_result["files"], 2)
|
|
200
|
+
|
|
201
|
+
recall_result = rag.recall("schema user", limit=5, auto_index=False)
|
|
202
|
+
self.assertEqual(recall_result["count"], 2)
|
|
203
|
+
sources = {match["metadata"]["source"] for match in recall_result["matches"]}
|
|
204
|
+
self.assertEqual(sources, {"schema.mdx", "database.markdown"})
|
|
205
|
+
finally:
|
|
206
|
+
rag.close()
|
|
207
|
+
|
|
154
208
|
def test_stats(self):
|
|
155
209
|
rag = RAGrep(db_path=str(self.db_path), embedder=FakeEmbedder())
|
|
156
210
|
try:
|
ragrep-0.2.1/tests/test_cli.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import tempfile
|
|
5
|
-
import unittest
|
|
6
|
-
from contextlib import redirect_stdout
|
|
7
|
-
from io import StringIO
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from unittest.mock import patch
|
|
10
|
-
|
|
11
|
-
from ragrep.cli import main
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class CLITests(unittest.TestCase):
|
|
15
|
-
def test_stats_flag_alias(self):
|
|
16
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
17
|
-
db_path = Path(temp_dir) / ".ragrep.db"
|
|
18
|
-
|
|
19
|
-
output = StringIO()
|
|
20
|
-
with redirect_stdout(output):
|
|
21
|
-
exit_code = main(["--stats", "--json", "--db-path", str(db_path)])
|
|
22
|
-
|
|
23
|
-
self.assertEqual(exit_code, 0)
|
|
24
|
-
payload = json.loads(output.getvalue())
|
|
25
|
-
self.assertEqual(payload["backend"], "sqlite")
|
|
26
|
-
self.assertEqual(payload["total_chunks"], 0)
|
|
27
|
-
|
|
28
|
-
def test_check_gpu_flag_alias(self):
|
|
29
|
-
output = StringIO()
|
|
30
|
-
with redirect_stdout(output):
|
|
31
|
-
exit_code = main(["--check-gpu", "--json"])
|
|
32
|
-
|
|
33
|
-
self.assertEqual(exit_code, 0)
|
|
34
|
-
payload = json.loads(output.getvalue())
|
|
35
|
-
self.assertIn("resolved_device", payload)
|
|
36
|
-
self.assertIn("torch_available", payload)
|
|
37
|
-
|
|
38
|
-
def test_index_prints_new_file_paths(self):
|
|
39
|
-
class DummyRAG:
|
|
40
|
-
def __init__(self, *args, **kwargs):
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
def __enter__(self):
|
|
44
|
-
return self
|
|
45
|
-
|
|
46
|
-
def __exit__(self, exc_type, exc, tb):
|
|
47
|
-
return None
|
|
48
|
-
|
|
49
|
-
def index(self, path=".", force=False):
|
|
50
|
-
return {
|
|
51
|
-
"indexed": True,
|
|
52
|
-
"reason": "new files detected",
|
|
53
|
-
"root": "/tmp/work",
|
|
54
|
-
"files": 3,
|
|
55
|
-
"chunks": 10,
|
|
56
|
-
"chunks_indexed": 4,
|
|
57
|
-
"indexed_files": 1,
|
|
58
|
-
"new_files": ["src/new_file.py"],
|
|
59
|
-
"updated_files": [],
|
|
60
|
-
"removed_files": [],
|
|
61
|
-
"full_rebuild": False,
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
output = StringIO()
|
|
65
|
-
with patch("ragrep.cli.RAGrep", DummyRAG):
|
|
66
|
-
with redirect_stdout(output):
|
|
67
|
-
exit_code = main(["index", "."])
|
|
68
|
-
|
|
69
|
-
self.assertEqual(exit_code, 0)
|
|
70
|
-
text = output.getvalue()
|
|
71
|
-
self.assertIn("New files indexed:", text)
|
|
72
|
-
self.assertIn("src/new_file.py", text)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if __name__ == "__main__":
|
|
76
|
-
unittest.main()
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import sys
|
|
5
|
-
import tempfile
|
|
6
|
-
import unittest
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from types import SimpleNamespace
|
|
9
|
-
from unittest.mock import patch
|
|
10
|
-
|
|
11
|
-
from ragrep.retrieval.embeddings import (
|
|
12
|
-
default_model_dir,
|
|
13
|
-
get_runtime_device_info,
|
|
14
|
-
resolve_embedding_model,
|
|
15
|
-
resolve_runtime_device,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class EmbeddingConfigTests(unittest.TestCase):
|
|
20
|
-
def test_model_alias_resolution(self):
|
|
21
|
-
self.assertEqual(
|
|
22
|
-
resolve_embedding_model("mxbai-embed-large"),
|
|
23
|
-
"mixedbread-ai/mxbai-embed-large-v1",
|
|
24
|
-
)
|
|
25
|
-
self.assertEqual(resolve_embedding_model("custom/model"), "custom/model")
|
|
26
|
-
|
|
27
|
-
def test_model_dir_env_override(self):
|
|
28
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
29
|
-
with patch.dict(os.environ, {"RAGREP_MODEL_DIR": temp_dir}, clear=False):
|
|
30
|
-
self.assertEqual(default_model_dir(), Path(temp_dir).resolve())
|
|
31
|
-
|
|
32
|
-
def test_device_auto_without_torch(self):
|
|
33
|
-
with patch.dict(sys.modules, {"torch": None}):
|
|
34
|
-
self.assertEqual(resolve_runtime_device("auto"), "cpu")
|
|
35
|
-
|
|
36
|
-
def test_device_auto_prefers_cuda(self):
|
|
37
|
-
fake_torch = SimpleNamespace(
|
|
38
|
-
cuda=SimpleNamespace(is_available=lambda: True),
|
|
39
|
-
backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
|
|
40
|
-
)
|
|
41
|
-
with patch.dict(sys.modules, {"torch": fake_torch}):
|
|
42
|
-
self.assertEqual(resolve_runtime_device("auto"), "cuda")
|
|
43
|
-
|
|
44
|
-
def test_device_auto_uses_mps_when_cuda_missing(self):
|
|
45
|
-
fake_torch = SimpleNamespace(
|
|
46
|
-
cuda=SimpleNamespace(is_available=lambda: False),
|
|
47
|
-
backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: True)),
|
|
48
|
-
)
|
|
49
|
-
with patch.dict(sys.modules, {"torch": fake_torch}):
|
|
50
|
-
self.assertEqual(resolve_runtime_device("auto"), "mps")
|
|
51
|
-
|
|
52
|
-
def test_explicit_device_is_respected(self):
|
|
53
|
-
self.assertEqual(resolve_runtime_device("cpu"), "cpu")
|
|
54
|
-
self.assertEqual(resolve_runtime_device("cuda:0"), "cuda:0")
|
|
55
|
-
|
|
56
|
-
def test_runtime_device_info_without_torch(self):
|
|
57
|
-
with patch.dict(sys.modules, {"torch": None}):
|
|
58
|
-
info = get_runtime_device_info("auto")
|
|
59
|
-
self.assertFalse(info["torch_available"])
|
|
60
|
-
self.assertEqual(info["resolved_device"], "cpu")
|
|
61
|
-
|
|
62
|
-
def test_runtime_device_info_with_cuda_inventory(self):
|
|
63
|
-
fake_torch = SimpleNamespace(
|
|
64
|
-
cuda=SimpleNamespace(
|
|
65
|
-
is_available=lambda: True,
|
|
66
|
-
device_count=lambda: 2,
|
|
67
|
-
get_device_name=lambda i: f"GPU-{i}",
|
|
68
|
-
),
|
|
69
|
-
backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
|
|
70
|
-
)
|
|
71
|
-
with patch.dict(sys.modules, {"torch": fake_torch}):
|
|
72
|
-
info = get_runtime_device_info("auto")
|
|
73
|
-
self.assertTrue(info["torch_available"])
|
|
74
|
-
self.assertTrue(info["cuda_available"])
|
|
75
|
-
self.assertEqual(info["cuda_device_count"], 2)
|
|
76
|
-
self.assertEqual(info["cuda_devices"], ["GPU-0", "GPU-1"])
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if __name__ == "__main__":
|
|
80
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|