ragrep 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {ragrep-0.2.1/src/ragrep.egg-info → ragrep-0.2.2}/PKG-INFO +1 -1
  2. {ragrep-0.2.1 → ragrep-0.2.2}/pyproject.toml +1 -1
  3. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/__init__.py +1 -1
  4. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/cli.py +35 -20
  5. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/core/document_processor.py +13 -0
  6. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/retrieval/embeddings.py +60 -8
  7. {ragrep-0.2.1 → ragrep-0.2.2/src/ragrep.egg-info}/PKG-INFO +1 -1
  8. ragrep-0.2.2/tests/test_cli.py +140 -0
  9. ragrep-0.2.2/tests/test_embeddings.py +179 -0
  10. {ragrep-0.2.1 → ragrep-0.2.2}/tests/test_ragrep.py +54 -0
  11. ragrep-0.2.1/tests/test_cli.py +0 -76
  12. ragrep-0.2.1/tests/test_embeddings.py +0 -80
  13. {ragrep-0.2.1 → ragrep-0.2.2}/LICENSE +0 -0
  14. {ragrep-0.2.1 → ragrep-0.2.2}/MANIFEST.in +0 -0
  15. {ragrep-0.2.1 → ragrep-0.2.2}/README.md +0 -0
  16. {ragrep-0.2.1 → ragrep-0.2.2}/docs/README.md +0 -0
  17. {ragrep-0.2.1 → ragrep-0.2.2}/env.example +0 -0
  18. {ragrep-0.2.1 → ragrep-0.2.2}/examples/basic_usage.py +0 -0
  19. {ragrep-0.2.1 → ragrep-0.2.2}/requirements.txt +0 -0
  20. {ragrep-0.2.1 → ragrep-0.2.2}/setup.cfg +0 -0
  21. {ragrep-0.2.1 → ragrep-0.2.2}/setup.py +0 -0
  22. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/core/__init__.py +0 -0
  23. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/core/rag_system.py +0 -0
  24. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/retrieval/__init__.py +0 -0
  25. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep/retrieval/vector_store.py +0 -0
  26. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/SOURCES.txt +0 -0
  27. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/dependency_links.txt +0 -0
  28. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/entry_points.txt +0 -0
  29. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/requires.txt +0 -0
  30. {ragrep-0.2.1 → ragrep-0.2.2}/src/ragrep.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragrep
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Local semantic code recall with mxbai embeddings and SQLite
5
5
  Author-email: RAGrep Team <ragrep@example.com>
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ragrep"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  description = "Local semantic code recall with mxbai embeddings and SQLite"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.2.1"
5
+ __version__ = "0.2.2"
6
6
  __author__ = "RAGrep Team"
7
7
 
8
8
 
@@ -117,15 +117,42 @@ def _run_gpu_info(args: argparse.Namespace) -> int:
117
117
  return 0
118
118
 
119
119
 
120
- def _print_new_file_paths(index_result: dict) -> None:
121
- new_files = index_result.get("new_files") or []
122
- if not new_files:
120
+ def _print_file_paths(title: str, paths: list[str]) -> None:
121
+ if not paths:
123
122
  return
124
- print("New files indexed:")
125
- for path in new_files:
123
+ print(f"{title}:")
124
+ for path in paths:
126
125
  print(path)
127
126
 
128
127
 
128
+ def _print_index_status(index_result: dict) -> None:
129
+ root = index_result.get("root") or index_result.get("indexed_root") or "."
130
+ new_files = index_result.get("new_files") or []
131
+ updated_files = index_result.get("updated_files") or []
132
+ removed_files = index_result.get("removed_files") or []
133
+
134
+ if index_result.get("indexed"):
135
+ print(
136
+ f"Index updated for {root}: "
137
+ f"{len(new_files)} added, {len(updated_files)} modified, {len(removed_files)} removed."
138
+ )
139
+ _print_file_paths("Added files", new_files)
140
+ _print_file_paths("Modified files", updated_files)
141
+ _print_file_paths("Removed files", removed_files)
142
+ print(
143
+ f"Indexed {index_result['indexed_files']} changed files "
144
+ f"({index_result['chunks_indexed']} chunks updated, {index_result['chunks']} total): "
145
+ f"{index_result['reason']}"
146
+ )
147
+ return
148
+
149
+ print(
150
+ f"Index is already up to date for {root} "
151
+ f"({index_result['files']} files, {index_result['chunks']} chunks): "
152
+ f"{index_result['reason']}"
153
+ )
154
+
155
+
129
156
  def _run_recall(args: argparse.Namespace) -> int:
130
157
  setup_logging(args.verbose)
131
158
  query = " ".join(args.query).strip()
@@ -150,13 +177,8 @@ def _run_recall(args: argparse.Namespace) -> int:
150
177
  return 0
151
178
 
152
179
  index_info = result.get("auto_index")
153
- if index_info and index_info.get("indexed"):
154
- _print_new_file_paths(index_info)
155
- print(
156
- f"Indexed {index_info['indexed_files']} changed files "
157
- f"({index_info['chunks_indexed']} chunks updated, {index_info['chunks']} total): "
158
- f"{index_info['reason']}"
159
- )
180
+ if index_info:
181
+ _print_index_status(index_info)
160
182
 
161
183
  matches = result.get("matches", [])
162
184
  print(f"Results: {len(matches)}")
@@ -185,14 +207,7 @@ def _run_index(args: argparse.Namespace) -> int:
185
207
  print(json.dumps(result, indent=2))
186
208
  return 0
187
209
 
188
- if result["indexed"]:
189
- _print_new_file_paths(result)
190
- print(
191
- f"Indexed {result['indexed_files']} changed files "
192
- f"({result['chunks_indexed']} chunks updated, {result['chunks']} total)"
193
- )
194
- else:
195
- print(f"Index unchanged: {result['reason']}")
210
+ _print_index_status(result)
196
211
 
197
212
  return 0
198
213
 
@@ -8,26 +8,39 @@ from typing import Any, Dict, Iterable, List
8
8
 
9
9
 
10
10
  _DEFAULT_EXTENSIONS = {
11
+ ".avsc",
11
12
  ".c",
12
13
  ".cc",
13
14
  ".cpp",
14
15
  ".css",
16
+ ".gql",
15
17
  ".go",
18
+ ".graphql",
19
+ ".graphqls",
16
20
  ".h",
17
21
  ".hpp",
18
22
  ".html",
19
23
  ".java",
20
24
  ".js",
21
25
  ".json",
26
+ ".markdown",
22
27
  ".md",
28
+ ".mdown",
29
+ ".mdx",
30
+ ".mkd",
31
+ ".mkdn",
23
32
  ".py",
24
33
  ".rb",
34
+ ".raml",
25
35
  ".rs",
26
36
  ".sql",
27
37
  ".toml",
28
38
  ".ts",
29
39
  ".txt",
40
+ ".proto",
41
+ ".wsdl",
30
42
  ".xml",
43
+ ".xsd",
31
44
  ".yaml",
32
45
  ".yml",
33
46
  }
@@ -115,6 +115,29 @@ def default_model_dir() -> Path:
115
115
  return base / "models"
116
116
 
117
117
 
118
+ def _has_local_model_snapshot(model: str, model_dir: Path) -> bool:
119
+ """Return whether the requested model already exists in the local cache."""
120
+ model_path = Path(model).expanduser()
121
+ if model_path.exists():
122
+ return True
123
+
124
+ try:
125
+ from huggingface_hub import _CACHED_NO_EXIST, try_to_load_from_cache
126
+ except Exception:
127
+ return False
128
+
129
+ for filename in ("modules.json", "config.json", "tokenizer_config.json"):
130
+ cached = try_to_load_from_cache(
131
+ repo_id=model,
132
+ filename=filename,
133
+ cache_dir=str(model_dir),
134
+ )
135
+ if cached is not None and cached != _CACHED_NO_EXIST:
136
+ return True
137
+
138
+ return False
139
+
140
+
118
141
  class LocalEmbedder:
119
142
  """Generate embeddings in-process using sentence-transformers."""
120
143
 
@@ -139,17 +162,46 @@ class LocalEmbedder:
139
162
  "Install with: pip install sentence-transformers"
140
163
  ) from exc
141
164
 
165
+ prefer_local_only = _has_local_model_snapshot(self.resolved_model, self.model_dir)
166
+
142
167
  try:
143
- self._model = SentenceTransformer(
144
- self.resolved_model,
145
- cache_folder=str(self.model_dir),
146
- device=self.device,
168
+ self._model = self._load_model(
169
+ SentenceTransformer,
170
+ local_files_only=prefer_local_only,
147
171
  )
148
172
  except Exception as exc: # pragma: no cover - model download/load depends on environment
149
- raise EmbeddingError(
150
- f"Failed to load embedding model '{self.resolved_model}'. "
151
- f"Model directory: {self.model_dir}."
152
- ) from exc
173
+ if prefer_local_only:
174
+ try:
175
+ self._model = self._load_model(
176
+ SentenceTransformer,
177
+ local_files_only=False,
178
+ )
179
+ except Exception as retry_exc:
180
+ raise EmbeddingError(
181
+ f"Failed to load embedding model '{self.resolved_model}'. "
182
+ f"Model directory: {self.model_dir}."
183
+ ) from retry_exc
184
+ else:
185
+ raise EmbeddingError(
186
+ f"Failed to load embedding model '{self.resolved_model}'. "
187
+ f"Model directory: {self.model_dir}."
188
+ ) from exc
189
+
190
+ def _load_model(self, sentence_transformer_cls: Any, *, local_files_only: bool) -> Any:
191
+ kwargs: Dict[str, Any] = {
192
+ "cache_folder": str(self.model_dir),
193
+ "device": self.device,
194
+ "local_files_only": local_files_only,
195
+ }
196
+
197
+ try:
198
+ return sentence_transformer_cls(self.resolved_model, **kwargs)
199
+ except TypeError as exc:
200
+ if "local_files_only" not in str(exc):
201
+ raise
202
+
203
+ kwargs.pop("local_files_only", None)
204
+ return sentence_transformer_cls(self.resolved_model, **kwargs)
153
205
 
154
206
  def embed_texts(self, texts: Iterable[str], batch_size: int = 32) -> List[List[float]]:
155
207
  items = list(texts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragrep
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Local semantic code recall with mxbai embeddings and SQLite
5
5
  Author-email: RAGrep Team <ragrep@example.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import tempfile
5
+ import unittest
6
+ from contextlib import redirect_stdout
7
+ from io import StringIO
8
+ from pathlib import Path
9
+ from unittest.mock import patch
10
+
11
+ from ragrep.cli import main
12
+
13
+
14
+ class CLITests(unittest.TestCase):
15
+ def test_stats_flag_alias(self):
16
+ with tempfile.TemporaryDirectory() as temp_dir:
17
+ db_path = Path(temp_dir) / ".ragrep.db"
18
+
19
+ output = StringIO()
20
+ with redirect_stdout(output):
21
+ exit_code = main(["--stats", "--json", "--db-path", str(db_path)])
22
+
23
+ self.assertEqual(exit_code, 0)
24
+ payload = json.loads(output.getvalue())
25
+ self.assertEqual(payload["backend"], "sqlite")
26
+ self.assertEqual(payload["total_chunks"], 0)
27
+
28
+ def test_check_gpu_flag_alias(self):
29
+ output = StringIO()
30
+ with redirect_stdout(output):
31
+ exit_code = main(["--check-gpu", "--json"])
32
+
33
+ self.assertEqual(exit_code, 0)
34
+ payload = json.loads(output.getvalue())
35
+ self.assertIn("resolved_device", payload)
36
+ self.assertIn("torch_available", payload)
37
+
38
+ def test_index_prints_added_modified_and_removed_file_paths(self):
39
+ class DummyRAG:
40
+ def __init__(self, *args, **kwargs):
41
+ pass
42
+
43
+ def __enter__(self):
44
+ return self
45
+
46
+ def __exit__(self, exc_type, exc, tb):
47
+ return None
48
+
49
+ def index(self, path=".", force=False):
50
+ return {
51
+ "indexed": True,
52
+ "reason": "new files detected, updated files detected, files removed",
53
+ "root": "/tmp/work",
54
+ "files": 3,
55
+ "chunks": 10,
56
+ "chunks_indexed": 4,
57
+ "indexed_files": 2,
58
+ "new_files": ["src/new_file.py"],
59
+ "updated_files": ["src/changed_file.py"],
60
+ "removed_files": ["src/removed_file.py"],
61
+ "full_rebuild": False,
62
+ }
63
+
64
+ output = StringIO()
65
+ with patch("ragrep.cli.RAGrep", DummyRAG):
66
+ with redirect_stdout(output):
67
+ exit_code = main(["index", "."])
68
+
69
+ self.assertEqual(exit_code, 0)
70
+ text = output.getvalue()
71
+ self.assertIn("Index updated for /tmp/work: 1 added, 1 modified, 1 removed.", text)
72
+ self.assertIn("Added files:", text)
73
+ self.assertIn("src/new_file.py", text)
74
+ self.assertIn("Modified files:", text)
75
+ self.assertIn("src/changed_file.py", text)
76
+ self.assertIn("Removed files:", text)
77
+ self.assertIn("src/removed_file.py", text)
78
+ self.assertIn(
79
+ "Indexed 2 changed files (4 chunks updated, 10 total): "
80
+ "new files detected, updated files detected, files removed",
81
+ text,
82
+ )
83
+
84
+ def test_recall_prints_up_to_date_status_before_results(self):
85
+ class DummyRAG:
86
+ def __init__(self, *args, **kwargs):
87
+ pass
88
+
89
+ def __enter__(self):
90
+ return self
91
+
92
+ def __exit__(self, exc_type, exc, tb):
93
+ return None
94
+
95
+ def recall(self, query, limit=20, path=None, auto_index=True):
96
+ return {
97
+ "query": query,
98
+ "count": 1,
99
+ "matches": [
100
+ {
101
+ "score": 0.9,
102
+ "text": "# Schema",
103
+ "metadata": {"source": "docs/schema.md"},
104
+ }
105
+ ],
106
+ "auto_index": {
107
+ "indexed": False,
108
+ "reason": "index is current",
109
+ "root": "/tmp/work",
110
+ "files": 2,
111
+ "chunks": 8,
112
+ "chunks_indexed": 0,
113
+ "indexed_files": 0,
114
+ "new_files": [],
115
+ "updated_files": [],
116
+ "removed_files": [],
117
+ "full_rebuild": False,
118
+ },
119
+ }
120
+
121
+ output = StringIO()
122
+ with patch("ragrep.cli.RAGrep", DummyRAG):
123
+ with redirect_stdout(output):
124
+ exit_code = main(["schema"])
125
+
126
+ self.assertEqual(exit_code, 0)
127
+ text = output.getvalue()
128
+ self.assertIn(
129
+ "Index is already up to date for /tmp/work (2 files, 8 chunks): index is current",
130
+ text,
131
+ )
132
+ self.assertIn("Results: 1", text)
133
+ self.assertLess(
134
+ text.index("Index is already up to date for /tmp/work"),
135
+ text.index("Results: 1"),
136
+ )
137
+
138
+
139
+ if __name__ == "__main__":
140
+ unittest.main()
@@ -0,0 +1,179 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ import tempfile
6
+ import unittest
7
+ from pathlib import Path
8
+ from types import SimpleNamespace
9
+ from types import ModuleType
10
+ from unittest.mock import patch
11
+
12
+ from ragrep.retrieval.embeddings import (
13
+ LocalEmbedder,
14
+ default_model_dir,
15
+ get_runtime_device_info,
16
+ resolve_embedding_model,
17
+ resolve_runtime_device,
18
+ )
19
+
20
+
21
+ class EmbeddingConfigTests(unittest.TestCase):
22
+ def test_model_alias_resolution(self):
23
+ self.assertEqual(
24
+ resolve_embedding_model("mxbai-embed-large"),
25
+ "mixedbread-ai/mxbai-embed-large-v1",
26
+ )
27
+ self.assertEqual(resolve_embedding_model("custom/model"), "custom/model")
28
+
29
+ def test_model_dir_env_override(self):
30
+ with tempfile.TemporaryDirectory() as temp_dir:
31
+ with patch.dict(os.environ, {"RAGREP_MODEL_DIR": temp_dir}, clear=False):
32
+ self.assertEqual(default_model_dir(), Path(temp_dir).resolve())
33
+
34
+ def test_device_auto_without_torch(self):
35
+ with patch.dict(sys.modules, {"torch": None}):
36
+ self.assertEqual(resolve_runtime_device("auto"), "cpu")
37
+
38
+ def test_device_auto_prefers_cuda(self):
39
+ fake_torch = SimpleNamespace(
40
+ cuda=SimpleNamespace(is_available=lambda: True),
41
+ backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
42
+ )
43
+ with patch.dict(sys.modules, {"torch": fake_torch}):
44
+ self.assertEqual(resolve_runtime_device("auto"), "cuda")
45
+
46
+ def test_device_auto_uses_mps_when_cuda_missing(self):
47
+ fake_torch = SimpleNamespace(
48
+ cuda=SimpleNamespace(is_available=lambda: False),
49
+ backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: True)),
50
+ )
51
+ with patch.dict(sys.modules, {"torch": fake_torch}):
52
+ self.assertEqual(resolve_runtime_device("auto"), "mps")
53
+
54
+ def test_explicit_device_is_respected(self):
55
+ self.assertEqual(resolve_runtime_device("cpu"), "cpu")
56
+ self.assertEqual(resolve_runtime_device("cuda:0"), "cuda:0")
57
+
58
+ def test_runtime_device_info_without_torch(self):
59
+ with patch.dict(sys.modules, {"torch": None}):
60
+ info = get_runtime_device_info("auto")
61
+ self.assertFalse(info["torch_available"])
62
+ self.assertEqual(info["resolved_device"], "cpu")
63
+
64
+ def test_runtime_device_info_with_cuda_inventory(self):
65
+ fake_torch = SimpleNamespace(
66
+ cuda=SimpleNamespace(
67
+ is_available=lambda: True,
68
+ device_count=lambda: 2,
69
+ get_device_name=lambda i: f"GPU-{i}",
70
+ ),
71
+ backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
72
+ )
73
+ with patch.dict(sys.modules, {"torch": fake_torch}):
74
+ info = get_runtime_device_info("auto")
75
+ self.assertTrue(info["torch_available"])
76
+ self.assertTrue(info["cuda_available"])
77
+ self.assertEqual(info["cuda_device_count"], 2)
78
+ self.assertEqual(info["cuda_devices"], ["GPU-0", "GPU-1"])
79
+
80
+ def test_local_embedder_uses_local_files_only_when_model_is_cached(self):
81
+ calls = []
82
+ sentence_transformers = ModuleType("sentence_transformers")
83
+ huggingface_hub = ModuleType("huggingface_hub")
84
+ cached_marker = object()
85
+
86
+ class FakeSentenceTransformer:
87
+ def __init__(self, model_name, **kwargs):
88
+ calls.append({"model_name": model_name, **kwargs})
89
+
90
+ def try_to_load_from_cache(*, repo_id, filename, cache_dir):
91
+ if repo_id == "mixedbread-ai/mxbai-embed-large-v1" and filename == "modules.json":
92
+ return str(Path(cache_dir) / "models--cached" / "modules.json")
93
+ return cached_marker
94
+
95
+ sentence_transformers.SentenceTransformer = FakeSentenceTransformer
96
+ huggingface_hub.try_to_load_from_cache = try_to_load_from_cache
97
+ huggingface_hub._CACHED_NO_EXIST = cached_marker
98
+
99
+ with tempfile.TemporaryDirectory() as temp_dir:
100
+ with patch.dict(
101
+ sys.modules,
102
+ {
103
+ "sentence_transformers": sentence_transformers,
104
+ "huggingface_hub": huggingface_hub,
105
+ },
106
+ ):
107
+ LocalEmbedder(model_dir=temp_dir, device="cpu")
108
+
109
+ self.assertEqual(len(calls), 1)
110
+ self.assertTrue(calls[0]["local_files_only"])
111
+
112
+ def test_local_embedder_allows_download_when_cache_is_missing(self):
113
+ calls = []
114
+ sentence_transformers = ModuleType("sentence_transformers")
115
+ huggingface_hub = ModuleType("huggingface_hub")
116
+ cached_marker = object()
117
+
118
+ class FakeSentenceTransformer:
119
+ def __init__(self, model_name, **kwargs):
120
+ calls.append({"model_name": model_name, **kwargs})
121
+
122
+ def try_to_load_from_cache(*, repo_id, filename, cache_dir):
123
+ return cached_marker
124
+
125
+ sentence_transformers.SentenceTransformer = FakeSentenceTransformer
126
+ huggingface_hub.try_to_load_from_cache = try_to_load_from_cache
127
+ huggingface_hub._CACHED_NO_EXIST = cached_marker
128
+
129
+ with tempfile.TemporaryDirectory() as temp_dir:
130
+ with patch.dict(
131
+ sys.modules,
132
+ {
133
+ "sentence_transformers": sentence_transformers,
134
+ "huggingface_hub": huggingface_hub,
135
+ },
136
+ ):
137
+ LocalEmbedder(model_dir=temp_dir, device="cpu")
138
+
139
+ self.assertEqual(len(calls), 1)
140
+ self.assertFalse(calls[0]["local_files_only"])
141
+
142
+ def test_local_embedder_retries_without_local_only_when_cached_load_fails(self):
143
+ calls = []
144
+ sentence_transformers = ModuleType("sentence_transformers")
145
+ huggingface_hub = ModuleType("huggingface_hub")
146
+ cached_marker = object()
147
+
148
+ class FakeSentenceTransformer:
149
+ def __init__(self, model_name, **kwargs):
150
+ calls.append({"model_name": model_name, **kwargs})
151
+ if kwargs.get("local_files_only"):
152
+ raise OSError("cache incomplete")
153
+
154
+ def try_to_load_from_cache(*, repo_id, filename, cache_dir):
155
+ if filename == "modules.json":
156
+ return str(Path(cache_dir) / "models--cached" / "modules.json")
157
+ return cached_marker
158
+
159
+ sentence_transformers.SentenceTransformer = FakeSentenceTransformer
160
+ huggingface_hub.try_to_load_from_cache = try_to_load_from_cache
161
+ huggingface_hub._CACHED_NO_EXIST = cached_marker
162
+
163
+ with tempfile.TemporaryDirectory() as temp_dir:
164
+ with patch.dict(
165
+ sys.modules,
166
+ {
167
+ "sentence_transformers": sentence_transformers,
168
+ "huggingface_hub": huggingface_hub,
169
+ },
170
+ ):
171
+ LocalEmbedder(model_dir=temp_dir, device="cpu")
172
+
173
+ self.assertEqual(len(calls), 2)
174
+ self.assertTrue(calls[0]["local_files_only"])
175
+ self.assertFalse(calls[1]["local_files_only"])
176
+
177
+
178
+ if __name__ == "__main__":
179
+ unittest.main()
@@ -20,6 +20,10 @@ class FakeEmbedder:
20
20
  "payment",
21
21
  "cache",
22
22
  "error",
23
+ "schema",
24
+ "user",
25
+ "message",
26
+ "type",
23
27
  ]
24
28
 
25
29
  def embed_texts(self, texts, batch_size: int = 32):
@@ -151,6 +155,56 @@ class RAGrepTests(unittest.TestCase):
151
155
  finally:
152
156
  rag.close()
153
157
 
158
+ def test_index_and_recall_schema_files(self):
159
+ schema_root = self.root / "schemas"
160
+ schema_root.mkdir(parents=True, exist_ok=True)
161
+ (schema_root / "user.graphql").write_text(
162
+ "type User {\n id: ID!\n email: String!\n}\n",
163
+ encoding="utf-8",
164
+ )
165
+ (schema_root / "user.proto").write_text(
166
+ "syntax = \"proto3\";\nmessage User {\n string id = 1;\n}\n",
167
+ encoding="utf-8",
168
+ )
169
+
170
+ rag = RAGrep(db_path=str(self.db_path), embedder=FakeEmbedder())
171
+ try:
172
+ index_result = rag.index(str(schema_root))
173
+ self.assertTrue(index_result["indexed"])
174
+ self.assertEqual(index_result["files"], 2)
175
+
176
+ recall_result = rag.recall("schema user", limit=5, auto_index=False)
177
+ self.assertEqual(recall_result["count"], 2)
178
+ sources = {match["metadata"]["source"] for match in recall_result["matches"]}
179
+ self.assertEqual(sources, {"user.graphql", "user.proto"})
180
+ finally:
181
+ rag.close()
182
+
183
+ def test_index_and_recall_markdown_variants(self):
184
+ docs_root = self.root / "docs"
185
+ docs_root.mkdir(parents=True, exist_ok=True)
186
+ (docs_root / "schema.mdx").write_text(
187
+ "# Schema\n\nUser field documentation.\n",
188
+ encoding="utf-8",
189
+ )
190
+ (docs_root / "database.markdown").write_text(
191
+ "# Database\n\nSchema for the user table.\n",
192
+ encoding="utf-8",
193
+ )
194
+
195
+ rag = RAGrep(db_path=str(self.db_path), embedder=FakeEmbedder())
196
+ try:
197
+ index_result = rag.index(str(docs_root))
198
+ self.assertTrue(index_result["indexed"])
199
+ self.assertEqual(index_result["files"], 2)
200
+
201
+ recall_result = rag.recall("schema user", limit=5, auto_index=False)
202
+ self.assertEqual(recall_result["count"], 2)
203
+ sources = {match["metadata"]["source"] for match in recall_result["matches"]}
204
+ self.assertEqual(sources, {"schema.mdx", "database.markdown"})
205
+ finally:
206
+ rag.close()
207
+
154
208
  def test_stats(self):
155
209
  rag = RAGrep(db_path=str(self.db_path), embedder=FakeEmbedder())
156
210
  try:
@@ -1,76 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import tempfile
5
- import unittest
6
- from contextlib import redirect_stdout
7
- from io import StringIO
8
- from pathlib import Path
9
- from unittest.mock import patch
10
-
11
- from ragrep.cli import main
12
-
13
-
14
- class CLITests(unittest.TestCase):
15
- def test_stats_flag_alias(self):
16
- with tempfile.TemporaryDirectory() as temp_dir:
17
- db_path = Path(temp_dir) / ".ragrep.db"
18
-
19
- output = StringIO()
20
- with redirect_stdout(output):
21
- exit_code = main(["--stats", "--json", "--db-path", str(db_path)])
22
-
23
- self.assertEqual(exit_code, 0)
24
- payload = json.loads(output.getvalue())
25
- self.assertEqual(payload["backend"], "sqlite")
26
- self.assertEqual(payload["total_chunks"], 0)
27
-
28
- def test_check_gpu_flag_alias(self):
29
- output = StringIO()
30
- with redirect_stdout(output):
31
- exit_code = main(["--check-gpu", "--json"])
32
-
33
- self.assertEqual(exit_code, 0)
34
- payload = json.loads(output.getvalue())
35
- self.assertIn("resolved_device", payload)
36
- self.assertIn("torch_available", payload)
37
-
38
- def test_index_prints_new_file_paths(self):
39
- class DummyRAG:
40
- def __init__(self, *args, **kwargs):
41
- pass
42
-
43
- def __enter__(self):
44
- return self
45
-
46
- def __exit__(self, exc_type, exc, tb):
47
- return None
48
-
49
- def index(self, path=".", force=False):
50
- return {
51
- "indexed": True,
52
- "reason": "new files detected",
53
- "root": "/tmp/work",
54
- "files": 3,
55
- "chunks": 10,
56
- "chunks_indexed": 4,
57
- "indexed_files": 1,
58
- "new_files": ["src/new_file.py"],
59
- "updated_files": [],
60
- "removed_files": [],
61
- "full_rebuild": False,
62
- }
63
-
64
- output = StringIO()
65
- with patch("ragrep.cli.RAGrep", DummyRAG):
66
- with redirect_stdout(output):
67
- exit_code = main(["index", "."])
68
-
69
- self.assertEqual(exit_code, 0)
70
- text = output.getvalue()
71
- self.assertIn("New files indexed:", text)
72
- self.assertIn("src/new_file.py", text)
73
-
74
-
75
- if __name__ == "__main__":
76
- unittest.main()
@@ -1,80 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import sys
5
- import tempfile
6
- import unittest
7
- from pathlib import Path
8
- from types import SimpleNamespace
9
- from unittest.mock import patch
10
-
11
- from ragrep.retrieval.embeddings import (
12
- default_model_dir,
13
- get_runtime_device_info,
14
- resolve_embedding_model,
15
- resolve_runtime_device,
16
- )
17
-
18
-
19
- class EmbeddingConfigTests(unittest.TestCase):
20
- def test_model_alias_resolution(self):
21
- self.assertEqual(
22
- resolve_embedding_model("mxbai-embed-large"),
23
- "mixedbread-ai/mxbai-embed-large-v1",
24
- )
25
- self.assertEqual(resolve_embedding_model("custom/model"), "custom/model")
26
-
27
- def test_model_dir_env_override(self):
28
- with tempfile.TemporaryDirectory() as temp_dir:
29
- with patch.dict(os.environ, {"RAGREP_MODEL_DIR": temp_dir}, clear=False):
30
- self.assertEqual(default_model_dir(), Path(temp_dir).resolve())
31
-
32
- def test_device_auto_without_torch(self):
33
- with patch.dict(sys.modules, {"torch": None}):
34
- self.assertEqual(resolve_runtime_device("auto"), "cpu")
35
-
36
- def test_device_auto_prefers_cuda(self):
37
- fake_torch = SimpleNamespace(
38
- cuda=SimpleNamespace(is_available=lambda: True),
39
- backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
40
- )
41
- with patch.dict(sys.modules, {"torch": fake_torch}):
42
- self.assertEqual(resolve_runtime_device("auto"), "cuda")
43
-
44
- def test_device_auto_uses_mps_when_cuda_missing(self):
45
- fake_torch = SimpleNamespace(
46
- cuda=SimpleNamespace(is_available=lambda: False),
47
- backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: True)),
48
- )
49
- with patch.dict(sys.modules, {"torch": fake_torch}):
50
- self.assertEqual(resolve_runtime_device("auto"), "mps")
51
-
52
- def test_explicit_device_is_respected(self):
53
- self.assertEqual(resolve_runtime_device("cpu"), "cpu")
54
- self.assertEqual(resolve_runtime_device("cuda:0"), "cuda:0")
55
-
56
- def test_runtime_device_info_without_torch(self):
57
- with patch.dict(sys.modules, {"torch": None}):
58
- info = get_runtime_device_info("auto")
59
- self.assertFalse(info["torch_available"])
60
- self.assertEqual(info["resolved_device"], "cpu")
61
-
62
- def test_runtime_device_info_with_cuda_inventory(self):
63
- fake_torch = SimpleNamespace(
64
- cuda=SimpleNamespace(
65
- is_available=lambda: True,
66
- device_count=lambda: 2,
67
- get_device_name=lambda i: f"GPU-{i}",
68
- ),
69
- backends=SimpleNamespace(mps=SimpleNamespace(is_available=lambda: False)),
70
- )
71
- with patch.dict(sys.modules, {"torch": fake_torch}):
72
- info = get_runtime_device_info("auto")
73
- self.assertTrue(info["torch_available"])
74
- self.assertTrue(info["cuda_available"])
75
- self.assertEqual(info["cuda_device_count"], 2)
76
- self.assertEqual(info["cuda_devices"], ["GPU-0", "GPU-1"])
77
-
78
-
79
- if __name__ == "__main__":
80
- unittest.main()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes