semfind 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semfind-0.1.0/LICENSE +21 -0
- semfind-0.1.0/PKG-INFO +79 -0
- semfind-0.1.0/README.md +53 -0
- semfind-0.1.0/pyproject.toml +39 -0
- semfind-0.1.0/setup.cfg +4 -0
- semfind-0.1.0/src/semfind.egg-info/PKG-INFO +79 -0
- semfind-0.1.0/src/semfind.egg-info/SOURCES.txt +13 -0
- semfind-0.1.0/src/semfind.egg-info/dependency_links.txt +1 -0
- semfind-0.1.0/src/semfind.egg-info/entry_points.txt +2 -0
- semfind-0.1.0/src/semfind.egg-info/requires.txt +3 -0
- semfind-0.1.0/src/semfind.egg-info/top_level.txt +1 -0
- semfind-0.1.0/src/semsearch/__init__.py +1 -0
- semfind-0.1.0/src/semsearch/cli.py +95 -0
- semfind-0.1.0/src/semsearch/index.py +98 -0
- semfind-0.1.0/src/semsearch/search.py +63 -0
semfind-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 puri
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
semfind-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semfind
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semantic grep for the terminal — search files by meaning, not pattern
|
|
5
|
+
Author: puri
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/puri/semsearch
|
|
8
|
+
Keywords: semantic-search,grep,embeddings,cli
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Text Processing :: General
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: fastembed>=0.7.0
|
|
23
|
+
Requires-Dist: faiss-cpu>=1.7.0
|
|
24
|
+
Requires-Dist: numpy>=1.24.0
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# semsearch
|
|
28
|
+
|
|
29
|
+
Semantic grep for the terminal. Search files by meaning, not pattern.
|
|
30
|
+
|
|
31
|
+
Uses [fastembed](https://github.com/qdrant/fastembed) (BAAI/bge-small-en-v1.5) + FAISS for fast local vector search. No API keys needed — everything runs locally.
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install semsearch
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Search a file
|
|
43
|
+
semsearch "deployment issue" logs.md
|
|
44
|
+
|
|
45
|
+
# Search multiple files, top 3 results
|
|
46
|
+
semsearch "permission error" memory/*.md -k 3
|
|
47
|
+
|
|
48
|
+
# Show 2 lines of context around each match
|
|
49
|
+
semsearch "database migration" notes.md -n 2
|
|
50
|
+
|
|
51
|
+
# Force re-index (ignore cache)
|
|
52
|
+
semsearch "query" file.md --reindex
|
|
53
|
+
|
|
54
|
+
# Set minimum similarity threshold
|
|
55
|
+
semsearch "auth bug" *.md -m 0.5
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## How it works
|
|
59
|
+
|
|
60
|
+
1. On first search, each file's non-empty lines are embedded and cached in `~/.cache/semsearch/`
|
|
61
|
+
2. Cache is keyed by file content hash — changes auto-invalidate
|
|
62
|
+
3. Your query is embedded and compared via FAISS inner-product search
|
|
63
|
+
4. Results are printed grep-style with similarity scores
|
|
64
|
+
|
|
65
|
+
## Options
|
|
66
|
+
|
|
67
|
+
| Flag | Description | Default |
|
|
68
|
+
|------|-------------|---------|
|
|
69
|
+
| `-k, --top-k` | Number of results | 5 |
|
|
70
|
+
| `-n, --context` | Context lines before/after | 0 |
|
|
71
|
+
| `-m, --max-distance` | Minimum similarity score | none |
|
|
72
|
+
| `--reindex` | Force re-embed | false |
|
|
73
|
+
| `--model` | Embedding model | BAAI/bge-small-en-v1.5 |
|
|
74
|
+
| `--no-cache` | Skip cache | false |
|
|
75
|
+
| `--version` | Print version | |
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
MIT
|
semfind-0.1.0/README.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# semsearch
|
|
2
|
+
|
|
3
|
+
Semantic grep for the terminal. Search files by meaning, not pattern.
|
|
4
|
+
|
|
5
|
+
Uses [fastembed](https://github.com/qdrant/fastembed) (BAAI/bge-small-en-v1.5) + FAISS for fast local vector search. No API keys needed — everything runs locally.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install semsearch
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Search a file
|
|
17
|
+
semsearch "deployment issue" logs.md
|
|
18
|
+
|
|
19
|
+
# Search multiple files, top 3 results
|
|
20
|
+
semsearch "permission error" memory/*.md -k 3
|
|
21
|
+
|
|
22
|
+
# Show 2 lines of context around each match
|
|
23
|
+
semsearch "database migration" notes.md -n 2
|
|
24
|
+
|
|
25
|
+
# Force re-index (ignore cache)
|
|
26
|
+
semsearch "query" file.md --reindex
|
|
27
|
+
|
|
28
|
+
# Set minimum similarity threshold
|
|
29
|
+
semsearch "auth bug" *.md -m 0.5
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## How it works
|
|
33
|
+
|
|
34
|
+
1. On first search, each file's non-empty lines are embedded and cached in `~/.cache/semsearch/`
|
|
35
|
+
2. Cache is keyed by file content hash — changes auto-invalidate
|
|
36
|
+
3. Your query is embedded and compared via FAISS inner-product search
|
|
37
|
+
4. Results are printed grep-style with similarity scores
|
|
38
|
+
|
|
39
|
+
## Options
|
|
40
|
+
|
|
41
|
+
| Flag | Description | Default |
|
|
42
|
+
|------|-------------|---------|
|
|
43
|
+
| `-k, --top-k` | Number of results | 5 |
|
|
44
|
+
| `-n, --context` | Context lines before/after | 0 |
|
|
45
|
+
| `-m, --max-distance` | Minimum similarity score | none |
|
|
46
|
+
| `--reindex` | Force re-embed | false |
|
|
47
|
+
| `--model` | Embedding model | BAAI/bge-small-en-v1.5 |
|
|
48
|
+
| `--no-cache` | Skip cache | false |
|
|
49
|
+
| `--version` | Print version | |
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
MIT
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "semfind"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Semantic grep for the terminal — search files by meaning, not pattern"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [{ name = "puri" }]
|
|
13
|
+
keywords = ["semantic-search", "grep", "embeddings", "cli"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Environment :: Console",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Text Processing :: General",
|
|
24
|
+
"Topic :: Utilities",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"fastembed>=0.7.0",
|
|
28
|
+
"faiss-cpu>=1.7.0",
|
|
29
|
+
"numpy>=1.24.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
semsearch = "semsearch.cli:main"
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/puri/semsearch"
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
semfind-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semfind
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semantic grep for the terminal — search files by meaning, not pattern
|
|
5
|
+
Author: puri
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/puri/semsearch
|
|
8
|
+
Keywords: semantic-search,grep,embeddings,cli
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Text Processing :: General
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: fastembed>=0.7.0
|
|
23
|
+
Requires-Dist: faiss-cpu>=1.7.0
|
|
24
|
+
Requires-Dist: numpy>=1.24.0
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# semsearch
|
|
28
|
+
|
|
29
|
+
Semantic grep for the terminal. Search files by meaning, not pattern.
|
|
30
|
+
|
|
31
|
+
Uses [fastembed](https://github.com/qdrant/fastembed) (BAAI/bge-small-en-v1.5) + FAISS for fast local vector search. No API keys needed — everything runs locally.
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install semsearch
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Search a file
|
|
43
|
+
semsearch "deployment issue" logs.md
|
|
44
|
+
|
|
45
|
+
# Search multiple files, top 3 results
|
|
46
|
+
semsearch "permission error" memory/*.md -k 3
|
|
47
|
+
|
|
48
|
+
# Show 2 lines of context around each match
|
|
49
|
+
semsearch "database migration" notes.md -n 2
|
|
50
|
+
|
|
51
|
+
# Force re-index (ignore cache)
|
|
52
|
+
semsearch "query" file.md --reindex
|
|
53
|
+
|
|
54
|
+
# Set minimum similarity threshold
|
|
55
|
+
semsearch "auth bug" *.md -m 0.5
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## How it works
|
|
59
|
+
|
|
60
|
+
1. On first search, each file's non-empty lines are embedded and cached in `~/.cache/semsearch/`
|
|
61
|
+
2. Cache is keyed by file content hash — changes auto-invalidate
|
|
62
|
+
3. Your query is embedded and compared via FAISS inner-product search
|
|
63
|
+
4. Results are printed grep-style with similarity scores
|
|
64
|
+
|
|
65
|
+
## Options
|
|
66
|
+
|
|
67
|
+
| Flag | Description | Default |
|
|
68
|
+
|------|-------------|---------|
|
|
69
|
+
| `-k, --top-k` | Number of results | 5 |
|
|
70
|
+
| `-n, --context` | Context lines before/after | 0 |
|
|
71
|
+
| `-m, --max-distance` | Minimum similarity score | none |
|
|
72
|
+
| `--reindex` | Force re-embed | false |
|
|
73
|
+
| `--model` | Embedding model | BAAI/bge-small-en-v1.5 |
|
|
74
|
+
| `--no-cache` | Skip cache | false |
|
|
75
|
+
| `--version` | Print version | |
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
MIT
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/semfind.egg-info/PKG-INFO
|
|
5
|
+
src/semfind.egg-info/SOURCES.txt
|
|
6
|
+
src/semfind.egg-info/dependency_links.txt
|
|
7
|
+
src/semfind.egg-info/entry_points.txt
|
|
8
|
+
src/semfind.egg-info/requires.txt
|
|
9
|
+
src/semfind.egg-info/top_level.txt
|
|
10
|
+
src/semsearch/__init__.py
|
|
11
|
+
src/semsearch/cli.py
|
|
12
|
+
src/semsearch/index.py
|
|
13
|
+
src/semsearch/search.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
semsearch
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""CLI entry point for semsearch."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from . import __version__
|
|
7
|
+
from .index import DEFAULT_MODEL
|
|
8
|
+
from .search import search
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ANSI color helpers
|
|
12
|
+
def _cyan(s: str) -> str:
|
|
13
|
+
return f"\033[36m{s}\033[0m"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _green(s: str) -> str:
|
|
17
|
+
return f"\033[32m{s}\033[0m"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _dim(s: str) -> str:
|
|
21
|
+
return f"\033[2m{s}\033[0m"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _read_context_lines(filepath: str) -> list[str]:
|
|
25
|
+
with open(filepath) as f:
|
|
26
|
+
return f.readlines()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def main(argv: list[str] | None = None) -> int:
|
|
30
|
+
parser = argparse.ArgumentParser(
|
|
31
|
+
prog="semsearch",
|
|
32
|
+
description="Semantic grep — search files by meaning, not pattern.",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument("query", help="Search query")
|
|
35
|
+
parser.add_argument("files", nargs="+", help="Files to search")
|
|
36
|
+
parser.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5)")
|
|
37
|
+
parser.add_argument("-n", "--context", type=int, default=0, help="Lines of context before/after match")
|
|
38
|
+
parser.add_argument("-m", "--max-distance", type=float, default=None, help="Minimum similarity threshold")
|
|
39
|
+
parser.add_argument("--reindex", action="store_true", help="Force re-embed even if cache exists")
|
|
40
|
+
parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Embedding model (default: {DEFAULT_MODEL})")
|
|
41
|
+
parser.add_argument("--no-cache", action="store_true", help="Don't save/load embeddings cache")
|
|
42
|
+
parser.add_argument("--version", action="version", version=f"semsearch {__version__}")
|
|
43
|
+
|
|
44
|
+
args = parser.parse_args(argv)
|
|
45
|
+
|
|
46
|
+
# Validate files exist
|
|
47
|
+
missing = [f for f in args.files if not __import__("os").path.isfile(f)]
|
|
48
|
+
if missing:
|
|
49
|
+
for f in missing:
|
|
50
|
+
print(f"semsearch: {f}: No such file", file=sys.stderr)
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
results = search(
|
|
54
|
+
query=args.query,
|
|
55
|
+
filepaths=args.files,
|
|
56
|
+
top_k=args.top_k,
|
|
57
|
+
max_distance=args.max_distance,
|
|
58
|
+
model_name=args.model,
|
|
59
|
+
reindex=args.reindex,
|
|
60
|
+
no_cache=args.no_cache,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if not results:
|
|
64
|
+
print("No results found.", file=sys.stderr)
|
|
65
|
+
return 0
|
|
66
|
+
|
|
67
|
+
# Cache of file lines for context display
|
|
68
|
+
file_lines: dict[str, list[str]] = {}
|
|
69
|
+
ctx = args.context
|
|
70
|
+
|
|
71
|
+
for r in results:
|
|
72
|
+
if ctx > 0 and r.file not in file_lines:
|
|
73
|
+
file_lines[r.file] = _read_context_lines(r.file)
|
|
74
|
+
|
|
75
|
+
if ctx > 0:
|
|
76
|
+
lines = file_lines[r.file]
|
|
77
|
+
start = max(0, r.line_num - 1 - ctx)
|
|
78
|
+
end = min(len(lines), r.line_num + ctx)
|
|
79
|
+
for i in range(start, end):
|
|
80
|
+
ln = i + 1
|
|
81
|
+
text = lines[i].rstrip("\n")
|
|
82
|
+
if ln == r.line_num:
|
|
83
|
+
print(f"{_cyan(r.file)}:{_green(str(ln))}: {text} {_dim(f'({r.score:.3f})')}")
|
|
84
|
+
else:
|
|
85
|
+
print(f"{_dim(f'{r.file}:{ln}: {text}')}")
|
|
86
|
+
if r != results[-1]:
|
|
87
|
+
print("--")
|
|
88
|
+
else:
|
|
89
|
+
print(f"{_cyan(r.file)}:{_green(str(r.line_num))}: {r.text} {_dim(f'({r.score:.3f})')}")
|
|
90
|
+
|
|
91
|
+
return 0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
sys.exit(main())
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Indexing: embed file lines, save/load from cache."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import faiss
|
|
9
|
+
import numpy as np
|
|
10
|
+
from fastembed import TextEmbedding
|
|
11
|
+
|
|
12
|
+
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
|
|
13
|
+
CACHE_DIR = Path.home() / ".cache" / "semsearch"
|
|
14
|
+
|
|
15
|
+
# Module-level model cache to avoid re-loading across calls
|
|
16
|
+
_model_cache: dict[str, TextEmbedding] = {}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _get_model(model_name: str) -> TextEmbedding:
|
|
20
|
+
if model_name not in _model_cache:
|
|
21
|
+
_model_cache[model_name] = TextEmbedding(model_name=model_name)
|
|
22
|
+
return _model_cache[model_name]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _content_hash(filepath: str) -> str:
|
|
26
|
+
h = hashlib.sha256()
|
|
27
|
+
with open(filepath, "rb") as f:
|
|
28
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
29
|
+
h.update(chunk)
|
|
30
|
+
return h.hexdigest()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _cache_key(filepath: str, model_name: str, content_hash: str) -> str:
|
|
34
|
+
raw = f"{os.path.abspath(filepath)}|{content_hash}|{model_name}"
|
|
35
|
+
return hashlib.sha256(raw.encode()).hexdigest()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _cache_paths(key: str) -> tuple[Path, Path]:
|
|
39
|
+
return CACHE_DIR / f"{key}.npy", CACHE_DIR / f"{key}.json"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _load_cache(filepath: str, model_name: str) -> tuple[np.ndarray, list[dict]] | None:
|
|
43
|
+
ch = _content_hash(filepath)
|
|
44
|
+
key = _cache_key(filepath, model_name, ch)
|
|
45
|
+
npy_path, json_path = _cache_paths(key)
|
|
46
|
+
if npy_path.exists() and json_path.exists():
|
|
47
|
+
embeddings = np.load(npy_path)
|
|
48
|
+
with open(json_path) as f:
|
|
49
|
+
metadata = json.load(f)
|
|
50
|
+
return embeddings, metadata
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _save_cache(filepath: str, model_name: str, embeddings: np.ndarray, metadata: list[dict]) -> None:
|
|
55
|
+
ch = _content_hash(filepath)
|
|
56
|
+
key = _cache_key(filepath, model_name, ch)
|
|
57
|
+
npy_path, json_path = _cache_paths(key)
|
|
58
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
np.save(npy_path, embeddings)
|
|
60
|
+
with open(json_path, "w") as f:
|
|
61
|
+
json.dump(metadata, f)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def build_index(
|
|
65
|
+
filepath: str,
|
|
66
|
+
model_name: str = DEFAULT_MODEL,
|
|
67
|
+
reindex: bool = False,
|
|
68
|
+
no_cache: bool = False,
|
|
69
|
+
) -> tuple[np.ndarray, list[dict]]:
|
|
70
|
+
"""Embed all non-empty lines of a file. Returns (embeddings, metadata)."""
|
|
71
|
+
if not reindex and not no_cache:
|
|
72
|
+
cached = _load_cache(filepath, model_name)
|
|
73
|
+
if cached is not None:
|
|
74
|
+
return cached
|
|
75
|
+
|
|
76
|
+
with open(filepath) as f:
|
|
77
|
+
raw_lines = f.readlines()
|
|
78
|
+
|
|
79
|
+
lines: list[str] = []
|
|
80
|
+
metadata: list[dict] = []
|
|
81
|
+
for i, line in enumerate(raw_lines):
|
|
82
|
+
stripped = line.rstrip("\n")
|
|
83
|
+
if stripped.strip():
|
|
84
|
+
lines.append(stripped)
|
|
85
|
+
metadata.append({"file": filepath, "line_num": i + 1, "text": stripped})
|
|
86
|
+
|
|
87
|
+
if not lines:
|
|
88
|
+
empty = np.empty((0, 0), dtype=np.float32)
|
|
89
|
+
return empty, []
|
|
90
|
+
|
|
91
|
+
model = _get_model(model_name)
|
|
92
|
+
embeddings = np.array(list(model.embed(lines)), dtype=np.float32)
|
|
93
|
+
faiss.normalize_L2(embeddings)
|
|
94
|
+
|
|
95
|
+
if not no_cache:
|
|
96
|
+
_save_cache(filepath, model_name, embeddings, metadata)
|
|
97
|
+
|
|
98
|
+
return embeddings, metadata
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Search: embed query, FAISS lookup, return ranked results."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import faiss
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .index import DEFAULT_MODEL, build_index, _get_model
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Result:
|
|
13
|
+
file: str
|
|
14
|
+
line_num: int
|
|
15
|
+
text: str
|
|
16
|
+
score: float
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def search(
|
|
20
|
+
query: str,
|
|
21
|
+
filepaths: list[str],
|
|
22
|
+
top_k: int = 5,
|
|
23
|
+
max_distance: float | None = None,
|
|
24
|
+
model_name: str = DEFAULT_MODEL,
|
|
25
|
+
reindex: bool = False,
|
|
26
|
+
no_cache: bool = False,
|
|
27
|
+
) -> list[Result]:
|
|
28
|
+
"""Search files for lines semantically similar to query."""
|
|
29
|
+
all_embeddings: list[np.ndarray] = []
|
|
30
|
+
all_metadata: list[dict] = []
|
|
31
|
+
|
|
32
|
+
for fp in filepaths:
|
|
33
|
+
embeddings, metadata = build_index(fp, model_name, reindex=reindex, no_cache=no_cache)
|
|
34
|
+
if embeddings.size == 0:
|
|
35
|
+
continue
|
|
36
|
+
all_embeddings.append(embeddings)
|
|
37
|
+
all_metadata.extend(metadata)
|
|
38
|
+
|
|
39
|
+
if not all_embeddings:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
combined = np.vstack(all_embeddings)
|
|
43
|
+
dim = combined.shape[1]
|
|
44
|
+
index = faiss.IndexFlatIP(dim)
|
|
45
|
+
index.add(combined)
|
|
46
|
+
|
|
47
|
+
model = _get_model(model_name)
|
|
48
|
+
query_vec = np.array(list(model.embed([query])), dtype=np.float32)
|
|
49
|
+
faiss.normalize_L2(query_vec)
|
|
50
|
+
|
|
51
|
+
k = min(top_k, len(all_metadata))
|
|
52
|
+
distances, indices = index.search(query_vec, k)
|
|
53
|
+
|
|
54
|
+
results: list[Result] = []
|
|
55
|
+
for score, idx in zip(distances[0], indices[0]):
|
|
56
|
+
if idx == -1:
|
|
57
|
+
continue
|
|
58
|
+
if max_distance is not None and score < max_distance:
|
|
59
|
+
continue
|
|
60
|
+
m = all_metadata[idx]
|
|
61
|
+
results.append(Result(file=m["file"], line_num=m["line_num"], text=m["text"], score=float(score)))
|
|
62
|
+
|
|
63
|
+
return results
|