semfind 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
semfind-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 puri
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
semfind-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: semfind
3
+ Version: 0.1.0
4
+ Summary: Semantic grep for the terminal — search files by meaning, not pattern
5
+ Author: puri
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/puri/semsearch
8
+ Keywords: semantic-search,grep,embeddings,cli
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Text Processing :: General
18
+ Classifier: Topic :: Utilities
19
+ Requires-Python: >=3.9
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: fastembed>=0.7.0
23
+ Requires-Dist: faiss-cpu>=1.7.0
24
+ Requires-Dist: numpy>=1.24.0
25
+ Dynamic: license-file
26
+
27
+ # semsearch
28
+
29
+ Semantic grep for the terminal. Search files by meaning, not pattern.
30
+
31
+ Uses [fastembed](https://github.com/qdrant/fastembed) (BAAI/bge-small-en-v1.5) + FAISS for fast local vector search. No API keys needed — everything runs locally.
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install semsearch
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```bash
42
+ # Search a file
43
+ semsearch "deployment issue" logs.md
44
+
45
+ # Search multiple files, top 3 results
46
+ semsearch "permission error" memory/*.md -k 3
47
+
48
+ # Show 2 lines of context around each match
49
+ semsearch "database migration" notes.md -n 2
50
+
51
+ # Force re-index (ignore cache)
52
+ semsearch "query" file.md --reindex
53
+
54
+ # Set minimum similarity threshold
55
+ semsearch "auth bug" *.md -m 0.5
56
+ ```
57
+
58
+ ## How it works
59
+
60
+ 1. On first search, each file's non-empty lines are embedded and cached in `~/.cache/semsearch/`
61
+ 2. Cache is keyed by file content hash — changes auto-invalidate
62
+ 3. Your query is embedded and compared via FAISS inner-product search
63
+ 4. Results are printed grep-style with similarity scores
64
+
65
+ ## Options
66
+
67
+ | Flag | Description | Default |
68
+ |------|-------------|---------|
69
+ | `-k, --top-k` | Number of results | 5 |
70
+ | `-n, --context` | Context lines before/after | 0 |
71
+ | `-m, --max-distance` | Minimum similarity score | none |
72
+ | `--reindex` | Force re-embed | false |
73
+ | `--model` | Embedding model | BAAI/bge-small-en-v1.5 |
74
+ | `--no-cache` | Skip cache | false |
75
+ | `--version` | Print version | |
76
+
77
+ ## License
78
+
79
+ MIT
@@ -0,0 +1,53 @@
1
+ # semsearch
2
+
3
+ Semantic grep for the terminal. Search files by meaning, not pattern.
4
+
5
+ Uses [fastembed](https://github.com/qdrant/fastembed) (BAAI/bge-small-en-v1.5) + FAISS for fast local vector search. No API keys needed — everything runs locally.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install semsearch
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ```bash
16
+ # Search a file
17
+ semsearch "deployment issue" logs.md
18
+
19
+ # Search multiple files, top 3 results
20
+ semsearch "permission error" memory/*.md -k 3
21
+
22
+ # Show 2 lines of context around each match
23
+ semsearch "database migration" notes.md -n 2
24
+
25
+ # Force re-index (ignore cache)
26
+ semsearch "query" file.md --reindex
27
+
28
+ # Set minimum similarity threshold
29
+ semsearch "auth bug" *.md -m 0.5
30
+ ```
31
+
32
+ ## How it works
33
+
34
+ 1. On first search, each file's non-empty lines are embedded and cached in `~/.cache/semsearch/`
35
+ 2. Cache is keyed by file content hash — changes auto-invalidate
36
+ 3. Your query is embedded and compared via FAISS inner-product search
37
+ 4. Results are printed grep-style with similarity scores
38
+
39
+ ## Options
40
+
41
+ | Flag | Description | Default |
42
+ |------|-------------|---------|
43
+ | `-k, --top-k` | Number of results | 5 |
44
+ | `-n, --context` | Context lines before/after | 0 |
45
+ | `-m, --max-distance` | Minimum similarity score | none |
46
+ | `--reindex` | Force re-embed | false |
47
+ | `--model` | Embedding model | BAAI/bge-small-en-v1.5 |
48
+ | `--no-cache` | Skip cache | false |
49
+ | `--version` | Print version | |
50
+
51
+ ## License
52
+
53
+ MIT
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "semfind"
7
+ version = "0.1.0"
8
+ description = "Semantic grep for the terminal — search files by meaning, not pattern"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [{ name = "puri" }]
13
+ keywords = ["semantic-search", "grep", "embeddings", "cli"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Environment :: Console",
17
+ "Intended Audience :: Developers",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Text Processing :: General",
24
+ "Topic :: Utilities",
25
+ ]
26
+ dependencies = [
27
+ "fastembed>=0.7.0",
28
+ "faiss-cpu>=1.7.0",
29
+ "numpy>=1.24.0",
30
+ ]
31
+
32
+ [project.scripts]
33
+ semsearch = "semsearch.cli:main"
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/puri/semsearch"
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: semfind
3
+ Version: 0.1.0
4
+ Summary: Semantic grep for the terminal — search files by meaning, not pattern
5
+ Author: puri
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/puri/semsearch
8
+ Keywords: semantic-search,grep,embeddings,cli
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Text Processing :: General
18
+ Classifier: Topic :: Utilities
19
+ Requires-Python: >=3.9
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: fastembed>=0.7.0
23
+ Requires-Dist: faiss-cpu>=1.7.0
24
+ Requires-Dist: numpy>=1.24.0
25
+ Dynamic: license-file
26
+
27
+ # semsearch
28
+
29
+ Semantic grep for the terminal. Search files by meaning, not pattern.
30
+
31
+ Uses [fastembed](https://github.com/qdrant/fastembed) (BAAI/bge-small-en-v1.5) + FAISS for fast local vector search. No API keys needed — everything runs locally.
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install semsearch
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```bash
42
+ # Search a file
43
+ semsearch "deployment issue" logs.md
44
+
45
+ # Search multiple files, top 3 results
46
+ semsearch "permission error" memory/*.md -k 3
47
+
48
+ # Show 2 lines of context around each match
49
+ semsearch "database migration" notes.md -n 2
50
+
51
+ # Force re-index (ignore cache)
52
+ semsearch "query" file.md --reindex
53
+
54
+ # Set minimum similarity threshold
55
+ semsearch "auth bug" *.md -m 0.5
56
+ ```
57
+
58
+ ## How it works
59
+
60
+ 1. On first search, each file's non-empty lines are embedded and cached in `~/.cache/semsearch/`
61
+ 2. Cache is keyed by file content hash — changes auto-invalidate
62
+ 3. Your query is embedded and compared via FAISS inner-product search
63
+ 4. Results are printed grep-style with similarity scores
64
+
65
+ ## Options
66
+
67
+ | Flag | Description | Default |
68
+ |------|-------------|---------|
69
+ | `-k, --top-k` | Number of results | 5 |
70
+ | `-n, --context` | Context lines before/after | 0 |
71
+ | `-m, --max-distance` | Minimum similarity score | none |
72
+ | `--reindex` | Force re-embed | false |
73
+ | `--model` | Embedding model | BAAI/bge-small-en-v1.5 |
74
+ | `--no-cache` | Skip cache | false |
75
+ | `--version` | Print version | |
76
+
77
+ ## License
78
+
79
+ MIT
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/semfind.egg-info/PKG-INFO
5
+ src/semfind.egg-info/SOURCES.txt
6
+ src/semfind.egg-info/dependency_links.txt
7
+ src/semfind.egg-info/entry_points.txt
8
+ src/semfind.egg-info/requires.txt
9
+ src/semfind.egg-info/top_level.txt
10
+ src/semsearch/__init__.py
11
+ src/semsearch/cli.py
12
+ src/semsearch/index.py
13
+ src/semsearch/search.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ semsearch = semsearch.cli:main
@@ -0,0 +1,3 @@
1
+ fastembed>=0.7.0
2
+ faiss-cpu>=1.7.0
3
+ numpy>=1.24.0
@@ -0,0 +1 @@
1
+ semsearch
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,95 @@
1
+ """CLI entry point for semsearch."""
2
+
3
+ import argparse
4
+ import sys
5
+
6
+ from . import __version__
7
+ from .index import DEFAULT_MODEL
8
+ from .search import search
9
+
10
+
11
+ # ANSI color helpers
12
+ def _cyan(s: str) -> str:
13
+ return f"\033[36m{s}\033[0m"
14
+
15
+
16
+ def _green(s: str) -> str:
17
+ return f"\033[32m{s}\033[0m"
18
+
19
+
20
+ def _dim(s: str) -> str:
21
+ return f"\033[2m{s}\033[0m"
22
+
23
+
24
+ def _read_context_lines(filepath: str) -> list[str]:
25
+ with open(filepath) as f:
26
+ return f.readlines()
27
+
28
+
29
+ def main(argv: list[str] | None = None) -> int:
30
+ parser = argparse.ArgumentParser(
31
+ prog="semsearch",
32
+ description="Semantic grep — search files by meaning, not pattern.",
33
+ )
34
+ parser.add_argument("query", help="Search query")
35
+ parser.add_argument("files", nargs="+", help="Files to search")
36
+ parser.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5)")
37
+ parser.add_argument("-n", "--context", type=int, default=0, help="Lines of context before/after match")
38
+ parser.add_argument("-m", "--max-distance", type=float, default=None, help="Minimum similarity threshold")
39
+ parser.add_argument("--reindex", action="store_true", help="Force re-embed even if cache exists")
40
+ parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Embedding model (default: {DEFAULT_MODEL})")
41
+ parser.add_argument("--no-cache", action="store_true", help="Don't save/load embeddings cache")
42
+ parser.add_argument("--version", action="version", version=f"semsearch {__version__}")
43
+
44
+ args = parser.parse_args(argv)
45
+
46
+ # Validate files exist
47
+ missing = [f for f in args.files if not __import__("os").path.isfile(f)]
48
+ if missing:
49
+ for f in missing:
50
+ print(f"semsearch: {f}: No such file", file=sys.stderr)
51
+ return 1
52
+
53
+ results = search(
54
+ query=args.query,
55
+ filepaths=args.files,
56
+ top_k=args.top_k,
57
+ max_distance=args.max_distance,
58
+ model_name=args.model,
59
+ reindex=args.reindex,
60
+ no_cache=args.no_cache,
61
+ )
62
+
63
+ if not results:
64
+ print("No results found.", file=sys.stderr)
65
+ return 0
66
+
67
+ # Cache of file lines for context display
68
+ file_lines: dict[str, list[str]] = {}
69
+ ctx = args.context
70
+
71
+ for r in results:
72
+ if ctx > 0 and r.file not in file_lines:
73
+ file_lines[r.file] = _read_context_lines(r.file)
74
+
75
+ if ctx > 0:
76
+ lines = file_lines[r.file]
77
+ start = max(0, r.line_num - 1 - ctx)
78
+ end = min(len(lines), r.line_num + ctx)
79
+ for i in range(start, end):
80
+ ln = i + 1
81
+ text = lines[i].rstrip("\n")
82
+ if ln == r.line_num:
83
+ print(f"{_cyan(r.file)}:{_green(str(ln))}: {text} {_dim(f'({r.score:.3f})')}")
84
+ else:
85
+ print(f"{_dim(f'{r.file}:{ln}: {text}')}")
86
+ if r != results[-1]:
87
+ print("--")
88
+ else:
89
+ print(f"{_cyan(r.file)}:{_green(str(r.line_num))}: {r.text} {_dim(f'({r.score:.3f})')}")
90
+
91
+ return 0
92
+
93
+
94
+ if __name__ == "__main__":
95
+ sys.exit(main())
@@ -0,0 +1,98 @@
1
+ """Indexing: embed file lines, save/load from cache."""
2
+
3
+ import hashlib
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import faiss
9
+ import numpy as np
10
+ from fastembed import TextEmbedding
11
+
12
+ DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
13
+ CACHE_DIR = Path.home() / ".cache" / "semsearch"
14
+
15
+ # Module-level model cache to avoid re-loading across calls
16
+ _model_cache: dict[str, TextEmbedding] = {}
17
+
18
+
19
+ def _get_model(model_name: str) -> TextEmbedding:
20
+ if model_name not in _model_cache:
21
+ _model_cache[model_name] = TextEmbedding(model_name=model_name)
22
+ return _model_cache[model_name]
23
+
24
+
25
+ def _content_hash(filepath: str) -> str:
26
+ h = hashlib.sha256()
27
+ with open(filepath, "rb") as f:
28
+ for chunk in iter(lambda: f.read(8192), b""):
29
+ h.update(chunk)
30
+ return h.hexdigest()
31
+
32
+
33
+ def _cache_key(filepath: str, model_name: str, content_hash: str) -> str:
34
+ raw = f"{os.path.abspath(filepath)}|{content_hash}|{model_name}"
35
+ return hashlib.sha256(raw.encode()).hexdigest()
36
+
37
+
38
+ def _cache_paths(key: str) -> tuple[Path, Path]:
39
+ return CACHE_DIR / f"{key}.npy", CACHE_DIR / f"{key}.json"
40
+
41
+
42
+ def _load_cache(filepath: str, model_name: str) -> tuple[np.ndarray, list[dict]] | None:
43
+ ch = _content_hash(filepath)
44
+ key = _cache_key(filepath, model_name, ch)
45
+ npy_path, json_path = _cache_paths(key)
46
+ if npy_path.exists() and json_path.exists():
47
+ embeddings = np.load(npy_path)
48
+ with open(json_path) as f:
49
+ metadata = json.load(f)
50
+ return embeddings, metadata
51
+ return None
52
+
53
+
54
+ def _save_cache(filepath: str, model_name: str, embeddings: np.ndarray, metadata: list[dict]) -> None:
55
+ ch = _content_hash(filepath)
56
+ key = _cache_key(filepath, model_name, ch)
57
+ npy_path, json_path = _cache_paths(key)
58
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
59
+ np.save(npy_path, embeddings)
60
+ with open(json_path, "w") as f:
61
+ json.dump(metadata, f)
62
+
63
+
64
+ def build_index(
65
+ filepath: str,
66
+ model_name: str = DEFAULT_MODEL,
67
+ reindex: bool = False,
68
+ no_cache: bool = False,
69
+ ) -> tuple[np.ndarray, list[dict]]:
70
+ """Embed all non-empty lines of a file. Returns (embeddings, metadata)."""
71
+ if not reindex and not no_cache:
72
+ cached = _load_cache(filepath, model_name)
73
+ if cached is not None:
74
+ return cached
75
+
76
+ with open(filepath) as f:
77
+ raw_lines = f.readlines()
78
+
79
+ lines: list[str] = []
80
+ metadata: list[dict] = []
81
+ for i, line in enumerate(raw_lines):
82
+ stripped = line.rstrip("\n")
83
+ if stripped.strip():
84
+ lines.append(stripped)
85
+ metadata.append({"file": filepath, "line_num": i + 1, "text": stripped})
86
+
87
+ if not lines:
88
+ empty = np.empty((0, 0), dtype=np.float32)
89
+ return empty, []
90
+
91
+ model = _get_model(model_name)
92
+ embeddings = np.array(list(model.embed(lines)), dtype=np.float32)
93
+ faiss.normalize_L2(embeddings)
94
+
95
+ if not no_cache:
96
+ _save_cache(filepath, model_name, embeddings, metadata)
97
+
98
+ return embeddings, metadata
@@ -0,0 +1,63 @@
1
+ """Search: embed query, FAISS lookup, return ranked results."""
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import faiss
6
+ import numpy as np
7
+
8
+ from .index import DEFAULT_MODEL, build_index, _get_model
9
+
10
+
11
+ @dataclass
12
+ class Result:
13
+ file: str
14
+ line_num: int
15
+ text: str
16
+ score: float
17
+
18
+
19
+ def search(
20
+ query: str,
21
+ filepaths: list[str],
22
+ top_k: int = 5,
23
+ max_distance: float | None = None,
24
+ model_name: str = DEFAULT_MODEL,
25
+ reindex: bool = False,
26
+ no_cache: bool = False,
27
+ ) -> list[Result]:
28
+ """Search files for lines semantically similar to query."""
29
+ all_embeddings: list[np.ndarray] = []
30
+ all_metadata: list[dict] = []
31
+
32
+ for fp in filepaths:
33
+ embeddings, metadata = build_index(fp, model_name, reindex=reindex, no_cache=no_cache)
34
+ if embeddings.size == 0:
35
+ continue
36
+ all_embeddings.append(embeddings)
37
+ all_metadata.extend(metadata)
38
+
39
+ if not all_embeddings:
40
+ return []
41
+
42
+ combined = np.vstack(all_embeddings)
43
+ dim = combined.shape[1]
44
+ index = faiss.IndexFlatIP(dim)
45
+ index.add(combined)
46
+
47
+ model = _get_model(model_name)
48
+ query_vec = np.array(list(model.embed([query])), dtype=np.float32)
49
+ faiss.normalize_L2(query_vec)
50
+
51
+ k = min(top_k, len(all_metadata))
52
+ distances, indices = index.search(query_vec, k)
53
+
54
+ results: list[Result] = []
55
+ for score, idx in zip(distances[0], indices[0]):
56
+ if idx == -1:
57
+ continue
58
+ if max_distance is not None and score < max_distance:
59
+ continue
60
+ m = all_metadata[idx]
61
+ results.append(Result(file=m["file"], line_num=m["line_num"], text=m["text"], score=float(score)))
62
+
63
+ return results