embgrep 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embgrep/__init__.py +79 -0
- embgrep/__main__.py +141 -0
- embgrep/chunker.py +205 -0
- embgrep/db.py +159 -0
- embgrep/embedder.py +60 -0
- embgrep/indexer.py +237 -0
- embgrep/mcp_server.py +119 -0
- embgrep-0.1.0.dist-info/METADATA +194 -0
- embgrep-0.1.0.dist-info/RECORD +12 -0
- embgrep-0.1.0.dist-info/WHEEL +4 -0
- embgrep-0.1.0.dist-info/entry_points.txt +3 -0
- embgrep-0.1.0.dist-info/licenses/LICENSE +21 -0
embgrep/__init__.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""embgrep — Local semantic search, embedding-powered grep for files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from embgrep.indexer import EmbGrep, IndexStatus, SearchResult
|
|
6
|
+
|
|
7
|
+
__all__ = ["EmbGrep", "IndexStatus", "SearchResult", "index", "search", "status", "update"]
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def index(directory: str, patterns: list[str] | None = None, db_path: str | None = None) -> dict:
|
|
12
|
+
"""Index files in a directory.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
directory: Path to the directory to index.
|
|
16
|
+
patterns: Optional list of glob patterns to filter files.
|
|
17
|
+
db_path: Optional path to the SQLite database.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Dictionary with files_indexed, chunks_created, index_size_mb.
|
|
21
|
+
"""
|
|
22
|
+
eg = EmbGrep(db_path=db_path) if db_path else EmbGrep()
|
|
23
|
+
try:
|
|
24
|
+
return eg.index(directory, patterns=patterns)
|
|
25
|
+
finally:
|
|
26
|
+
eg.close()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def search(
|
|
30
|
+
query: str, top_k: int = 5, path_filter: str | None = None, db_path: str | None = None
|
|
31
|
+
) -> list[SearchResult]:
|
|
32
|
+
"""Semantic search across indexed files.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
query: Natural language search query.
|
|
36
|
+
top_k: Number of results to return.
|
|
37
|
+
path_filter: Optional LIKE pattern to filter by file path.
|
|
38
|
+
db_path: Optional path to the SQLite database.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List of SearchResult sorted by similarity score.
|
|
42
|
+
"""
|
|
43
|
+
eg = EmbGrep(db_path=db_path) if db_path else EmbGrep()
|
|
44
|
+
try:
|
|
45
|
+
return eg.search(query, top_k=top_k, path_filter=path_filter)
|
|
46
|
+
finally:
|
|
47
|
+
eg.close()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def status(db_path: str | None = None) -> IndexStatus:
|
|
51
|
+
"""Get index statistics.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
db_path: Optional path to the SQLite database.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
IndexStatus with total_files, total_chunks, last_updated, index_size_mb.
|
|
58
|
+
"""
|
|
59
|
+
eg = EmbGrep(db_path=db_path) if db_path else EmbGrep()
|
|
60
|
+
try:
|
|
61
|
+
return eg.status()
|
|
62
|
+
finally:
|
|
63
|
+
eg.close()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def update(db_path: str | None = None) -> dict:
|
|
67
|
+
"""Incremental update — re-index changed files only.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
db_path: Optional path to the SQLite database.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Dictionary with updated_files, new_chunks, removed_files.
|
|
74
|
+
"""
|
|
75
|
+
eg = EmbGrep(db_path=db_path) if db_path else EmbGrep()
|
|
76
|
+
try:
|
|
77
|
+
return eg.update()
|
|
78
|
+
finally:
|
|
79
|
+
eg.close()
|
embgrep/__main__.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""CLI entry point for embgrep — embedding-powered grep for files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main() -> None:
|
|
9
|
+
"""Main CLI entry point."""
|
|
10
|
+
try:
|
|
11
|
+
import click
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
except ImportError:
|
|
15
|
+
print("CLI requires extra dependencies: pip install embgrep[cli]")
|
|
16
|
+
sys.exit(1)
|
|
17
|
+
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
@click.group()
|
|
21
|
+
@click.version_option(package_name="embgrep")
|
|
22
|
+
def cli() -> None:
|
|
23
|
+
"""embgrep — Local semantic search, embedding-powered grep for files."""
|
|
24
|
+
|
|
25
|
+
@cli.command()
|
|
26
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
27
|
+
@click.option("--patterns", "-p", default=None, help="Comma-separated glob patterns (e.g., '*.md,*.py').")
|
|
28
|
+
@click.option("--db-path", default=None, help="Path to SQLite database.")
|
|
29
|
+
@click.option("--model", default="BAAI/bge-small-en-v1.5", help="Embedding model name.")
|
|
30
|
+
def index(path: str, patterns: str | None, db_path: str | None, model: str) -> None:
|
|
31
|
+
"""Index files in PATH for semantic search."""
|
|
32
|
+
from embgrep.indexer import EmbGrep
|
|
33
|
+
|
|
34
|
+
pattern_list = [p.strip() for p in patterns.split(",")] if patterns else None
|
|
35
|
+
|
|
36
|
+
kwargs: dict = {"model": model}
|
|
37
|
+
if db_path:
|
|
38
|
+
kwargs["db_path"] = db_path
|
|
39
|
+
|
|
40
|
+
eg = EmbGrep(**kwargs)
|
|
41
|
+
try:
|
|
42
|
+
with console.status("[bold green]Indexing files..."):
|
|
43
|
+
result = eg.index(path, patterns=pattern_list)
|
|
44
|
+
console.print(f"[green]Indexed {result['files_indexed']} files, {result['chunks_created']} chunks[/green]")
|
|
45
|
+
console.print(f"Index size: {result['index_size_mb']} MB")
|
|
46
|
+
finally:
|
|
47
|
+
eg.close()
|
|
48
|
+
|
|
49
|
+
@cli.command()
|
|
50
|
+
@click.argument("query")
|
|
51
|
+
@click.option("--top-k", "-k", default=5, help="Number of results to return.")
|
|
52
|
+
@click.option("--path-filter", "-f", default=None, help="SQL LIKE pattern for file path filter.")
|
|
53
|
+
@click.option("--db-path", default=None, help="Path to SQLite database.")
|
|
54
|
+
@click.option("--model", default="BAAI/bge-small-en-v1.5", help="Embedding model name.")
|
|
55
|
+
def search(query: str, top_k: int, path_filter: str | None, db_path: str | None, model: str) -> None:
|
|
56
|
+
"""Semantic search across indexed files."""
|
|
57
|
+
from embgrep.indexer import EmbGrep
|
|
58
|
+
|
|
59
|
+
kwargs: dict = {"model": model}
|
|
60
|
+
if db_path:
|
|
61
|
+
kwargs["db_path"] = db_path
|
|
62
|
+
|
|
63
|
+
eg = EmbGrep(**kwargs)
|
|
64
|
+
try:
|
|
65
|
+
with console.status("[bold green]Searching..."):
|
|
66
|
+
results = eg.search(query, top_k=top_k, path_filter=path_filter)
|
|
67
|
+
|
|
68
|
+
if not results:
|
|
69
|
+
console.print("[yellow]No results found.[/yellow]")
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
table = Table(title=f"Search: {query!r}", show_lines=True)
|
|
73
|
+
table.add_column("#", style="dim", width=3)
|
|
74
|
+
table.add_column("Score", style="cyan", width=7)
|
|
75
|
+
table.add_column("File", style="green")
|
|
76
|
+
table.add_column("Lines", style="yellow", width=10)
|
|
77
|
+
table.add_column("Preview", max_width=60)
|
|
78
|
+
|
|
79
|
+
for i, r in enumerate(results, 1):
|
|
80
|
+
preview = r.chunk_text[:120].replace("\n", " ").strip()
|
|
81
|
+
if len(r.chunk_text) > 120:
|
|
82
|
+
preview += "..."
|
|
83
|
+
table.add_row(
|
|
84
|
+
str(i),
|
|
85
|
+
f"{r.score:.4f}",
|
|
86
|
+
r.file_path,
|
|
87
|
+
f"{r.line_start}-{r.line_end}",
|
|
88
|
+
preview,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
console.print(table)
|
|
92
|
+
finally:
|
|
93
|
+
eg.close()
|
|
94
|
+
|
|
95
|
+
@cli.command()
|
|
96
|
+
@click.option("--db-path", default=None, help="Path to SQLite database.")
|
|
97
|
+
def status(db_path: str | None) -> None:
|
|
98
|
+
"""Show index statistics."""
|
|
99
|
+
from embgrep.indexer import EmbGrep
|
|
100
|
+
|
|
101
|
+
kwargs: dict = {}
|
|
102
|
+
if db_path:
|
|
103
|
+
kwargs["db_path"] = db_path
|
|
104
|
+
|
|
105
|
+
eg = EmbGrep(**kwargs)
|
|
106
|
+
try:
|
|
107
|
+
st = eg.status()
|
|
108
|
+
console.print("[bold]embgrep Index Status[/bold]")
|
|
109
|
+
console.print(f" Files: {st.total_files}")
|
|
110
|
+
console.print(f" Chunks: {st.total_chunks}")
|
|
111
|
+
console.print(f" Last updated: {st.last_updated}")
|
|
112
|
+
console.print(f" Index size: {st.index_size_mb} MB")
|
|
113
|
+
finally:
|
|
114
|
+
eg.close()
|
|
115
|
+
|
|
116
|
+
@cli.command()
|
|
117
|
+
@click.option("--db-path", default=None, help="Path to SQLite database.")
|
|
118
|
+
@click.option("--model", default="BAAI/bge-small-en-v1.5", help="Embedding model name.")
|
|
119
|
+
def update(db_path: str | None, model: str) -> None:
|
|
120
|
+
"""Incremental update — re-index changed files only."""
|
|
121
|
+
from embgrep.indexer import EmbGrep
|
|
122
|
+
|
|
123
|
+
kwargs: dict = {"model": model}
|
|
124
|
+
if db_path:
|
|
125
|
+
kwargs["db_path"] = db_path
|
|
126
|
+
|
|
127
|
+
eg = EmbGrep(**kwargs)
|
|
128
|
+
try:
|
|
129
|
+
with console.status("[bold green]Updating index..."):
|
|
130
|
+
result = eg.update()
|
|
131
|
+
console.print(f"[green]Updated {result['updated_files']} files, {result['new_chunks']} new chunks[/green]")
|
|
132
|
+
if result["removed_files"]:
|
|
133
|
+
console.print(f"[yellow]Removed {result['removed_files']} deleted files[/yellow]")
|
|
134
|
+
finally:
|
|
135
|
+
eg.close()
|
|
136
|
+
|
|
137
|
+
cli()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
main()
|
embgrep/chunker.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""File chunking for embgrep — split files into semantically meaningful chunks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# File extensions grouped by chunking strategy
|
|
9
|
+
_CODE_EXTENSIONS = {".py", ".js", ".ts", ".java", ".go", ".rs"}
|
|
10
|
+
_DOC_EXTENSIONS = {".md", ".txt"}
|
|
11
|
+
_CONFIG_EXTENSIONS = {".yaml", ".yml", ".json", ".toml", ".cfg", ".ini"}
|
|
12
|
+
_SHELL_EXTENSIONS = {".sh", ".bash"}
|
|
13
|
+
|
|
14
|
+
SUPPORTED_EXTENSIONS = _CODE_EXTENSIONS | _DOC_EXTENSIONS | _CONFIG_EXTENSIONS | _SHELL_EXTENSIONS
|
|
15
|
+
|
|
16
|
+
# Regex patterns for detecting function/class boundaries per language
|
|
17
|
+
_CODE_BOUNDARY_PATTERNS: dict[str, re.Pattern[str]] = {
|
|
18
|
+
".py": re.compile(r"^(def |class |async def )", re.MULTILINE),
|
|
19
|
+
".js": re.compile(r"^(function |class |export |const \w+ = )", re.MULTILINE),
|
|
20
|
+
".ts": re.compile(r"^(function |class |export |const \w+ = )", re.MULTILINE),
|
|
21
|
+
".java": re.compile(r"^(public |private |class )", re.MULTILINE),
|
|
22
|
+
".go": re.compile(r"^(func |type )", re.MULTILINE),
|
|
23
|
+
".rs": re.compile(r"^(fn |pub fn |pub struct |struct |impl |pub impl )", re.MULTILINE),
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
# Markdown heading pattern
|
|
27
|
+
_MD_HEADING = re.compile(r"^#{1,6}\s", re.MULTILINE)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def chunk_file(file_path: str, max_chunk_size: int = 1000) -> list[dict]:
|
|
31
|
+
"""Split a file into chunks for embedding.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
file_path: Path to the file to chunk.
|
|
35
|
+
max_chunk_size: Maximum number of characters per chunk for fixed-size fallback.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
List of dicts with keys: text, line_start, line_end.
|
|
39
|
+
|
|
40
|
+
Strategy:
|
|
41
|
+
- .py/.js/.ts/.java/.go/.rs: split by function/class definitions (regex)
|
|
42
|
+
- .md/.txt: split by headings (## or blank line groups)
|
|
43
|
+
- Others: split by max_chunk_size characters
|
|
44
|
+
"""
|
|
45
|
+
path = Path(file_path)
|
|
46
|
+
suffix = path.suffix.lower()
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
content = path.read_text(encoding="utf-8")
|
|
50
|
+
except FileNotFoundError:
|
|
51
|
+
return []
|
|
52
|
+
except UnicodeDecodeError:
|
|
53
|
+
try:
|
|
54
|
+
content = path.read_text(encoding="latin-1")
|
|
55
|
+
except Exception:
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
if not content.strip():
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
lines = content.splitlines(keepends=True)
|
|
62
|
+
|
|
63
|
+
if suffix in _CODE_EXTENSIONS:
|
|
64
|
+
chunks = _chunk_code(lines, suffix, max_chunk_size)
|
|
65
|
+
elif suffix in _DOC_EXTENSIONS:
|
|
66
|
+
chunks = _chunk_docs(lines, suffix, max_chunk_size)
|
|
67
|
+
else:
|
|
68
|
+
chunks = _chunk_fixed(lines, max_chunk_size)
|
|
69
|
+
|
|
70
|
+
# Filter out empty chunks
|
|
71
|
+
return [c for c in chunks if c["text"].strip()]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _chunk_code(lines: list[str], suffix: str, max_chunk_size: int) -> list[dict]:
|
|
75
|
+
"""Split code files by function/class boundaries."""
|
|
76
|
+
pattern = _CODE_BOUNDARY_PATTERNS.get(suffix)
|
|
77
|
+
if pattern is None:
|
|
78
|
+
return _chunk_fixed(lines, max_chunk_size)
|
|
79
|
+
|
|
80
|
+
# Find boundary line numbers
|
|
81
|
+
boundaries: list[int] = []
|
|
82
|
+
for i, line in enumerate(lines):
|
|
83
|
+
if pattern.match(line):
|
|
84
|
+
boundaries.append(i)
|
|
85
|
+
|
|
86
|
+
if not boundaries:
|
|
87
|
+
return _chunk_fixed(lines, max_chunk_size)
|
|
88
|
+
|
|
89
|
+
chunks: list[dict] = []
|
|
90
|
+
|
|
91
|
+
# Lines before the first boundary (imports, module docstring, etc.)
|
|
92
|
+
if boundaries[0] > 0:
|
|
93
|
+
text = "".join(lines[: boundaries[0]])
|
|
94
|
+
if text.strip():
|
|
95
|
+
chunks.append({"text": text, "line_start": 1, "line_end": boundaries[0]})
|
|
96
|
+
|
|
97
|
+
# Each boundary to the next
|
|
98
|
+
for i, start in enumerate(boundaries):
|
|
99
|
+
end = boundaries[i + 1] if i + 1 < len(boundaries) else len(lines)
|
|
100
|
+
text = "".join(lines[start:end])
|
|
101
|
+
if text.strip():
|
|
102
|
+
chunks.append({"text": text, "line_start": start + 1, "line_end": end})
|
|
103
|
+
|
|
104
|
+
return chunks
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _chunk_docs(lines: list[str], suffix: str, max_chunk_size: int) -> list[dict]:
|
|
108
|
+
"""Split document files by headings or blank-line groups."""
|
|
109
|
+
if suffix == ".md":
|
|
110
|
+
return _chunk_markdown(lines, max_chunk_size)
|
|
111
|
+
return _chunk_by_blank_lines(lines, max_chunk_size)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _chunk_markdown(lines: list[str], max_chunk_size: int) -> list[dict]:
|
|
115
|
+
"""Split markdown by heading boundaries."""
|
|
116
|
+
boundaries: list[int] = []
|
|
117
|
+
for i, line in enumerate(lines):
|
|
118
|
+
if _MD_HEADING.match(line):
|
|
119
|
+
boundaries.append(i)
|
|
120
|
+
|
|
121
|
+
if not boundaries:
|
|
122
|
+
return _chunk_by_blank_lines(lines, max_chunk_size)
|
|
123
|
+
|
|
124
|
+
chunks: list[dict] = []
|
|
125
|
+
|
|
126
|
+
# Lines before the first heading
|
|
127
|
+
if boundaries[0] > 0:
|
|
128
|
+
text = "".join(lines[: boundaries[0]])
|
|
129
|
+
if text.strip():
|
|
130
|
+
chunks.append({"text": text, "line_start": 1, "line_end": boundaries[0]})
|
|
131
|
+
|
|
132
|
+
for i, start in enumerate(boundaries):
|
|
133
|
+
end = boundaries[i + 1] if i + 1 < len(boundaries) else len(lines)
|
|
134
|
+
text = "".join(lines[start:end])
|
|
135
|
+
if text.strip():
|
|
136
|
+
chunks.append({"text": text, "line_start": start + 1, "line_end": end})
|
|
137
|
+
|
|
138
|
+
return chunks
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _chunk_by_blank_lines(lines: list[str], max_chunk_size: int) -> list[dict]:
|
|
142
|
+
"""Split text by groups of blank lines."""
|
|
143
|
+
chunks: list[dict] = []
|
|
144
|
+
current_lines: list[str] = []
|
|
145
|
+
start_line = 0
|
|
146
|
+
|
|
147
|
+
for i, line in enumerate(lines):
|
|
148
|
+
if not line.strip() and current_lines:
|
|
149
|
+
# End of a paragraph
|
|
150
|
+
text = "".join(current_lines)
|
|
151
|
+
if len(text) > max_chunk_size:
|
|
152
|
+
# Split large paragraphs
|
|
153
|
+
sub_chunks = _split_text_fixed(current_lines, start_line, max_chunk_size)
|
|
154
|
+
chunks.extend(sub_chunks)
|
|
155
|
+
else:
|
|
156
|
+
chunks.append({"text": text, "line_start": start_line + 1, "line_end": i})
|
|
157
|
+
current_lines = []
|
|
158
|
+
start_line = i + 1
|
|
159
|
+
else:
|
|
160
|
+
if not current_lines:
|
|
161
|
+
start_line = i
|
|
162
|
+
current_lines.append(line)
|
|
163
|
+
|
|
164
|
+
# Remaining lines
|
|
165
|
+
if current_lines:
|
|
166
|
+
text = "".join(current_lines)
|
|
167
|
+
if text.strip():
|
|
168
|
+
chunks.append({"text": text, "line_start": start_line + 1, "line_end": len(lines)})
|
|
169
|
+
|
|
170
|
+
return chunks
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _chunk_fixed(lines: list[str], max_chunk_size: int) -> list[dict]:
|
|
174
|
+
"""Split by fixed character size."""
|
|
175
|
+
return _split_text_fixed(lines, 0, max_chunk_size)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _split_text_fixed(lines: list[str], offset: int, max_chunk_size: int) -> list[dict]:
|
|
179
|
+
"""Split a list of lines into chunks of approximately max_chunk_size characters."""
|
|
180
|
+
chunks: list[dict] = []
|
|
181
|
+
current_lines: list[str] = []
|
|
182
|
+
current_size = 0
|
|
183
|
+
start_line = offset
|
|
184
|
+
|
|
185
|
+
for i, line in enumerate(lines):
|
|
186
|
+
if current_size + len(line) > max_chunk_size and current_lines:
|
|
187
|
+
text = "".join(current_lines)
|
|
188
|
+
chunks.append({"text": text, "line_start": start_line + 1, "line_end": offset + i})
|
|
189
|
+
current_lines = [line]
|
|
190
|
+
current_size = len(line)
|
|
191
|
+
start_line = offset + i
|
|
192
|
+
else:
|
|
193
|
+
current_lines.append(line)
|
|
194
|
+
current_size += len(line)
|
|
195
|
+
|
|
196
|
+
if current_lines:
|
|
197
|
+
text = "".join(current_lines)
|
|
198
|
+
if text.strip():
|
|
199
|
+
chunks.append({
|
|
200
|
+
"text": text,
|
|
201
|
+
"line_start": start_line + 1,
|
|
202
|
+
"line_end": offset + len(lines),
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
return chunks
|
embgrep/db.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""SQLite storage for embgrep — files and chunks tables."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sqlite3
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
_SCHEMA = """
|
|
10
|
+
CREATE TABLE IF NOT EXISTS indexed_files (
|
|
11
|
+
id INTEGER PRIMARY KEY,
|
|
12
|
+
file_path TEXT UNIQUE NOT NULL,
|
|
13
|
+
file_hash TEXT NOT NULL,
|
|
14
|
+
indexed_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
|
15
|
+
chunk_count INTEGER DEFAULT 0
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
19
|
+
id INTEGER PRIMARY KEY,
|
|
20
|
+
file_id INTEGER NOT NULL,
|
|
21
|
+
chunk_text TEXT NOT NULL,
|
|
22
|
+
line_start INTEGER NOT NULL,
|
|
23
|
+
line_end INTEGER NOT NULL,
|
|
24
|
+
embedding BLOB NOT NULL,
|
|
25
|
+
FOREIGN KEY (file_id) REFERENCES indexed_files(id) ON DELETE CASCADE
|
|
26
|
+
);
|
|
27
|
+
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_file_id ON chunks(file_id);
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_files_path ON indexed_files(file_path);
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
DEFAULT_DB_PATH = "~/.local/share/embgrep/embgrep.db"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Database:
|
|
36
|
+
"""SQLite database wrapper for embgrep index storage."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, db_path: str = DEFAULT_DB_PATH) -> None:
|
|
39
|
+
self.db_path = os.path.expanduser(db_path)
|
|
40
|
+
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
self._conn = sqlite3.connect(self.db_path)
|
|
42
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
43
|
+
self._conn.execute("PRAGMA foreign_keys=ON")
|
|
44
|
+
self._conn.executescript(_SCHEMA)
|
|
45
|
+
self._conn.commit()
|
|
46
|
+
|
|
47
|
+
def close(self) -> None:
|
|
48
|
+
"""Close the database connection."""
|
|
49
|
+
self._conn.close()
|
|
50
|
+
|
|
51
|
+
# --- indexed_files operations ---
|
|
52
|
+
|
|
53
|
+
def insert_file(self, file_path: str, file_hash: str) -> int:
|
|
54
|
+
"""Insert a file record and return its id."""
|
|
55
|
+
cur = self._conn.execute(
|
|
56
|
+
"INSERT INTO indexed_files (file_path, file_hash, chunk_count) VALUES (?, ?, 0)",
|
|
57
|
+
(file_path, file_hash),
|
|
58
|
+
)
|
|
59
|
+
self._conn.commit()
|
|
60
|
+
return cur.lastrowid # type: ignore[return-value]
|
|
61
|
+
|
|
62
|
+
def get_file(self, file_path: str) -> tuple[int, str, str, str, int] | None:
|
|
63
|
+
"""Get file record by path. Returns (id, file_path, file_hash, indexed_at, chunk_count) or None."""
|
|
64
|
+
cur = self._conn.execute("SELECT id, file_path, file_hash, indexed_at, chunk_count FROM indexed_files WHERE file_path = ?", (file_path,))
|
|
65
|
+
return cur.fetchone()
|
|
66
|
+
|
|
67
|
+
def get_all_files(self) -> list[tuple[int, str, str, str, int]]:
|
|
68
|
+
"""Get all file records."""
|
|
69
|
+
cur = self._conn.execute("SELECT id, file_path, file_hash, indexed_at, chunk_count FROM indexed_files")
|
|
70
|
+
return cur.fetchall()
|
|
71
|
+
|
|
72
|
+
def update_file_hash(self, file_id: int, file_hash: str) -> None:
|
|
73
|
+
"""Update file hash and reset indexed_at."""
|
|
74
|
+
self._conn.execute(
|
|
75
|
+
"UPDATE indexed_files SET file_hash = ?, indexed_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
76
|
+
(file_hash, file_id),
|
|
77
|
+
)
|
|
78
|
+
self._conn.commit()
|
|
79
|
+
|
|
80
|
+
def update_chunk_count(self, file_id: int, count: int) -> None:
|
|
81
|
+
"""Update the chunk count for a file."""
|
|
82
|
+
self._conn.execute("UPDATE indexed_files SET chunk_count = ? WHERE id = ?", (count, file_id))
|
|
83
|
+
self._conn.commit()
|
|
84
|
+
|
|
85
|
+
def delete_file(self, file_id: int) -> None:
|
|
86
|
+
"""Delete a file and its chunks (cascading)."""
|
|
87
|
+
self._conn.execute("DELETE FROM indexed_files WHERE id = ?", (file_id,))
|
|
88
|
+
self._conn.commit()
|
|
89
|
+
|
|
90
|
+
def file_count(self) -> int:
|
|
91
|
+
"""Get total number of indexed files."""
|
|
92
|
+
cur = self._conn.execute("SELECT COUNT(*) FROM indexed_files")
|
|
93
|
+
return cur.fetchone()[0]
|
|
94
|
+
|
|
95
|
+
# --- chunks operations ---
|
|
96
|
+
|
|
97
|
+
def insert_chunks(self, file_id: int, chunks: list[tuple[str, int, int, bytes]]) -> int:
|
|
98
|
+
"""Insert multiple chunks for a file. Each chunk is (text, line_start, line_end, embedding_blob).
|
|
99
|
+
|
|
100
|
+
Returns the number of chunks inserted.
|
|
101
|
+
"""
|
|
102
|
+
self._conn.executemany(
|
|
103
|
+
"INSERT INTO chunks (file_id, chunk_text, line_start, line_end, embedding) VALUES (?, ?, ?, ?, ?)",
|
|
104
|
+
[(file_id, text, ls, le, emb) for text, ls, le, emb in chunks],
|
|
105
|
+
)
|
|
106
|
+
self._conn.commit()
|
|
107
|
+
return len(chunks)
|
|
108
|
+
|
|
109
|
+
def delete_chunks_for_file(self, file_id: int) -> None:
|
|
110
|
+
"""Delete all chunks for a file."""
|
|
111
|
+
self._conn.execute("DELETE FROM chunks WHERE file_id = ?", (file_id,))
|
|
112
|
+
self._conn.commit()
|
|
113
|
+
|
|
114
|
+
def get_chunks(
|
|
115
|
+
self, path_filter: str | None = None
|
|
116
|
+
) -> list[tuple[int, str, str, int, int, bytes]]:
|
|
117
|
+
"""Get chunks with optional path filter.
|
|
118
|
+
|
|
119
|
+
Returns list of (chunk_id, file_path, chunk_text, line_start, line_end, embedding_blob).
|
|
120
|
+
"""
|
|
121
|
+
if path_filter:
|
|
122
|
+
cur = self._conn.execute(
|
|
123
|
+
"""
|
|
124
|
+
SELECT c.id, f.file_path, c.chunk_text, c.line_start, c.line_end, c.embedding
|
|
125
|
+
FROM chunks c
|
|
126
|
+
JOIN indexed_files f ON c.file_id = f.id
|
|
127
|
+
WHERE f.file_path LIKE ?
|
|
128
|
+
""",
|
|
129
|
+
(path_filter,),
|
|
130
|
+
)
|
|
131
|
+
else:
|
|
132
|
+
cur = self._conn.execute(
|
|
133
|
+
"""
|
|
134
|
+
SELECT c.id, f.file_path, c.chunk_text, c.line_start, c.line_end, c.embedding
|
|
135
|
+
FROM chunks c
|
|
136
|
+
JOIN indexed_files f ON c.file_id = f.id
|
|
137
|
+
"""
|
|
138
|
+
)
|
|
139
|
+
return cur.fetchall()
|
|
140
|
+
|
|
141
|
+
def chunk_count(self) -> int:
|
|
142
|
+
"""Get total number of chunks."""
|
|
143
|
+
cur = self._conn.execute("SELECT COUNT(*) FROM chunks")
|
|
144
|
+
return cur.fetchone()[0]
|
|
145
|
+
|
|
146
|
+
# --- statistics ---
|
|
147
|
+
|
|
148
|
+
def last_updated(self) -> str | None:
|
|
149
|
+
"""Get the most recent indexed_at timestamp."""
|
|
150
|
+
cur = self._conn.execute("SELECT MAX(indexed_at) FROM indexed_files")
|
|
151
|
+
row = cur.fetchone()
|
|
152
|
+
return row[0] if row else None
|
|
153
|
+
|
|
154
|
+
def db_size_mb(self) -> float:
|
|
155
|
+
"""Get database file size in MB."""
|
|
156
|
+
try:
|
|
157
|
+
return os.path.getsize(self.db_path) / (1024 * 1024)
|
|
158
|
+
except OSError:
|
|
159
|
+
return 0.0
|
embgrep/embedder.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""fastembed wrapper for embgrep — lazy-loading embedding model."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from fastembed import TextEmbedding
|
|
11
|
+
|
|
12
|
+
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
|
|
13
|
+
EMBEDDING_DIM = 384
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Embedder:
|
|
17
|
+
"""Wrapper around fastembed TextEmbedding with lazy initialization.
|
|
18
|
+
|
|
19
|
+
The ONNX model is downloaded and loaded only on first use.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, model: str = DEFAULT_MODEL) -> None:
|
|
23
|
+
self._model_name = model
|
|
24
|
+
self._model: TextEmbedding | None = None
|
|
25
|
+
|
|
26
|
+
def _ensure_model(self) -> TextEmbedding:
|
|
27
|
+
"""Lazily initialize the fastembed model."""
|
|
28
|
+
if self._model is None:
|
|
29
|
+
from fastembed import TextEmbedding
|
|
30
|
+
|
|
31
|
+
self._model = TextEmbedding(self._model_name)
|
|
32
|
+
return self._model
|
|
33
|
+
|
|
34
|
+
def embed(self, texts: list[str]) -> list[np.ndarray]:
|
|
35
|
+
"""Embed a list of texts.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
texts: List of text strings to embed.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List of numpy arrays, each of shape (EMBEDDING_DIM,).
|
|
42
|
+
"""
|
|
43
|
+
if not texts:
|
|
44
|
+
return []
|
|
45
|
+
model = self._ensure_model()
|
|
46
|
+
embeddings = list(model.embed(texts))
|
|
47
|
+
return [e.astype(np.float32) for e in embeddings]
|
|
48
|
+
|
|
49
|
+
def embed_query(self, query: str) -> np.ndarray:
|
|
50
|
+
"""Embed a single query string.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
query: The query text to embed.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Numpy array of shape (EMBEDDING_DIM,).
|
|
57
|
+
"""
|
|
58
|
+
model = self._ensure_model()
|
|
59
|
+
embeddings = list(model.embed([query]))
|
|
60
|
+
return embeddings[0].astype(np.float32)
|
embgrep/indexer.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""EmbGrep — main orchestrator for indexing and semantic search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from embgrep.chunker import SUPPORTED_EXTENSIONS, chunk_file
|
|
13
|
+
from embgrep.db import DEFAULT_DB_PATH, Database
|
|
14
|
+
from embgrep.embedder import EMBEDDING_DIM, Embedder
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class SearchResult:
|
|
19
|
+
"""A single search result with file location and similarity score."""
|
|
20
|
+
|
|
21
|
+
file_path: str
|
|
22
|
+
chunk_text: str
|
|
23
|
+
score: float
|
|
24
|
+
line_start: int
|
|
25
|
+
line_end: int
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class IndexStatus:
|
|
30
|
+
"""Statistics about the current embgrep index."""
|
|
31
|
+
|
|
32
|
+
total_files: int
|
|
33
|
+
total_chunks: int
|
|
34
|
+
last_updated: str
|
|
35
|
+
index_size_mb: float
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _file_hash(file_path: str) -> str:
|
|
39
|
+
"""Compute SHA-256 hash of file contents."""
|
|
40
|
+
h = hashlib.sha256()
|
|
41
|
+
with open(file_path, "rb") as f:
|
|
42
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
43
|
+
h.update(chunk)
|
|
44
|
+
return h.hexdigest()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _collect_files(directory: str, patterns: list[str] | None = None) -> list[str]:
|
|
48
|
+
"""Collect files from directory matching patterns or supported extensions."""
|
|
49
|
+
root = Path(directory).resolve()
|
|
50
|
+
if not root.is_dir():
|
|
51
|
+
msg = f"Directory not found: {directory}"
|
|
52
|
+
raise FileNotFoundError(msg)
|
|
53
|
+
|
|
54
|
+
if patterns:
|
|
55
|
+
files: list[str] = []
|
|
56
|
+
for pattern in patterns:
|
|
57
|
+
files.extend(str(p) for p in root.rglob(pattern) if p.is_file())
|
|
58
|
+
return sorted(set(files))
|
|
59
|
+
|
|
60
|
+
# Default: collect files with supported extensions
|
|
61
|
+
files = []
|
|
62
|
+
for p in root.rglob("*"):
|
|
63
|
+
if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS:
|
|
64
|
+
files.append(str(p))
|
|
65
|
+
return sorted(files)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class EmbGrep:
|
|
69
|
+
"""Main orchestrator for embedding-powered file search.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
db_path: Path to the SQLite database file.
|
|
73
|
+
model: Name of the fastembed model to use.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
db_path: str = DEFAULT_DB_PATH,
|
|
79
|
+
model: str = "BAAI/bge-small-en-v1.5",
|
|
80
|
+
) -> None:
|
|
81
|
+
self._db = Database(db_path)
|
|
82
|
+
self._embedder = Embedder(model)
|
|
83
|
+
|
|
84
|
+
def index(self, directory: str, patterns: list[str] | None = None) -> dict:
|
|
85
|
+
"""Index files in a directory.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
directory: Path to the directory to index.
|
|
89
|
+
patterns: Optional list of glob patterns to filter files (e.g., ["*.py", "*.md"]).
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Dict with keys: files_indexed, chunks_created, index_size_mb.
|
|
93
|
+
"""
|
|
94
|
+
files = _collect_files(directory, patterns)
|
|
95
|
+
|
|
96
|
+
files_indexed = 0
|
|
97
|
+
chunks_created = 0
|
|
98
|
+
|
|
99
|
+
for file_path in files:
|
|
100
|
+
fhash = _file_hash(file_path)
|
|
101
|
+
existing = self._db.get_file(file_path)
|
|
102
|
+
|
|
103
|
+
if existing and existing[2] == fhash:
|
|
104
|
+
# File unchanged, skip
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
if existing:
|
|
108
|
+
# File changed, remove old data
|
|
109
|
+
self._db.delete_chunks_for_file(existing[0])
|
|
110
|
+
self._db.delete_file(existing[0])
|
|
111
|
+
|
|
112
|
+
# Index the file
|
|
113
|
+
n_chunks = self._index_single_file(file_path, fhash)
|
|
114
|
+
if n_chunks > 0:
|
|
115
|
+
files_indexed += 1
|
|
116
|
+
chunks_created += n_chunks
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
"files_indexed": files_indexed,
|
|
120
|
+
"chunks_created": chunks_created,
|
|
121
|
+
"index_size_mb": round(self._db.db_size_mb(), 2),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
def _index_single_file(self, file_path: str, fhash: str) -> int:
|
|
125
|
+
"""Index a single file: chunk, embed, store. Returns number of chunks created."""
|
|
126
|
+
chunks = chunk_file(file_path)
|
|
127
|
+
if not chunks:
|
|
128
|
+
return 0
|
|
129
|
+
|
|
130
|
+
texts = [c["text"] for c in chunks]
|
|
131
|
+
embeddings = self._embedder.embed(texts)
|
|
132
|
+
|
|
133
|
+
file_id = self._db.insert_file(file_path, fhash)
|
|
134
|
+
|
|
135
|
+
chunk_records = []
|
|
136
|
+
for chunk_data, emb in zip(chunks, embeddings, strict=True):
|
|
137
|
+
blob = emb.astype(np.float32).tobytes()
|
|
138
|
+
chunk_records.append((chunk_data["text"], chunk_data["line_start"], chunk_data["line_end"], blob))
|
|
139
|
+
|
|
140
|
+
self._db.insert_chunks(file_id, chunk_records)
|
|
141
|
+
self._db.update_chunk_count(file_id, len(chunk_records))
|
|
142
|
+
|
|
143
|
+
return len(chunk_records)
|
|
144
|
+
|
|
145
|
+
def search(self, query: str, top_k: int = 5, path_filter: str | None = None) -> list[SearchResult]:
|
|
146
|
+
"""Semantic search across indexed chunks.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
query: Natural language search query.
|
|
150
|
+
top_k: Number of top results to return.
|
|
151
|
+
path_filter: Optional SQL LIKE pattern to filter by file path (e.g., "%.py").
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
List of SearchResult sorted by descending similarity score.
|
|
155
|
+
"""
|
|
156
|
+
query_emb = self._embedder.embed_query(query)
|
|
157
|
+
db_chunks = self._db.get_chunks(path_filter=path_filter)
|
|
158
|
+
|
|
159
|
+
if not db_chunks:
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
results: list[SearchResult] = []
|
|
163
|
+
|
|
164
|
+
for _chunk_id, file_path, chunk_text, line_start, line_end, emb_blob in db_chunks:
|
|
165
|
+
emb = np.frombuffer(emb_blob, dtype=np.float32)
|
|
166
|
+
if emb.shape[0] != EMBEDDING_DIM:
|
|
167
|
+
continue
|
|
168
|
+
score = _cosine_similarity(query_emb, emb)
|
|
169
|
+
results.append(SearchResult(
|
|
170
|
+
file_path=file_path,
|
|
171
|
+
chunk_text=chunk_text,
|
|
172
|
+
score=float(score),
|
|
173
|
+
line_start=line_start,
|
|
174
|
+
line_end=line_end,
|
|
175
|
+
))
|
|
176
|
+
|
|
177
|
+
results.sort(key=lambda r: r.score, reverse=True)
|
|
178
|
+
return results[:top_k]
|
|
179
|
+
|
|
180
|
+
def update(self) -> dict:
|
|
181
|
+
"""Incremental update — re-index changed files, remove deleted files.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Dict with keys: updated_files, new_chunks, removed_files.
|
|
185
|
+
"""
|
|
186
|
+
all_files = self._db.get_all_files()
|
|
187
|
+
updated_files = 0
|
|
188
|
+
new_chunks = 0
|
|
189
|
+
removed_files = 0
|
|
190
|
+
|
|
191
|
+
for file_id, file_path, stored_hash, _indexed_at, _chunk_count in all_files:
|
|
192
|
+
if not os.path.isfile(file_path):
|
|
193
|
+
# File was deleted
|
|
194
|
+
self._db.delete_file(file_id)
|
|
195
|
+
removed_files += 1
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
current_hash = _file_hash(file_path)
|
|
199
|
+
if current_hash != stored_hash:
|
|
200
|
+
# File changed, re-index
|
|
201
|
+
self._db.delete_chunks_for_file(file_id)
|
|
202
|
+
self._db.delete_file(file_id)
|
|
203
|
+
n_chunks = self._index_single_file(file_path, current_hash)
|
|
204
|
+
updated_files += 1
|
|
205
|
+
new_chunks += n_chunks
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
"updated_files": updated_files,
|
|
209
|
+
"new_chunks": new_chunks,
|
|
210
|
+
"removed_files": removed_files,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
def status(self) -> IndexStatus:
|
|
214
|
+
"""Get current index statistics.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
IndexStatus dataclass with summary information.
|
|
218
|
+
"""
|
|
219
|
+
return IndexStatus(
|
|
220
|
+
total_files=self._db.file_count(),
|
|
221
|
+
total_chunks=self._db.chunk_count(),
|
|
222
|
+
last_updated=self._db.last_updated() or "never",
|
|
223
|
+
index_size_mb=round(self._db.db_size_mb(), 2),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def close(self) -> None:
|
|
227
|
+
"""Close the database connection."""
|
|
228
|
+
self._db.close()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
232
|
+
"""Compute cosine similarity between two vectors."""
|
|
233
|
+
norm_a = np.linalg.norm(a)
|
|
234
|
+
norm_b = np.linalg.norm(b)
|
|
235
|
+
if norm_a == 0 or norm_b == 0:
|
|
236
|
+
return 0.0
|
|
237
|
+
return float(np.dot(a, b) / (norm_a * norm_b))
|
embgrep/mcp_server.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""FastMCP server for embgrep — 4 semantic search tools."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _create_server():
|
|
9
|
+
"""Create and configure the FastMCP server."""
|
|
10
|
+
try:
|
|
11
|
+
from fastmcp import FastMCP
|
|
12
|
+
except ImportError:
|
|
13
|
+
msg = "MCP server requires extra dependencies: pip install embgrep[mcp]"
|
|
14
|
+
raise ImportError(msg) # noqa: B904
|
|
15
|
+
|
|
16
|
+
from embgrep.indexer import EmbGrep
|
|
17
|
+
|
|
18
|
+
mcp = FastMCP("embgrep", instructions="Local semantic search — embedding-powered grep for files.")
|
|
19
|
+
|
|
20
|
+
def _get_embgrep() -> EmbGrep:
|
|
21
|
+
return EmbGrep()
|
|
22
|
+
|
|
23
|
+
@mcp.tool()
|
|
24
|
+
def index_directory(path: str, patterns: str = "*.md,*.py,*.txt") -> str:
|
|
25
|
+
"""Index files in a directory for semantic search.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
path: Directory path to index.
|
|
29
|
+
patterns: Comma-separated glob patterns (default: "*.md,*.py,*.txt").
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
JSON string with indexing results.
|
|
33
|
+
"""
|
|
34
|
+
pattern_list = [p.strip() for p in patterns.split(",")]
|
|
35
|
+
eg = _get_embgrep()
|
|
36
|
+
try:
|
|
37
|
+
result = eg.index(path, patterns=pattern_list)
|
|
38
|
+
return json.dumps(result, indent=2)
|
|
39
|
+
finally:
|
|
40
|
+
eg.close()
|
|
41
|
+
|
|
42
|
+
@mcp.tool()
|
|
43
|
+
def semantic_search(query: str, top_k: int = 5, path_filter: str | None = None) -> str:
|
|
44
|
+
"""Search indexed files using natural language.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
query: Natural language search query.
|
|
48
|
+
top_k: Number of results to return (default: 5).
|
|
49
|
+
path_filter: Optional SQL LIKE pattern to filter by file path.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
JSON string with search results.
|
|
53
|
+
"""
|
|
54
|
+
eg = _get_embgrep()
|
|
55
|
+
try:
|
|
56
|
+
results = eg.search(query, top_k=top_k, path_filter=path_filter)
|
|
57
|
+
return json.dumps(
|
|
58
|
+
[
|
|
59
|
+
{
|
|
60
|
+
"file_path": r.file_path,
|
|
61
|
+
"score": round(r.score, 4),
|
|
62
|
+
"line_start": r.line_start,
|
|
63
|
+
"line_end": r.line_end,
|
|
64
|
+
"chunk_text": r.chunk_text[:500],
|
|
65
|
+
}
|
|
66
|
+
for r in results
|
|
67
|
+
],
|
|
68
|
+
indent=2,
|
|
69
|
+
)
|
|
70
|
+
finally:
|
|
71
|
+
eg.close()
|
|
72
|
+
|
|
73
|
+
@mcp.tool()
|
|
74
|
+
def index_status() -> str:
|
|
75
|
+
"""Get current index statistics.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
JSON string with index status information.
|
|
79
|
+
"""
|
|
80
|
+
eg = _get_embgrep()
|
|
81
|
+
try:
|
|
82
|
+
st = eg.status()
|
|
83
|
+
return json.dumps(
|
|
84
|
+
{
|
|
85
|
+
"total_files": st.total_files,
|
|
86
|
+
"total_chunks": st.total_chunks,
|
|
87
|
+
"last_updated": st.last_updated,
|
|
88
|
+
"index_size_mb": st.index_size_mb,
|
|
89
|
+
},
|
|
90
|
+
indent=2,
|
|
91
|
+
)
|
|
92
|
+
finally:
|
|
93
|
+
eg.close()
|
|
94
|
+
|
|
95
|
+
@mcp.tool()
|
|
96
|
+
def update_index() -> str:
|
|
97
|
+
"""Incremental update — re-index changed files only (hash comparison).
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
JSON string with update results.
|
|
101
|
+
"""
|
|
102
|
+
eg = _get_embgrep()
|
|
103
|
+
try:
|
|
104
|
+
result = eg.update()
|
|
105
|
+
return json.dumps(result, indent=2)
|
|
106
|
+
finally:
|
|
107
|
+
eg.close()
|
|
108
|
+
|
|
109
|
+
return mcp
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def main() -> None:
|
|
113
|
+
"""Run the MCP server."""
|
|
114
|
+
server = _create_server()
|
|
115
|
+
server.run()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
main()
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: embgrep
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local semantic search — embedding-powered grep for files, zero external services.
|
|
5
|
+
Project-URL: Homepage, https://github.com/QuartzUnit/embgrep
|
|
6
|
+
Project-URL: Repository, https://github.com/QuartzUnit/embgrep
|
|
7
|
+
Author: QuartzUnit
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: embeddings,grep,local,mcp,rag,semantic-search
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: fastembed>=0.4
|
|
20
|
+
Requires-Dist: numpy>=1.24
|
|
21
|
+
Provides-Extra: all
|
|
22
|
+
Requires-Dist: click>=8.0; extra == 'all'
|
|
23
|
+
Requires-Dist: fastmcp>=2.0; extra == 'all'
|
|
24
|
+
Requires-Dist: rich>=13.0; extra == 'all'
|
|
25
|
+
Provides-Extra: cli
|
|
26
|
+
Requires-Dist: click>=8.0; extra == 'cli'
|
|
27
|
+
Requires-Dist: rich>=13.0; extra == 'cli'
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
31
|
+
Provides-Extra: mcp
|
|
32
|
+
Requires-Dist: fastmcp>=2.0; extra == 'mcp'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# embgrep
|
|
36
|
+
|
|
37
|
+
**Local semantic search — embedding-powered grep for files, zero external services.**
|
|
38
|
+
|
|
39
|
+
[](https://pypi.org/project/embgrep/)
|
|
40
|
+
[](https://pypi.org/project/embgrep/)
|
|
41
|
+
[](https://opensource.org/licenses/MIT)
|
|
42
|
+
|
|
43
|
+
Search your codebase and documentation by *meaning*, not just keywords. embgrep indexes files into local embeddings and lets you run semantic queries — no API keys, no cloud services, no vector database servers.
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- **Local embeddings** — Uses [fastembed](https://github.com/qdrant/fastembed) (ONNX Runtime), no API keys needed
|
|
48
|
+
- **SQLite storage** — Single-file index, no external vector DB
|
|
49
|
+
- **Incremental indexing** — Only re-indexes changed files (SHA-256 hash comparison)
|
|
50
|
+
- **Smart chunking** — Function-level splitting for code, heading-level for docs
|
|
51
|
+
- **MCP native** — 4-tool FastMCP server for LLM agent integration
|
|
52
|
+
- **15+ file types** — `.py`, `.js`, `.ts`, `.java`, `.go`, `.rs`, `.md`, `.txt`, `.yaml`, `.json`, `.toml`, and more
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install embgrep # core (fastembed + numpy)
|
|
58
|
+
pip install embgrep[cli] # + click/rich CLI
|
|
59
|
+
pip install embgrep[mcp] # + FastMCP server
|
|
60
|
+
pip install embgrep[all] # everything
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
### Python API
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from embgrep import EmbGrep
|
|
69
|
+
|
|
70
|
+
eg = EmbGrep()
|
|
71
|
+
|
|
72
|
+
# Index a directory
|
|
73
|
+
eg.index("./my-project", patterns=["*.py", "*.md"])
|
|
74
|
+
|
|
75
|
+
# Semantic search
|
|
76
|
+
results = eg.search("database connection pooling", top_k=5)
|
|
77
|
+
for r in results:
|
|
78
|
+
print(f"{r.file_path}:{r.line_start}-{r.line_end} (score: {r.score:.4f})")
|
|
79
|
+
print(f" {r.chunk_text[:80]}...")
|
|
80
|
+
|
|
81
|
+
# Incremental update (only changed files)
|
|
82
|
+
eg.update()
|
|
83
|
+
|
|
84
|
+
# Index statistics
|
|
85
|
+
status = eg.status()
|
|
86
|
+
print(f"{status.total_files} files, {status.total_chunks} chunks, {status.index_size_mb} MB")
|
|
87
|
+
|
|
88
|
+
eg.close()
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### CLI
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Index a project
|
|
95
|
+
embgrep index ./my-project --patterns "*.py,*.md"
|
|
96
|
+
|
|
97
|
+
# Search
|
|
98
|
+
embgrep search "error handling patterns"
|
|
99
|
+
|
|
100
|
+
# Filter by file type
|
|
101
|
+
embgrep search "async database query" --path-filter "%.py"
|
|
102
|
+
|
|
103
|
+
# Check status
|
|
104
|
+
embgrep status
|
|
105
|
+
|
|
106
|
+
# Update changed files
|
|
107
|
+
embgrep update
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Convenience functions
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
import embgrep
|
|
114
|
+
|
|
115
|
+
embgrep.index("./src")
|
|
116
|
+
results = embgrep.search("authentication middleware")
|
|
117
|
+
status = embgrep.status()
|
|
118
|
+
embgrep.update()
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## MCP Server
|
|
122
|
+
|
|
123
|
+
Add to your Claude Desktop / MCP client configuration:
|
|
124
|
+
|
|
125
|
+
```json
|
|
126
|
+
{
|
|
127
|
+
"mcpServers": {
|
|
128
|
+
"embgrep": {
|
|
129
|
+
"command": "embgrep-mcp"
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Or with uvx:
|
|
136
|
+
|
|
137
|
+
```json
|
|
138
|
+
{
|
|
139
|
+
"mcpServers": {
|
|
140
|
+
"embgrep": {
|
|
141
|
+
"command": "uvx",
|
|
142
|
+
"args": ["--from", "embgrep[mcp]", "embgrep-mcp"]
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### MCP Tools
|
|
149
|
+
|
|
150
|
+
| Tool | Description |
|
|
151
|
+
|------|-------------|
|
|
152
|
+
| `index_directory` | Index files in a directory for semantic search |
|
|
153
|
+
| `semantic_search` | Search indexed files using natural language |
|
|
154
|
+
| `index_status` | Get current index statistics |
|
|
155
|
+
| `update_index` | Incremental update — re-index changed files only |
|
|
156
|
+
|
|
157
|
+
## How It Works
|
|
158
|
+
|
|
159
|
+
1. **Chunking** — Files are split into semantically meaningful chunks:
|
|
160
|
+
- Code files (`.py`, `.js`, `.ts`, etc.): split by function/class boundaries
|
|
161
|
+
- Documents (`.md`, `.txt`): split by headings or paragraph breaks
|
|
162
|
+
- Config files: fixed-size chunking
|
|
163
|
+
|
|
164
|
+
2. **Embedding** — Each chunk is converted to a 384-dimensional vector using [BGE-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) via ONNX Runtime (no PyTorch needed)
|
|
165
|
+
|
|
166
|
+
3. **Storage** — Embeddings are stored as BLOBs in a local SQLite database
|
|
167
|
+
|
|
168
|
+
4. **Search** — Query text is embedded and compared against all chunks using cosine similarity
|
|
169
|
+
|
|
170
|
+
## Configuration
|
|
171
|
+
|
|
172
|
+
| Parameter | Default | Description |
|
|
173
|
+
|-----------|---------|-------------|
|
|
174
|
+
| `db_path` | `~/.local/share/embgrep/embgrep.db` | SQLite database location |
|
|
175
|
+
| `model` | `BAAI/bge-small-en-v1.5` | fastembed model name |
|
|
176
|
+
| `max_chunk_size` | 1000 chars | Maximum chunk size for fixed-size splitting |
|
|
177
|
+
| `top_k` | 5 | Number of search results |
|
|
178
|
+
|
|
179
|
+
## QuartzUnit Ecosystem
|
|
180
|
+
|
|
181
|
+
| Package | Description |
|
|
182
|
+
|---------|-------------|
|
|
183
|
+
| [markgrab](https://github.com/QuartzUnit/markgrab) | HTML/YouTube/PDF/DOCX to LLM-ready markdown |
|
|
184
|
+
| [snapgrab](https://github.com/QuartzUnit/snapgrab) | URL to screenshot + metadata |
|
|
185
|
+
| [docpick](https://github.com/QuartzUnit/docpick) | OCR + LLM document structure extraction |
|
|
186
|
+
| [browsegrab](https://github.com/QuartzUnit/browsegrab) | Local LLM browser agent |
|
|
187
|
+
| [feedkit](https://github.com/QuartzUnit/feedkit) | RSS feed collection + MCP |
|
|
188
|
+
| **embgrep** | **Local semantic search for files** |
|
|
189
|
+
|
|
190
|
+
## License
|
|
191
|
+
|
|
192
|
+
MIT
|
|
193
|
+
|
|
194
|
+
<!-- mcp-name: io.github.ArkNill/embgrep -->
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
embgrep/__init__.py,sha256=U4l_kzuD2ISXldt79TwK7pAQk3VFLROcQiSMTCmRJM4,2277
|
|
2
|
+
embgrep/__main__.py,sha256=9fKlLfoa2vAib2tdwdrtcYfgI9WhDlozES8GhKq7XRA,5325
|
|
3
|
+
embgrep/chunker.py,sha256=f4iTL3oCs1nK0eMH2GrqfrzLpPanDVU8zAd5XBi5HWI,7042
|
|
4
|
+
embgrep/db.py,sha256=zX4AAjZ-3wm7QRKzuAkOVt4J3D-UPsRaSFjtaqVuYy4,5885
|
|
5
|
+
embgrep/embedder.py,sha256=NXIN7ZwodBO-gAGBD9nGvIfMMbcl0iu3r-t1WTk25lI,1659
|
|
6
|
+
embgrep/indexer.py,sha256=PIAMe8qidD-e8mjnjT7HsutF99kOwuRy5PRk5BQbJjY,7519
|
|
7
|
+
embgrep/mcp_server.py,sha256=fqYF-UDWqWugGkzMAGOG_2o3JaJZY8F3SzDklRNrtRA,3385
|
|
8
|
+
embgrep-0.1.0.dist-info/METADATA,sha256=41KnmBPsRdoG1rh0MsipEnAQqUspimm8s_WrDpXHHOs,5984
|
|
9
|
+
embgrep-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
embgrep-0.1.0.dist-info/entry_points.txt,sha256=zIgMaq7SJsf_tCQilxas66N9OnPuhVhJOZ02-OTMPuI,88
|
|
11
|
+
embgrep-0.1.0.dist-info/licenses/LICENSE,sha256=fFlwqojwD_YW8kxc0bAa5n0luMCtwf04FZ-Ad0aQ7RI,1067
|
|
12
|
+
embgrep-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 QuartzUnit
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|