infogrep 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infogrep/__init__.py +3 -0
- infogrep/cli.py +217 -0
- infogrep/config.py +217 -0
- infogrep/engine.py +166 -0
- infogrep/indexer.py +362 -0
- infogrep/ingest/__init__.py +1 -0
- infogrep/ingest/chunker.py +97 -0
- infogrep/ingest/extract/__init__.py +5 -0
- infogrep/ingest/extract/registry.py +172 -0
- infogrep/ingest/graph.py +138 -0
- infogrep/ingest/types.py +30 -0
- infogrep/ingest/walker.py +47 -0
- infogrep/jvm.py +77 -0
- infogrep/manifest.py +231 -0
- infogrep/mcp_server.py +154 -0
- infogrep/retrieval/__init__.py +6 -0
- infogrep/retrieval/base.py +86 -0
- infogrep/retrieval/dense.py +234 -0
- infogrep/retrieval/embedders/__init__.py +5 -0
- infogrep/retrieval/embedders/base.py +21 -0
- infogrep/retrieval/embedders/cache.py +54 -0
- infogrep/retrieval/embedders/hashing.py +37 -0
- infogrep/retrieval/embedders/registry.py +22 -0
- infogrep/retrieval/embedders/sentence_transformer.py +115 -0
- infogrep/retrieval/fusion.py +39 -0
- infogrep/retrieval/graph.py +150 -0
- infogrep/retrieval/kb.py +179 -0
- infogrep/retrieval/sparse.py +379 -0
- infogrep/scheduler.py +100 -0
- infogrep/web.py +362 -0
- infogrep-0.0.1.dist-info/METADATA +353 -0
- infogrep-0.0.1.dist-info/RECORD +35 -0
- infogrep-0.0.1.dist-info/WHEEL +4 -0
- infogrep-0.0.1.dist-info/entry_points.txt +2 -0
- infogrep-0.0.1.dist-info/licenses/LICENSE +21 -0
infogrep/__init__.py
ADDED
infogrep/cli.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""InfoGrep command-line interface.
|
|
2
|
+
|
|
3
|
+
Thin wrapper over the core engine; also the entry point used by the daily
|
|
4
|
+
scheduled re-index. Subcommands are stubbed until their milestones land.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import typer
|
|
13
|
+
|
|
14
|
+
from . import __version__
|
|
15
|
+
from .config import Config
|
|
16
|
+
|
|
17
|
+
app = typer.Typer(
|
|
18
|
+
add_completion=False,
|
|
19
|
+
help="Local-first content search (sparse + dense + knowledge base) for coding agents.",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _version_callback(value: bool) -> None:
|
|
24
|
+
if value:
|
|
25
|
+
typer.echo(f"infogrep {__version__}")
|
|
26
|
+
raise typer.Exit()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@app.callback()
|
|
30
|
+
def _root(
|
|
31
|
+
version: bool = typer.Option(
|
|
32
|
+
False, "--version", callback=_version_callback, is_eager=True, help="Show version."
|
|
33
|
+
),
|
|
34
|
+
) -> None:
|
|
35
|
+
"""InfoGrep: index and search the content of local files."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@app.command()
|
|
39
|
+
def index(
|
|
40
|
+
directory: Path = typer.Argument(..., help="Directory to index."),
|
|
41
|
+
full: bool = typer.Option(False, "--full", help="Force a full re-index."),
|
|
42
|
+
) -> None:
|
|
43
|
+
"""Build or incrementally update the side-car index for a directory."""
|
|
44
|
+
from .indexer import Indexer
|
|
45
|
+
|
|
46
|
+
cfg = Config.load(directory)
|
|
47
|
+
if not cfg.target_dir.is_dir():
|
|
48
|
+
typer.echo(f"[infogrep] not a directory: {cfg.target_dir}", err=True)
|
|
49
|
+
raise typer.Exit(code=2)
|
|
50
|
+
|
|
51
|
+
typer.echo(f"[infogrep] indexing {cfg.target_dir}")
|
|
52
|
+
typer.echo(f"[infogrep] index location: {cfg.index_dir}")
|
|
53
|
+
|
|
54
|
+
def _progress(done: int, total: int) -> None:
|
|
55
|
+
typer.echo(f"[infogrep] extracted {done}/{total} files…", err=True)
|
|
56
|
+
|
|
57
|
+
report = Indexer(cfg).reindex(full=full, on_progress=_progress)
|
|
58
|
+
typer.echo(
|
|
59
|
+
"[infogrep] "
|
|
60
|
+
f"added={report.added} modified={report.modified} deleted={report.deleted} "
|
|
61
|
+
f"unchanged={report.unchanged} name_only={report.name_only}"
|
|
62
|
+
)
|
|
63
|
+
typer.echo(f"[infogrep] index now holds {report.n_files} files, {report.n_passages} passages")
|
|
64
|
+
for err in report.errors:
|
|
65
|
+
typer.echo(f"[infogrep] error: {err}", err=True)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@app.command()
|
|
69
|
+
def search(
|
|
70
|
+
query: str = typer.Argument(..., help="Search query."),
|
|
71
|
+
directory: Path = typer.Option(Path.cwd(), "--dir", "-d", help="Indexed directory."),
|
|
72
|
+
k: int = typer.Option(10, "--k", help="Number of results."),
|
|
73
|
+
mode: str = typer.Option("hybrid", "--mode", "-m", help="hybrid | sparse | dense | kb | graph."),
|
|
74
|
+
prf: bool = typer.Option(False, "--prf", help="RM3 pseudo-relevance feedback (sparse)."),
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Query indexed content."""
|
|
77
|
+
from .engine import SearchEngine
|
|
78
|
+
|
|
79
|
+
engine = SearchEngine(Config.load(directory))
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
if mode == "sparse":
|
|
83
|
+
results = engine.search_sparse(query, k=k, prf=prf)
|
|
84
|
+
elif mode == "dense":
|
|
85
|
+
results = engine.search_dense(query, k=k)
|
|
86
|
+
elif mode == "hybrid":
|
|
87
|
+
out = engine.search_hybrid(query, k=k, prf=prf)
|
|
88
|
+
results = out.results
|
|
89
|
+
if out.used:
|
|
90
|
+
typer.echo(f"[infogrep] fused: {', '.join(out.used)}")
|
|
91
|
+
for name, reason in out.skipped.items():
|
|
92
|
+
typer.echo(f"[infogrep] skipped {name}: {reason}")
|
|
93
|
+
elif mode == "kb":
|
|
94
|
+
results = engine.search_kb(query, k=k)
|
|
95
|
+
elif mode == "graph":
|
|
96
|
+
results = engine.search_graph(query, k=k)
|
|
97
|
+
else:
|
|
98
|
+
typer.echo(f"[infogrep] unknown mode: {mode}", err=True)
|
|
99
|
+
raise typer.Exit(code=2)
|
|
100
|
+
except FileNotFoundError as exc:
|
|
101
|
+
typer.echo(f"[infogrep] {exc}", err=True)
|
|
102
|
+
raise typer.Exit(code=2)
|
|
103
|
+
|
|
104
|
+
if not results:
|
|
105
|
+
typer.echo("[infogrep] no results.")
|
|
106
|
+
return
|
|
107
|
+
for i, r in enumerate(results, start=1):
|
|
108
|
+
file_ref = r.abs_path or r.path # original file path when known
|
|
109
|
+
loc = file_ref + (f" p.{r.page}" if r.page is not None else "")
|
|
110
|
+
typer.echo(f"{i:2}. [{r.score:.3f}] {loc} ({r.retriever})")
|
|
111
|
+
typer.echo(f" {r.snippet.strip()[:160]}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@app.command()
|
|
115
|
+
def status(
|
|
116
|
+
directory: Path = typer.Argument(Path.cwd(), help="Indexed directory."),
|
|
117
|
+
) -> None:
|
|
118
|
+
"""Show index status and staleness for a directory."""
|
|
119
|
+
from .indexer import Indexer
|
|
120
|
+
|
|
121
|
+
cfg = Config.load(directory)
|
|
122
|
+
info = Indexer(cfg).status()
|
|
123
|
+
typer.echo(f"[infogrep] target: {cfg.target_dir}")
|
|
124
|
+
typer.echo(f"[infogrep] index location: {cfg.index_dir}")
|
|
125
|
+
if not info.get("indexed"):
|
|
126
|
+
typer.echo("[infogrep] indexed: no")
|
|
127
|
+
typer.echo("[infogrep] run `infogrep index <dir>` to build the index.")
|
|
128
|
+
return
|
|
129
|
+
typer.echo("[infogrep] indexed: yes")
|
|
130
|
+
typer.echo(f"[infogrep] files: {info['n_files']} passages: {info['n_passages']}")
|
|
131
|
+
typer.echo(f"[infogrep] index version: {info['index_version']}")
|
|
132
|
+
last = info.get("last_indexed_at")
|
|
133
|
+
if last:
|
|
134
|
+
when = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(last)))
|
|
135
|
+
typer.echo(f"[infogrep] last indexed: {when}")
|
|
136
|
+
if info.get("stale"):
|
|
137
|
+
typer.echo(
|
|
138
|
+
f"[infogrep] STALE: {info['pending']} pending "
|
|
139
|
+
f"(+{info['pending_added']} ~{info['pending_modified']} -{info['pending_deleted']}) "
|
|
140
|
+
"— run `infogrep index`"
|
|
141
|
+
)
|
|
142
|
+
elif "stale" in info:
|
|
143
|
+
typer.echo("[infogrep] up to date")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@app.command()
|
|
147
|
+
def mcp(
|
|
148
|
+
directory: Path = typer.Option(Path.cwd(), "--dir", "-d", help="Default indexed directory."),
|
|
149
|
+
) -> None:
|
|
150
|
+
"""Run the MCP server (stdio) so coding agents can call InfoGrep's search tools."""
|
|
151
|
+
from .mcp_server import main as serve
|
|
152
|
+
|
|
153
|
+
serve(directory=str(Path(directory).expanduser().resolve()))
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@app.command()
|
|
157
|
+
def serve(
|
|
158
|
+
directory: Path = typer.Option(Path.cwd(), "--dir", "-d", help="Indexed directory to search."),
|
|
159
|
+
port: int = typer.Option(7421, "--port", "-p", help="Port (uncommon by default)."),
|
|
160
|
+
host: str = typer.Option("127.0.0.1", "--host", help="Bind host (localhost by default)."),
|
|
161
|
+
) -> None:
|
|
162
|
+
"""Run a local web UI to test search in a browser."""
|
|
163
|
+
from .web import serve as run_web
|
|
164
|
+
|
|
165
|
+
run_web(directory=directory, host=host, port=port)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
schedule_app = typer.Typer(help="Manage daily auto-reindex (macOS launchd).")
|
|
169
|
+
app.add_typer(schedule_app, name="schedule")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@schedule_app.command("install")
|
|
173
|
+
def schedule_install(
|
|
174
|
+
directory: Path = typer.Argument(..., help="Directory to reindex daily."),
|
|
175
|
+
at: str = typer.Option("03:00", "--at", help="Daily run time, HH:MM (24h)."),
|
|
176
|
+
) -> None:
|
|
177
|
+
"""Install a daily reindex agent for a directory."""
|
|
178
|
+
from . import scheduler
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
hour, minute = (int(x) for x in at.split(":", 1))
|
|
182
|
+
except ValueError:
|
|
183
|
+
typer.echo(f"[infogrep] invalid --at time: {at!r} (use HH:MM)", err=True)
|
|
184
|
+
raise typer.Exit(code=2)
|
|
185
|
+
path = scheduler.install(directory, hour=hour, minute=minute)
|
|
186
|
+
typer.echo(f"[infogrep] scheduled daily reindex of {Path(directory).resolve()} at {at}")
|
|
187
|
+
typer.echo(f"[infogrep] launchd agent: {path}")
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@schedule_app.command("uninstall")
|
|
191
|
+
def schedule_uninstall(
|
|
192
|
+
directory: Path = typer.Argument(..., help="Directory whose schedule to remove."),
|
|
193
|
+
) -> None:
|
|
194
|
+
"""Remove the daily reindex agent for a directory."""
|
|
195
|
+
from . import scheduler
|
|
196
|
+
|
|
197
|
+
if scheduler.uninstall(directory):
|
|
198
|
+
typer.echo(f"[infogrep] removed reindex schedule for {Path(directory).resolve()}")
|
|
199
|
+
else:
|
|
200
|
+
typer.echo("[infogrep] no schedule found for that directory.")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@schedule_app.command("list")
|
|
204
|
+
def schedule_list() -> None:
|
|
205
|
+
"""List installed daily reindex agents."""
|
|
206
|
+
from . import scheduler
|
|
207
|
+
|
|
208
|
+
agents = scheduler.list_agents()
|
|
209
|
+
if not agents:
|
|
210
|
+
typer.echo("[infogrep] no reindex schedules installed.")
|
|
211
|
+
return
|
|
212
|
+
for a in agents:
|
|
213
|
+
typer.echo(f"[infogrep] {a['hour']:02d}:{a['minute']:02d} daily {a['directory']}")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
if __name__ == "__main__": # pragma: no cover
|
|
217
|
+
app()
|
infogrep/config.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Configuration model and per-directory config loading.
|
|
2
|
+
|
|
3
|
+
Indexing never writes into the indexed folder. Each directory's index lives in a
|
|
4
|
+
separate location under ``$INFOGREP_HOME`` (default ``~/.infogrep``):
|
|
5
|
+
``$INFOGREP_HOME/indexes/<name>-<hash-of-abs-path>/``. Per-directory config is read from
|
|
6
|
+
that index dir's ``config.toml`` (with an optional global ``$INFOGREP_HOME/config.toml``
|
|
7
|
+
as a base).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import sys
|
|
16
|
+
from dataclasses import dataclass, field, asdict
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
if sys.version_info >= (3, 11):
|
|
20
|
+
import tomllib
|
|
21
|
+
else: # pragma: no cover - exercised only on 3.10
|
|
22
|
+
import tomli as tomllib
|
|
23
|
+
|
|
24
|
+
# Legacy in-folder side-car name — still pruned during the walk so an old one (or a
|
|
25
|
+
# stray) inside a target never gets indexed. InfoGrep no longer creates it.
|
|
26
|
+
SIDECAR_DIRNAME = ".infogrep"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def index_home() -> Path:
|
|
30
|
+
"""Root for all InfoGrep indexes (override with the INFOGREP_HOME env var)."""
|
|
31
|
+
return Path(os.environ.get("INFOGREP_HOME", "~/.infogrep")).expanduser()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def index_dir_for(target_dir: Path) -> Path:
|
|
35
|
+
"""Stable, separate index location for a target directory (outside the target)."""
|
|
36
|
+
target = Path(target_dir).expanduser().resolve()
|
|
37
|
+
digest = hashlib.sha256(str(target).encode("utf-8")).hexdigest()[:12]
|
|
38
|
+
name = re.sub(r"[^A-Za-z0-9._-]", "_", target.name) or "root"
|
|
39
|
+
return index_home() / "indexes" / f"{name}-{digest}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ChunkConfig:
|
|
44
|
+
"""How long documents are split into passages."""
|
|
45
|
+
|
|
46
|
+
size: int = 512 # target chunk size in tokens/words
|
|
47
|
+
overlap: int = 64 # overlap between adjacent chunks
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class IngestConfig:
|
|
52
|
+
"""Ingestion-side options."""
|
|
53
|
+
|
|
54
|
+
ocr: bool = False # OCR PDF pages that have little/no extractable text (needs tesseract)
|
|
55
|
+
ocr_min_chars: int = 16 # below this many chars on a page, try OCR
|
|
56
|
+
workers: int = 0 # parallel extraction processes; 0 = auto (min(8, cpu count))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class DenseConfig:
|
|
61
|
+
"""Dense retrieval settings.
|
|
62
|
+
|
|
63
|
+
Off by default: embedding a large corpus needs a model download and significant
|
|
64
|
+
RAM/GPU. Enable per directory with ``[dense] enabled = true`` once you want semantics.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
enabled: bool = False
|
|
68
|
+
embedder: str = "qwen" # registry key; see infogrep.retrieval.embedders
|
|
69
|
+
model_name: str = "Qwen/Qwen3-Embedding-0.6B"
|
|
70
|
+
device: str = "auto" # "auto" -> mps/cuda/cpu
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class SparseConfig:
|
|
75
|
+
"""Sparse (Pyserini/BM25) settings."""
|
|
76
|
+
|
|
77
|
+
enabled: bool = True
|
|
78
|
+
prf: bool = False # pseudo-relevance feedback (query expansion), off by default
|
|
79
|
+
prf_fb_docs: int = 10 # feedback documents (top multi-field results) to expand from
|
|
80
|
+
prf_fb_terms: int = 10 # expansion terms to add to the query
|
|
81
|
+
# Analyzer language. Default "en+zh" handles English (Porter stemming) AND CJK
|
|
82
|
+
# (bigrams) together. Also: "en" (English only), "zh"/"ja"/"ko" (single CJK).
|
|
83
|
+
# Changing it triggers a full re-index.
|
|
84
|
+
language: str = "en+zh"
|
|
85
|
+
# Multi-field BM25 weights: passage text + file name + path.
|
|
86
|
+
field_boosts: dict = field(
|
|
87
|
+
default_factory=lambda: {"contents": 1.0, "filename": 2.0, "pathtext": 1.0}
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class KnowledgeBaseConfig:
|
|
93
|
+
"""Obsidian knowledge-base settings (backed by the Obsidian CLI)."""
|
|
94
|
+
|
|
95
|
+
enabled: bool = False
|
|
96
|
+
vault: str | None = None # Obsidian vault name; None -> the CLI's active vault
|
|
97
|
+
cli: str = "obsidian" # path to the Obsidian CLI binary
|
|
98
|
+
hops: int = 1 # graph link hops to expand (follows links + backlinks)
|
|
99
|
+
search_limit: int = 10 # how many search hits to seed graph expansion from
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class GraphConfig:
|
|
104
|
+
"""Metadata knowledge-graph over folder/file structure.
|
|
105
|
+
|
|
106
|
+
Built from file *paths and names only* (never content) on every reindex, and
|
|
107
|
+
materialized as an Obsidian-compatible vault of folder notes under the index's
|
|
108
|
+
``graph_vault/`` side-car dir, browsable in Obsidian if you open it there. Hybrid
|
|
109
|
+
search uses it (no Obsidian app required — InfoGrep reads its own graph directly) to
|
|
110
|
+
pull in sibling files from the folder(s) whose name/contents best match the query,
|
|
111
|
+
not only files whose own content matched. Cheap (pure path manipulation, no model),
|
|
112
|
+
so on by default.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
enabled: bool = True
|
|
116
|
+
hops: int = 1 # folder hops to expand from a matched folder (parent/children/siblings)
|
|
117
|
+
max_folders: int = 5 # top-scoring folders to expand into file candidates per query
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# Documents indexed by content (and, where supported, OCR). Code/config files are not
|
|
121
|
+
# included by default — set include = ["**/*"] to index everything.
|
|
122
|
+
DEFAULT_DOC_TYPES = [
|
|
123
|
+
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "rtf",
|
|
124
|
+
"odt", "ods", "odp", "txt", "md", "markdown", "rst", "tex", "csv", "tsv",
|
|
125
|
+
"json", "jsonl",
|
|
126
|
+
]
|
|
127
|
+
# Images: indexed by file name / path (content only if OCR is enabled).
|
|
128
|
+
DEFAULT_IMAGE_TYPES = [
|
|
129
|
+
"png", "jpg", "jpeg", "gif", "bmp", "tif", "tiff", "webp", "svg", "heic", "heif",
|
|
130
|
+
]
|
|
131
|
+
DEFAULT_INCLUDE = [f"**/*.{ext}" for ext in DEFAULT_DOC_TYPES + DEFAULT_IMAGE_TYPES]
|
|
132
|
+
|
|
133
|
+
# Skip dependency / VCS / cache trees and editor/OS junk during the walk.
|
|
134
|
+
DEFAULT_EXCLUDE = [
|
|
135
|
+
".infogrep/**", "**/.git/**", "**/node_modules/**",
|
|
136
|
+
"**/.venv/**", "**/venv/**", "**/site-packages/**", "**/__pycache__/**",
|
|
137
|
+
"**/.cache/**", "**/.tox/**", "**/.mypy_cache/**", "**/.pytest_cache/**",
|
|
138
|
+
"**/.Trash/**", "**/~$*", "**/.dropbox.cache/**",
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@dataclass
|
|
143
|
+
class Config:
|
|
144
|
+
"""Top-level InfoGrep configuration for one indexed directory."""
|
|
145
|
+
|
|
146
|
+
target_dir: Path
|
|
147
|
+
# Documents + images by default; set include = ["**/*"] to index every file.
|
|
148
|
+
include: list[str] = field(default_factory=lambda: list(DEFAULT_INCLUDE))
|
|
149
|
+
exclude: list[str] = field(default_factory=lambda: list(DEFAULT_EXCLUDE))
|
|
150
|
+
chunk: ChunkConfig = field(default_factory=ChunkConfig)
|
|
151
|
+
ingest: IngestConfig = field(default_factory=IngestConfig)
|
|
152
|
+
sparse: SparseConfig = field(default_factory=SparseConfig)
|
|
153
|
+
dense: DenseConfig = field(default_factory=DenseConfig)
|
|
154
|
+
kb: KnowledgeBaseConfig = field(default_factory=KnowledgeBaseConfig)
|
|
155
|
+
graph: GraphConfig = field(default_factory=GraphConfig)
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def index_dir(self) -> Path:
|
|
159
|
+
"""Where this directory's index lives — a separate location, not in the target."""
|
|
160
|
+
return index_dir_for(self.target_dir)
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def manifest_path(self) -> Path:
|
|
164
|
+
return self.index_dir / "manifest.sqlite"
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def sparse_dir(self) -> Path:
|
|
168
|
+
return self.index_dir / "sparse"
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def dense_dir(self) -> Path:
|
|
172
|
+
return self.index_dir / "dense"
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def cache_dir(self) -> Path:
|
|
176
|
+
return self.index_dir / "cache"
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def graph_vault_dir(self) -> Path:
|
|
180
|
+
"""Obsidian-compatible vault of folder notes (metadata graph), for browsing."""
|
|
181
|
+
return self.index_dir / "graph_vault"
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def graph_json_path(self) -> Path:
|
|
185
|
+
"""Compact JSON form of the same graph, read directly by the graph retriever."""
|
|
186
|
+
return self.index_dir / "graph.json"
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
def load(cls, target_dir: str | Path) -> "Config":
|
|
190
|
+
"""Load config for ``target_dir`` (global config.toml, then per-index override)."""
|
|
191
|
+
target = Path(target_dir).expanduser().resolve()
|
|
192
|
+
cfg = cls(target_dir=target)
|
|
193
|
+
for config_file in (index_home() / "config.toml", cfg.index_dir / "config.toml"):
|
|
194
|
+
if config_file.is_file():
|
|
195
|
+
with config_file.open("rb") as fh:
|
|
196
|
+
cfg = cls._merge(cfg, tomllib.load(fh))
|
|
197
|
+
return cfg
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _merge(base: "Config", data: dict) -> "Config":
|
|
201
|
+
"""Shallow-merge a parsed TOML dict onto a default Config."""
|
|
202
|
+
for key in ("include", "exclude"):
|
|
203
|
+
if key in data:
|
|
204
|
+
setattr(base, key, list(data[key]))
|
|
205
|
+
if "chunk" in data:
|
|
206
|
+
base.chunk = ChunkConfig(**{**asdict(base.chunk), **data["chunk"]})
|
|
207
|
+
if "ingest" in data:
|
|
208
|
+
base.ingest = IngestConfig(**{**asdict(base.ingest), **data["ingest"]})
|
|
209
|
+
if "sparse" in data:
|
|
210
|
+
base.sparse = SparseConfig(**{**asdict(base.sparse), **data["sparse"]})
|
|
211
|
+
if "dense" in data:
|
|
212
|
+
base.dense = DenseConfig(**{**asdict(base.dense), **data["dense"]})
|
|
213
|
+
if "kb" in data:
|
|
214
|
+
base.kb = KnowledgeBaseConfig(**{**asdict(base.kb), **data["kb"]})
|
|
215
|
+
if "graph" in data:
|
|
216
|
+
base.graph = GraphConfig(**{**asdict(base.graph), **data["graph"]})
|
|
217
|
+
return base
|
infogrep/engine.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Search engine: the shared core behind both the CLI and the MCP server.
|
|
2
|
+
|
|
3
|
+
Owns the retrievers for one indexed directory, runs them individually or fused (RRF),
|
|
4
|
+
and degrades gracefully when a backend's index is missing or a backend errors.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
|
|
11
|
+
from .config import Config
|
|
12
|
+
from .retrieval.base import Result, with_file_metadata
|
|
13
|
+
from .retrieval.fusion import reciprocal_rank_fusion
|
|
14
|
+
|
|
15
|
+
# Per-retriever candidate pool size for fusion (>= k so RRF has material to work with).
|
|
16
|
+
_POOL_MIN = 20
|
|
17
|
+
|
|
18
|
+
ALL_RETRIEVERS = ("sparse", "dense", "kb", "graph")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class HybridResults:
|
|
23
|
+
"""Fused results plus which retrievers actually contributed / were skipped."""
|
|
24
|
+
|
|
25
|
+
results: list[Result]
|
|
26
|
+
used: list[str] = field(default_factory=list)
|
|
27
|
+
skipped: dict[str, str] = field(default_factory=dict) # retriever -> reason
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SearchEngine:
|
|
31
|
+
def __init__(self, config: Config):
|
|
32
|
+
self.config = config
|
|
33
|
+
self._sparse = None
|
|
34
|
+
self._dense = None
|
|
35
|
+
self._kb = None
|
|
36
|
+
self._graph = None
|
|
37
|
+
|
|
38
|
+
# -- lazy backends -----------------------------------------------------
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def sparse(self):
|
|
42
|
+
if self._sparse is None:
|
|
43
|
+
from .retrieval.sparse import SparseIndex
|
|
44
|
+
|
|
45
|
+
self._sparse = SparseIndex(
|
|
46
|
+
self.config.sparse_dir,
|
|
47
|
+
self.config.cache_dir,
|
|
48
|
+
field_boosts=self.config.sparse.field_boosts,
|
|
49
|
+
language=self.config.sparse.language,
|
|
50
|
+
prf_fb_docs=self.config.sparse.prf_fb_docs,
|
|
51
|
+
prf_fb_terms=self.config.sparse.prf_fb_terms,
|
|
52
|
+
)
|
|
53
|
+
return self._sparse
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def dense(self):
|
|
57
|
+
if self._dense is None:
|
|
58
|
+
from .retrieval.dense import DenseIndex
|
|
59
|
+
|
|
60
|
+
self._dense = DenseIndex(self.config)
|
|
61
|
+
return self._dense
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def kb(self):
|
|
65
|
+
if self._kb is None:
|
|
66
|
+
from .retrieval.kb import KnowledgeBaseIndex
|
|
67
|
+
|
|
68
|
+
self._kb = KnowledgeBaseIndex(self.config)
|
|
69
|
+
return self._kb
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def graph(self):
|
|
73
|
+
if self._graph is None:
|
|
74
|
+
from .retrieval.graph import FolderGraphIndex
|
|
75
|
+
|
|
76
|
+
self._graph = FolderGraphIndex(
|
|
77
|
+
self.config.index_dir,
|
|
78
|
+
hops=self.config.graph.hops,
|
|
79
|
+
max_folders=self.config.graph.max_folders,
|
|
80
|
+
)
|
|
81
|
+
return self._graph
|
|
82
|
+
|
|
83
|
+
# -- individual retrievers --------------------------------------------
|
|
84
|
+
|
|
85
|
+
def _enrich(self, results: list[Result], root) -> list[Result]:
|
|
86
|
+
"""Attach the original file path + metadata to each result."""
|
|
87
|
+
return [with_file_metadata(r, root) for r in results]
|
|
88
|
+
|
|
89
|
+
def search_sparse(self, query: str, k: int = 10, prf: bool = False) -> list[Result]:
|
|
90
|
+
# Content-file retrievers: paths are relative to the indexed directory.
|
|
91
|
+
return self._enrich(self.sparse.search(query, k=k, prf=prf), self.config.target_dir)
|
|
92
|
+
|
|
93
|
+
def search_dense(self, query: str, k: int = 10) -> list[Result]:
|
|
94
|
+
return self._enrich(self.dense.search(query, k=k), self.config.target_dir)
|
|
95
|
+
|
|
96
|
+
def search_kb(self, query: str, k: int = 10) -> list[Result]:
|
|
97
|
+
# KB paths are vault-relative; we have the vault name (CLI target), not its
|
|
98
|
+
# filesystem root, so set filename/ext only (root=None leaves abs_path unset).
|
|
99
|
+
return self._enrich(self.kb.search(query, k=k), None)
|
|
100
|
+
|
|
101
|
+
def search_graph(self, query: str, k: int = 10) -> list[Result]:
|
|
102
|
+
# Graph paths reference real files in the indexed directory, just like sparse/dense.
|
|
103
|
+
return self._enrich(self.graph.search(query, k=k), self.config.target_dir)
|
|
104
|
+
|
|
105
|
+
def _run(self, name: str, query: str, k: int, prf: bool) -> list[Result]:
|
|
106
|
+
if name == "sparse":
|
|
107
|
+
return self.search_sparse(query, k=k, prf=prf)
|
|
108
|
+
if name == "dense":
|
|
109
|
+
return self.search_dense(query, k=k)
|
|
110
|
+
if name == "kb":
|
|
111
|
+
return self.search_kb(query, k=k)
|
|
112
|
+
if name == "graph":
|
|
113
|
+
return self.search_graph(query, k=k)
|
|
114
|
+
raise ValueError(f"unknown retriever: {name}")
|
|
115
|
+
|
|
116
|
+
def _enabled(self, name: str) -> bool:
|
|
117
|
+
return {
|
|
118
|
+
"sparse": self.config.sparse.enabled,
|
|
119
|
+
"dense": self.config.dense.enabled,
|
|
120
|
+
"kb": self.config.kb.enabled,
|
|
121
|
+
"graph": self.config.graph.enabled,
|
|
122
|
+
}.get(name, False)
|
|
123
|
+
|
|
124
|
+
# -- fused -------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def search_hybrid(
|
|
127
|
+
self,
|
|
128
|
+
query: str,
|
|
129
|
+
k: int = 10,
|
|
130
|
+
retrievers: list[str] | None = None,
|
|
131
|
+
prf: bool = False,
|
|
132
|
+
) -> HybridResults:
|
|
133
|
+
names = retrievers or [r for r in ALL_RETRIEVERS if self._enabled(r)]
|
|
134
|
+
pool = max(k, _POOL_MIN)
|
|
135
|
+
|
|
136
|
+
lists: list[list[Result]] = []
|
|
137
|
+
out = HybridResults(results=[])
|
|
138
|
+
for name in names:
|
|
139
|
+
if not self._enabled(name):
|
|
140
|
+
out.skipped[name] = "disabled in config"
|
|
141
|
+
continue
|
|
142
|
+
try:
|
|
143
|
+
hits = self._run(name, query, pool, prf)
|
|
144
|
+
except FileNotFoundError as exc:
|
|
145
|
+
out.skipped[name] = str(exc)
|
|
146
|
+
continue
|
|
147
|
+
except Exception as exc: # one backend failing shouldn't sink the query
|
|
148
|
+
out.skipped[name] = f"error: {exc}"
|
|
149
|
+
continue
|
|
150
|
+
lists.append(hits)
|
|
151
|
+
out.used.append(name)
|
|
152
|
+
|
|
153
|
+
out.results = reciprocal_rank_fusion(lists, top_n=k) if lists else []
|
|
154
|
+
return out
|
|
155
|
+
|
|
156
|
+
# -- maintenance -------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
def status(self) -> dict:
|
|
159
|
+
from .indexer import Indexer
|
|
160
|
+
|
|
161
|
+
return Indexer(self.config).status()
|
|
162
|
+
|
|
163
|
+
def reindex(self, full: bool = False) -> dict:
|
|
164
|
+
from .indexer import Indexer
|
|
165
|
+
|
|
166
|
+
return Indexer(self.config).reindex(full=full).as_dict()
|