infogrep 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. infogrep-0.0.1/.github/workflows/ci.yml +22 -0
  2. infogrep-0.0.1/.github/workflows/python-publish.yml +70 -0
  3. infogrep-0.0.1/.gitignore +33 -0
  4. infogrep-0.0.1/LICENSE +21 -0
  5. infogrep-0.0.1/Makefile +38 -0
  6. infogrep-0.0.1/PKG-INFO +353 -0
  7. infogrep-0.0.1/PLAN.md +211 -0
  8. infogrep-0.0.1/README.md +315 -0
  9. infogrep-0.0.1/docs/.nojekyll +0 -0
  10. infogrep-0.0.1/docs/index.html +316 -0
  11. infogrep-0.0.1/docs/styles.css +418 -0
  12. infogrep-0.0.1/infogrep/__init__.py +3 -0
  13. infogrep-0.0.1/infogrep/cli.py +217 -0
  14. infogrep-0.0.1/infogrep/config.py +217 -0
  15. infogrep-0.0.1/infogrep/engine.py +166 -0
  16. infogrep-0.0.1/infogrep/indexer.py +362 -0
  17. infogrep-0.0.1/infogrep/ingest/__init__.py +1 -0
  18. infogrep-0.0.1/infogrep/ingest/chunker.py +97 -0
  19. infogrep-0.0.1/infogrep/ingest/extract/__init__.py +5 -0
  20. infogrep-0.0.1/infogrep/ingest/extract/registry.py +172 -0
  21. infogrep-0.0.1/infogrep/ingest/graph.py +138 -0
  22. infogrep-0.0.1/infogrep/ingest/types.py +30 -0
  23. infogrep-0.0.1/infogrep/ingest/walker.py +47 -0
  24. infogrep-0.0.1/infogrep/jvm.py +77 -0
  25. infogrep-0.0.1/infogrep/manifest.py +231 -0
  26. infogrep-0.0.1/infogrep/mcp_server.py +154 -0
  27. infogrep-0.0.1/infogrep/retrieval/__init__.py +6 -0
  28. infogrep-0.0.1/infogrep/retrieval/base.py +86 -0
  29. infogrep-0.0.1/infogrep/retrieval/dense.py +234 -0
  30. infogrep-0.0.1/infogrep/retrieval/embedders/__init__.py +5 -0
  31. infogrep-0.0.1/infogrep/retrieval/embedders/base.py +21 -0
  32. infogrep-0.0.1/infogrep/retrieval/embedders/cache.py +54 -0
  33. infogrep-0.0.1/infogrep/retrieval/embedders/hashing.py +37 -0
  34. infogrep-0.0.1/infogrep/retrieval/embedders/registry.py +22 -0
  35. infogrep-0.0.1/infogrep/retrieval/embedders/sentence_transformer.py +115 -0
  36. infogrep-0.0.1/infogrep/retrieval/fusion.py +39 -0
  37. infogrep-0.0.1/infogrep/retrieval/graph.py +150 -0
  38. infogrep-0.0.1/infogrep/retrieval/kb.py +179 -0
  39. infogrep-0.0.1/infogrep/retrieval/sparse.py +379 -0
  40. infogrep-0.0.1/infogrep/scheduler.py +100 -0
  41. infogrep-0.0.1/infogrep/web.py +362 -0
  42. infogrep-0.0.1/install.sh +109 -0
  43. infogrep-0.0.1/macos/README.md +48 -0
  44. infogrep-0.0.1/macos/build.sh +37 -0
  45. infogrep-0.0.1/macos/main.swift +374 -0
  46. infogrep-0.0.1/pyproject.toml +68 -0
  47. infogrep-0.0.1/tests/conftest.py +14 -0
  48. infogrep-0.0.1/tests/test_chunker.py +57 -0
  49. infogrep-0.0.1/tests/test_cjk.py +135 -0
  50. infogrep-0.0.1/tests/test_cli.py +26 -0
  51. infogrep-0.0.1/tests/test_config.py +56 -0
  52. infogrep-0.0.1/tests/test_dense.py +116 -0
  53. infogrep-0.0.1/tests/test_engine.py +83 -0
  54. infogrep-0.0.1/tests/test_filetypes.py +196 -0
  55. infogrep-0.0.1/tests/test_fusion.py +31 -0
  56. infogrep-0.0.1/tests/test_graph.py +163 -0
  57. infogrep-0.0.1/tests/test_indexer.py +168 -0
  58. infogrep-0.0.1/tests/test_integration_legal.py +130 -0
  59. infogrep-0.0.1/tests/test_kb.py +144 -0
  60. infogrep-0.0.1/tests/test_mcp.py +64 -0
  61. infogrep-0.0.1/tests/test_metadata.py +56 -0
  62. infogrep-0.0.1/tests/test_ocr.py +50 -0
  63. infogrep-0.0.1/tests/test_scheduler.py +66 -0
  64. infogrep-0.0.1/tests/test_sparse.py +252 -0
  65. infogrep-0.0.1/tests/test_staleness.py +45 -0
  66. infogrep-0.0.1/tests/test_walker.py +71 -0
  67. infogrep-0.0.1/tests/test_web.py +167 -0
  68. infogrep-0.0.1/thoughts.md +38 -0
  69. infogrep-0.0.1/uninstall.sh +51 -0
  70. infogrep-0.0.1/uv.lock +5675 -0
@@ -0,0 +1,22 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ shellcheck:
10
+ name: shellcheck (shell scripts)
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - name: Lint shell scripts
15
+ run: shellcheck install.sh uninstall.sh macos/build.sh
16
+
17
+ ruff:
18
+ name: ruff (python lint)
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: astral-sh/ruff-action@v3
@@ -0,0 +1,70 @@
1
+ # This workflow will upload a Python Package to PyPI when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ release-build:
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.x"
28
+
29
+ - name: Build release distributions
30
+ run: |
31
+ # NOTE: put your own distribution build steps here.
32
+ python -m pip install build
33
+ python -m build
34
+
35
+ - name: Upload distributions
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: release-dists
39
+ path: dist/
40
+
41
+ pypi-publish:
42
+ runs-on: ubuntu-latest
43
+ needs:
44
+ - release-build
45
+ permissions:
46
+ # IMPORTANT: this permission is mandatory for trusted publishing
47
+ id-token: write
48
+
49
+ # Dedicated environments with protections for publishing are strongly recommended.
50
+ # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51
+ environment:
52
+ name: pypi
53
+ # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54
+ # url: https://pypi.org/p/YOURPROJECT
55
+ #
56
+ # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57
+ # ALTERNATIVE: exactly, uncomment the following line instead:
58
+ # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59
+
60
+ steps:
61
+ - name: Retrieve release distributions
62
+ uses: actions/download-artifact@v4
63
+ with:
64
+ name: release-dists
65
+ path: dist/
66
+
67
+ - name: Publish release distributions to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
69
+ with:
70
+ packages-dir: dist/
@@ -0,0 +1,33 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+
9
+ # uv / venv
10
+ .venv/
11
+ venv/
12
+
13
+ # Caches & tooling
14
+ .pytest_cache/
15
+ .ruff_cache/
16
+ .mypy_cache/
17
+
18
+ # InfoGrep side-car indices (never commit indexed data)
19
+ .infogrep/
20
+
21
+ # Models / weights
22
+ *.safetensors
23
+ *.bin
24
+ models/
25
+
26
+ # OS
27
+ .DS_Store
28
+
29
+ # Env
30
+ .env
31
+
32
+ # Built macOS launcher app
33
+ macos/InfoGrep.app/
infogrep-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Qingyao Ai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,38 @@
1
+ # InfoGrep — convenience wrapper. Run `make` (or `make help`) to list targets.
2
+ .DEFAULT_GOAL := help
3
+ .PHONY: help install uninstall purge sync app test lint shellcheck build
4
+
5
+ help: ## Show this help
6
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) \
7
+ | awk 'BEGIN{FS=":.*?## "}{printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}'
8
+
9
+ install: ## Install InfoGrep (app + login agents + MCP) — see ./install.sh
10
+ ./install.sh
11
+
12
+ uninstall: ## Remove the app, login agents and MCP (keeps indexes)
13
+ ./uninstall.sh
14
+
15
+ purge: ## Uninstall AND delete all indexes (~/.infogrep)
16
+ ./uninstall.sh --purge
17
+
18
+ sync: ## Create/refresh the dev virtualenv (uv sync --extra dev)
19
+ uv sync --extra dev
20
+
21
+ app: ## Build the macOS menu-bar app (macos/InfoGrep.app)
22
+ cd macos && ./build.sh
23
+
24
+ test: ## Run the test suite
25
+ uv run pytest
26
+
27
+ lint: ## Lint Python (ruff) and shell scripts (shellcheck if installed)
28
+ uv run ruff check .
29
+ @command -v shellcheck >/dev/null 2>&1 \
30
+ && shellcheck install.sh uninstall.sh macos/build.sh \
31
+ || echo "shellcheck not installed — skipping (brew install shellcheck)"
32
+
33
+ shellcheck: ## Lint just the shell scripts
34
+ shellcheck install.sh uninstall.sh macos/build.sh
35
+
36
+ build: ## Build sdist + wheel into dist/ (uv build)
37
+ rm -rf dist
38
+ uv build
@@ -0,0 +1,353 @@
1
+ Metadata-Version: 2.4
2
+ Name: infogrep
3
+ Version: 0.0.1
4
+ Summary: Local-first content search engine (sparse + dense + knowledge base) for coding agents
5
+ Project-URL: Homepage, https://qingyaoai.github.io/InfoGrep/
6
+ Project-URL: Repository, https://github.com/QingyaoAi/InfoGrep
7
+ Project-URL: Issues, https://github.com/QingyaoAi/InfoGrep/issues
8
+ Author-email: Qingyao Ai <aiqingyao@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Classifier: Topic :: Text Processing :: Indexing
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: mcp>=1.28.0
24
+ Requires-Dist: openpyxl>=3.1.5
25
+ Requires-Dist: pathspec>=1.1.1
26
+ Requires-Dist: pymupdf>=1.27.2.3
27
+ Requires-Dist: pyserini>=1.2.0
28
+ Requires-Dist: python-docx>=1.2.0
29
+ Requires-Dist: python-pptx>=1.0.2
30
+ Requires-Dist: sentence-transformers>=5.6.0
31
+ Requires-Dist: tomli>=2.0; python_version < '3.11'
32
+ Requires-Dist: typer>=0.12
33
+ Requires-Dist: zvec>=0.5.1
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=8.0; extra == 'dev'
36
+ Requires-Dist: ruff>=0.5; extra == 'dev'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # InfoGrep
40
+
41
+ [![CI](https://github.com/QingyaoAi/InfoGrep/actions/workflows/ci.yml/badge.svg)](https://github.com/QingyaoAi/InfoGrep/actions/workflows/ci.yml)
42
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
43
+
44
+ **InfoGrep is a local-first search engine for the *content* of your files.** Point it at a
45
+ folder — a Dropbox, a codebase, a research archive — and it indexes what's actually written
46
+ inside every PDF, Office doc, spreadsheet, note, and image caption, then makes it searchable
47
+ by keyword, meaning, or knowledge-graph — from the command line, a browser, or directly as
48
+ tools your coding agent (Claude Code, Codex, …) can call.
49
+
50
+ Everything runs on your machine. Nothing is uploaded anywhere. Your files are never modified.
51
+
52
+ 📖 **[Project website](https://QingyaoAi.github.io/InfoGrep/)** · [PLAN.md](PLAN.md) (design & milestones)
53
+
54
+ ---
55
+
56
+ ## Why InfoGrep
57
+
58
+ `grep` and Spotlight only see file names, or plain text. They can't look inside a PDF, DOCX,
59
+ or PPTX, they don't rank results by relevance, and they have no idea what a coding agent
60
+ should do with the output. InfoGrep fixes all three:
61
+
62
+ - **Reads real content.** PDFs (including scanned ones, via OCR), Word/PowerPoint/Excel,
63
+ legacy `.doc`, RTF/OpenDocument, plain text and markup, and JSON — not just file names.
64
+ - **Four complementary retrieval modes, fused.** Exact-keyword (BM25), semantic
65
+ (embeddings), an Obsidian knowledge-base graph, and a folder/filename metadata graph —
66
+ combined with reciprocal rank fusion, or called independently.
67
+ - **Folder-aware, not just file-aware.** A metadata graph over your folder structure (paths
68
+ and file names only, never content) lets hybrid search also surface sibling files from the
69
+ *folder* a hit lives in — not only files whose own content matched the query.
70
+ - **Built for agents, not just humans.** An MCP server exposes each retriever as a tool with
71
+ structured, citable results (`path`, `page`, `snippet`, `score`), so Claude Code, Codex, or
72
+ any MCP-aware agent can search your files as naturally as it reads them.
73
+ - **Local-first and non-destructive.** The index lives in a side-car location outside the
74
+ folder you're searching; your files are only ever read.
75
+ - **Incremental.** Re-indexing only touches files that changed since the last run, and can
76
+ run on a daily schedule automatically.
77
+
78
+ ## How it works
79
+
80
+ ```
81
+ ┌───────────────────────────────────────────────┐
82
+ │ MCP server / CLI / browser UI │
83
+ │ search_sparse · search_dense · search_kb │
84
+ │ search_graph · search_hybrid │
85
+ │ index_status · reindex │
86
+ └───────────────────────┬─────────────────────────┘
87
+
88
+ ┌───────────────┬───────────────────┼────────────────┬───────────────┐
89
+ │ │ │ │
90
+ ┌──────▼──────┐ ┌──────▼──────┐ ┌───────▼──────┐ ┌───────▼──────┐
91
+ │ Sparse │ │ Dense │ │ Knowledge │ │ Folder │
92
+ │ (Pyserini │ │ (embeddings │ │ base │ │ metadata │
93
+ │ BM25, │ │ + Zvec ANN, │ │ (Obsidian │ │ graph │
94
+ │ bilingual) │ │off by │ │ graph, live │ │ (paths only, │
95
+ │ │ │ default) │ │ vault) │ │ no content) │
96
+ └──────┬──────┘ └──────┬──────┘ └───────┬──────┘ └───────┬──────┘
97
+ └───────────────┴───────────────────┴────────────────┘
98
+ │ reciprocal rank fusion
99
+ ┌──────▼──────┐
100
+ │ Fusion │
101
+ └──────┬──────┘
102
+
103
+ ┌─────────────▼──────────────┐
104
+ │ Side-car index store │
105
+ │ ~/.infogrep/indexes/<dir>/ │
106
+ └─────────────▲──────────────┘
107
+
108
+ ┌───────────────────────────┴────────────────────────────┐
109
+ │ Ingestion pipeline │
110
+ │ walk (include/exclude globs) → extract (per file type) │
111
+ │ → chunk into passages → index (sparse/dense) │
112
+ │ → build folder/filename metadata graph │
113
+ │ → manifest.sqlite tracks hash/mtime for deltas │
114
+ └──────────────────────────────────────────────────────────┘
115
+ ```
116
+
117
+ 1. **Walk** the target directory, respecting include/exclude glob patterns.
118
+ 2. **Extract** text per file type (PDF via PyMuPDF, DOCX/PPTX/XLSX via python-docx/pptx/
119
+ openpyxl, legacy `.doc` via macOS `textutil`, everything else as UTF-8 text). Files with
120
+ no extractable content are still indexed by file name/path, so they're findable.
121
+ 3. **Chunk** long documents into overlapping passages (`{doc_id, passage_id, text, path,
122
+ page}`), preserving page numbers for citations.
123
+ 4. **Index** passages into a **manifest** (SQLite: path → hash/mtime/size, for change
124
+ detection) plus **sparse** (Lucene/BM25 via Pyserini) and, optionally, **dense**
125
+ (embeddings in a Zvec vector store) indexes.
126
+ 5. **Build the folder/filename metadata graph** from every indexed file's *path* (never its
127
+ content): a folder tree materialized as an Obsidian-compatible vault of linked notes
128
+ (browsable in Obsidian) plus a compact JSON form used for fast lookups.
129
+ 6. **Retrieve** via any of the four retrievers, or all of them fused with **reciprocal rank
130
+ fusion (RRF)** — no tuning required, and each retriever can be skipped gracefully if it
131
+ isn't enabled or available. The metadata graph lets a hit's *folder* pull in sibling files
132
+ too, not only files whose own content matched.
133
+ 7. **Re-index incrementally**: a manifest diff classifies files as added/modified/deleted, so
134
+ only the delta is re-extracted, re-chunked, and re-indexed — a no-op run does nothing (the
135
+ metadata graph rebuilds whenever files are added/removed — cheap, since it's just paths).
136
+
137
+ The index is **never** written into the folder you're searching — it lives under
138
+ `$INFOGREP_HOME/indexes/<name>-<hash>/` (default `~/.infogrep`), so your directory's
139
+ structure and git history stay untouched.
140
+
141
+ ## What it can search
142
+
143
+ | Category | Types |
144
+ |---|---|
145
+ | Documents | `pdf` `doc` `docx` `ppt` `pptx` `xls` `xlsx` `rtf` `odt` `ods` `odp` |
146
+ | Text & markup | `txt` `md` `markdown` `rst` `tex` `csv` `tsv` `json` `jsonl` |
147
+ | Images (name/path; content with OCR) | `png` `jpg` `jpeg` `gif` `bmp` `tif` `tiff` `webp` `svg` `heic` `heif` |
148
+
149
+ This is the default; set `include = ["**/*"]` in a directory's config to index every file
150
+ (anything without a dedicated extractor is still indexed by name/path). Dependency, VCS, and
151
+ cache trees (`node_modules`, `.git`, `.venv`, `__pycache__`, …) and editor/OS junk are
152
+ excluded by default.
153
+
154
+ Sparse search is **multi-field**: queries match passage text *and* the file name/path
155
+ (tokenized, independently weighted), so you can find a file by what it's called, not only
156
+ what it says. Sparse indexing is **bilingual by default** (`en+zh`): English gets Porter
157
+ stemming, Chinese/Japanese/Korean get CJK bigram analysis, in a single index — switch to a
158
+ single language with `[sparse] language`.
159
+
160
+ ## Install
161
+
162
+ Requires [`uv`](https://docs.astral.sh/uv/) and JDK 21 for sparse search
163
+ (`brew install openjdk@21`).
164
+
165
+ ```bash
166
+ git clone https://github.com/QingyaoAi/InfoGrep.git
167
+ cd InfoGrep
168
+ uv sync --extra dev # create venv + install deps
169
+ uv run infogrep --help # show command surface
170
+ uv run pytest # run tests
171
+ ```
172
+
173
+ ### Install as a macOS app (optional)
174
+
175
+ The installer sets up the Python backend, builds a Spotlight-style menu-bar app, starts it
176
+ (and the search backend) at login, and registers the Claude Code MCP server:
177
+
178
+ ```bash
179
+ ./install.sh # INFOGREP_SERVE_DIR=/path sets the default folder; INFOGREP_PORT changes the port
180
+ ```
181
+
182
+ Then press **⌘⇧Space** for the launcher, or open <http://127.0.0.1:7421>. Add folders to
183
+ search from the app (**Index a Folder…**) or the web UI (**+ folder**).
184
+
185
+ Additionally requires Xcode Command Line Tools (`xcode-select --install`). The app is
186
+ ad-hoc signed, so the first launch needs a right-click → **Open** (one time).
187
+
188
+ Remove everything cleanly:
189
+
190
+ ```bash
191
+ ./uninstall.sh # removes the app, login agents and MCP server (keeps indexes)
192
+ ./uninstall.sh --purge # also delete all indexes (~/.infogrep)
193
+ ```
194
+
195
+ `make install` / `make uninstall` / `make purge` are equivalent; run `make` to list all
196
+ targets (`sync`, `app`, `test`, `lint`, …).
197
+
198
+ ## Usage
199
+
200
+ ### CLI
201
+
202
+ ```bash
203
+ infogrep index <dir> # build / update the index for a directory
204
+ infogrep search <query> -d <dir> # query (modes: hybrid [default] | sparse | dense | kb | graph)
205
+ infogrep search <query> --prf # sparse query expansion (RM3)
206
+ infogrep status <dir> # index status + staleness (pending changes)
207
+ infogrep mcp --dir <dir> # run the MCP server (stdio) for coding agents
208
+ infogrep serve --dir <dir> # browser UI to test search (http://127.0.0.1:7421)
209
+ infogrep schedule install <dir> --at 03:00 # daily auto-reindex via launchd
210
+ infogrep schedule list | uninstall <dir>
211
+ ```
212
+
213
+ `infogrep status <dir>` prints the exact index location and reports **staleness** — files
214
+ added/modified/deleted since the last index — so you know when a manual `infogrep index` is
215
+ worth running.
216
+
217
+ ### MCP server (Claude Code / Codex)
218
+
219
+ Register InfoGrep as an MCP server so an agent can search your files as a tool call:
220
+
221
+ ```bash
222
+ claude mcp add infogrep -- uv run infogrep mcp --dir /path/to/your/project
223
+ ```
224
+
225
+ Tools exposed: `search_sparse`, `search_dense`, `search_kb`, `search_graph`,
226
+ `search_hybrid`, `index_status`, `reindex`. Each search tool returns `{"results": [...]}`
227
+ where every result carries `path`, `page`, `snippet`, `score`, and `retriever` for easy
228
+ citation. `search_hybrid` (recommended) fuses whichever retrievers are enabled and reports
229
+ which were `used` vs. `skipped` (and why).
230
+
231
+ ### Browser UI
232
+
233
+ ```bash
234
+ infogrep serve --dir <dir> # http://127.0.0.1:7421 by default
235
+ ```
236
+
237
+ A search box, a mode selector (hybrid/sparse/dense/kb/graph), result snippets with path/page/
238
+ score, folder management (add/switch indexed directories), and a JSON API at
239
+ `/api/search` and `/api/status`. Bound to localhost only.
240
+
241
+ ### Folder/filename metadata graph
242
+
243
+ On every reindex, InfoGrep builds a knowledge graph over your folder structure — each file's
244
+ *path and name only, never its content* — and materializes it as an Obsidian-compatible vault
245
+ of linked folder notes under the index's `graph_vault/` side-car directory (open it in
246
+ Obsidian to browse, if you like). `search_graph` matches a query against folder/file *names*,
247
+ then expands to neighboring folders (parent, children, siblings) so files that live in the
248
+ most relevant folder(s) surface too — not just files whose own name/content matched. It
249
+ participates in `search_hybrid` automatically, letting one hit pull in its co-located
250
+ siblings. On by default (it's cheap — just path manipulation, no model or JVM):
251
+
252
+ ```toml
253
+ [graph]
254
+ enabled = true # set false to disable
255
+ hops = 1 # folder hops to expand from a matched folder (parent/children/siblings)
256
+ max_folders = 5 # top-scoring folders to expand into file candidates per query
257
+ ```
258
+
259
+ ### Knowledge base (Obsidian vault)
260
+
261
+ `search_kb` adds graph-aware search over an Obsidian vault via the **Obsidian CLI**: it
262
+ searches the live vault, then expands along links/backlinks so notes *connected* to a match
263
+ surface too — always current, no separate index. Requires the Obsidian app running with the
264
+ vault open. Enable per directory:
265
+
266
+ ```toml
267
+ [kb]
268
+ enabled = true
269
+ vault = "My Vault" # Obsidian vault name; omit to use the CLI's active vault
270
+ hops = 1 # link hops to expand (follows links + backlinks)
271
+ # cli = "obsidian" # path to the Obsidian CLI, if not on PATH
272
+ ```
273
+
274
+ If the app isn't running, `search_kb` is skipped (in hybrid) or reports a clear error
275
+ (standalone).
276
+
277
+ ### Scanned PDFs (OCR)
278
+
279
+ PDFs with no text layer can be OCR'd at ingest time (requires `tesseract`):
280
+
281
+ ```toml
282
+ [ingest]
283
+ ocr = true # OCR pages with little/no extractable text
284
+ ocr_min_chars = 16 # threshold below which a page is OCR'd
285
+ ```
286
+
287
+ ### Daily auto-reindex
288
+
289
+ `infogrep schedule install <dir>` registers a macOS `launchd` agent that reindexes the
290
+ directory once a day (logs to the index dir's `reindex.log`).
291
+
292
+ ## Configuration reference
293
+
294
+ Config is TOML, read from (in order) a global `$INFOGREP_HOME/config.toml`, then a
295
+ per-directory override at that index's `config.toml` (path shown by `infogrep status`).
296
+
297
+ ```toml
298
+ include = ["**/*.pdf", "**/*.docx", "..."] # default: documents + images, see table above
299
+ exclude = ["**/node_modules/**", "..."] # default: VCS/deps/cache/OS junk
300
+
301
+ [chunk]
302
+ size = 512 # target passage size (tokens/words)
303
+ overlap = 64 # overlap between adjacent passages
304
+
305
+ [ingest]
306
+ ocr = false # OCR scanned PDF pages
307
+ ocr_min_chars = 16 # page text below this length triggers OCR
308
+ workers = 0 # parallel extraction processes; 0 = auto (min(8, cpu count))
309
+
310
+ [sparse]
311
+ enabled = true
312
+ prf = false # RM3 pseudo-relevance feedback
313
+ prf_fb_docs = 10
314
+ prf_fb_terms = 10
315
+ language = "en+zh" # "en" | "zh" | "ja" | "ko" | "en+zh" (changing re-indexes)
316
+ field_boosts = { contents = 1.0, filename = 2.0, pathtext = 1.0 }
317
+
318
+ [dense]
319
+ enabled = false # off by default: needs a model + RAM/GPU
320
+ embedder = "qwen" # registry key; see infogrep.retrieval.embedders
321
+ model_name = "Qwen/Qwen3-Embedding-0.6B"
322
+ device = "auto" # "auto" -> mps/cuda/cpu
323
+
324
+ [kb]
325
+ enabled = false
326
+ vault = "" # Obsidian vault name; empty -> the CLI's active vault
327
+ cli = "obsidian"
328
+ hops = 1
329
+ search_limit = 10
330
+
331
+ [graph]
332
+ enabled = true # folder/filename metadata graph; cheap, on by default
333
+ hops = 1 # folder hops to expand from a matched folder
334
+ max_folders = 5 # top-scoring folders to expand into file candidates per query
335
+ ```
336
+
337
+ With dense off (the default), `hybrid` simply runs sparse and the metadata graph (plus the
338
+ knowledge base, if enabled) — no model download needed until you opt in.
339
+
340
+ ## Development
341
+
342
+ ```bash
343
+ make sync # create/refresh the dev virtualenv
344
+ make test # run the test suite
345
+ make lint # ruff + shellcheck
346
+ make app # build the macOS menu-bar app
347
+ ```
348
+
349
+ See [PLAN.md](PLAN.md) for the full architecture write-up and milestone history.
350
+
351
+ ## License
352
+
353
+ [MIT](LICENSE)