infogrep 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infogrep-0.0.1/.github/workflows/ci.yml +22 -0
- infogrep-0.0.1/.github/workflows/python-publish.yml +70 -0
- infogrep-0.0.1/.gitignore +33 -0
- infogrep-0.0.1/LICENSE +21 -0
- infogrep-0.0.1/Makefile +38 -0
- infogrep-0.0.1/PKG-INFO +353 -0
- infogrep-0.0.1/PLAN.md +211 -0
- infogrep-0.0.1/README.md +315 -0
- infogrep-0.0.1/docs/.nojekyll +0 -0
- infogrep-0.0.1/docs/index.html +316 -0
- infogrep-0.0.1/docs/styles.css +418 -0
- infogrep-0.0.1/infogrep/__init__.py +3 -0
- infogrep-0.0.1/infogrep/cli.py +217 -0
- infogrep-0.0.1/infogrep/config.py +217 -0
- infogrep-0.0.1/infogrep/engine.py +166 -0
- infogrep-0.0.1/infogrep/indexer.py +362 -0
- infogrep-0.0.1/infogrep/ingest/__init__.py +1 -0
- infogrep-0.0.1/infogrep/ingest/chunker.py +97 -0
- infogrep-0.0.1/infogrep/ingest/extract/__init__.py +5 -0
- infogrep-0.0.1/infogrep/ingest/extract/registry.py +172 -0
- infogrep-0.0.1/infogrep/ingest/graph.py +138 -0
- infogrep-0.0.1/infogrep/ingest/types.py +30 -0
- infogrep-0.0.1/infogrep/ingest/walker.py +47 -0
- infogrep-0.0.1/infogrep/jvm.py +77 -0
- infogrep-0.0.1/infogrep/manifest.py +231 -0
- infogrep-0.0.1/infogrep/mcp_server.py +154 -0
- infogrep-0.0.1/infogrep/retrieval/__init__.py +6 -0
- infogrep-0.0.1/infogrep/retrieval/base.py +86 -0
- infogrep-0.0.1/infogrep/retrieval/dense.py +234 -0
- infogrep-0.0.1/infogrep/retrieval/embedders/__init__.py +5 -0
- infogrep-0.0.1/infogrep/retrieval/embedders/base.py +21 -0
- infogrep-0.0.1/infogrep/retrieval/embedders/cache.py +54 -0
- infogrep-0.0.1/infogrep/retrieval/embedders/hashing.py +37 -0
- infogrep-0.0.1/infogrep/retrieval/embedders/registry.py +22 -0
- infogrep-0.0.1/infogrep/retrieval/embedders/sentence_transformer.py +115 -0
- infogrep-0.0.1/infogrep/retrieval/fusion.py +39 -0
- infogrep-0.0.1/infogrep/retrieval/graph.py +150 -0
- infogrep-0.0.1/infogrep/retrieval/kb.py +179 -0
- infogrep-0.0.1/infogrep/retrieval/sparse.py +379 -0
- infogrep-0.0.1/infogrep/scheduler.py +100 -0
- infogrep-0.0.1/infogrep/web.py +362 -0
- infogrep-0.0.1/install.sh +109 -0
- infogrep-0.0.1/macos/README.md +48 -0
- infogrep-0.0.1/macos/build.sh +37 -0
- infogrep-0.0.1/macos/main.swift +374 -0
- infogrep-0.0.1/pyproject.toml +68 -0
- infogrep-0.0.1/tests/conftest.py +14 -0
- infogrep-0.0.1/tests/test_chunker.py +57 -0
- infogrep-0.0.1/tests/test_cjk.py +135 -0
- infogrep-0.0.1/tests/test_cli.py +26 -0
- infogrep-0.0.1/tests/test_config.py +56 -0
- infogrep-0.0.1/tests/test_dense.py +116 -0
- infogrep-0.0.1/tests/test_engine.py +83 -0
- infogrep-0.0.1/tests/test_filetypes.py +196 -0
- infogrep-0.0.1/tests/test_fusion.py +31 -0
- infogrep-0.0.1/tests/test_graph.py +163 -0
- infogrep-0.0.1/tests/test_indexer.py +168 -0
- infogrep-0.0.1/tests/test_integration_legal.py +130 -0
- infogrep-0.0.1/tests/test_kb.py +144 -0
- infogrep-0.0.1/tests/test_mcp.py +64 -0
- infogrep-0.0.1/tests/test_metadata.py +56 -0
- infogrep-0.0.1/tests/test_ocr.py +50 -0
- infogrep-0.0.1/tests/test_scheduler.py +66 -0
- infogrep-0.0.1/tests/test_sparse.py +252 -0
- infogrep-0.0.1/tests/test_staleness.py +45 -0
- infogrep-0.0.1/tests/test_walker.py +71 -0
- infogrep-0.0.1/tests/test_web.py +167 -0
- infogrep-0.0.1/thoughts.md +38 -0
- infogrep-0.0.1/uninstall.sh +51 -0
- infogrep-0.0.1/uv.lock +5675 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
shellcheck:
|
|
10
|
+
name: shellcheck (shell scripts)
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- name: Lint shell scripts
|
|
15
|
+
run: shellcheck install.sh uninstall.sh macos/build.sh
|
|
16
|
+
|
|
17
|
+
ruff:
|
|
18
|
+
name: ruff (python lint)
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- uses: astral-sh/ruff-action@v3
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# This workflow will upload a Python Package to PyPI when a release is created
|
|
2
|
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
|
3
|
+
|
|
4
|
+
# This workflow uses actions that are not certified by GitHub.
|
|
5
|
+
# They are provided by a third-party and are governed by
|
|
6
|
+
# separate terms of service, privacy policy, and support
|
|
7
|
+
# documentation.
|
|
8
|
+
|
|
9
|
+
name: Upload Python Package
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
release:
|
|
13
|
+
types: [published]
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
release-build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.x"
|
|
28
|
+
|
|
29
|
+
- name: Build release distributions
|
|
30
|
+
run: |
|
|
31
|
+
# NOTE: put your own distribution build steps here.
|
|
32
|
+
python -m pip install build
|
|
33
|
+
python -m build
|
|
34
|
+
|
|
35
|
+
- name: Upload distributions
|
|
36
|
+
uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: release-dists
|
|
39
|
+
path: dist/
|
|
40
|
+
|
|
41
|
+
pypi-publish:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
needs:
|
|
44
|
+
- release-build
|
|
45
|
+
permissions:
|
|
46
|
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
|
47
|
+
id-token: write
|
|
48
|
+
|
|
49
|
+
# Dedicated environments with protections for publishing are strongly recommended.
|
|
50
|
+
# For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
|
|
51
|
+
environment:
|
|
52
|
+
name: pypi
|
|
53
|
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
|
54
|
+
# url: https://pypi.org/p/YOURPROJECT
|
|
55
|
+
#
|
|
56
|
+
# ALTERNATIVE: if your GitHub Release name is the PyPI project version string
|
|
57
|
+
# ALTERNATIVE: exactly, uncomment the following line instead:
|
|
58
|
+
# url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
|
|
59
|
+
|
|
60
|
+
steps:
|
|
61
|
+
- name: Retrieve release distributions
|
|
62
|
+
uses: actions/download-artifact@v4
|
|
63
|
+
with:
|
|
64
|
+
name: release-dists
|
|
65
|
+
path: dist/
|
|
66
|
+
|
|
67
|
+
- name: Publish release distributions to PyPI
|
|
68
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
69
|
+
with:
|
|
70
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
|
|
9
|
+
# uv / venv
|
|
10
|
+
.venv/
|
|
11
|
+
venv/
|
|
12
|
+
|
|
13
|
+
# Caches & tooling
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.ruff_cache/
|
|
16
|
+
.mypy_cache/
|
|
17
|
+
|
|
18
|
+
# InfoGrep side-car indices (never commit indexed data)
|
|
19
|
+
.infogrep/
|
|
20
|
+
|
|
21
|
+
# Models / weights
|
|
22
|
+
*.safetensors
|
|
23
|
+
*.bin
|
|
24
|
+
models/
|
|
25
|
+
|
|
26
|
+
# OS
|
|
27
|
+
.DS_Store
|
|
28
|
+
|
|
29
|
+
# Env
|
|
30
|
+
.env
|
|
31
|
+
|
|
32
|
+
# Built macOS launcher app
|
|
33
|
+
macos/InfoGrep.app/
|
infogrep-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Qingyao Ai
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
infogrep-0.0.1/Makefile
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# InfoGrep — convenience wrapper. Run `make` (or `make help`) to list targets.
|
|
2
|
+
.DEFAULT_GOAL := help
|
|
3
|
+
.PHONY: help install uninstall purge sync app test lint shellcheck build
|
|
4
|
+
|
|
5
|
+
help: ## Show this help
|
|
6
|
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) \
|
|
7
|
+
| awk 'BEGIN{FS=":.*?## "}{printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}'
|
|
8
|
+
|
|
9
|
+
install: ## Install InfoGrep (app + login agents + MCP) — see ./install.sh
|
|
10
|
+
./install.sh
|
|
11
|
+
|
|
12
|
+
uninstall: ## Remove the app, login agents and MCP (keeps indexes)
|
|
13
|
+
./uninstall.sh
|
|
14
|
+
|
|
15
|
+
purge: ## Uninstall AND delete all indexes (~/.infogrep)
|
|
16
|
+
./uninstall.sh --purge
|
|
17
|
+
|
|
18
|
+
sync: ## Create/refresh the dev virtualenv (uv sync --extra dev)
|
|
19
|
+
uv sync --extra dev
|
|
20
|
+
|
|
21
|
+
app: ## Build the macOS menu-bar app (macos/InfoGrep.app)
|
|
22
|
+
cd macos && ./build.sh
|
|
23
|
+
|
|
24
|
+
test: ## Run the test suite
|
|
25
|
+
uv run pytest
|
|
26
|
+
|
|
27
|
+
lint: ## Lint Python (ruff) and shell scripts (shellcheck if installed)
|
|
28
|
+
uv run ruff check .
|
|
29
|
+
@command -v shellcheck >/dev/null 2>&1 \
|
|
30
|
+
&& shellcheck install.sh uninstall.sh macos/build.sh \
|
|
31
|
+
|| echo "shellcheck not installed — skipping (brew install shellcheck)"
|
|
32
|
+
|
|
33
|
+
shellcheck: ## Lint just the shell scripts
|
|
34
|
+
shellcheck install.sh uninstall.sh macos/build.sh
|
|
35
|
+
|
|
36
|
+
build: ## Build sdist + wheel into dist/ (uv build)
|
|
37
|
+
rm -rf dist
|
|
38
|
+
uv build
|
infogrep-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: infogrep
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Local-first content search engine (sparse + dense + knowledge base) for coding agents
|
|
5
|
+
Project-URL: Homepage, https://qingyaoai.github.io/InfoGrep/
|
|
6
|
+
Project-URL: Repository, https://github.com/QingyaoAi/InfoGrep
|
|
7
|
+
Project-URL: Issues, https://github.com/QingyaoAi/InfoGrep/issues
|
|
8
|
+
Author-email: Qingyao Ai <aiqingyao@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: mcp>=1.28.0
|
|
24
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
25
|
+
Requires-Dist: pathspec>=1.1.1
|
|
26
|
+
Requires-Dist: pymupdf>=1.27.2.3
|
|
27
|
+
Requires-Dist: pyserini>=1.2.0
|
|
28
|
+
Requires-Dist: python-docx>=1.2.0
|
|
29
|
+
Requires-Dist: python-pptx>=1.0.2
|
|
30
|
+
Requires-Dist: sentence-transformers>=5.6.0
|
|
31
|
+
Requires-Dist: tomli>=2.0; python_version < '3.11'
|
|
32
|
+
Requires-Dist: typer>=0.12
|
|
33
|
+
Requires-Dist: zvec>=0.5.1
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
# InfoGrep
|
|
40
|
+
|
|
41
|
+
[](https://github.com/QingyaoAi/InfoGrep/actions/workflows/ci.yml)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
|
|
44
|
+
**InfoGrep is a local-first search engine for the *content* of your files.** Point it at a
|
|
45
|
+
folder — a Dropbox, a codebase, a research archive — and it indexes what's actually written
|
|
46
|
+
inside every PDF, Office doc, spreadsheet, note, and image caption, then makes it searchable
|
|
47
|
+
by keyword, meaning, or knowledge-graph — from the command line, a browser, or directly as
|
|
48
|
+
tools your coding agent (Claude Code, Codex, …) can call.
|
|
49
|
+
|
|
50
|
+
Everything runs on your machine. Nothing is uploaded anywhere. Your files are never modified.
|
|
51
|
+
|
|
52
|
+
📖 **[Project website](https://QingyaoAi.github.io/InfoGrep/)** · [PLAN.md](PLAN.md) (design & milestones)
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Why InfoGrep
|
|
57
|
+
|
|
58
|
+
`grep` and Spotlight only see file names, or plain text. They can't look inside a PDF, DOCX,
|
|
59
|
+
or PPTX, they don't rank results by relevance, and they have no idea what a coding agent
|
|
60
|
+
should do with the output. InfoGrep fixes all three:
|
|
61
|
+
|
|
62
|
+
- **Reads real content.** PDFs (including scanned ones, via OCR), Word/PowerPoint/Excel,
|
|
63
|
+
legacy `.doc`, RTF/OpenDocument, plain text and markup, and JSON — not just file names.
|
|
64
|
+
- **Four complementary retrieval modes, fused.** Exact-keyword (BM25), semantic
|
|
65
|
+
(embeddings), an Obsidian knowledge-base graph, and a folder/filename metadata graph —
|
|
66
|
+
combined with reciprocal rank fusion, or called independently.
|
|
67
|
+
- **Folder-aware, not just file-aware.** A metadata graph over your folder structure (paths
|
|
68
|
+
and file names only, never content) lets hybrid search also surface sibling files from the
|
|
69
|
+
*folder* a hit lives in — not only files whose own content matched the query.
|
|
70
|
+
- **Built for agents, not just humans.** An MCP server exposes each retriever as a tool with
|
|
71
|
+
structured, citable results (`path`, `page`, `snippet`, `score`), so Claude Code, Codex, or
|
|
72
|
+
any MCP-aware agent can search your files as naturally as it reads them.
|
|
73
|
+
- **Local-first and non-destructive.** The index lives in a side-car location outside the
|
|
74
|
+
folder you're searching; your files are only ever read.
|
|
75
|
+
- **Incremental.** Re-indexing only touches files that changed since the last run, and can
|
|
76
|
+
run on a daily schedule automatically.
|
|
77
|
+
|
|
78
|
+
## How it works
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
┌───────────────────────────────────────────────┐
|
|
82
|
+
│ MCP server / CLI / browser UI │
|
|
83
|
+
│ search_sparse · search_dense · search_kb │
|
|
84
|
+
│ search_graph · search_hybrid │
|
|
85
|
+
│ index_status · reindex │
|
|
86
|
+
└───────────────────────┬─────────────────────────┘
|
|
87
|
+
│
|
|
88
|
+
┌───────────────┬───────────────────┼────────────────┬───────────────┐
|
|
89
|
+
│ │ │ │
|
|
90
|
+
┌──────▼──────┐ ┌──────▼──────┐ ┌───────▼──────┐ ┌───────▼──────┐
|
|
91
|
+
│ Sparse │ │ Dense │ │ Knowledge │ │ Folder │
|
|
92
|
+
│ (Pyserini │ │ (embeddings │ │ base │ │ metadata │
|
|
93
|
+
│ BM25, │ │ + Zvec ANN, │ │ (Obsidian │ │ graph │
|
|
94
|
+
│ bilingual) │ │off by │ │ graph, live │ │ (paths only, │
|
|
95
|
+
│ │ │ default) │ │ vault) │ │ no content) │
|
|
96
|
+
└──────┬──────┘ └──────┬──────┘ └───────┬──────┘ └───────┬──────┘
|
|
97
|
+
└───────────────┴───────────────────┴────────────────┘
|
|
98
|
+
│ reciprocal rank fusion
|
|
99
|
+
┌──────▼──────┐
|
|
100
|
+
│ Fusion │
|
|
101
|
+
└──────┬──────┘
|
|
102
|
+
│
|
|
103
|
+
┌─────────────▼──────────────┐
|
|
104
|
+
│ Side-car index store │
|
|
105
|
+
│ ~/.infogrep/indexes/<dir>/ │
|
|
106
|
+
└─────────────▲──────────────┘
|
|
107
|
+
│
|
|
108
|
+
┌───────────────────────────┴────────────────────────────┐
|
|
109
|
+
│ Ingestion pipeline │
|
|
110
|
+
│ walk (include/exclude globs) → extract (per file type) │
|
|
111
|
+
│ → chunk into passages → index (sparse/dense) │
|
|
112
|
+
│ → build folder/filename metadata graph │
|
|
113
|
+
│ → manifest.sqlite tracks hash/mtime for deltas │
|
|
114
|
+
└──────────────────────────────────────────────────────────┘
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
1. **Walk** the target directory, respecting include/exclude glob patterns.
|
|
118
|
+
2. **Extract** text per file type (PDF via PyMuPDF, DOCX/PPTX/XLSX via python-docx/pptx/
|
|
119
|
+
openpyxl, legacy `.doc` via macOS `textutil`, everything else as UTF-8 text). Files with
|
|
120
|
+
no extractable content are still indexed by file name/path, so they're findable.
|
|
121
|
+
3. **Chunk** long documents into overlapping passages (`{doc_id, passage_id, text, path,
|
|
122
|
+
page}`), preserving page numbers for citations.
|
|
123
|
+
4. **Index** passages into a **manifest** (SQLite: path → hash/mtime/size, for change
|
|
124
|
+
detection) plus **sparse** (Lucene/BM25 via Pyserini) and, optionally, **dense**
|
|
125
|
+
(embeddings in a Zvec vector store) indexes.
|
|
126
|
+
5. **Build the folder/filename metadata graph** from every indexed file's *path* (never its
|
|
127
|
+
content): a folder tree materialized as an Obsidian-compatible vault of linked notes
|
|
128
|
+
(browsable in Obsidian) plus a compact JSON form used for fast lookups.
|
|
129
|
+
6. **Retrieve** via any of the four retrievers, or all of them fused with **reciprocal rank
|
|
130
|
+
fusion (RRF)** — no tuning required, and each retriever can be skipped gracefully if it
|
|
131
|
+
isn't enabled or available. The metadata graph lets a hit's *folder* pull in sibling files
|
|
132
|
+
too, not only files whose own content matched.
|
|
133
|
+
7. **Re-index incrementally**: a manifest diff classifies files as added/modified/deleted, so
|
|
134
|
+
only the delta is re-extracted, re-chunked, and re-indexed — a no-op run does nothing (the
|
|
135
|
+
metadata graph rebuilds whenever files are added/removed — cheap, since it's just paths).
|
|
136
|
+
|
|
137
|
+
The index is **never** written into the folder you're searching — it lives under
|
|
138
|
+
`$INFOGREP_HOME/indexes/<name>-<hash>/` (default `~/.infogrep`), so your directory's
|
|
139
|
+
structure and git history stay untouched.
|
|
140
|
+
|
|
141
|
+
## What it can search
|
|
142
|
+
|
|
143
|
+
| Category | Types |
|
|
144
|
+
|---|---|
|
|
145
|
+
| Documents | `pdf` `doc` `docx` `ppt` `pptx` `xls` `xlsx` `rtf` `odt` `ods` `odp` |
|
|
146
|
+
| Text & markup | `txt` `md` `markdown` `rst` `tex` `csv` `tsv` `json` `jsonl` |
|
|
147
|
+
| Images (name/path; content with OCR) | `png` `jpg` `jpeg` `gif` `bmp` `tif` `tiff` `webp` `svg` `heic` `heif` |
|
|
148
|
+
|
|
149
|
+
This is the default; set `include = ["**/*"]` in a directory's config to index every file
|
|
150
|
+
(anything without a dedicated extractor is still indexed by name/path). Dependency, VCS, and
|
|
151
|
+
cache trees (`node_modules`, `.git`, `.venv`, `__pycache__`, …) and editor/OS junk are
|
|
152
|
+
excluded by default.
|
|
153
|
+
|
|
154
|
+
Sparse search is **multi-field**: queries match passage text *and* the file name/path
|
|
155
|
+
(tokenized, independently weighted), so you can find a file by what it's called, not only
|
|
156
|
+
what it says. Sparse indexing is **bilingual by default** (`en+zh`): English gets Porter
|
|
157
|
+
stemming, Chinese/Japanese/Korean get CJK bigram analysis, in a single index — switch to a
|
|
158
|
+
single language with `[sparse] language`.
|
|
159
|
+
|
|
160
|
+
## Install
|
|
161
|
+
|
|
162
|
+
Requires [`uv`](https://docs.astral.sh/uv/) and JDK 21 for sparse search
|
|
163
|
+
(`brew install openjdk@21`).
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
git clone https://github.com/QingyaoAi/InfoGrep.git
|
|
167
|
+
cd InfoGrep
|
|
168
|
+
uv sync --extra dev # create venv + install deps
|
|
169
|
+
uv run infogrep --help # show command surface
|
|
170
|
+
uv run pytest # run tests
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Install as a macOS app (optional)
|
|
174
|
+
|
|
175
|
+
The installer sets up the Python backend, builds a Spotlight-style menu-bar app, starts it
|
|
176
|
+
(and the search backend) at login, and registers the Claude Code MCP server:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
./install.sh # INFOGREP_SERVE_DIR=/path sets the default folder; INFOGREP_PORT changes the port
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Then press **⌘⇧Space** for the launcher, or open <http://127.0.0.1:7421>. Add folders to
|
|
183
|
+
search from the app (**Index a Folder…**) or the web UI (**+ folder**).
|
|
184
|
+
|
|
185
|
+
Additionally requires Xcode Command Line Tools (`xcode-select --install`). The app is
|
|
186
|
+
ad-hoc signed, so the first launch needs a right-click → **Open** (one time).
|
|
187
|
+
|
|
188
|
+
Remove everything cleanly:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
./uninstall.sh # removes the app, login agents and MCP server (keeps indexes)
|
|
192
|
+
./uninstall.sh --purge # also delete all indexes (~/.infogrep)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
`make install` / `make uninstall` / `make purge` are equivalent; run `make` to list all
|
|
196
|
+
targets (`sync`, `app`, `test`, `lint`, …).
|
|
197
|
+
|
|
198
|
+
## Usage
|
|
199
|
+
|
|
200
|
+
### CLI
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
infogrep index <dir> # build / update the index for a directory
|
|
204
|
+
infogrep search <query> -d <dir> # query (modes: hybrid [default] | sparse | dense | kb | graph)
|
|
205
|
+
infogrep search <query> --prf # sparse query expansion (RM3)
|
|
206
|
+
infogrep status <dir> # index status + staleness (pending changes)
|
|
207
|
+
infogrep mcp --dir <dir> # run the MCP server (stdio) for coding agents
|
|
208
|
+
infogrep serve --dir <dir> # browser UI to test search (http://127.0.0.1:7421)
|
|
209
|
+
infogrep schedule install <dir> --at 03:00 # daily auto-reindex via launchd
|
|
210
|
+
infogrep schedule list | uninstall <dir>
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
`infogrep status <dir>` prints the exact index location and reports **staleness** — files
|
|
214
|
+
added/modified/deleted since the last index — so you know when a manual `infogrep index` is
|
|
215
|
+
worth running.
|
|
216
|
+
|
|
217
|
+
### MCP server (Claude Code / Codex)
|
|
218
|
+
|
|
219
|
+
Register InfoGrep as an MCP server so an agent can search your files as a tool call:
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
claude mcp add infogrep -- uv run infogrep mcp --dir /path/to/your/project
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Tools exposed: `search_sparse`, `search_dense`, `search_kb`, `search_graph`,
|
|
226
|
+
`search_hybrid`, `index_status`, `reindex`. Each search tool returns `{"results": [...]}`
|
|
227
|
+
where every result carries `path`, `page`, `snippet`, `score`, and `retriever` for easy
|
|
228
|
+
citation. `search_hybrid` (recommended) fuses whichever retrievers are enabled and reports
|
|
229
|
+
which were `used` vs. `skipped` (and why).
|
|
230
|
+
|
|
231
|
+
### Browser UI
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
infogrep serve --dir <dir> # http://127.0.0.1:7421 by default
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
A search box, a mode selector (hybrid/sparse/dense/kb/graph), result snippets with path/page/
|
|
238
|
+
score, folder management (add/switch indexed directories), and a JSON API at
|
|
239
|
+
`/api/search` and `/api/status`. Bound to localhost only.
|
|
240
|
+
|
|
241
|
+
### Folder/filename metadata graph
|
|
242
|
+
|
|
243
|
+
On every reindex, InfoGrep builds a knowledge graph over your folder structure — each file's
|
|
244
|
+
*path and name only, never its content* — and materializes it as an Obsidian-compatible vault
|
|
245
|
+
of linked folder notes under the index's `graph_vault/` side-car directory (open it in
|
|
246
|
+
Obsidian to browse, if you like). `search_graph` matches a query against folder/file *names*,
|
|
247
|
+
then expands to neighboring folders (parent, children, siblings) so files that live in the
|
|
248
|
+
most relevant folder(s) surface too — not just files whose own name/content matched. It
|
|
249
|
+
participates in `search_hybrid` automatically, letting one hit pull in its co-located
|
|
250
|
+
siblings. On by default (it's cheap — just path manipulation, no model or JVM):
|
|
251
|
+
|
|
252
|
+
```toml
|
|
253
|
+
[graph]
|
|
254
|
+
enabled = true # set false to disable
|
|
255
|
+
hops = 1 # folder hops to expand from a matched folder (parent/children/siblings)
|
|
256
|
+
max_folders = 5 # top-scoring folders to expand into file candidates per query
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Knowledge base (Obsidian vault)
|
|
260
|
+
|
|
261
|
+
`search_kb` adds graph-aware search over an Obsidian vault via the **Obsidian CLI**: it
|
|
262
|
+
searches the live vault, then expands along links/backlinks so notes *connected* to a match
|
|
263
|
+
surface too — always current, no separate index. Requires the Obsidian app running with the
|
|
264
|
+
vault open. Enable per directory:
|
|
265
|
+
|
|
266
|
+
```toml
|
|
267
|
+
[kb]
|
|
268
|
+
enabled = true
|
|
269
|
+
vault = "My Vault" # Obsidian vault name; omit to use the CLI's active vault
|
|
270
|
+
hops = 1 # link hops to expand (follows links + backlinks)
|
|
271
|
+
# cli = "obsidian" # path to the Obsidian CLI, if not on PATH
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
If the app isn't running, `search_kb` is skipped (in hybrid) or reports a clear error
|
|
275
|
+
(standalone).
|
|
276
|
+
|
|
277
|
+
### Scanned PDFs (OCR)
|
|
278
|
+
|
|
279
|
+
PDFs with no text layer can be OCR'd at ingest time (requires `tesseract`):
|
|
280
|
+
|
|
281
|
+
```toml
|
|
282
|
+
[ingest]
|
|
283
|
+
ocr = true # OCR pages with little/no extractable text
|
|
284
|
+
ocr_min_chars = 16 # threshold below which a page is OCR'd
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Daily auto-reindex
|
|
288
|
+
|
|
289
|
+
`infogrep schedule install <dir>` registers a macOS `launchd` agent that reindexes the
|
|
290
|
+
directory once a day (logs to the index dir's `reindex.log`).
|
|
291
|
+
|
|
292
|
+
## Configuration reference
|
|
293
|
+
|
|
294
|
+
Config is TOML, read from (in order) a global `$INFOGREP_HOME/config.toml`, then a
|
|
295
|
+
per-directory override at that index's `config.toml` (path shown by `infogrep status`).
|
|
296
|
+
|
|
297
|
+
```toml
|
|
298
|
+
include = ["**/*.pdf", "**/*.docx", "..."] # default: documents + images, see table above
|
|
299
|
+
exclude = ["**/node_modules/**", "..."] # default: VCS/deps/cache/OS junk
|
|
300
|
+
|
|
301
|
+
[chunk]
|
|
302
|
+
size = 512 # target passage size (tokens/words)
|
|
303
|
+
overlap = 64 # overlap between adjacent passages
|
|
304
|
+
|
|
305
|
+
[ingest]
|
|
306
|
+
ocr = false # OCR scanned PDF pages
|
|
307
|
+
ocr_min_chars = 16 # page text below this length triggers OCR
|
|
308
|
+
workers = 0 # parallel extraction processes; 0 = auto (min(8, cpu count))
|
|
309
|
+
|
|
310
|
+
[sparse]
|
|
311
|
+
enabled = true
|
|
312
|
+
prf = false # RM3 pseudo-relevance feedback
|
|
313
|
+
prf_fb_docs = 10
|
|
314
|
+
prf_fb_terms = 10
|
|
315
|
+
language = "en+zh" # "en" | "zh" | "ja" | "ko" | "en+zh" (changing re-indexes)
|
|
316
|
+
field_boosts = { contents = 1.0, filename = 2.0, pathtext = 1.0 }
|
|
317
|
+
|
|
318
|
+
[dense]
|
|
319
|
+
enabled = false # off by default: needs a model + RAM/GPU
|
|
320
|
+
embedder = "qwen" # registry key; see infogrep.retrieval.embedders
|
|
321
|
+
model_name = "Qwen/Qwen3-Embedding-0.6B"
|
|
322
|
+
device = "auto" # "auto" -> mps/cuda/cpu
|
|
323
|
+
|
|
324
|
+
[kb]
|
|
325
|
+
enabled = false
|
|
326
|
+
vault = "" # Obsidian vault name; empty -> the CLI's active vault
|
|
327
|
+
cli = "obsidian"
|
|
328
|
+
hops = 1
|
|
329
|
+
search_limit = 10
|
|
330
|
+
|
|
331
|
+
[graph]
|
|
332
|
+
enabled = true # folder/filename metadata graph; cheap, on by default
|
|
333
|
+
hops = 1 # folder hops to expand from a matched folder
|
|
334
|
+
max_folders = 5 # top-scoring folders to expand into file candidates per query
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
With dense off (the default), `hybrid` simply runs sparse and the metadata graph (plus the
|
|
338
|
+
knowledge base, if enabled) — no model download needed until you opt in.
|
|
339
|
+
|
|
340
|
+
## Development
|
|
341
|
+
|
|
342
|
+
```bash
|
|
343
|
+
make sync # create/refresh the dev virtualenv
|
|
344
|
+
make test # run the test suite
|
|
345
|
+
make lint # ruff + shellcheck
|
|
346
|
+
make app # build the macOS menu-bar app
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
See [PLAN.md](PLAN.md) for the full architecture write-up and milestone history.
|
|
350
|
+
|
|
351
|
+
## License
|
|
352
|
+
|
|
353
|
+
[MIT](LICENSE)
|