pyqmd 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyqmd-0.1.0/.github/workflows/docs.yml +38 -0
- pyqmd-0.1.0/.github/workflows/pypi_release.yml +44 -0
- pyqmd-0.1.0/.gitignore +5 -0
- pyqmd-0.1.0/BACKLOG.md +13 -0
- pyqmd-0.1.0/PKG-INFO +24 -0
- pyqmd-0.1.0/README.md +3 -0
- pyqmd-0.1.0/docs/IMPLEMENTATION.md +358 -0
- pyqmd-0.1.0/docs/VISION.md +82 -0
- pyqmd-0.1.0/docs/api.md +247 -0
- pyqmd-0.1.0/docs/architecture.md +86 -0
- pyqmd-0.1.0/docs/cli.md +147 -0
- pyqmd-0.1.0/docs/getting-started/installation.md +43 -0
- pyqmd-0.1.0/docs/getting-started/quickstart.md +63 -0
- pyqmd-0.1.0/docs/guide/collections.md +44 -0
- pyqmd-0.1.0/docs/guide/configuration.md +88 -0
- pyqmd-0.1.0/docs/guide/graphrag.md +153 -0
- pyqmd-0.1.0/docs/guide/indexing.md +112 -0
- pyqmd-0.1.0/docs/guide/searching.md +91 -0
- pyqmd-0.1.0/docs/index.md +117 -0
- pyqmd-0.1.0/docs/overrides/.gitkeep +0 -0
- pyqmd-0.1.0/docs/research/advanced-retrieval-techniques.md +181 -0
- pyqmd-0.1.0/docs/research/python-ecosystem.md +113 -0
- pyqmd-0.1.0/docs/research/qmd-architecture.md +109 -0
- pyqmd-0.1.0/docs/stylesheets/iris.css +256 -0
- pyqmd-0.1.0/docs/superpowers/plans/2026-04-01-pyqmd-tier1.md +3403 -0
- pyqmd-0.1.0/docs/superpowers/plans/2026-04-05-pepper-foundation.md +1298 -0
- pyqmd-0.1.0/docs/superpowers/specs/2026-04-01-pyqmd-design.md +337 -0
- pyqmd-0.1.0/docs/superpowers/specs/2026-04-05-pepper-foundation-design.md +177 -0
- pyqmd-0.1.0/mkdocs.yml +61 -0
- pyqmd-0.1.0/pyproject.toml +46 -0
- pyqmd-0.1.0/src/pyqmd/__init__.py +6 -0
- pyqmd-0.1.0/src/pyqmd/chunking/__init__.py +5 -0
- pyqmd-0.1.0/src/pyqmd/chunking/frontmatter.py +34 -0
- pyqmd-0.1.0/src/pyqmd/chunking/markdown.py +235 -0
- pyqmd-0.1.0/src/pyqmd/chunking/scoring.py +68 -0
- pyqmd-0.1.0/src/pyqmd/cli.py +363 -0
- pyqmd-0.1.0/src/pyqmd/config.py +158 -0
- pyqmd-0.1.0/src/pyqmd/core.py +309 -0
- pyqmd-0.1.0/src/pyqmd/embeddings/__init__.py +6 -0
- pyqmd-0.1.0/src/pyqmd/embeddings/base.py +14 -0
- pyqmd-0.1.0/src/pyqmd/embeddings/sentence_transformers.py +37 -0
- pyqmd-0.1.0/src/pyqmd/graph/__init__.py +1 -0
- pyqmd-0.1.0/src/pyqmd/graph/engine.py +260 -0
- pyqmd-0.1.0/src/pyqmd/indexing/__init__.py +5 -0
- pyqmd-0.1.0/src/pyqmd/indexing/contextual.py +125 -0
- pyqmd-0.1.0/src/pyqmd/indexing/hasher.py +37 -0
- pyqmd-0.1.0/src/pyqmd/indexing/pipeline.py +146 -0
- pyqmd-0.1.0/src/pyqmd/models.py +76 -0
- pyqmd-0.1.0/src/pyqmd/progress.py +94 -0
- pyqmd-0.1.0/src/pyqmd/retrieval/__init__.py +5 -0
- pyqmd-0.1.0/src/pyqmd/retrieval/fusion.py +16 -0
- pyqmd-0.1.0/src/pyqmd/retrieval/hyde.py +74 -0
- pyqmd-0.1.0/src/pyqmd/retrieval/parent.py +23 -0
- pyqmd-0.1.0/src/pyqmd/retrieval/pipeline.py +163 -0
- pyqmd-0.1.0/src/pyqmd/retrieval/rerank.py +17 -0
- pyqmd-0.1.0/src/pyqmd/storage/__init__.py +6 -0
- pyqmd-0.1.0/src/pyqmd/storage/base.py +38 -0
- pyqmd-0.1.0/src/pyqmd/storage/lancedb_backend.py +143 -0
- pyqmd-0.1.0/src/pyqmd/watch.py +200 -0
- pyqmd-0.1.0/tests/conftest.py +44 -0
- pyqmd-0.1.0/tests/fixtures/sample_markdown/large.md +69 -0
- pyqmd-0.1.0/tests/fixtures/sample_markdown/nested_headings.md +33 -0
- pyqmd-0.1.0/tests/fixtures/sample_markdown/simple.md +15 -0
- pyqmd-0.1.0/tests/fixtures/sample_markdown/with_code.md +24 -0
- pyqmd-0.1.0/tests/fixtures/sample_markdown/with_frontmatter.md +18 -0
- pyqmd-0.1.0/tests/test_chunking.py +75 -0
- pyqmd-0.1.0/tests/test_cli.py +64 -0
- pyqmd-0.1.0/tests/test_config.py +154 -0
- pyqmd-0.1.0/tests/test_core.py +45 -0
- pyqmd-0.1.0/tests/test_embeddings.py +42 -0
- pyqmd-0.1.0/tests/test_frontmatter.py +44 -0
- pyqmd-0.1.0/tests/test_fusion.py +38 -0
- pyqmd-0.1.0/tests/test_hasher.py +41 -0
- pyqmd-0.1.0/tests/test_indexing.py +51 -0
- pyqmd-0.1.0/tests/test_models.py +96 -0
- pyqmd-0.1.0/tests/test_parent.py +44 -0
- pyqmd-0.1.0/tests/test_rerank.py +33 -0
- pyqmd-0.1.0/tests/test_retrieval.py +86 -0
- pyqmd-0.1.0/tests/test_scoring.py +58 -0
- pyqmd-0.1.0/tests/test_storage.py +93 -0
- pyqmd-0.1.0/tests/test_watch.py +50 -0
- pyqmd-0.1.0/uv.lock +4237 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Deploy Docs
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches: [master]
|
|
5
|
+
workflow_dispatch:
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
pages: write
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
concurrency:
|
|
13
|
+
group: "pages"
|
|
14
|
+
cancel-in-progress: true
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
build:
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: '3.12'
|
|
24
|
+
- run: pip install mkdocs-material
|
|
25
|
+
- run: mkdocs build --strict
|
|
26
|
+
- uses: actions/upload-pages-artifact@v3
|
|
27
|
+
with:
|
|
28
|
+
path: site
|
|
29
|
+
|
|
30
|
+
deploy:
|
|
31
|
+
needs: build
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
environment:
|
|
34
|
+
name: github-pages
|
|
35
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
36
|
+
steps:
|
|
37
|
+
- id: deployment
|
|
38
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
|
|
13
|
+
- name: Set up Python
|
|
14
|
+
uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
|
|
18
|
+
- name: Install build dependencies
|
|
19
|
+
run: pip install build
|
|
20
|
+
|
|
21
|
+
- name: Build package
|
|
22
|
+
run: python -m build
|
|
23
|
+
|
|
24
|
+
- name: Upload build artifacts
|
|
25
|
+
uses: actions/upload-artifact@v4
|
|
26
|
+
with:
|
|
27
|
+
name: dist
|
|
28
|
+
path: dist/
|
|
29
|
+
|
|
30
|
+
publish:
|
|
31
|
+
needs: build
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
environment: pypi
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write
|
|
36
|
+
steps:
|
|
37
|
+
- name: Download build artifacts
|
|
38
|
+
uses: actions/download-artifact@v4
|
|
39
|
+
with:
|
|
40
|
+
name: dist
|
|
41
|
+
path: dist/
|
|
42
|
+
|
|
43
|
+
- name: Publish to PyPI
|
|
44
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
pyqmd-0.1.0/.gitignore
ADDED
pyqmd-0.1.0/BACKLOG.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Backlog
|
|
2
|
+
|
|
3
|
+
## Completed
|
|
4
|
+
|
|
5
|
+
- [x] **Config migration to TOML + Pydantic** — Replaced JSON config with TOML on disk and Pydantic models in memory. Per-collection overrides, watch/search config sections.
|
|
6
|
+
- [x] **FTS index fix** — Moved full-text search index creation from every query to store time.
|
|
7
|
+
- [x] **Path-prefix search filter** — `--path-prefix` option on `qmd search` to restrict results by file path. Configurable overfetch multiplier.
|
|
8
|
+
- [x] **Watch command** — `qmd watch` with watchdog filesystem events, optional polling fallback, configurable debounce and ignore patterns.
|
|
9
|
+
|
|
10
|
+
## Tech Debt
|
|
11
|
+
|
|
12
|
+
- [ ] **diskcache unsafe pickle deserialization (CVE-2025-69872)** — Transitive dep via `nano-graphrag` -> `dspy` -> `diskcache<=5.6.3`. No patched version available yet. Dismissed as tolerable risk since exploitation requires local write access to cache dir. Revisit when `diskcache` releases a fix. [Dependabot alert #1](https://github.com/jeffrichley/pyqmd/security/dependabot/1)
|
|
13
|
+
- [ ] **Default string duplication** — Default values (chunk_size=800, embed_model, etc.) are duplicated between `config.py` load() fallbacks and `Collection` model fields. Low risk but could diverge.
|
pyqmd-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyqmd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python-native local search engine for markdown files. Hybrid BM25 + vector search with reranking.
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: lancedb>=0.6.0
|
|
7
|
+
Requires-Dist: markdown-it-py>=3.0.0
|
|
8
|
+
Requires-Dist: nano-graphrag>=0.0.8.2
|
|
9
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
10
|
+
Requires-Dist: pydantic>=2.0
|
|
11
|
+
Requires-Dist: pyyaml>=6.0
|
|
12
|
+
Requires-Dist: rich>=13.0.0
|
|
13
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
14
|
+
Requires-Dist: tomli-w>=1.0.0
|
|
15
|
+
Requires-Dist: typer>=0.9.0
|
|
16
|
+
Requires-Dist: watchdog>=3.0.0
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest-tmp-files>=0.0.2; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# pyqmd
|
|
23
|
+
|
|
24
|
+
Python-native local search engine for markdown files. Hybrid BM25 + vector search with reranking.
|
pyqmd-0.1.0/README.md
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
# py-qmd Implementation Plan
|
|
2
|
+
|
|
3
|
+
## Tech Stack
|
|
4
|
+
|
|
5
|
+
| Component | Choice | Rationale |
|
|
6
|
+
|-----------|--------|-----------|
|
|
7
|
+
| Package manager | uv | User preference, fast, modern |
|
|
8
|
+
| CLI framework | Typer | User preference, clean API |
|
|
9
|
+
| Logging | Rich | User preference, beautiful output |
|
|
10
|
+
| Primary storage | LanceDB | Embedded, native hybrid search, zero-config |
|
|
11
|
+
| Alt storage | SQLite + FTS5 + sqlite-vec | QMD-parity option, more control |
|
|
12
|
+
| Embeddings | sentence-transformers | Pluggable, local-first, huge model ecosystem |
|
|
13
|
+
| Default embed model | all-MiniLM-L6-v2 | Fast, good quality, 384 dims |
|
|
14
|
+
| Reranking | cross-encoder (sentence-transformers) | Simple, effective, upgradable to ColBERT |
|
|
15
|
+
| Markdown parsing | markdown-it-py or mistune | Fast, extensible |
|
|
16
|
+
| Document conversion | markitdown (Microsoft) | PDF/DOCX/PPTX → markdown |
|
|
17
|
+
|
|
18
|
+
## Architecture
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
┌──────────────────────────────────────────────────────┐
|
|
22
|
+
│ py-qmd CLI (Typer) │
|
|
23
|
+
│ qmd add | qmd search | qmd index | qmd status │
|
|
24
|
+
├──────────────────────────────────────────────────────┤
|
|
25
|
+
│ py-qmd Python API │
|
|
26
|
+
│ PyQMD.add_collection() | .search() | .index() │
|
|
27
|
+
├──────────────┬───────────────────┬───────────────────┤
|
|
28
|
+
│ Indexing │ Querying │ Management │
|
|
29
|
+
│ Pipeline │ Pipeline │ │
|
|
30
|
+
├──────────────┼───────────────────┼───────────────────┤
|
|
31
|
+
│ Storage Layer (pluggable) │
|
|
32
|
+
│ LanceDB | SQLite+FTS5+sqlite-vec │
|
|
33
|
+
├──────────────────────────────────────────────────────┤
|
|
34
|
+
│ Embedding Layer (pluggable) │
|
|
35
|
+
│ sentence-transformers | GGUF | API-based │
|
|
36
|
+
├──────────────────────────────────────────────────────┤
|
|
37
|
+
│ Reranking Layer (pluggable) │
|
|
38
|
+
│ cross-encoder | ColBERT | local LLM | none │
|
|
39
|
+
└──────────────────────────────────────────────────────┘
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Tier 1: QMD Parity + Quick Wins
|
|
43
|
+
|
|
44
|
+
**Goal:** A working py-qmd that matches QMD's core functionality plus two easy
|
|
45
|
+
high-impact additions (contextual retrieval, parent-child retrieval).
|
|
46
|
+
|
|
47
|
+
### 1.1 Core Data Model
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
@dataclass
|
|
51
|
+
class Chunk:
|
|
52
|
+
id: str # SHA-256 hash of content
|
|
53
|
+
content: str # The actual text
|
|
54
|
+
context: str | None # LLM-generated context prefix (contextual retrieval)
|
|
55
|
+
source_file: str # Path to source markdown file
|
|
56
|
+
collection: str # Collection name
|
|
57
|
+
heading_path: list[str] # ["H1 title", "H2 title", "H3 title"]
|
|
58
|
+
parent_id: str | None # ID of parent chunk (parent-child retrieval)
|
|
59
|
+
start_line: int # Line number in source file
|
|
60
|
+
end_line: int # Line number in source file
|
|
61
|
+
metadata: dict # Arbitrary metadata (tags, dates, etc.)
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class SearchResult:
|
|
65
|
+
chunk: Chunk
|
|
66
|
+
score: float # Combined score after fusion
|
|
67
|
+
bm25_score: float | None # Individual BM25 score
|
|
68
|
+
vector_score: float | None # Individual vector similarity score
|
|
69
|
+
rerank_score: float | None # Reranker score (if reranking enabled)
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class Collection:
|
|
73
|
+
name: str
|
|
74
|
+
paths: list[str] # Directories to index
|
|
75
|
+
mask: str # Glob pattern (default: "**/*.md")
|
|
76
|
+
config: CollectionConfig # Per-collection settings
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 1.2 Markdown-Aware Chunking
|
|
80
|
+
|
|
81
|
+
Custom chunker inspired by QMD's scoring algorithm:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
BREAK_SCORES = {
|
|
85
|
+
"h1": 100, # # Heading 1
|
|
86
|
+
"h2": 90, # ## Heading 2
|
|
87
|
+
"h3": 80, # ### Heading 3
|
|
88
|
+
"h4": 70, # #### Heading 4
|
|
89
|
+
"code_block_end": 85, # End of fenced code block
|
|
90
|
+
"hr": 75, # Horizontal rule / thematic break
|
|
91
|
+
"blank_line": 50, # Empty line between paragraphs
|
|
92
|
+
"list_end": 45, # End of a list
|
|
93
|
+
"blockquote_end": 40, # End of a blockquote
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Rules:**
|
|
98
|
+
- Target chunk size: ~800 tokens (configurable)
|
|
99
|
+
- Overlap: 15% (configurable)
|
|
100
|
+
- Never split inside fenced code blocks
|
|
101
|
+
- Never split inside tables
|
|
102
|
+
- Preserve heading hierarchy as metadata on each chunk
|
|
103
|
+
- Parent-child: each chunk stores its parent heading's chunk ID
|
|
104
|
+
|
|
105
|
+
### 1.3 Indexing Pipeline
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
File detected (new or modified)
|
|
109
|
+
│
|
|
110
|
+
▼
|
|
111
|
+
Parse markdown → identify structure (headings, code blocks, etc.)
|
|
112
|
+
│
|
|
113
|
+
▼
|
|
114
|
+
Split into chunks using break-point scoring
|
|
115
|
+
│
|
|
116
|
+
▼
|
|
117
|
+
[Optional] Generate context prefix via LLM (contextual retrieval)
|
|
118
|
+
│
|
|
119
|
+
▼
|
|
120
|
+
Compute embeddings (sentence-transformers)
|
|
121
|
+
│
|
|
122
|
+
▼
|
|
123
|
+
Store in LanceDB (text + vector + metadata)
|
|
124
|
+
│
|
|
125
|
+
▼
|
|
126
|
+
Update file hash registry (for incremental updates)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Incremental indexing:** Track file hashes (SHA-256 of file content). On re-index,
|
|
130
|
+
skip unchanged files. When a file changes, remove all its chunks and re-index it.
|
|
131
|
+
|
|
132
|
+
### 1.4 Query Pipeline
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
User query
|
|
136
|
+
│
|
|
137
|
+
▼
|
|
138
|
+
[Optional] Query expansion (keyword variants, domain terms)
|
|
139
|
+
│
|
|
140
|
+
▼
|
|
141
|
+
┌───────────┐ ┌──────────────┐
|
|
142
|
+
│ BM25 │ │ Vector │
|
|
143
|
+
│ search │ │ search │
|
|
144
|
+
└─────┬─────┘ └──────┬───────┘
|
|
145
|
+
│ │
|
|
146
|
+
▼ ▼
|
|
147
|
+
Reciprocal Rank Fusion (k=60)
|
|
148
|
+
│
|
|
149
|
+
▼
|
|
150
|
+
[Optional] Cross-encoder reranking
|
|
151
|
+
│
|
|
152
|
+
▼
|
|
153
|
+
[Optional] Parent expansion (return parent chunks for context)
|
|
154
|
+
│
|
|
155
|
+
▼
|
|
156
|
+
Return top-K SearchResults
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### 1.5 CLI Commands
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Collection management
|
|
163
|
+
qmd add <name> <path> [--mask "**/*.md"]
|
|
164
|
+
qmd remove <name>
|
|
165
|
+
qmd list # List all collections
|
|
166
|
+
qmd status [name] # Show index stats
|
|
167
|
+
|
|
168
|
+
# Indexing
|
|
169
|
+
qmd index [name] # Index/re-index a collection (or all)
|
|
170
|
+
qmd index --full # Force full re-index (ignore hashes)
|
|
171
|
+
|
|
172
|
+
# Searching
|
|
173
|
+
qmd search "query text" # Search all collections
|
|
174
|
+
qmd search "query" --collection <name> # Search specific collection
|
|
175
|
+
qmd search "query" --top-k 10 # Limit results
|
|
176
|
+
qmd search "query" --no-rerank # Skip reranking step
|
|
177
|
+
qmd search "query" --expand # Enable parent chunk expansion
|
|
178
|
+
qmd search "query" --hyde # Enable HyDE (Tier 2)
|
|
179
|
+
|
|
180
|
+
# Configuration
|
|
181
|
+
qmd config # Show current config
|
|
182
|
+
qmd config set embed_model <model> # Change embedding model
|
|
183
|
+
qmd config set chunk_size 800 # Change target chunk size
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### 1.6 Python API
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from py_qmd import PyQMD
|
|
190
|
+
|
|
191
|
+
# Initialize
|
|
192
|
+
qmd = PyQMD(data_dir="~/.py-qmd")
|
|
193
|
+
|
|
194
|
+
# Add and index a collection
|
|
195
|
+
qmd.add_collection("notes", paths=["~/notes"], mask="**/*.md")
|
|
196
|
+
qmd.index("notes")
|
|
197
|
+
|
|
198
|
+
# Search
|
|
199
|
+
results = qmd.search("how to handle NaN values", top_k=5)
|
|
200
|
+
for result in results:
|
|
201
|
+
print(f"{result.score:.3f} | {result.chunk.source_file}")
|
|
202
|
+
print(f" {result.chunk.heading_path}")
|
|
203
|
+
print(f" {result.chunk.content[:200]}")
|
|
204
|
+
|
|
205
|
+
# Search with options
|
|
206
|
+
results = qmd.search(
|
|
207
|
+
"indicator lookback period",
|
|
208
|
+
collections=["notes", "docs"],
|
|
209
|
+
top_k=10,
|
|
210
|
+
rerank=True,
|
|
211
|
+
expand_parents=True,
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Tier 2: Beyond QMD
|
|
216
|
+
|
|
217
|
+
### 2.1 HyDE (Hypothetical Document Embeddings)
|
|
218
|
+
|
|
219
|
+
At query time, generate a hypothetical answer via LLM, embed it, and use that
|
|
220
|
+
embedding for vector search. Bridges the vocabulary gap between questions and answers.
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
results = qmd.search("why does my indicator return NaN", hyde=True)
|
|
224
|
+
# Internally:
|
|
225
|
+
# 1. LLM generates: "The indicator returns NaN because the lookback period
|
|
226
|
+
# exceeds the available data..."
|
|
227
|
+
# 2. That hypothetical answer is embedded
|
|
228
|
+
# 3. Vector search uses the hypothetical embedding (closer to real answers)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### 2.2 ColBERT Integration
|
|
232
|
+
|
|
233
|
+
Replace or augment single-vector search with ColBERT's per-token late interaction.
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
qmd.config.set("retriever", "colbert") # or "hybrid+colbert"
|
|
237
|
+
# Uses ragatouille under the hood
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### 2.3 Advanced Query Expansion
|
|
241
|
+
|
|
242
|
+
Use an LLM to generate multiple sub-queries from a single user query:
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
# User: "my bollinger bands look wrong"
|
|
246
|
+
# Expanded:
|
|
247
|
+
# - "bollinger bands incorrect values"
|
|
248
|
+
# - "technical indicator calculation error"
|
|
249
|
+
# - "rolling standard deviation pandas"
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### 2.4 Pluggable Embedding Models
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
qmd = PyQMD(embed_model="nomic-embed-text") # sentence-transformers
|
|
256
|
+
qmd = PyQMD(embed_model="gguf:model.gguf") # Local GGUF via llama-cpp
|
|
257
|
+
qmd = PyQMD(embed_model="openai:text-embedding-3-small") # API-based
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Tier 3: Advanced Features
|
|
261
|
+
|
|
262
|
+
### 3.1 GraphRAG
|
|
263
|
+
|
|
264
|
+
Build a knowledge graph from indexed content. Extract entities (functions, concepts,
|
|
265
|
+
error types) and relationships. Enable multi-hop queries.
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
qmd.build_graph("course-qa") # Extract entities + relationships
|
|
269
|
+
results = qmd.graph_search("relationship between Sharpe ratio and volatility")
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### 3.2 RAPTOR
|
|
273
|
+
|
|
274
|
+
Recursive summarization tree for hierarchical content. Best for static collections
|
|
275
|
+
(lecture notes, course docs) that don't change often.
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
qmd.build_raptor_tree("lectures") # Cluster → summarize → recurse
|
|
279
|
+
results = qmd.search("market microstructure", strategy="raptor")
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### 3.3 MCP Server
|
|
283
|
+
|
|
284
|
+
Expose py-qmd as an MCP server for Claude Code and other AI tools.
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
qmd serve --mcp # Start MCP server
|
|
288
|
+
qmd serve --mcp --port 8080 # Custom port
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### 3.4 File Watching
|
|
292
|
+
|
|
293
|
+
Watch collections for changes and auto-reindex.
|
|
294
|
+
|
|
295
|
+
```bash
|
|
296
|
+
qmd watch # Watch all collections
|
|
297
|
+
qmd watch --collection notes # Watch specific collection
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## Project Structure
|
|
301
|
+
|
|
302
|
+
```
|
|
303
|
+
py-qmd/
|
|
304
|
+
├── pyproject.toml # uv project config
|
|
305
|
+
├── README.md
|
|
306
|
+
├── src/
|
|
307
|
+
│ └── py_qmd/
|
|
308
|
+
│ ├── __init__.py # Public API
|
|
309
|
+
│ ├── cli.py # Typer CLI
|
|
310
|
+
│ ├── core.py # PyQMD main class
|
|
311
|
+
│ ├── chunking/
|
|
312
|
+
│ │ ├── __init__.py
|
|
313
|
+
│ │ ├── markdown.py # Markdown-aware chunker
|
|
314
|
+
│ │ ├── scoring.py # Break-point scoring algorithm
|
|
315
|
+
│ │ └── code.py # AST-aware code chunking (tree-sitter)
|
|
316
|
+
│ ├── indexing/
|
|
317
|
+
│ │ ├── __init__.py
|
|
318
|
+
│ │ ├── pipeline.py # Indexing pipeline orchestration
|
|
319
|
+
│ │ ├── contextual.py # Contextual retrieval (LLM context generation)
|
|
320
|
+
│ │ └── hasher.py # File hash tracking for incremental updates
|
|
321
|
+
│ ├── retrieval/
|
|
322
|
+
│ │ ├── __init__.py
|
|
323
|
+
│ │ ├── pipeline.py # Query pipeline orchestration
|
|
324
|
+
│ │ ├── bm25.py # BM25 search
|
|
325
|
+
│ │ ├── vector.py # Vector search
|
|
326
|
+
│ │ ├── fusion.py # RRF + position-aware blending
|
|
327
|
+
│ │ ├── rerank.py # Cross-encoder / ColBERT reranking
|
|
328
|
+
│ │ ├── hyde.py # HyDE query expansion
|
|
329
|
+
│ │ └── parent.py # Parent-child expansion
|
|
330
|
+
│ ├── storage/
|
|
331
|
+
│ │ ├── __init__.py
|
|
332
|
+
│ │ ├── base.py # Abstract storage interface
|
|
333
|
+
│ │ ├── lancedb.py # LanceDB backend
|
|
334
|
+
│ │ └── sqlite.py # SQLite + FTS5 + sqlite-vec backend
|
|
335
|
+
│ ├── embeddings/
|
|
336
|
+
│ │ ├── __init__.py
|
|
337
|
+
│ │ ├── base.py # Abstract embedding interface
|
|
338
|
+
│ │ ├── sentence_transformers.py
|
|
339
|
+
│ │ ├── gguf.py # Local GGUF models
|
|
340
|
+
│ │ └── api.py # API-based embeddings (OpenAI, etc.)
|
|
341
|
+
│ ├── graph/ # Tier 3: GraphRAG
|
|
342
|
+
│ ├── raptor/ # Tier 3: RAPTOR
|
|
343
|
+
│ ├── models.py # Data models (Chunk, SearchResult, etc.)
|
|
344
|
+
│ └── config.py # Configuration management
|
|
345
|
+
├── tests/
|
|
346
|
+
│ ├── test_chunking.py
|
|
347
|
+
│ ├── test_indexing.py
|
|
348
|
+
│ ├── test_retrieval.py
|
|
349
|
+
│ └── fixtures/
|
|
350
|
+
│ └── sample_markdown/ # Test markdown files
|
|
351
|
+
└── docs/
|
|
352
|
+
├── VISION.md
|
|
353
|
+
├── IMPLEMENTATION.md
|
|
354
|
+
└── research/
|
|
355
|
+
├── qmd-architecture.md
|
|
356
|
+
├── advanced-retrieval-techniques.md
|
|
357
|
+
└── python-ecosystem.md
|
|
358
|
+
```
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# py-qmd: Python Query Markup Documents
|
|
2
|
+
|
|
3
|
+
## What Is This?
|
|
4
|
+
|
|
5
|
+
A Python-native local search engine for markdown files, inspired by
|
|
6
|
+
[QMD](https://github.com/tobi/qmd) but going beyond it. py-qmd indexes directories
|
|
7
|
+
of markdown files and makes them searchable via hybrid search (full-text + semantic +
|
|
8
|
+
reranking), all running locally with no cloud dependencies.
|
|
9
|
+
|
|
10
|
+
## Why Build This?
|
|
11
|
+
|
|
12
|
+
QMD (by Tobi Lutke) proved the concept: markdown files as a source of truth, indexed
|
|
13
|
+
for fast hybrid retrieval. But QMD is JavaScript/Bun only. The Python ecosystem has all
|
|
14
|
+
the individual pieces (chunkers, embedding models, vector stores, rerankers) but nobody
|
|
15
|
+
has built the glue layer that ties them together into a cohesive, local-first search
|
|
16
|
+
engine with a clean CLI.
|
|
17
|
+
|
|
18
|
+
py-qmd fills that gap.
|
|
19
|
+
|
|
20
|
+
## Core Principles
|
|
21
|
+
|
|
22
|
+
1. **Markdown is the source of truth.** Files are human-readable, version-controlled,
|
|
23
|
+
git-diffable. py-qmd indexes them without modifying them.
|
|
24
|
+
|
|
25
|
+
2. **Local-first.** Everything runs on your machine. No API keys required for core
|
|
26
|
+
search functionality. (LLM-powered features like contextual retrieval and HyDE
|
|
27
|
+
optionally use an API.)
|
|
28
|
+
|
|
29
|
+
3. **Pluggable.** Swap embedding models, storage backends, rerankers. Start simple,
|
|
30
|
+
upgrade components independently.
|
|
31
|
+
|
|
32
|
+
4. **Beyond QMD.** Incorporate techniques QMD doesn't have: contextual retrieval,
|
|
33
|
+
parent-child retrieval, ColBERT, GraphRAG, HyDE.
|
|
34
|
+
|
|
35
|
+
5. **Python-native.** Built with uv, Rich logging, Typer CLI. First-class Python
|
|
36
|
+
library API alongside the CLI.
|
|
37
|
+
|
|
38
|
+
## Who Is This For?
|
|
39
|
+
|
|
40
|
+
- Developers who keep knowledge in markdown (notes, docs, meeting transcripts)
|
|
41
|
+
- AI agent builders who need a local retrieval backend (Claude Code skills, MCP servers)
|
|
42
|
+
- Educators who need to search across semesters of course materials and forum archives
|
|
43
|
+
- Anyone who wants "search my markdown files" without spinning up Elasticsearch
|
|
44
|
+
|
|
45
|
+
## How It Compares to QMD
|
|
46
|
+
|
|
47
|
+
| Feature | QMD | py-qmd |
|
|
48
|
+
|---------|-----|--------|
|
|
49
|
+
| Language | JavaScript/Bun | Python |
|
|
50
|
+
| Chunking | Markdown-aware, AST for code | Same + configurable scoring |
|
|
51
|
+
| BM25 | SQLite FTS5 | LanceDB native (or SQLite FTS5) |
|
|
52
|
+
| Vector search | sqlite-vec | LanceDB native (or sqlite-vec) |
|
|
53
|
+
| Embeddings | embeddinggemma-300M (GGUF) | Pluggable (sentence-transformers, GGUF, API) |
|
|
54
|
+
| Reranking | Qwen3-Reranker-0.6B | Pluggable (cross-encoder, ColBERT, local LLM) |
|
|
55
|
+
| Fusion | RRF | RRF + position-aware blending |
|
|
56
|
+
| Contextual retrieval | No | Yes (Tier 1) |
|
|
57
|
+
| Parent-child retrieval | No | Yes (Tier 1) |
|
|
58
|
+
| HyDE | Partial | Yes (Tier 2) |
|
|
59
|
+
| ColBERT | No | Yes (Tier 2) |
|
|
60
|
+
| GraphRAG | No | Yes (Tier 3) |
|
|
61
|
+
| RAPTOR | No | Yes (Tier 3) |
|
|
62
|
+
| CLI | Custom | Typer + Rich |
|
|
63
|
+
| Python API | No | First-class |
|
|
64
|
+
| MCP server | Separate wrapper | Built-in option |
|
|
65
|
+
| Claude Code skill | No | Planned |
|
|
66
|
+
|
|
67
|
+
## Relationship to the EdStem Bot Project
|
|
68
|
+
|
|
69
|
+
py-qmd is a standalone library that the EdStem automation system will use as its
|
|
70
|
+
knowledge base and retrieval engine. The dependency flows one way:
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
py-qmd (standalone library, reusable)
|
|
74
|
+
↑
|
|
75
|
+
ed-api (EdStem API client)
|
|
76
|
+
↑
|
|
77
|
+
ed-ingest (scraper + media pipeline → markdown files → py-qmd collections)
|
|
78
|
+
↑
|
|
79
|
+
ed-bot (answer engine, Claude Code skills)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
py-qmd knows nothing about EdStem. It just indexes and searches markdown files.
|