pyqmd 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. pyqmd-0.1.0/.github/workflows/docs.yml +38 -0
  2. pyqmd-0.1.0/.github/workflows/pypi_release.yml +44 -0
  3. pyqmd-0.1.0/.gitignore +5 -0
  4. pyqmd-0.1.0/BACKLOG.md +13 -0
  5. pyqmd-0.1.0/PKG-INFO +24 -0
  6. pyqmd-0.1.0/README.md +3 -0
  7. pyqmd-0.1.0/docs/IMPLEMENTATION.md +358 -0
  8. pyqmd-0.1.0/docs/VISION.md +82 -0
  9. pyqmd-0.1.0/docs/api.md +247 -0
  10. pyqmd-0.1.0/docs/architecture.md +86 -0
  11. pyqmd-0.1.0/docs/cli.md +147 -0
  12. pyqmd-0.1.0/docs/getting-started/installation.md +43 -0
  13. pyqmd-0.1.0/docs/getting-started/quickstart.md +63 -0
  14. pyqmd-0.1.0/docs/guide/collections.md +44 -0
  15. pyqmd-0.1.0/docs/guide/configuration.md +88 -0
  16. pyqmd-0.1.0/docs/guide/graphrag.md +153 -0
  17. pyqmd-0.1.0/docs/guide/indexing.md +112 -0
  18. pyqmd-0.1.0/docs/guide/searching.md +91 -0
  19. pyqmd-0.1.0/docs/index.md +117 -0
  20. pyqmd-0.1.0/docs/overrides/.gitkeep +0 -0
  21. pyqmd-0.1.0/docs/research/advanced-retrieval-techniques.md +181 -0
  22. pyqmd-0.1.0/docs/research/python-ecosystem.md +113 -0
  23. pyqmd-0.1.0/docs/research/qmd-architecture.md +109 -0
  24. pyqmd-0.1.0/docs/stylesheets/iris.css +256 -0
  25. pyqmd-0.1.0/docs/superpowers/plans/2026-04-01-pyqmd-tier1.md +3403 -0
  26. pyqmd-0.1.0/docs/superpowers/plans/2026-04-05-pepper-foundation.md +1298 -0
  27. pyqmd-0.1.0/docs/superpowers/specs/2026-04-01-pyqmd-design.md +337 -0
  28. pyqmd-0.1.0/docs/superpowers/specs/2026-04-05-pepper-foundation-design.md +177 -0
  29. pyqmd-0.1.0/mkdocs.yml +61 -0
  30. pyqmd-0.1.0/pyproject.toml +46 -0
  31. pyqmd-0.1.0/src/pyqmd/__init__.py +6 -0
  32. pyqmd-0.1.0/src/pyqmd/chunking/__init__.py +5 -0
  33. pyqmd-0.1.0/src/pyqmd/chunking/frontmatter.py +34 -0
  34. pyqmd-0.1.0/src/pyqmd/chunking/markdown.py +235 -0
  35. pyqmd-0.1.0/src/pyqmd/chunking/scoring.py +68 -0
  36. pyqmd-0.1.0/src/pyqmd/cli.py +363 -0
  37. pyqmd-0.1.0/src/pyqmd/config.py +158 -0
  38. pyqmd-0.1.0/src/pyqmd/core.py +309 -0
  39. pyqmd-0.1.0/src/pyqmd/embeddings/__init__.py +6 -0
  40. pyqmd-0.1.0/src/pyqmd/embeddings/base.py +14 -0
  41. pyqmd-0.1.0/src/pyqmd/embeddings/sentence_transformers.py +37 -0
  42. pyqmd-0.1.0/src/pyqmd/graph/__init__.py +1 -0
  43. pyqmd-0.1.0/src/pyqmd/graph/engine.py +260 -0
  44. pyqmd-0.1.0/src/pyqmd/indexing/__init__.py +5 -0
  45. pyqmd-0.1.0/src/pyqmd/indexing/contextual.py +125 -0
  46. pyqmd-0.1.0/src/pyqmd/indexing/hasher.py +37 -0
  47. pyqmd-0.1.0/src/pyqmd/indexing/pipeline.py +146 -0
  48. pyqmd-0.1.0/src/pyqmd/models.py +76 -0
  49. pyqmd-0.1.0/src/pyqmd/progress.py +94 -0
  50. pyqmd-0.1.0/src/pyqmd/retrieval/__init__.py +5 -0
  51. pyqmd-0.1.0/src/pyqmd/retrieval/fusion.py +16 -0
  52. pyqmd-0.1.0/src/pyqmd/retrieval/hyde.py +74 -0
  53. pyqmd-0.1.0/src/pyqmd/retrieval/parent.py +23 -0
  54. pyqmd-0.1.0/src/pyqmd/retrieval/pipeline.py +163 -0
  55. pyqmd-0.1.0/src/pyqmd/retrieval/rerank.py +17 -0
  56. pyqmd-0.1.0/src/pyqmd/storage/__init__.py +6 -0
  57. pyqmd-0.1.0/src/pyqmd/storage/base.py +38 -0
  58. pyqmd-0.1.0/src/pyqmd/storage/lancedb_backend.py +143 -0
  59. pyqmd-0.1.0/src/pyqmd/watch.py +200 -0
  60. pyqmd-0.1.0/tests/conftest.py +44 -0
  61. pyqmd-0.1.0/tests/fixtures/sample_markdown/large.md +69 -0
  62. pyqmd-0.1.0/tests/fixtures/sample_markdown/nested_headings.md +33 -0
  63. pyqmd-0.1.0/tests/fixtures/sample_markdown/simple.md +15 -0
  64. pyqmd-0.1.0/tests/fixtures/sample_markdown/with_code.md +24 -0
  65. pyqmd-0.1.0/tests/fixtures/sample_markdown/with_frontmatter.md +18 -0
  66. pyqmd-0.1.0/tests/test_chunking.py +75 -0
  67. pyqmd-0.1.0/tests/test_cli.py +64 -0
  68. pyqmd-0.1.0/tests/test_config.py +154 -0
  69. pyqmd-0.1.0/tests/test_core.py +45 -0
  70. pyqmd-0.1.0/tests/test_embeddings.py +42 -0
  71. pyqmd-0.1.0/tests/test_frontmatter.py +44 -0
  72. pyqmd-0.1.0/tests/test_fusion.py +38 -0
  73. pyqmd-0.1.0/tests/test_hasher.py +41 -0
  74. pyqmd-0.1.0/tests/test_indexing.py +51 -0
  75. pyqmd-0.1.0/tests/test_models.py +96 -0
  76. pyqmd-0.1.0/tests/test_parent.py +44 -0
  77. pyqmd-0.1.0/tests/test_rerank.py +33 -0
  78. pyqmd-0.1.0/tests/test_retrieval.py +86 -0
  79. pyqmd-0.1.0/tests/test_scoring.py +58 -0
  80. pyqmd-0.1.0/tests/test_storage.py +93 -0
  81. pyqmd-0.1.0/tests/test_watch.py +50 -0
  82. pyqmd-0.1.0/uv.lock +4237 -0
@@ -0,0 +1,38 @@
1
+ name: Deploy Docs
2
+ on:
3
+ push:
4
+ branches: [master]
5
+ workflow_dispatch:
6
+
7
+ permissions:
8
+ contents: read
9
+ pages: write
10
+ id-token: write
11
+
12
+ concurrency:
13
+ group: "pages"
14
+ cancel-in-progress: true
15
+
16
+ jobs:
17
+ build:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: actions/setup-python@v5
22
+ with:
23
+ python-version: '3.12'
24
+ - run: pip install mkdocs-material
25
+ - run: mkdocs build --strict
26
+ - uses: actions/upload-pages-artifact@v3
27
+ with:
28
+ path: site
29
+
30
+ deploy:
31
+ needs: build
32
+ runs-on: ubuntu-latest
33
+ environment:
34
+ name: github-pages
35
+ url: ${{ steps.deployment.outputs.page_url }}
36
+ steps:
37
+ - id: deployment
38
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,44 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.12"
17
+
18
+ - name: Install build dependencies
19
+ run: pip install build
20
+
21
+ - name: Build package
22
+ run: python -m build
23
+
24
+ - name: Upload build artifacts
25
+ uses: actions/upload-artifact@v4
26
+ with:
27
+ name: dist
28
+ path: dist/
29
+
30
+ publish:
31
+ needs: build
32
+ runs-on: ubuntu-latest
33
+ environment: pypi
34
+ permissions:
35
+ id-token: write
36
+ steps:
37
+ - name: Download build artifacts
38
+ uses: actions/download-artifact@v4
39
+ with:
40
+ name: dist
41
+ path: dist/
42
+
43
+ - name: Publish to PyPI
44
+ uses: pypa/gh-action-pypi-publish@release/v1
pyqmd-0.1.0/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ .env
2
+ __pycache__/
3
+ *.pyc
4
+ .venv/
5
+ site/
pyqmd-0.1.0/BACKLOG.md ADDED
@@ -0,0 +1,13 @@
1
+ # Backlog
2
+
3
+ ## Completed
4
+
5
+ - [x] **Config migration to TOML + Pydantic** — Replaced JSON config with TOML on disk and Pydantic models in memory. Per-collection overrides, watch/search config sections.
6
+ - [x] **FTS index fix** — Moved full-text search index creation from every query to store time.
7
+ - [x] **Path-prefix search filter** — `--path-prefix` option on `qmd search` to restrict results by file path. Configurable overfetch multiplier.
8
+ - [x] **Watch command** — `qmd watch` with watchdog filesystem events, optional polling fallback, configurable debounce and ignore patterns.
9
+
10
+ ## Tech Debt
11
+
12
+ - [ ] **diskcache unsafe pickle deserialization (CVE-2025-69872)** — Transitive dep via `nano-graphrag` -> `dspy` -> `diskcache<=5.6.3`. No patched version available yet. Dismissed as tolerable risk since exploitation requires local write access to cache dir. Revisit when `diskcache` releases a fix. [Dependabot alert #1](https://github.com/jeffrichley/pyqmd/security/dependabot/1)
13
+ - [ ] **Default string duplication** — Default values (chunk_size=800, embed_model, etc.) are duplicated between `config.py` load() fallbacks and `Collection` model fields. Low risk but could diverge.
pyqmd-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyqmd
3
+ Version: 0.1.0
4
+ Summary: Python-native local search engine for markdown files. Hybrid BM25 + vector search with reranking.
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: lancedb>=0.6.0
7
+ Requires-Dist: markdown-it-py>=3.0.0
8
+ Requires-Dist: nano-graphrag>=0.0.8.2
9
+ Requires-Dist: pyarrow>=14.0.0
10
+ Requires-Dist: pydantic>=2.0
11
+ Requires-Dist: pyyaml>=6.0
12
+ Requires-Dist: rich>=13.0.0
13
+ Requires-Dist: sentence-transformers>=2.2.0
14
+ Requires-Dist: tomli-w>=1.0.0
15
+ Requires-Dist: typer>=0.9.0
16
+ Requires-Dist: watchdog>=3.0.0
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest-tmp-files>=0.0.2; extra == 'dev'
19
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # pyqmd
23
+
24
+ Python-native local search engine for markdown files. Hybrid BM25 + vector search with reranking.
pyqmd-0.1.0/README.md ADDED
@@ -0,0 +1,3 @@
1
+ # pyqmd
2
+
3
+ Python-native local search engine for markdown files. Hybrid BM25 + vector search with reranking.
@@ -0,0 +1,358 @@
1
+ # py-qmd Implementation Plan
2
+
3
+ ## Tech Stack
4
+
5
+ | Component | Choice | Rationale |
6
+ |-----------|--------|-----------|
7
+ | Package manager | uv | User preference, fast, modern |
8
+ | CLI framework | Typer | User preference, clean API |
9
+ | Logging | Rich | User preference, beautiful output |
10
+ | Primary storage | LanceDB | Embedded, native hybrid search, zero-config |
11
+ | Alt storage | SQLite + FTS5 + sqlite-vec | QMD-parity option, more control |
12
+ | Embeddings | sentence-transformers | Pluggable, local-first, huge model ecosystem |
13
+ | Default embed model | all-MiniLM-L6-v2 | Fast, good quality, 384 dims |
14
+ | Reranking | cross-encoder (sentence-transformers) | Simple, effective, upgradable to ColBERT |
15
+ | Markdown parsing | markdown-it-py or mistune | Fast, extensible |
16
+ | Document conversion | markitdown (Microsoft) | PDF/DOCX/PPTX → markdown |
17
+
18
+ ## Architecture
19
+
20
+ ```
21
+ ┌──────────────────────────────────────────────────────┐
22
+ │ py-qmd CLI (Typer) │
23
+ │ qmd add | qmd search | qmd index | qmd status │
24
+ ├──────────────────────────────────────────────────────┤
25
+ │ py-qmd Python API │
26
+ │ PyQMD.add_collection() | .search() | .index() │
27
+ ├──────────────┬───────────────────┬───────────────────┤
28
+ │ Indexing │ Querying │ Management │
29
+ │ Pipeline │ Pipeline │ │
30
+ ├──────────────┼───────────────────┼───────────────────┤
31
+ │ Storage Layer (pluggable) │
32
+ │ LanceDB | SQLite+FTS5+sqlite-vec │
33
+ ├──────────────────────────────────────────────────────┤
34
+ │ Embedding Layer (pluggable) │
35
+ │ sentence-transformers | GGUF | API-based │
36
+ ├──────────────────────────────────────────────────────┤
37
+ │ Reranking Layer (pluggable) │
38
+ │ cross-encoder | ColBERT | local LLM | none │
39
+ └──────────────────────────────────────────────────────┘
40
+ ```
41
+
42
+ ## Tier 1: QMD Parity + Quick Wins
43
+
44
+ **Goal:** A working py-qmd that matches QMD's core functionality plus two easy
45
+ high-impact additions (contextual retrieval, parent-child retrieval).
46
+
47
+ ### 1.1 Core Data Model
48
+
49
+ ```python
50
+ @dataclass
51
+ class Chunk:
52
+ id: str # SHA-256 hash of content
53
+ content: str # The actual text
54
+ context: str | None # LLM-generated context prefix (contextual retrieval)
55
+ source_file: str # Path to source markdown file
56
+ collection: str # Collection name
57
+ heading_path: list[str] # ["H1 title", "H2 title", "H3 title"]
58
+ parent_id: str | None # ID of parent chunk (parent-child retrieval)
59
+ start_line: int # Line number in source file
60
+ end_line: int # Line number in source file
61
+ metadata: dict # Arbitrary metadata (tags, dates, etc.)
62
+
63
+ @dataclass
64
+ class SearchResult:
65
+ chunk: Chunk
66
+ score: float # Combined score after fusion
67
+ bm25_score: float | None # Individual BM25 score
68
+ vector_score: float | None # Individual vector similarity score
69
+ rerank_score: float | None # Reranker score (if reranking enabled)
70
+
71
+ @dataclass
72
+ class Collection:
73
+ name: str
74
+ paths: list[str] # Directories to index
75
+ mask: str # Glob pattern (default: "**/*.md")
76
+ config: CollectionConfig # Per-collection settings
77
+ ```
78
+
79
+ ### 1.2 Markdown-Aware Chunking
80
+
81
+ Custom chunker inspired by QMD's scoring algorithm:
82
+
83
+ ```python
84
+ BREAK_SCORES = {
85
+ "h1": 100, # # Heading 1
86
+ "h2": 90, # ## Heading 2
87
+ "h3": 80, # ### Heading 3
88
+ "h4": 70, # #### Heading 4
89
+ "code_block_end": 85, # End of fenced code block
90
+ "hr": 75, # Horizontal rule / thematic break
91
+ "blank_line": 50, # Empty line between paragraphs
92
+ "list_end": 45, # End of a list
93
+ "blockquote_end": 40, # End of a blockquote
94
+ }
95
+ ```
96
+
97
+ **Rules:**
98
+ - Target chunk size: ~800 tokens (configurable)
99
+ - Overlap: 15% (configurable)
100
+ - Never split inside fenced code blocks
101
+ - Never split inside tables
102
+ - Preserve heading hierarchy as metadata on each chunk
103
+ - Parent-child: each chunk stores its parent heading's chunk ID
104
+
105
+ ### 1.3 Indexing Pipeline
106
+
107
+ ```
108
+ File detected (new or modified)
109
+
110
+
111
+ Parse markdown → identify structure (headings, code blocks, etc.)
112
+
113
+
114
+ Split into chunks using break-point scoring
115
+
116
+
117
+ [Optional] Generate context prefix via LLM (contextual retrieval)
118
+
119
+
120
+ Compute embeddings (sentence-transformers)
121
+
122
+
123
+ Store in LanceDB (text + vector + metadata)
124
+
125
+
126
+ Update file hash registry (for incremental updates)
127
+ ```
128
+
129
+ **Incremental indexing:** Track file hashes (SHA-256 of file content). On re-index,
130
+ skip unchanged files. When a file changes, remove all its chunks and re-index it.
131
+
132
+ ### 1.4 Query Pipeline
133
+
134
+ ```
135
+ User query
136
+
137
+
138
+ [Optional] Query expansion (keyword variants, domain terms)
139
+
140
+
141
+ ┌───────────┐ ┌──────────────┐
142
+ │ BM25 │ │ Vector │
143
+ │ search │ │ search │
144
+ └─────┬─────┘ └──────┬───────┘
145
+ │ │
146
+ ▼ ▼
147
+ Reciprocal Rank Fusion (k=60)
148
+
149
+
150
+ [Optional] Cross-encoder reranking
151
+
152
+
153
+ [Optional] Parent expansion (return parent chunks for context)
154
+
155
+
156
+ Return top-K SearchResults
157
+ ```
158
+
159
+ ### 1.5 CLI Commands
160
+
161
+ ```bash
162
+ # Collection management
163
+ qmd add <name> <path> [--mask "**/*.md"]
164
+ qmd remove <name>
165
+ qmd list # List all collections
166
+ qmd status [name] # Show index stats
167
+
168
+ # Indexing
169
+ qmd index [name] # Index/re-index a collection (or all)
170
+ qmd index --full # Force full re-index (ignore hashes)
171
+
172
+ # Searching
173
+ qmd search "query text" # Search all collections
174
+ qmd search "query" --collection <name> # Search specific collection
175
+ qmd search "query" --top-k 10 # Limit results
176
+ qmd search "query" --no-rerank # Skip reranking step
177
+ qmd search "query" --expand # Enable parent chunk expansion
178
+ qmd search "query" --hyde # Enable HyDE (Tier 2)
179
+
180
+ # Configuration
181
+ qmd config # Show current config
182
+ qmd config set embed_model <model> # Change embedding model
183
+ qmd config set chunk_size 800 # Change target chunk size
184
+ ```
185
+
186
+ ### 1.6 Python API
187
+
188
+ ```python
189
+ from py_qmd import PyQMD
190
+
191
+ # Initialize
192
+ qmd = PyQMD(data_dir="~/.py-qmd")
193
+
194
+ # Add and index a collection
195
+ qmd.add_collection("notes", paths=["~/notes"], mask="**/*.md")
196
+ qmd.index("notes")
197
+
198
+ # Search
199
+ results = qmd.search("how to handle NaN values", top_k=5)
200
+ for result in results:
201
+ print(f"{result.score:.3f} | {result.chunk.source_file}")
202
+ print(f" {result.chunk.heading_path}")
203
+ print(f" {result.chunk.content[:200]}")
204
+
205
+ # Search with options
206
+ results = qmd.search(
207
+ "indicator lookback period",
208
+ collections=["notes", "docs"],
209
+ top_k=10,
210
+ rerank=True,
211
+ expand_parents=True,
212
+ )
213
+ ```
214
+
215
+ ## Tier 2: Beyond QMD
216
+
217
+ ### 2.1 HyDE (Hypothetical Document Embeddings)
218
+
219
+ At query time, generate a hypothetical answer via LLM, embed it, and use that
220
+ embedding for vector search. Bridges the vocabulary gap between questions and answers.
221
+
222
+ ```python
223
+ results = qmd.search("why does my indicator return NaN", hyde=True)
224
+ # Internally:
225
+ # 1. LLM generates: "The indicator returns NaN because the lookback period
226
+ # exceeds the available data..."
227
+ # 2. That hypothetical answer is embedded
228
+ # 3. Vector search uses the hypothetical embedding (closer to real answers)
229
+ ```
230
+
231
+ ### 2.2 ColBERT Integration
232
+
233
+ Replace or augment single-vector search with ColBERT's per-token late interaction.
234
+
235
+ ```python
236
+ qmd.config.set("retriever", "colbert") # or "hybrid+colbert"
237
+ # Uses ragatouille under the hood
238
+ ```
239
+
240
+ ### 2.3 Advanced Query Expansion
241
+
242
+ Use an LLM to generate multiple sub-queries from a single user query:
243
+
244
+ ```python
245
+ # User: "my bollinger bands look wrong"
246
+ # Expanded:
247
+ # - "bollinger bands incorrect values"
248
+ # - "technical indicator calculation error"
249
+ # - "rolling standard deviation pandas"
250
+ ```
251
+
252
+ ### 2.4 Pluggable Embedding Models
253
+
254
+ ```python
255
+ qmd = PyQMD(embed_model="nomic-embed-text") # sentence-transformers
256
+ qmd = PyQMD(embed_model="gguf:model.gguf") # Local GGUF via llama-cpp
257
+ qmd = PyQMD(embed_model="openai:text-embedding-3-small") # API-based
258
+ ```
259
+
260
+ ## Tier 3: Advanced Features
261
+
262
+ ### 3.1 GraphRAG
263
+
264
+ Build a knowledge graph from indexed content. Extract entities (functions, concepts,
265
+ error types) and relationships. Enable multi-hop queries.
266
+
267
+ ```python
268
+ qmd.build_graph("course-qa") # Extract entities + relationships
269
+ results = qmd.graph_search("relationship between Sharpe ratio and volatility")
270
+ ```
271
+
272
+ ### 3.2 RAPTOR
273
+
274
+ Recursive summarization tree for hierarchical content. Best for static collections
275
+ (lecture notes, course docs) that don't change often.
276
+
277
+ ```python
278
+ qmd.build_raptor_tree("lectures") # Cluster → summarize → recurse
279
+ results = qmd.search("market microstructure", strategy="raptor")
280
+ ```
281
+
282
+ ### 3.3 MCP Server
283
+
284
+ Expose py-qmd as an MCP server for Claude Code and other AI tools.
285
+
286
+ ```bash
287
+ qmd serve --mcp # Start MCP server
288
+ qmd serve --mcp --port 8080 # Custom port
289
+ ```
290
+
291
+ ### 3.4 File Watching
292
+
293
+ Watch collections for changes and auto-reindex.
294
+
295
+ ```bash
296
+ qmd watch # Watch all collections
297
+ qmd watch --collection notes # Watch specific collection
298
+ ```
299
+
300
+ ## Project Structure
301
+
302
+ ```
303
+ py-qmd/
304
+ ├── pyproject.toml # uv project config
305
+ ├── README.md
306
+ ├── src/
307
+ │ └── py_qmd/
308
+ │ ├── __init__.py # Public API
309
+ │ ├── cli.py # Typer CLI
310
+ │ ├── core.py # PyQMD main class
311
+ │ ├── chunking/
312
+ │ │ ├── __init__.py
313
+ │ │ ├── markdown.py # Markdown-aware chunker
314
+ │ │ ├── scoring.py # Break-point scoring algorithm
315
+ │ │ └── code.py # AST-aware code chunking (tree-sitter)
316
+ │ ├── indexing/
317
+ │ │ ├── __init__.py
318
+ │ │ ├── pipeline.py # Indexing pipeline orchestration
319
+ │ │ ├── contextual.py # Contextual retrieval (LLM context generation)
320
+ │ │ └── hasher.py # File hash tracking for incremental updates
321
+ │ ├── retrieval/
322
+ │ │ ├── __init__.py
323
+ │ │ ├── pipeline.py # Query pipeline orchestration
324
+ │ │ ├── bm25.py # BM25 search
325
+ │ │ ├── vector.py # Vector search
326
+ │ │ ├── fusion.py # RRF + position-aware blending
327
+ │ │ ├── rerank.py # Cross-encoder / ColBERT reranking
328
+ │ │ ├── hyde.py # HyDE query expansion
329
+ │ │ └── parent.py # Parent-child expansion
330
+ │ ├── storage/
331
+ │ │ ├── __init__.py
332
+ │ │ ├── base.py # Abstract storage interface
333
+ │ │ ├── lancedb.py # LanceDB backend
334
+ │ │ └── sqlite.py # SQLite + FTS5 + sqlite-vec backend
335
+ │ ├── embeddings/
336
+ │ │ ├── __init__.py
337
+ │ │ ├── base.py # Abstract embedding interface
338
+ │ │ ├── sentence_transformers.py
339
+ │ │ ├── gguf.py # Local GGUF models
340
+ │ │ └── api.py # API-based embeddings (OpenAI, etc.)
341
+ │ ├── graph/ # Tier 3: GraphRAG
342
+ │ ├── raptor/ # Tier 3: RAPTOR
343
+ │ ├── models.py # Data models (Chunk, SearchResult, etc.)
344
+ │ └── config.py # Configuration management
345
+ ├── tests/
346
+ │ ├── test_chunking.py
347
+ │ ├── test_indexing.py
348
+ │ ├── test_retrieval.py
349
+ │ └── fixtures/
350
+ │ └── sample_markdown/ # Test markdown files
351
+ └── docs/
352
+ ├── VISION.md
353
+ ├── IMPLEMENTATION.md
354
+ └── research/
355
+ ├── qmd-architecture.md
356
+ ├── advanced-retrieval-techniques.md
357
+ └── python-ecosystem.md
358
+ ```
@@ -0,0 +1,82 @@
1
+ # py-qmd: Python Query Markup Documents
2
+
3
+ ## What Is This?
4
+
5
+ A Python-native local search engine for markdown files, inspired by
6
+ [QMD](https://github.com/tobi/qmd) but going beyond it. py-qmd indexes directories
7
+ of markdown files and makes them searchable via hybrid search (full-text + semantic +
8
+ reranking), all running locally with no cloud dependencies.
9
+
10
+ ## Why Build This?
11
+
12
+ QMD (by Tobi Lutke) proved the concept: markdown files as a source of truth, indexed
13
+ for fast hybrid retrieval. But QMD is JavaScript/Bun only. The Python ecosystem has all
14
+ the individual pieces (chunkers, embedding models, vector stores, rerankers) but nobody
15
+ has built the glue layer that ties them together into a cohesive, local-first search
16
+ engine with a clean CLI.
17
+
18
+ py-qmd fills that gap.
19
+
20
+ ## Core Principles
21
+
22
+ 1. **Markdown is the source of truth.** Files are human-readable, version-controlled,
23
+ git-diffable. py-qmd indexes them without modifying them.
24
+
25
+ 2. **Local-first.** Everything runs on your machine. No API keys required for core
26
+ search functionality. (LLM-powered features like contextual retrieval and HyDE
27
+ optionally use an API.)
28
+
29
+ 3. **Pluggable.** Swap embedding models, storage backends, rerankers. Start simple,
30
+ upgrade components independently.
31
+
32
+ 4. **Beyond QMD.** Incorporate techniques QMD doesn't have: contextual retrieval,
33
+ parent-child retrieval, ColBERT, GraphRAG, HyDE.
34
+
35
+ 5. **Python-native.** Built with uv, Rich logging, Typer CLI. First-class Python
36
+ library API alongside the CLI.
37
+
38
+ ## Who Is This For?
39
+
40
+ - Developers who keep knowledge in markdown (notes, docs, meeting transcripts)
41
+ - AI agent builders who need a local retrieval backend (Claude Code skills, MCP servers)
42
+ - Educators who need to search across semesters of course materials and forum archives
43
+ - Anyone who wants "search my markdown files" without spinning up Elasticsearch
44
+
45
+ ## How It Compares to QMD
46
+
47
+ | Feature | QMD | py-qmd |
48
+ |---------|-----|--------|
49
+ | Language | JavaScript/Bun | Python |
50
+ | Chunking | Markdown-aware, AST for code | Same + configurable scoring |
51
+ | BM25 | SQLite FTS5 | LanceDB native (or SQLite FTS5) |
52
+ | Vector search | sqlite-vec | LanceDB native (or sqlite-vec) |
53
+ | Embeddings | embeddinggemma-300M (GGUF) | Pluggable (sentence-transformers, GGUF, API) |
54
+ | Reranking | Qwen3-Reranker-0.6B | Pluggable (cross-encoder, ColBERT, local LLM) |
55
+ | Fusion | RRF | RRF + position-aware blending |
56
+ | Contextual retrieval | No | Yes (Tier 1) |
57
+ | Parent-child retrieval | No | Yes (Tier 1) |
58
+ | HyDE | Partial | Yes (Tier 2) |
59
+ | ColBERT | No | Yes (Tier 2) |
60
+ | GraphRAG | No | Yes (Tier 3) |
61
+ | RAPTOR | No | Yes (Tier 3) |
62
+ | CLI | Custom | Typer + Rich |
63
+ | Python API | No | First-class |
64
+ | MCP server | Separate wrapper | Built-in option |
65
+ | Claude Code skill | No | Planned |
66
+
67
+ ## Relationship to the EdStem Bot Project
68
+
69
+ py-qmd is a standalone library that the EdStem automation system will use as its
70
+ knowledge base and retrieval engine. The dependency flows one way:
71
+
72
+ ```
73
+ py-qmd (standalone library, reusable)
74
+
75
+ ed-api (EdStem API client)
76
+
77
+ ed-ingest (scraper + media pipeline → markdown files → py-qmd collections)
78
+
79
+ ed-bot (answer engine, Claude Code skills)
80
+ ```
81
+
82
+ py-qmd knows nothing about EdStem. It just indexes and searches markdown files.