desksearch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. desksearch-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +42 -0
  2. desksearch-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +23 -0
  3. desksearch-0.1.0/.github/workflows/build-release.yml +91 -0
  4. desksearch-0.1.0/.gitignore +30 -0
  5. desksearch-0.1.0/ARCHITECTURE.md +197 -0
  6. desksearch-0.1.0/CONTRIBUTING.md +122 -0
  7. desksearch-0.1.0/DESIGN_DECISIONS.md +116 -0
  8. desksearch-0.1.0/LICENSE +21 -0
  9. desksearch-0.1.0/PKG-INFO +223 -0
  10. desksearch-0.1.0/PLUGINS.md +132 -0
  11. desksearch-0.1.0/README.md +181 -0
  12. desksearch-0.1.0/benchmarks/benchmark.py +608 -0
  13. desksearch-0.1.0/benchmarks/comparison.md +88 -0
  14. desksearch-0.1.0/benchmarks/results.json +97 -0
  15. desksearch-0.1.0/benchmarks/results.md +90 -0
  16. desksearch-0.1.0/conftest.py +8 -0
  17. desksearch-0.1.0/desksearch-backend.spec +57 -0
  18. desksearch-0.1.0/docs/logo.png +0 -0
  19. desksearch-0.1.0/docs/screenshot.png +0 -0
  20. desksearch-0.1.0/electron/assets/.gitkeep +0 -0
  21. desksearch-0.1.0/electron/assets/icon.icns +0 -0
  22. desksearch-0.1.0/electron/assets/icon.ico +0 -0
  23. desksearch-0.1.0/electron/assets/icon.png +0 -0
  24. desksearch-0.1.0/electron/assets/tray-icon.png +0 -0
  25. desksearch-0.1.0/electron/build-config.js +78 -0
  26. desksearch-0.1.0/electron/main.js +236 -0
  27. desksearch-0.1.0/electron/package-lock.json +5278 -0
  28. desksearch-0.1.0/electron/package.json +93 -0
  29. desksearch-0.1.0/electron/preload.js +13 -0
  30. desksearch-0.1.0/pyproject.toml +50 -0
  31. desksearch-0.1.0/scripts/build-app.sh +18 -0
  32. desksearch-0.1.0/src/desksearch/__init__.py +2 -0
  33. desksearch-0.1.0/src/desksearch/__main__.py +501 -0
  34. desksearch-0.1.0/src/desksearch/api/__init__.py +0 -0
  35. desksearch-0.1.0/src/desksearch/api/routes.py +854 -0
  36. desksearch-0.1.0/src/desksearch/api/schemas.py +167 -0
  37. desksearch-0.1.0/src/desksearch/api/server.py +133 -0
  38. desksearch-0.1.0/src/desksearch/config.py +78 -0
  39. desksearch-0.1.0/src/desksearch/core/__init__.py +5 -0
  40. desksearch-0.1.0/src/desksearch/core/bm25.py +125 -0
  41. desksearch-0.1.0/src/desksearch/core/dense.py +266 -0
  42. desksearch-0.1.0/src/desksearch/core/fusion.py +94 -0
  43. desksearch-0.1.0/src/desksearch/core/search.py +208 -0
  44. desksearch-0.1.0/src/desksearch/core/snippets.py +147 -0
  45. desksearch-0.1.0/src/desksearch/daemon/__init__.py +1 -0
  46. desksearch-0.1.0/src/desksearch/daemon/autostart.py +224 -0
  47. desksearch-0.1.0/src/desksearch/daemon/service.py +616 -0
  48. desksearch-0.1.0/src/desksearch/daemon/tray.py +145 -0
  49. desksearch-0.1.0/src/desksearch/indexer/__init__.py +4 -0
  50. desksearch-0.1.0/src/desksearch/indexer/chunker.py +129 -0
  51. desksearch-0.1.0/src/desksearch/indexer/embedder.py +308 -0
  52. desksearch-0.1.0/src/desksearch/indexer/parsers.py +161 -0
  53. desksearch-0.1.0/src/desksearch/indexer/pipeline.py +401 -0
  54. desksearch-0.1.0/src/desksearch/indexer/store.py +224 -0
  55. desksearch-0.1.0/src/desksearch/indexer/watcher.py +111 -0
  56. desksearch-0.1.0/src/desksearch/onboarding.py +263 -0
  57. desksearch-0.1.0/src/desksearch/plugins/__init__.py +26 -0
  58. desksearch-0.1.0/src/desksearch/plugins/base.py +97 -0
  59. desksearch-0.1.0/src/desksearch/plugins/builtin/__init__.py +1 -0
  60. desksearch-0.1.0/src/desksearch/plugins/builtin/browser_bookmarks.py +143 -0
  61. desksearch-0.1.0/src/desksearch/plugins/builtin/clipboard_monitor.py +81 -0
  62. desksearch-0.1.0/src/desksearch/plugins/builtin/email_connector.py +107 -0
  63. desksearch-0.1.0/src/desksearch/plugins/loader.py +126 -0
  64. desksearch-0.1.0/src/desksearch/plugins/registry.py +113 -0
  65. desksearch-0.1.0/src/ui/index.html +13 -0
  66. desksearch-0.1.0/src/ui/package-lock.json +2766 -0
  67. desksearch-0.1.0/src/ui/package.json +25 -0
  68. desksearch-0.1.0/src/ui/postcss.config.js +6 -0
  69. desksearch-0.1.0/src/ui/src/App.tsx +221 -0
  70. desksearch-0.1.0/src/ui/src/components/Dashboard.tsx +184 -0
  71. desksearch-0.1.0/src/ui/src/components/FileExplorer.tsx +279 -0
  72. desksearch-0.1.0/src/ui/src/components/Filters.tsx +117 -0
  73. desksearch-0.1.0/src/ui/src/components/FolderManager.tsx +360 -0
  74. desksearch-0.1.0/src/ui/src/components/IndexingProgress.tsx +62 -0
  75. desksearch-0.1.0/src/ui/src/components/Onboarding.tsx +263 -0
  76. desksearch-0.1.0/src/ui/src/components/ResultCard.tsx +143 -0
  77. desksearch-0.1.0/src/ui/src/components/ResultsList.tsx +75 -0
  78. desksearch-0.1.0/src/ui/src/components/SearchBar.tsx +92 -0
  79. desksearch-0.1.0/src/ui/src/components/Settings.tsx +192 -0
  80. desksearch-0.1.0/src/ui/src/components/StatusBar.tsx +51 -0
  81. desksearch-0.1.0/src/ui/src/config.ts +3 -0
  82. desksearch-0.1.0/src/ui/src/hooks/useIndexStatus.ts +38 -0
  83. desksearch-0.1.0/src/ui/src/hooks/useSearch.ts +56 -0
  84. desksearch-0.1.0/src/ui/src/main.tsx +10 -0
  85. desksearch-0.1.0/src/ui/src/styles/globals.css +35 -0
  86. desksearch-0.1.0/src/ui/src/types.ts +94 -0
  87. desksearch-0.1.0/src/ui/src/vite-env.d.ts +1 -0
  88. desksearch-0.1.0/src/ui/tailwind.config.js +31 -0
  89. desksearch-0.1.0/src/ui/tsconfig.json +21 -0
  90. desksearch-0.1.0/src/ui/vite.config.ts +22 -0
  91. desksearch-0.1.0/tests/__init__.py +0 -0
  92. desksearch-0.1.0/tests/test_api.py +157 -0
  93. desksearch-0.1.0/tests/test_indexer.py +346 -0
  94. desksearch-0.1.0/tests/test_integration.py +399 -0
  95. desksearch-0.1.0/tests/test_search.py +376 -0
@@ -0,0 +1,42 @@
1
+ ---
2
+ name: Bug Report
3
+ about: Report a bug to help us improve DeskSearch
4
+ title: '[Bug] '
5
+ labels: bug
6
+ assignees: ''
7
+ ---
8
+
9
+ ## Describe the Bug
10
+
11
+ A clear and concise description of what the bug is.
12
+
13
+ ## Steps to Reproduce
14
+
15
+ 1. Run `desksearch ...`
16
+ 2. Search for '...'
17
+ 3. See error
18
+
19
+ ## Expected Behavior
20
+
21
+ What you expected to happen.
22
+
23
+ ## Actual Behavior
24
+
25
+ What actually happened.
26
+
27
+ ## Environment
28
+
29
+ - **OS**: [e.g., macOS 14.2, Ubuntu 22.04, Windows 11]
30
+ - **Python version**: [e.g., 3.12.1]
31
+ - **DeskSearch version**: [e.g., 0.1.0]
32
+ - **Install method**: [pip / source / desktop app]
33
+
34
+ ## Logs / Error Output
35
+
36
+ ```
37
+ Paste any error messages or logs here
38
+ ```
39
+
40
+ ## Additional Context
41
+
42
+ Add any other context, screenshots, or file samples (if relevant) here.
@@ -0,0 +1,23 @@
1
+ ---
2
+ name: Feature Request
3
+ about: Suggest a new feature or improvement
4
+ title: '[Feature] '
5
+ labels: enhancement
6
+ assignees: ''
7
+ ---
8
+
9
+ ## Problem
10
+
11
+ What problem does this feature solve? What are you trying to do that you can't do today?
12
+
13
+ ## Proposed Solution
14
+
15
+ Describe the solution you'd like. How would it work from a user's perspective?
16
+
17
+ ## Alternatives Considered
18
+
19
+ Any alternative solutions or features you've considered.
20
+
21
+ ## Additional Context
22
+
23
+ Add any other context, mockups, or examples here.
@@ -0,0 +1,91 @@
1
+ name: Build & Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ permissions:
9
+ contents: write
10
+
11
+ jobs:
12
+ build:
13
+ strategy:
14
+ matrix:
15
+ include:
16
+ - os: macos-latest
17
+ platform: mac
18
+ artifact: "dist/electron/DeskSearch-*.dmg"
19
+ - os: windows-latest
20
+ platform: win
21
+ artifact: "dist/electron/DeskSearch Setup *.exe"
22
+ - os: ubuntu-latest
23
+ platform: linux
24
+ artifact: "dist/electron/DeskSearch-*.AppImage"
25
+
26
+ runs-on: ${{ matrix.os }}
27
+
28
+ steps:
29
+ - uses: actions/checkout@v4
30
+
31
+ - name: Set up Python
32
+ uses: actions/setup-python@v5
33
+ with:
34
+ python-version: '3.12'
35
+
36
+ - name: Set up Node.js
37
+ uses: actions/setup-node@v4
38
+ with:
39
+ node-version: '22'
40
+
41
+ - name: Install Python dependencies
42
+ run: |
43
+ pip install -e ".[dev]"
44
+ pip install pyinstaller
45
+
46
+ - name: Build frontend
47
+ working-directory: src/ui
48
+ run: |
49
+ npm ci
50
+ npm run build
51
+
52
+ - name: Bundle backend with PyInstaller
53
+ run: |
54
+ pyinstaller --onedir --name desksearch-backend \
55
+ --hidden-import desksearch --hidden-import desksearch.api \
56
+ --hidden-import desksearch.api.server --hidden-import desksearch.api.routes \
57
+ --hidden-import desksearch.api.schemas --hidden-import desksearch.core \
58
+ --hidden-import desksearch.core.search --hidden-import desksearch.core.bm25 \
59
+ --hidden-import desksearch.core.dense --hidden-import desksearch.core.fusion \
60
+ --hidden-import desksearch.core.snippets --hidden-import desksearch.indexer \
61
+ --hidden-import desksearch.indexer.pipeline --hidden-import desksearch.indexer.parsers \
62
+ --hidden-import desksearch.indexer.chunker --hidden-import desksearch.indexer.embedder \
63
+ --hidden-import desksearch.indexer.store --hidden-import desksearch.indexer.watcher \
64
+ --hidden-import desksearch.config --hidden-import desksearch.onboarding \
65
+ --hidden-import desksearch.daemon --hidden-import desksearch.plugins \
66
+ --hidden-import uvicorn --hidden-import fastapi --hidden-import tantivy \
67
+ --hidden-import faiss --hidden-import onnxruntime --hidden-import tokenizers \
68
+ --hidden-import huggingface_hub \
69
+ --collect-all desksearch --collect-all tantivy --collect-all onnxruntime \
70
+ --collect-all tokenizers \
71
+ --add-data "src/ui/dist:ui/dist" \
72
+ --noconfirm \
73
+ src/desksearch/__main__.py
74
+
75
+ - name: Build Electron app
76
+ working-directory: electron
77
+ run: |
78
+ npm ci
79
+ npm run dist:${{ matrix.platform }}
80
+
81
+ - name: Upload artifact
82
+ uses: actions/upload-artifact@v4
83
+ with:
84
+ name: desksearch-${{ matrix.platform }}
85
+ path: ${{ matrix.artifact }}
86
+
87
+ - name: Upload to Release
88
+ if: startsWith(github.ref, 'refs/tags/')
89
+ uses: softprops/action-gh-release@v2
90
+ with:
91
+ files: ${{ matrix.artifact }}
@@ -0,0 +1,30 @@
1
+ # Python
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .pytest_cache/
9
+ *.so
10
+
11
+ # Node
12
+ node_modules/
13
+
14
+ # IDE
15
+ .idea/
16
+ .vscode/
17
+ *.swp
18
+ *.swo
19
+
20
+ # OS
21
+ .DS_Store
22
+ Thumbs.db
23
+
24
+ # LocalSearch data
25
+ .localsearch/
26
+
27
+ # Job search / personal
28
+ job-search/
29
+ arxiv-daily/
30
+ OVERNIGHT_PLAN.md
@@ -0,0 +1,197 @@
1
+ # DeskSearch — Private Semantic Search Engine for Your Files
2
+
3
+ ## Vision
4
+ A fast, beautiful, private semantic search engine that runs entirely on your laptop.
5
+ Index everything — documents, emails, notes, code, images — and find anything by asking in natural language.
6
+ "Perplexity for your own files."
7
+
8
+ ## Architecture Overview
9
+
10
+ ```
11
+ ┌─────────────────────────────────────────────────┐
12
+ │ Web UI (React) │
13
+ │ Search bar → Results with snippets & sources │
14
+ │ File preview │ Filters │ Collections │
15
+ └──────────────────────┬──────────────────────────┘
16
+ │ HTTP/WebSocket
17
+ ┌──────────────────────▼──────────────────────────┐
18
+ │ FastAPI Backend (Python) │
19
+ │ /search /index /status /settings │
20
+ │ Query understanding → Hybrid retrieval → │
21
+ │ Reranking → Snippet extraction │
22
+ └──────┬───────────┬───────────┬──────────────────┘
23
+ │ │ │
24
+ ┌──────▼──┐ ┌─────▼────┐ ┌───▼──────────────────┐
25
+ │ BM25 │ │ Dense │ │ Metadata Store │
26
+ │ Index │ │ Index │ │ (SQLite) │
27
+ │(tantivy)│ │ (FAISS/ │ │ file paths, dates, │
28
+ │ │ │ usearch)│ │ types, thumbnails │
29
+ └─────────┘ └──────────┘ └──────────────────────┘
30
+ ▲ ▲
31
+ │ │
32
+ ┌──────┴───────────┴──────────────────────────────┐
33
+ │ Indexing Pipeline │
34
+ │ File watcher (watchdog) → Parser → Chunker → │
35
+ │ Embedder (local model) → Index writer │
36
+ │ │
37
+ │ Parsers: PDF(marker) │ DOCX │ TXT │ MD │ Code │
38
+ │ Images(OCR) │ Email(.eml) │ HTML │
39
+ └─────────────────────────────────────────────────┘
40
+ ```
41
+
42
+ ## Tech Stack
43
+
44
+ ### Core (Python)
45
+ - **FastAPI** — async API server
46
+ - **SQLite + FTS5** — metadata store + full-text fallback
47
+ - **tantivy-py** — fast BM25 index (Rust-based, way faster than Whoosh)
48
+ - **FAISS or usearch** — dense vector index
49
+ - **sentence-transformers** — local embedding model (all-MiniLM-L6-v2 for MVP, upgrade later)
50
+ - **watchdog** — filesystem watcher for live indexing
51
+
52
+ ### Document Parsing
53
+ - **marker** — PDF → markdown (best quality)
54
+ - **python-docx** — Word documents
55
+ - **python-pptx** — PowerPoint
56
+ - **openpyxl** — Excel
57
+ - **beautifulsoup4** — HTML/emails
58
+ - **Pillow + pytesseract** — OCR for images/screenshots
59
+ - **tree-sitter** — code files (syntax-aware chunking)
60
+
61
+ ### UI (React + Vite)
62
+ - **React 18** with TypeScript
63
+ - **Tailwind CSS** — styling
64
+ - **Vite** — build tool
65
+ - Clean, minimal Perplexity-inspired design
66
+ - File preview panel
67
+ - Search filters (date, file type, folder)
68
+
69
+ ### Packaging
70
+ - **pip install desksearch** — Python package
71
+ - **brew install desksearch** — macOS (later)
72
+ - Single command to start: `desksearch serve`
73
+ - Auto-indexes ~/Documents, ~/Desktop, ~/Downloads by default
74
+
75
+ ## Key Design Decisions
76
+
77
+ 1. **100% Local** — No cloud, no API keys needed for MVP. Embedding model runs locally.
78
+ 2. **Hybrid Search** — BM25 + dense embeddings + reciprocal rank fusion. This is where Dylan's expertise shines.
79
+ 3. **Incremental Indexing** — File watcher detects changes, only re-indexes modified files.
80
+ 4. **Chunk with Context** — Each chunk stores parent document reference for full-context answers.
81
+ 5. **Fast Startup** — Index persists on disk. Startup = load index + start server. Should be <2 seconds.
82
+
83
+ ## MVP Scope (v0.1)
84
+
85
+ ### Must Have
86
+ - [ ] Index text files: .txt, .md, .pdf, .docx
87
+ - [ ] Hybrid search: BM25 + dense embeddings
88
+ - [ ] Reciprocal rank fusion for combining results
89
+ - [ ] Web UI with search bar and results
90
+ - [ ] File snippets with highlighted matches
91
+ - [ ] Click result → open file in system default app
92
+ - [ ] CLI: `desksearch index <path>` and `desksearch serve`
93
+ - [ ] Incremental indexing (only new/changed files)
94
+ - [ ] Basic filters: file type, date range
95
+
96
+ ### Nice to Have (v0.2)
97
+ - [ ] Image OCR indexing
98
+ - [ ] Code-aware indexing (tree-sitter)
99
+ - [ ] Answer generation (LLM summarizes top results)
100
+ - [ ] Email indexing (.eml, .mbox)
101
+ - [ ] Collections / saved searches
102
+ - [ ] System tray app (background daemon)
103
+
104
+ ### Future (v1.0)
105
+ - [ ] macOS native app
106
+ - [ ] Browser extension (index bookmarks)
107
+ - [ ] Slack/Discord/Gmail integrations (premium)
108
+ - [ ] Team/shared search (premium)
109
+ - [ ] Mobile companion app
110
+
111
+ ## Agent Assignments
112
+
113
+ ### Agent 1: Core Search Engine (src/core/)
114
+ - Hybrid retrieval: BM25 (tantivy) + dense (FAISS)
115
+ - Reciprocal rank fusion
116
+ - Query processing
117
+ - Result ranking and snippet extraction
118
+ - Tests
119
+
120
+ ### Agent 2: Indexing Pipeline (src/indexer/)
121
+ - File discovery and watching
122
+ - Document parsing (PDF, DOCX, TXT, MD, code)
123
+ - Chunking strategies
124
+ - Embedding generation
125
+ - SQLite metadata store
126
+ - Incremental index updates
127
+ - Tests
128
+
129
+ ### Agent 3: API Server (src/api/)
130
+ - FastAPI endpoints: /search, /index, /status, /settings
131
+ - WebSocket for live indexing progress
132
+ - CORS for UI
133
+ - Settings management
134
+ - Tests
135
+
136
+ ### Agent 4: Web UI (src/ui/)
137
+ - React + Vite + Tailwind
138
+ - Search interface (Perplexity-inspired)
139
+ - Results display with snippets, file icons, dates
140
+ - File type/date filters
141
+ - File preview panel
142
+ - Settings page (indexed folders, reindex trigger)
143
+
144
+ ## File Structure
145
+ ```
146
+ desksearch/
147
+ ├── pyproject.toml
148
+ ├── README.md
149
+ ├── ARCHITECTURE.md
150
+ ├── src/
151
+ │ ├── __init__.py
152
+ │ ├── __main__.py # CLI entry point
153
+ │ ├── config.py # Settings/configuration
154
+ │ ├── core/
155
+ │ │ ├── __init__.py
156
+ │ │ ├── search.py # Hybrid search engine
157
+ │ │ ├── bm25.py # Tantivy BM25 wrapper
158
+ │ │ ├── dense.py # FAISS/usearch dense index
159
+ │ │ ├── fusion.py # Reciprocal rank fusion
160
+ │ │ └── snippets.py # Snippet extraction & highlighting
161
+ │ ├── indexer/
162
+ │ │ ├── __init__.py
163
+ │ │ ├── pipeline.py # Main indexing pipeline
164
+ │ │ ├── watcher.py # Filesystem watcher
165
+ │ │ ├── parsers.py # Document parsers
166
+ │ │ ├── chunker.py # Text chunking
167
+ │ │ ├── embedder.py # Local embedding model
168
+ │ │ └── store.py # SQLite metadata store
169
+ │ ├── api/
170
+ │ │ ├── __init__.py
171
+ │ │ ├── server.py # FastAPI app
172
+ │ │ ├── routes.py # API endpoints
173
+ │ │ └── schemas.py # Pydantic models
174
+ │ └── ui/ # React app (built separately)
175
+ │ ├── package.json
176
+ │ ├── vite.config.ts
177
+ │ ├── index.html
178
+ │ ├── src/
179
+ │ │ ├── App.tsx
180
+ │ │ ├── components/
181
+ │ │ │ ├── SearchBar.tsx
182
+ │ │ │ ├── ResultsList.tsx
183
+ │ │ │ ├── ResultCard.tsx
184
+ │ │ │ ├── FilePreview.tsx
185
+ │ │ │ └── Filters.tsx
186
+ │ │ ├── hooks/
187
+ │ │ │ └── useSearch.ts
188
+ │ │ └── styles/
189
+ │ │ └── globals.css
190
+ │ └── tailwind.config.js
191
+ ├── tests/
192
+ │ ├── test_search.py
193
+ │ ├── test_indexer.py
194
+ │ └── test_api.py
195
+ └── data/ # Default index storage
196
+ └── .gitkeep
197
+ ```
@@ -0,0 +1,122 @@
1
+ # Contributing to DeskSearch
2
+
3
+ Thanks for your interest in contributing! This guide will help you get started.
4
+
5
+ ## Getting Started
6
+
7
+ 1. **Fork** the repository on GitHub
8
+ 2. **Clone** your fork locally:
9
+ ```bash
10
+ git clone https://github.com/YOUR_USERNAME/desksearch.git
11
+ cd desksearch
12
+ ```
13
+ 3. **Install** development dependencies:
14
+ ```bash
15
+ pip install -e ".[dev]"
16
+ ```
17
+ 4. **Create a branch** for your change:
18
+ ```bash
19
+ git checkout -b feature/my-feature
20
+ ```
21
+
22
+ ## Development Setup
23
+
24
+ ### Backend (Python)
25
+
26
+ ```bash
27
+ # Install in development mode
28
+ pip install -e ".[dev]"
29
+
30
+ # Run tests
31
+ pytest
32
+
33
+ # Run a specific test
34
+ pytest tests/test_search.py -v
35
+ ```
36
+
37
+ ### Frontend (Web UI)
38
+
39
+ ```bash
40
+ cd src/ui
41
+ npm install
42
+ npm run dev # Start dev server with hot reload
43
+ npm run build # Production build
44
+ ```
45
+
46
+ ## Making Changes
47
+
48
+ ### Code Style
49
+
50
+ - **Python**: Follow PEP 8. Use type hints where practical.
51
+ - **TypeScript/React**: Follow the existing patterns in `src/ui/`.
52
+ - Keep changes focused — one feature or fix per PR.
53
+
54
+ ### Testing
55
+
56
+ - Add tests for new functionality
57
+ - Ensure all existing tests pass: `pytest`
58
+ - Test across file formats if touching parsing/indexing
59
+
60
+ ### Commit Messages
61
+
62
+ Use clear, descriptive commit messages:
63
+
64
+ ```
65
+ feat: add EPUB parser plugin
66
+ fix: handle empty PDF pages in parser
67
+ docs: update CLI reference with daemon commands
68
+ perf: batch embedding for 3x indexing speedup
69
+ refactor: extract snippet highlighting into module
70
+ ```
71
+
72
+ Prefixes: `feat`, `fix`, `docs`, `perf`, `refactor`, `test`, `chore`
73
+
74
+ ## Pull Request Process
75
+
76
+ 1. **Update documentation** if your change affects user-facing behavior
77
+ 2. **Add tests** for new functionality
78
+ 3. **Run the test suite** and make sure everything passes
79
+ 4. **Push** to your fork and open a Pull Request
80
+ 5. **Describe your changes** — what and why, not just how
81
+ 6. **Link related issues** if applicable
82
+
83
+ ### PR Title Format
84
+
85
+ ```
86
+ feat: add support for PPTX files
87
+ fix: search crash when index is empty
88
+ ```
89
+
90
+ ## Reporting Bugs
91
+
92
+ Open an issue using the [bug report template](.github/ISSUE_TEMPLATE/bug_report.md) and include:
93
+
94
+ - Steps to reproduce
95
+ - Expected vs actual behavior
96
+ - Python version and OS
97
+ - Error messages or logs
98
+
99
+ ## Requesting Features
100
+
101
+ Open an issue using the [feature request template](.github/ISSUE_TEMPLATE/feature_request.md) and describe:
102
+
103
+ - The problem you're trying to solve
104
+ - Your proposed solution
105
+ - Any alternatives you've considered
106
+
107
+ ## Architecture Overview
108
+
109
+ ```
110
+ src/desksearch/
111
+ ├── __main__.py # CLI entry point (Click)
112
+ ├── config.py # Configuration (Pydantic)
113
+ ├── api/ # FastAPI web server
114
+ ├── core/ # Search engines (BM25, dense, fusion, snippets)
115
+ ├── indexer/ # Parsing, chunking, embedding, pipeline
116
+ ├── plugins/ # Plugin system (base classes + loader)
117
+ └── daemon/ # Background service, tray, autostart
118
+ ```
119
+
120
+ ## License
121
+
122
+ By contributing, you agree that your contributions will be licensed under the [MIT License](LICENSE).
@@ -0,0 +1,116 @@
1
+ # Design Decisions — Performance & UX
2
+
3
+ ## 1. Search Must Be INSTANT (<50ms)
4
+
5
+ ### Problem
6
+ Users expect Google-speed search. Loading a sentence-transformer model takes 2-5 seconds. Embedding a query takes 50-200ms. This is too slow for keystroke-by-keystroke search.
7
+
8
+ ### Solution: Dual-Mode Search
9
+ - **Phase 1 (instant, <10ms):** BM25 only via tantivy (Rust-based, sub-millisecond). Results appear as you type.
10
+ - **Phase 2 (background, <200ms):** Dense embedding search runs in background, results merge in via RRF. UI smoothly updates.
11
+ - **Startup:** Tantivy index loads in <100ms from memory-mapped files. Dense index loads async in background.
12
+ - **Model pre-loading:** Embedding model loaded once at startup, stays in memory. First search may have 1-2s delay for model load, then instant.
13
+
14
+ ### Alternative Considered
15
+ - Pre-compute query embeddings for common terms → too much storage, stale
16
+ - Use ONNX runtime instead of PyTorch → 3-5x faster inference, should implement in v0.2
17
+
18
+ ## 2. App-Like Experience
19
+
20
+ ### Problem
21
+ `pip install` + `desksearch serve` is developer-friendly but not consumer-friendly. Regular users want to download an app.
22
+
23
+ ### Solution: Progressive Packaging
24
+ - **v0.1 (now):** pip install + CLI. Target: developers and power users.
25
+ - **v0.2:** System tray app using `pystray`. Runs in background, menubar icon, opens web UI in browser.
26
+ - **v0.3:** Electron or Tauri desktop app. Single .dmg/.exe download. Bundles Python runtime + models.
27
+ - **v1.0:** Native macOS app (Swift) / Windows app. Proper file system integration.
28
+
29
+ ### For MVP: System Tray Daemon
30
+ ```
31
+ desksearch install-daemon # Creates LaunchAgent (macOS) or service (Linux/Windows)
32
+ # App starts on login, sits in system tray
33
+ # Click tray icon → opens search in browser
34
+ # Keyboard shortcut: Cmd+Shift+Space → instant search
35
+ ```
36
+
37
+ ## 3. Search Quality (This is the differentiator)
38
+
39
+ ### Why Our Search Is Better Than Spotlight/Windows Search
40
+ 1. **Hybrid retrieval:** BM25 catches exact terms, dense catches semantic meaning. Neither alone is sufficient.
41
+ 2. **Reciprocal Rank Fusion:** Principled way to combine two ranking signals (Dylan's IR expertise).
42
+ 3. **Smart chunking:** Paragraph-aware, respects document structure. Not blind character splits.
43
+ 4. **Query understanding:** Detect query type (keyword vs question vs phrase) and adjust retrieval strategy.
44
+ 5. **Snippet quality:** Show the MOST RELEVANT passage, not just the first occurrence.
45
+
46
+ ### Ranking Pipeline (v0.2+)
47
+ ```
48
+ Query → Query Analysis → BM25 Search + Dense Search → RRF Fusion → Reranker (cross-encoder) → Results
49
+ ```
50
+ Adding a cross-encoder reranker (e.g., ms-marco-MiniLM) on top-20 fused results would dramatically improve precision. This is standard in production search but NO local search tool does it.
51
+
52
+ ## 4. Indexing Must Be Background & Non-Intrusive
53
+
54
+ ### Requirements
55
+ - First index: scan + parse + embed all files. May take 5-30 min depending on corpus size.
56
+ - Incremental: file watcher detects changes, re-indexes only modified files. <1s per file.
57
+ - CPU usage: cap embedding at 50% CPU. Don't make the laptop fan spin up.
58
+ - Disk usage: index should be <10% of original corpus size.
59
+
60
+ ### Implementation
61
+ - Use ThreadPoolExecutor with max_workers=2 for embedding (limits CPU)
62
+ - Batch embed: collect 32 chunks, embed in one call (much faster than 1-by-1)
63
+ - Progress reporting via WebSocket to UI
64
+ - Pause/resume indexing
65
+
66
+ ## 5. Embedding Model Choice
67
+
68
+ ### For MVP: all-MiniLM-L6-v2
69
+ - 80MB model, loads fast
70
+ - 384-dim embeddings (small index)
71
+ - Good quality for general text
72
+ - Runs on CPU in ~5ms per query
73
+
74
+ ### For v0.2: User-selectable
75
+ - Option to use larger models (e5-large, bge-base) for better quality
76
+ - ONNX runtime for 3-5x speedup
77
+ - Apple Silicon MPS acceleration
78
+
79
+ ### For v1.0: Custom model
80
+ - Train a model specifically optimized for local file search
81
+ - Matryoshka embeddings (Dylan's Starbucks work) for flexible dim/precision tradeoff
82
+ - This is the ultimate differentiator — a model trained FOR this exact use case
83
+
84
+ ## 6. Global Keyboard Shortcut
85
+
86
+ Critical for adoption. Users should be able to:
87
+ - Press `Cmd+Shift+Space` (configurable) from anywhere
88
+ - Search bar pops up immediately
89
+ - Type query → instant results
90
+ - Press Enter → open file
91
+ - Press Escape → dismiss
92
+
93
+ Implementation: Requires native integration.
94
+ - macOS: NSEvent global monitor (Swift helper or pyobjc)
95
+ - Linux: X11/Wayland hotkey binding
96
+ - Windows: RegisterHotKey
97
+
98
+ ## 7. File Type Priority Ranking
99
+
100
+ Not all files are equal. A matching PDF in ~/Documents is more relevant than a .pyc in node_modules.
101
+
102
+ Default boosts:
103
+ - Documents (.pdf, .docx, .md): 1.5x
104
+ - Notes/text (.txt, .org): 1.3x
105
+ - Code (.py, .js): 1.0x
106
+ - Config (.json, .yaml, .toml): 0.8x
107
+ - Recent files: 1.2x boost for files modified in last 7 days
108
+ - Frequently opened: track open count, boost popular files
109
+
110
+ ## 8. Privacy & Security
111
+
112
+ - ALL processing local. No network calls except to download embedding model (one-time).
113
+ - Index stored in ~/.desksearch/ with user-only permissions (0700).
114
+ - No telemetry, no analytics, no phone home.
115
+ - Option to encrypt index at rest (v0.3).
116
+ - Excluded paths: .ssh, .gnupg, .env files with secrets auto-excluded.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Shuai Wang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.