sahara-memory 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sahara_memory-0.2.1/.gitignore +43 -0
  2. sahara_memory-0.2.1/.saharaignore.template +123 -0
  3. sahara_memory-0.2.1/ARCHITECTURE.md +256 -0
  4. sahara_memory-0.2.1/CHANGELOG.md +110 -0
  5. sahara_memory-0.2.1/CODE_OF_CONDUCT.md +126 -0
  6. sahara_memory-0.2.1/CONTRIBUTING.md +273 -0
  7. sahara_memory-0.2.1/LICENSE +21 -0
  8. sahara_memory-0.2.1/PKG-INFO +328 -0
  9. sahara_memory-0.2.1/README.md +271 -0
  10. sahara_memory-0.2.1/RELEASE_CHECKLIST.md +77 -0
  11. sahara_memory-0.2.1/ROADMAP.md +76 -0
  12. sahara_memory-0.2.1/SECURITY.md +124 -0
  13. sahara_memory-0.2.1/docs/ANSWER_PROVIDERS.md +180 -0
  14. sahara_memory-0.2.1/docs/CLAUDE_DESKTOP.md +276 -0
  15. sahara_memory-0.2.1/docs/COMMAND_REFERENCE.md +207 -0
  16. sahara_memory-0.2.1/docs/GETTING_STARTED.md +160 -0
  17. sahara_memory-0.2.1/docs/demo/README.md +34 -0
  18. sahara_memory-0.2.1/docs/demo/fixtures/Home/Repairs/dishwasher_invoice.md +6 -0
  19. sahara_memory-0.2.1/docs/demo/fixtures/Medical/prescription_receipt.md +8 -0
  20. sahara_memory-0.2.1/docs/demo/fixtures/Travel/portugal_itinerary.md +8 -0
  21. sahara_memory-0.2.1/docs/demo/sahara-demo.tape +54 -0
  22. sahara_memory-0.2.1/docs/images/sahara-mcp-social.png +0 -0
  23. sahara_memory-0.2.1/docs/images/sahara-memory-demo.svg +51 -0
  24. sahara_memory-0.2.1/docs/integrations/chat-agents.md +119 -0
  25. sahara_memory-0.2.1/pyproject.toml +119 -0
  26. sahara_memory-0.2.1/specs/THREE_STEP_PRODUCT_MODEL_PLAN.md +496 -0
  27. sahara_memory-0.2.1/src/sahara/__init__.py +4 -0
  28. sahara_memory-0.2.1/src/sahara/claude_desktop.py +238 -0
  29. sahara_memory-0.2.1/src/sahara/cli.py +2658 -0
  30. sahara_memory-0.2.1/src/sahara/config.py +266 -0
  31. sahara_memory-0.2.1/src/sahara/cost_estimator.py +5 -0
  32. sahara_memory-0.2.1/src/sahara/daemon.py +5 -0
  33. sahara_memory-0.2.1/src/sahara/encryption.py +5 -0
  34. sahara_memory-0.2.1/src/sahara/file_watcher.py +5 -0
  35. sahara_memory-0.2.1/src/sahara/ignore_rules.py +5 -0
  36. sahara_memory-0.2.1/src/sahara/library.py +224 -0
  37. sahara_memory-0.2.1/src/sahara/mcp_server.py +347 -0
  38. sahara_memory-0.2.1/src/sahara/models.py +173 -0
  39. sahara_memory-0.2.1/src/sahara/notifier.py +5 -0
  40. sahara_memory-0.2.1/src/sahara/s3_client.py +5 -0
  41. sahara_memory-0.2.1/src/sahara/search/__init__.py +10 -0
  42. sahara_memory-0.2.1/src/sahara/search/ask_engine.py +212 -0
  43. sahara_memory-0.2.1/src/sahara/search/search_engine.py +340 -0
  44. sahara_memory-0.2.1/src/sahara/search_engine.py +5 -0
  45. sahara_memory-0.2.1/src/sahara/state_db.py +5 -0
  46. sahara_memory-0.2.1/src/sahara/storage/__init__.py +25 -0
  47. sahara_memory-0.2.1/src/sahara/storage/backend.py +95 -0
  48. sahara_memory-0.2.1/src/sahara/storage/cost_estimator.py +271 -0
  49. sahara_memory-0.2.1/src/sahara/storage/dual_write_backend.py +175 -0
  50. sahara_memory-0.2.1/src/sahara/storage/lifecycle.py +204 -0
  51. sahara_memory-0.2.1/src/sahara/storage/local_drive_client.py +321 -0
  52. sahara_memory-0.2.1/src/sahara/storage/s3_client.py +701 -0
  53. sahara_memory-0.2.1/src/sahara/storage/state_db.py +1404 -0
  54. sahara_memory-0.2.1/src/sahara/sync/__init__.py +35 -0
  55. sahara_memory-0.2.1/src/sahara/sync/daemon.py +553 -0
  56. sahara_memory-0.2.1/src/sahara/sync/file_watcher.py +204 -0
  57. sahara_memory-0.2.1/src/sahara/sync/ignore_rules.py +84 -0
  58. sahara_memory-0.2.1/src/sahara/sync/sync_engine.py +1081 -0
  59. sahara_memory-0.2.1/src/sahara/sync_engine.py +5 -0
  60. sahara_memory-0.2.1/src/sahara/utils/__init__.py +37 -0
  61. sahara_memory-0.2.1/src/sahara/utils/encryption.py +307 -0
  62. sahara_memory-0.2.1/src/sahara/utils/hash.py +17 -0
  63. sahara_memory-0.2.1/src/sahara/utils/notifier.py +130 -0
@@ -0,0 +1,43 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ *.egg-info/
8
+ *.egg
9
+ .eggs/
10
+ build/
11
+ dist/
12
+ wheels/
13
+ .venv/
14
+ venv/
15
+ env/
16
+
17
+ # Testing / coverage
18
+ .coverage
19
+ .coverage.*
20
+ htmlcov/
21
+ .pytest_cache/
22
+ .tox/
23
+
24
+ # macOS
25
+ .DS_Store
26
+ .AppleDouble
27
+ ._*
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+
35
+ # Environment / secrets
36
+ .env
37
+ .env.local
38
+ .env.*.local
39
+
40
+ # Sahara runtime files (not source)
41
+ *.db
42
+ *.db-wal
43
+ *.db-shm
@@ -0,0 +1,123 @@
1
+ # .saharaignore — Sahara ignore rules
2
+ #
3
+ # Syntax is identical to .gitignore (gitignore wildmatch patterns).
4
+ # Lines beginning with # are comments.
5
+ # Blank lines are ignored.
6
+ #
7
+ # Patterns are evaluated relative to the indexed content root.
8
+ #
9
+ # Examples:
10
+ # *.log — ignore all .log files anywhere
11
+ # /tmp/ — ignore the "tmp" folder at the root of the content root
12
+ # build/ — ignore any folder named "build"
13
+ # docs/**/*.pdf — ignore all PDFs inside docs/ recursively
14
+ # !important.log — un-ignore important.log (exception to a broader rule)
15
+ #
16
+ # ──────────────────────────────────────────────────────────────────────────────
17
+ # Version control
18
+ # ──────────────────────────────────────────────────────────────────────────────
19
+ .git/
20
+ .hg/
21
+ .svn/
22
+
23
+ # ──────────────────────────────────────────────────────────────────────────────
24
+ # Python
25
+ # ──────────────────────────────────────────────────────────────────────────────
26
+ __pycache__/
27
+ *.py[cod]
28
+ *.pyo
29
+ .venv/
30
+ venv/
31
+ .env/
32
+ env/
33
+ *.egg-info/
34
+ dist/
35
+ build/
36
+ .mypy_cache/
37
+ .ruff_cache/
38
+ .pytest_cache/
39
+
40
+ # ──────────────────────────────────────────────────────────────────────────────
41
+ # JavaScript / Node
42
+ # ──────────────────────────────────────────────────────────────────────────────
43
+ node_modules/
44
+ .npm/
45
+ .yarn/
46
+ .pnp.*
47
+
48
+ # ──────────────────────────────────────────────────────────────────────────────
49
+ # macOS
50
+ # ──────────────────────────────────────────────────────────────────────────────
51
+ .DS_Store
52
+ .AppleDouble
53
+ .LSOverride
54
+ ._*
55
+ .Spotlight-V100
56
+ .Trashes
57
+ Icon
58
+
59
+ # ──────────────────────────────────────────────────────────────────────────────
60
+ # Windows
61
+ # ──────────────────────────────────────────────────────────────────────────────
62
+ Thumbs.db
63
+ Desktop.ini
64
+ ehthumbs.db
65
+ $RECYCLE.BIN/
66
+
67
+ # ──────────────────────────────────────────────────────────────────────────────
68
+ # Linux
69
+ # ──────────────────────────────────────────────────────────────────────────────
70
+ *~
71
+ .directory
72
+
73
+ # ──────────────────────────────────────────────────────────────────────────────
74
+ # Editors
75
+ # ──────────────────────────────────────────────────────────────────────────────
76
+ .idea/
77
+ .vscode/
78
+ *.swp
79
+ *.swo
80
+ *.bak
81
+ *.orig
82
+ *.tmp
83
+ *_conflict-*
84
+
85
+ # ──────────────────────────────────────────────────────────────────────────────
86
+ # Logs and databases
87
+ # ──────────────────────────────────────────────────────────────────────────────
88
+ *.log
89
+ *.sqlite
90
+ *.sqlite3
91
+ *.db
92
+
93
+ # ──────────────────────────────────────────────────────────────────────────────
94
+ # Archives (comment out if you want to sync zip files)
95
+ # ──────────────────────────────────────────────────────────────────────────────
96
+ # *.zip
97
+ # *.tar
98
+ # *.tar.gz
99
+ # *.tar.bz2
100
+ # *.7z
101
+ # *.rar
102
+
103
+ # ──────────────────────────────────────────────────────────────────────────────
104
+ # Secrets (HIGHLY RECOMMENDED to keep these ignored)
105
+ # ──────────────────────────────────────────────────────────────────────────────
106
+ .env
107
+ .env.*
108
+ *.pem
109
+ *.key
110
+ *.p12
111
+ *.pfx
112
+ secrets/
113
+ credentials/
114
+
115
+ # ──────────────────────────────────────────────────────────────────────────────
116
+ # Sahara internals (do not remove)
117
+ # ──────────────────────────────────────────────────────────────────────────────
118
+ .sahara/
119
+ .saharaignore
120
+
121
+ # ──────────────────────────────────────────────────────────────────────────────
122
+ # Add your custom rules below this line
123
+ # ──────────────────────────────────────────────────────────────────────────────
@@ -0,0 +1,256 @@
1
+ # Sahara — Architecture
2
+
3
+ This document explains how Sahara is structured so contributors can find their way around quickly and extend the system without touching unrelated code.
4
+
5
+ ---
6
+
7
+ ## System overview
8
+
9
+ ```
10
+ ┌─────────────┐ ┌────────────────┐ ┌──────────────────────┐
11
+ │ CLI (click) │──▶│ IndexingService│──▶│ SearchEngine │
12
+ │ cli.py │ │ library.py │ │ fastembed + vec │
13
+ └─────────────┘ └────────────────┘ └──────────────────────┘
14
+ │ │ │
15
+ │ ▼ ▼
16
+ │ ┌──────────┐ AskEngine
17
+ │ │ StateDB │ Ollama/OpenAI
18
+ │ └──────────┘
19
+ │ ▲
20
+ ▼ │
21
+ ┌─────────────┐ ┌──────────────────────┐
22
+ │ SyncEngine │──▶│ Optional StorageBackend│
23
+ └─────────────┘ │ S3 / LocalDrive │
24
+ └──────────────────────┘
25
+ ```
26
+
27
+ The CLI is the only public surface. Everything else is an internal library that the CLI composes.
28
+
29
+ ---
30
+
31
+ ## Source layout
32
+
33
+ ```
34
+ src/sahara/
35
+ ├── cli.py # All Click commands — the public API
36
+ ├── config.py # SaharaConfig dataclass + TOML I/O
37
+ ├── library.py # Content-root migration and local indexing service
38
+ ├── models.py # FileRecord, SyncOperation, ManifestEntry, ...
39
+
40
+ ├── storage/
41
+ │ ├── backend.py # StorageBackend Protocol (the interface)
42
+ │ ├── s3_client.py # AWS S3 + MinIO implementation
43
+ │ ├── local_drive_client.py # Local filesystem implementation
44
+ │ ├── dual_write_backend.py # local+glacier dual-write wrapper
45
+ │ ├── state_db.py # SQLite state — files, history, chunks, ...
46
+ │ └── cost_estimator.py # S3 pricing estimates
47
+
48
+ ├── sync/
49
+ │ ├── sync_engine.py # Three-way diff, conflict resolution, execution
50
+ │ ├── daemon.py # Background sync loop
51
+ │ ├── file_watcher.py # watchdog integration
52
+ │ └── ignore_rules.py # .saharaignore parser
53
+
54
+ ├── search/
55
+ │ ├── search_engine.py # Text extraction, chunking, embedding, sqlite-vec KNN
56
+ │ └── ask_engine.py # LLM answer generation (ollama / OpenAI)
57
+
58
+ └── utils/
59
+ ├── encryption.py # AES-256-GCM, PBKDF2, keyring
60
+ ├── hash.py # SHA-256 helpers (shared between sync and search)
61
+ └── notifier.py # OS desktop notification
62
+ ```
63
+
64
+ ---
65
+
66
+ ## Storage backends
67
+
68
+ ### The `StorageBackend` Protocol
69
+
70
+ `src/sahara/storage/backend.py` defines the `StorageBackend` Protocol. Every backend must implement these methods:
71
+
72
+ | Method | Purpose |
73
+ |--------|---------|
74
+ | `upload_file` | Upload a local file, optionally encrypting it first |
75
+ | `download_file` | Download a key to a local path, optionally decrypting |
76
+ | `delete_object` | Delete a key |
77
+ | `copy_object` | Copy within the same backend (rename path) |
78
+ | `get_manifest` / `put_manifest` | Fetch / write the Sahara manifest atomically |
79
+ | `list_all_objects` | Bootstrap when no manifest exists yet |
80
+ | `head_object` | Return metadata (size, etag, storage class) |
81
+ | `validate_bucket_access` | Connectivity check |
82
+ | `check_conditional_put_support` | Whether atomic manifest writes are supported |
83
+ | `restore_object` | Glacier restore (S3 only; raise if unsupported) |
84
+
85
+ `SyncEngine` accepts any `StorageBackend` — it never imports a concrete backend class directly.
86
+
87
+ ### Adding a new backend
88
+
89
+ 1. Create `src/sahara/storage/mybackend_client.py`
90
+ 2. Implement all methods in the `StorageBackend` Protocol (use `LocalDriveClient` as the simplest reference)
91
+ 3. Add an `isinstance` check in `cli.py` where the backend is instantiated (search for `storage_mode`)
92
+ 4. Add tests in `tests/test_mybackend.py` — mock the external service, do not require real network access
93
+
94
+ ### Current backends
95
+
96
+ | Class | Module | Description |
97
+ |-------|--------|-------------|
98
+ | `S3Client` | `storage/s3_client.py` | AWS S3 and MinIO (via `endpoint_url`) |
99
+ | `LocalDriveClient` | `storage/local_drive_client.py` | Local filesystem or network mount |
100
+ | `DualWriteBackend` | `storage/dual_write_backend.py` | Writes to two backends simultaneously (local + glacier) |
101
+
102
+ ---
103
+
104
+ ## Sync pipeline
105
+
106
+ The sync pipeline lives in `sync/sync_engine.py`. The sequence for a full sync:
107
+
108
+ ```
109
+ 1. Load manifest from storage (single JSON object — avoids per-file HeadObject calls)
110
+ 2. Scan local folder → build local snapshot {path → sha256}
111
+ 3. Load last-known-good state from StateDB
112
+ 4. Three-way diff(local, remote_manifest, last_known_good):
113
+ - New local file → upload
114
+ - Deleted locally → delete from remote (or skip if remote was also changed = conflict)
115
+ - New remote file → download
116
+ - Deleted remotely → delete locally
117
+ - Both modified → conflict
118
+ 5. For each operation: execute in thread pool (max_workers parallel)
119
+ 6. Write updated manifest back to storage (atomic via If-Match ETag check)
120
+ 7. Update StateDB with new sync state
121
+ ```
122
+
123
+ ### Why the manifest?
124
+
125
+ Without the manifest, every sync would need to call `HeadObject` on every file in the bucket to check its current state — at $0.0004 per 1,000 calls and 50k files, that is $0.02 per sync, $7/month. The manifest is a single JSON blob stored at `.sahara/manifest.json` in the bucket. One `GetObject` replaces thousands of `HeadObject` calls.
126
+
127
+ ### Conflict resolution
128
+
129
+ Conflict strategy is set in config (`backup` / `local` / `remote` / `ask`). The `backup` strategy (default) renames the local copy to `filename.conflict-TIMESTAMP.ext` and downloads the remote version — no data loss.
130
+
131
+ ---
132
+
133
+ ## Search pipeline
134
+
135
+ The search pipeline runs entirely locally. `library.py` scans every registered content
136
+ root directly; it does not depend on sync records or a storage backend.
137
+
138
+ ```
139
+ 1. IndexingService.index():
140
+ a. Load content roots from StateDB
141
+ b. Walk each root with .saharaignore rules
142
+ c. Maintain index_entries inventory and detect missing files
143
+ d. Call SearchEngine for supported local files
144
+
145
+ 2. SearchEngine.index_file(path):
146
+ a. Extract text (TextExtractor) — supports .txt, .md, .py, .pdf, .docx, and plain-text heuristic
147
+ b. Chunk text: 1600-char chunks with 320-char overlap
148
+ c. Embed each chunk independently with BAAI/bge-small-en-v1.5 (384-dim) via fastembed
149
+ d. Insert rows into `chunks` table and `vec_chunks` virtual table (sqlite-vec)
150
+
151
+ 3. search(query):
152
+ a. Embed the query string
153
+ b. KNN query against vec_chunks (O(log n) ANN, not a Python cosine loop)
154
+ c. Join against `chunks` to get file paths and snippet text
155
+ d. Deduplicate: keep best chunk score per file
156
+ e. Return ranked list of {relative_path, score, snippet}
157
+ ```
158
+
159
+ ### Why chunked indexing?
160
+
161
+ A 50-page PDF has ~25,000 words. Embedding the whole document as one vector would mean the embedding averages over all content, making any specific detail on page 30 nearly unretrievable. By splitting into 400-token chunks with 80-token overlap, each chunk can be matched independently, so a query about page 30 will find the right chunk.
162
+
163
+ ### Adding a new file parser
164
+
165
+ `TextExtractor.extract()` in `search/search_engine.py` dispatches on file extension. Add a new `elif suffix == ".xyz"` branch there. For heavier parsers (OCR, audio transcription) consider wrapping the import in a `try/except ImportError` so the base install does not require the dependency.
166
+
167
+ ---
168
+
169
+ ## Ask pipeline
170
+
171
+ `search/ask_engine.py` wraps `SearchEngine` with an LLM layer.
172
+
173
+ ```
174
+ 1. Run search(question, top_k)
175
+ 2. Build context string from top chunk texts (capped at 6,000 chars)
176
+ 3. Try LLM in priority order:
177
+ a. OpenAI if OPENAI_API_KEY is set
178
+ b. Ollama at http://localhost:11434 if reachable
179
+ c. Degrade: return search results with snippets, no generated answer
180
+ 4. Return AskResult(answer, sources, degraded, model_used)
181
+ ```
182
+
183
+ Degraded mode is intentional — `sahara ask` is useful even without any LLM installed, because the ranked snippets alone often answer the question visually.
184
+
185
+ ---
186
+
187
+ ## Daemon and file watcher
188
+
189
+ `sync/daemon.py` runs a background loop that calls `SyncEngine.sync()` on a configurable interval. `sync/file_watcher.py` wraps watchdog's `Observer` and triggers an immediate partial sync when specific files change, rather than waiting for the interval.
190
+
191
+ The daemon writes a PID file to `~/.sahara/daemon.pid` and logs to `~/.sahara/daemon.log`. The CLI's `sahara daemon start/stop/status` commands manage it.
192
+
193
+ ---
194
+
195
+ ## SQLite schema
196
+
197
+ All state is stored in `~/.sahara/state.db`. WAL mode is enabled on every connection for safe concurrent reads.
198
+
199
+ | Table | Purpose |
200
+ |-------|---------|
201
+ | `files` | One row per synced file — sha256, size, tier, timestamps, is_deleted |
202
+ | `history` | Append-only log of every sync operation |
203
+ | `pending_multipart` | In-flight multipart upload state (crash recovery) |
204
+ | `sync_targets` | Registered (local_path, s3_prefix) pairs |
205
+ | `content_roots` | Canonical indexed folders with primary and sync-enabled flags |
206
+ | `index_entries` | Local indexing inventory and indexed/unsupported/missing status |
207
+ | `storage_residency` | Explicit present/offloaded/missing state for stored files |
208
+ | `config_kv` | Key-value store for runtime config values |
209
+ | `embeddings` | Legacy single-vector-per-file index (superseded by `chunks`) |
210
+ | `chunks` | One row per text chunk — path, chunk_index, content_hash, chunk_text |
211
+ | `vec_chunks` | sqlite-vec virtual table — one float[384] vector per chunk (rowid matches `chunks.id`) |
212
+
213
+ The `chunks` and `vec_chunks` tables work as a pair. `vec_chunks` stores the raw vectors; `chunks` stores the text and metadata. A JOIN on `rowid = id` links them.
214
+
215
+ ### Offload lifecycle
216
+
217
+ `StorageLifecycle.offload()` requires a synced, indexed file. It downloads the stored
218
+ object to temporary storage, decrypts it when needed, verifies the plaintext SHA-256,
219
+ marks the file offloaded, and then removes the local source. Chunks and embeddings are
220
+ retained. `fetch()` downloads atomically, verifies the same checksum, and marks the file
221
+ present again. Sync ignores intentional offloads so they cannot be mistaken for local
222
+ deletions.
223
+
224
+ ---
225
+
226
+ ## Configuration
227
+
228
+ Config lives at `~/.sahara/config.toml`. `storage_mode = "none"` is the fresh-install
229
+ default. Existing configuration files that predate `storage_mode` are loaded as S3
230
+ configurations for compatibility. The CLI reads configuration at startup and passes a
231
+ snapshot down to each subsystem.
232
+
233
+ The TOML format is stable and user-editable. Do not add auto-generated comments or machine-managed sections to the config file.
234
+
235
+ ---
236
+
237
+ ## Known limitations
238
+
239
+ - **No reranker yet.** Results from sqlite-vec KNN are re-sorted by score but not re-ranked by a cross-encoder. Precision is good but not state-of-the-art for ambiguous queries.
240
+ - **Single embedding model.** Only `BAAI/bge-small-en-v1.5` (384-dim) is supported. Switching models requires re-indexing all files.
241
+ - **No incremental re-indexing.** `sahara index` re-indexes the whole collection. Content-hash tracking means only changed files are re-embedded, but the check is O(n) on the files table.
242
+ - **Single-user only.** The manifest + SQLite architecture assumes one writer at a time. Multiple machines syncing to the same bucket will serialize through the manifest ETag check.
243
+
244
+ ---
245
+
246
+ ## Where to start
247
+
248
+ | Contribution area | Start here |
249
+ |-------------------|-----------|
250
+ | New storage backend | `storage/backend.py` (Protocol) → `storage/local_drive_client.py` (simplest impl) |
251
+ | New file parser | `search/search_engine.py` `TextExtractor.extract()` |
252
+ | Improve search ranking | `search/search_engine.py` `SearchEngine.search()` |
253
+ | New CLI command | `cli.py` — add a `@main.command()` function |
254
+ | Sync bug | `sync/sync_engine.py` `DiffResult` and `_execute_operations()` |
255
+ | Daemon / watcher | `sync/daemon.py`, `sync/file_watcher.py` |
256
+ | Encryption | `utils/encryption.py` |
@@ -0,0 +1,110 @@
1
+ # Changelog
2
+
3
+ All notable changes to Sahara are documented here.
4
+
5
+ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). Sahara uses [Semantic Versioning](https://semver.org/).
6
+
7
+ ---
8
+
9
+ ## [Unreleased]
10
+
11
+ ---
12
+
13
+ ## [0.2.1] — 2026-06-07
14
+
15
+ ### Added
16
+
17
+ - Trusted Publishing workflows for verified TestPyPI and PyPI releases
18
+ - Contributor Covenant 3.0 code of conduct with private incident reporting
19
+ - Claude Desktop launch guide with platform configuration, exact MCP tool contracts,
20
+ security boundaries, verification, and troubleshooting
21
+ - Three-step product plan for basic indexing with optional local-drive or AWS storage
22
+ - Basic index-only mode with non-interactive `sahara init --mode basic --folder <path>`
23
+ - Canonical content-root and index-inventory database tables
24
+ - `sahara folder add/list/remove/sync` commands for index and sync scope management
25
+ - `sahara storage configure local/aws` for upgrading an existing basic library
26
+ - Checksum-verified `sahara offload` and `sahara fetch` with retained search metadata
27
+ - Explicit storage residency in CLI search/list/status and MCP results
28
+ - Local indexing that scans content roots without requiring sync records or storage
29
+ - `sahara mcp install-claude` for merge-safe, one-command Claude Desktop setup on
30
+ macOS and Windows
31
+
32
+ ### Changed
33
+
34
+ - Restored full mypy checking for the daemon and filesystem watcher
35
+ - Renamed the Python distribution from `sahara` to `sahara-memory` to avoid the
36
+ unrelated OpenStack Sahara project on PyPI; the product name, `sahara` CLI,
37
+ and `sahara` import package are unchanged
38
+ - First-time indexing now explains the local embedding-model download and clarifies
39
+ that Hugging Face authentication warnings do not require user action
40
+ - Package and license metadata now identify Nidheesh Puthalath as the maintainer
41
+ - README quick start now demonstrates both CLI retrieval and cited Claude Desktop use
42
+ - Added fictional, privacy-safe README, social, and reproducible terminal demo assets
43
+ - Ollama is the initial answer provider; OpenAI can be selected explicitly or saved
44
+ as the user's default without installing Ollama
45
+ - Added first-run Ollama and optional OpenAI setup guidance
46
+ - Streamlined the README around local search first, with answers, MCP, and storage
47
+ introduced as optional extensions
48
+ - Added a categorized reference covering every CLI command
49
+ - Documentation consolidated around current user, contributor, release, and architecture
50
+ guidance; superseded specifications remain available through Git history
51
+ - Fresh installations default to local indexing; legacy configs without `storage_mode`
52
+ retain their previous S3 behavior
53
+ - `index-report` now reads the local index inventory rather than the sync file table
54
+
55
+ ---
56
+
57
+ ## [0.2.0] — 2026-06-06
58
+
59
+ ### Added
60
+
61
+ - **Semantic search** — `sahara index` extracts and embeds file content; `sahara search <query>` retrieves files by meaning using sqlite-vec KNN
62
+ - **Chunked indexing** — long documents (PDFs, DOCX) are split into overlapping 400-token chunks so content past the first page is retrievable
63
+ - **`sahara ask`** — natural language question answering; uses local Ollama or OpenAI when available, degrades gracefully to ranked snippets
64
+ - **MinIO backend** — S3-compatible self-hosted storage via `endpoint_url` configuration
65
+ - **Local drive backend** — sync to a second local drive or NAS with no cloud account required
66
+ - **`local+glacier` dual-write mode** — writes to a local drive and S3 Glacier simultaneously
67
+ - **`StorageBackend` Protocol** — formal structural interface for all storage backends; `SyncEngine` no longer imports concrete backend classes
68
+ - **`BAAI/bge-small-en-v1.5` embedding model** — 384-dim vectors via `fastembed`; fast enough for CPU-only indexing
69
+ - **PDF and DOCX extraction** — `pypdf` and `python-docx` are optional dependencies under `[search]`
70
+ - **`sahara doctor --repair`** — diagnose and auto-fix common configuration problems
71
+ - **SHA-256 utility** — shared `utils/hash.py` used by both sync and search (previously duplicated)
72
+ - **Read-only MCP server** — exposes search, ask, chunk reads, folder listing, and index status to Claude Desktop and other MCP clients
73
+ - **Authenticated remote MCP** — HTTP/streamable transport with bearer-token protection for secure tunnel and remote-client workflows
74
+ - **MCP exposure controls** — tool and storage-prefix allowlists, snippet-size limits, and warnings for non-loopback bindings
75
+ - **`sahara index-report`** — reports indexed/unindexed file counts, skip reasons, and sample indexing gaps
76
+ - **MIT license file** — included in the repository, wheel metadata, and source distribution
77
+
78
+ ### Changed
79
+
80
+ - Public positioning updated to "Sahara: extended storage, searchable memory and instant retrieval"
81
+ - `_require_config` guard: local drive mode no longer requires a bucket to be configured
82
+ - Storage modules reorganised into `src/sahara/storage/`, sync modules into `src/sahara/sync/`
83
+ - Indexing skips unsupported binary media instead of attempting noisy text extraction
84
+
85
+ ### Fixed
86
+
87
+ - Manifest locking race condition under concurrent syncs
88
+ - False abort in local drive mode due to missing bucket check
89
+ - Optional MCP dependency tests now skip cleanly when the `[mcp]` extra is not installed
90
+
91
+ ---
92
+
93
+ ## [0.1.0] — 2024-03-16
94
+
95
+ ### Added
96
+
97
+ - **Bidirectional sync** to AWS S3 with three-way diff (local / remote / last-known-good)
98
+ - **Client-side AES-256-GCM encryption** with PBKDF2-HMAC-SHA256 key derivation (600,000 iterations)
99
+ - **Glacier archiving** — `sahara archive`, `sahara restore`, `sahara restore-download`
100
+ - **Background daemon** with file-watching via watchdog
101
+ - **Rename detection** — moves are tracked as copy + delete rather than delete + upload
102
+ - **Conflict resolution** — backup, local, remote, and ask strategies
103
+ - **Cost reporting** — `sahara usage` shows storage usage and estimated monthly S3 cost
104
+ - **`.saharaignore`** — gitignore-style rules for excluding files from sync
105
+ - **Multipart uploads** — automatic for files above a configurable threshold
106
+ - **`sahara doctor`** — connectivity and configuration diagnostics
107
+ - `sahara init` interactive setup wizard
108
+ - `sahara config show/get/set` configuration management
109
+ - `sahara history` sync operation log
110
+ - `sahara conflicts` and `sahara resolve`
@@ -0,0 +1,126 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We pledge to make our community welcoming, safe, and equitable for all.
6
+
7
+ We are committed to fostering an environment that respects and promotes the dignity,
8
+ rights, and contributions of all individuals, regardless of characteristics including
9
+ race, ethnicity, caste, color, age, physical characteristics, neurodiversity,
10
+ disability, sex or gender, gender identity or expression, sexual orientation, language,
11
+ philosophy or religion, national or social origin, socio-economic position, level of
12
+ education, or other status. The same privileges of participation are extended to
13
+ everyone who participates in good faith and in accordance with this Covenant.
14
+
15
+ ## Encouraged Behaviors
16
+
17
+ While acknowledging differences in social norms, we all strive to meet our community's
18
+ expectations for positive behavior. We also understand that our words and actions may
19
+ be interpreted differently than we intend based on culture, background, or native
20
+ language.
21
+
22
+ With these considerations in mind, we agree to behave mindfully toward each other and
23
+ act in ways that center our shared values, including:
24
+
25
+ 1. Respecting the **purpose of our community**, our activities, and our ways of
26
+ gathering.
27
+ 2. Engaging **kindly and honestly** with others.
28
+ 3. Respecting **different viewpoints** and experiences.
29
+ 4. **Taking responsibility** for our actions and contributions.
30
+ 5. Gracefully giving and accepting **constructive feedback**.
31
+ 6. Committing to **repairing harm** when it occurs.
32
+ 7. Behaving in other ways that promote and sustain the **well-being of our community**.
33
+
34
+ ## Restricted Behaviors
35
+
36
+ We agree to restrict the following behaviors in our community. Instances, threats, and
37
+ promotion of these behaviors are violations of this Code of Conduct.
38
+
39
+ 1. **Harassment.** Violating explicitly expressed boundaries or engaging in unnecessary
40
+ personal attention after any clear request to stop.
41
+ 2. **Character attacks.** Making insulting, demeaning, or pejorative comments directed
42
+ at a community member or group of people.
43
+ 3. **Stereotyping or discrimination.** Characterizing anyone's personality or behavior
44
+ on the basis of immutable identities or traits.
45
+ 4. **Sexualization.** Behaving in a way that would generally be considered
46
+ inappropriately intimate in the context or purpose of the community.
47
+ 5. **Violating confidentiality.** Sharing or acting on someone's personal or private
48
+ information without their permission.
49
+ 6. **Endangerment.** Causing, encouraging, or threatening violence or other harm toward
50
+ any person or group.
51
+ 7. Behaving in other ways that **threaten the well-being** of our community.
52
+
53
+ ### Other Restrictions
54
+
55
+ 1. **Misleading identity.** Impersonating someone else for any reason, or pretending to
56
+ be someone else to evade enforcement actions.
57
+ 2. **Failing to credit sources.** Not properly crediting the sources of content you
58
+ contribute.
59
+ 3. **Promotional materials.** Sharing marketing or other commercial content in a way
60
+ that is outside the norms of the community.
61
+ 4. **Irresponsible communication.** Failing to responsibly present content which
62
+ includes, links, or describes any other restricted behaviors.
63
+
64
+ ## Reporting an Issue
65
+
66
+ Tensions can occur between community members even when they are trying their best to
67
+ collaborate. Not every conflict represents a code of conduct violation, and this Code
68
+ of Conduct reinforces encouraged behaviors and norms that can help avoid conflicts and
69
+ minimize harm.
70
+
71
+ Report possible violations privately through the repository's
72
+ [private reporting channel](https://github.com/nidheesh-p/sahara/security/advisories/new).
73
+ Start the report title with `[Conduct]` and include relevant links, context, and any
74
+ supporting records. Do not report conduct incidents in a public issue.
75
+
76
+ The maintainer will take reports seriously and make every effort to respond promptly.
77
+ Reports will be investigated by reviewing available messages, logs, and other relevant
78
+ records. Investigation and enforcement actions will prioritize safety and
79
+ confidentiality.
80
+
81
+ ## Addressing and Repairing Harm
82
+
83
+ If an investigation finds that this Code of Conduct has been violated, the following
84
+ enforcement ladder may be used. Depending on the severity of a violation, lower steps
85
+ may be skipped.
86
+
87
+ 1. **Warning**
88
+ - Event: A violation involving a single incident or series of incidents.
89
+ - Consequence: A private, written warning from the maintainer.
90
+ - Repair: A private apology, acknowledgement of responsibility, or clarification of
91
+ expectations may be requested.
92
+ 2. **Temporarily Limited Activities**
93
+ - Event: A repeated violation after a warning, or the first occurrence of a more
94
+ serious violation.
95
+ - Consequence: A private warning with a time-limited cooldown period or restricted
96
+ participation.
97
+ - Repair: The person may be asked to reflect, apologize, and re-enter community
98
+ spaces thoughtfully.
99
+ 3. **Temporary Suspension**
100
+ - Event: A pattern of repeated violations or a single serious violation.
101
+ - Consequence: Temporary removal from community spaces with conditions for return.
102
+ - Repair: The person must respect the suspension and meet the stated conditions
103
+ before returning.
104
+ 4. **Permanent Ban**
105
+ - Event: Repeated violations that other steps have not resolved, or a violation so
106
+ serious that continued participation would endanger the community.
107
+ - Consequence: Permanent removal from community spaces and communication channels.
108
+ - Repair: There is no possible repair in cases of this severity.
109
+
110
+ This enforcement ladder is a guideline. The maintainer may use discretion and judgment
111
+ in the best interests of the community.
112
+
113
+ ## Scope
114
+
115
+ This Code of Conduct applies within all project community spaces and when an individual
116
+ is officially representing the project in public or other spaces. Examples include
117
+ posting through an official account or acting as an appointed representative at an
118
+ online or offline event.
119
+
120
+ ## Attribution
121
+
122
+ This Code of Conduct is adapted from the
123
+ [Contributor Covenant, version 3.0](https://www.contributor-covenant.org/version/3/0/).
124
+
125
+ Contributor Covenant is stewarded by the Organization for Ethical Source and licensed
126
+ under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).