sahara-memory 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sahara_memory-0.2.1/.gitignore +43 -0
- sahara_memory-0.2.1/.saharaignore.template +123 -0
- sahara_memory-0.2.1/ARCHITECTURE.md +256 -0
- sahara_memory-0.2.1/CHANGELOG.md +110 -0
- sahara_memory-0.2.1/CODE_OF_CONDUCT.md +126 -0
- sahara_memory-0.2.1/CONTRIBUTING.md +273 -0
- sahara_memory-0.2.1/LICENSE +21 -0
- sahara_memory-0.2.1/PKG-INFO +328 -0
- sahara_memory-0.2.1/README.md +271 -0
- sahara_memory-0.2.1/RELEASE_CHECKLIST.md +77 -0
- sahara_memory-0.2.1/ROADMAP.md +76 -0
- sahara_memory-0.2.1/SECURITY.md +124 -0
- sahara_memory-0.2.1/docs/ANSWER_PROVIDERS.md +180 -0
- sahara_memory-0.2.1/docs/CLAUDE_DESKTOP.md +276 -0
- sahara_memory-0.2.1/docs/COMMAND_REFERENCE.md +207 -0
- sahara_memory-0.2.1/docs/GETTING_STARTED.md +160 -0
- sahara_memory-0.2.1/docs/demo/README.md +34 -0
- sahara_memory-0.2.1/docs/demo/fixtures/Home/Repairs/dishwasher_invoice.md +6 -0
- sahara_memory-0.2.1/docs/demo/fixtures/Medical/prescription_receipt.md +8 -0
- sahara_memory-0.2.1/docs/demo/fixtures/Travel/portugal_itinerary.md +8 -0
- sahara_memory-0.2.1/docs/demo/sahara-demo.tape +54 -0
- sahara_memory-0.2.1/docs/images/sahara-mcp-social.png +0 -0
- sahara_memory-0.2.1/docs/images/sahara-memory-demo.svg +51 -0
- sahara_memory-0.2.1/docs/integrations/chat-agents.md +119 -0
- sahara_memory-0.2.1/pyproject.toml +119 -0
- sahara_memory-0.2.1/specs/THREE_STEP_PRODUCT_MODEL_PLAN.md +496 -0
- sahara_memory-0.2.1/src/sahara/__init__.py +4 -0
- sahara_memory-0.2.1/src/sahara/claude_desktop.py +238 -0
- sahara_memory-0.2.1/src/sahara/cli.py +2658 -0
- sahara_memory-0.2.1/src/sahara/config.py +266 -0
- sahara_memory-0.2.1/src/sahara/cost_estimator.py +5 -0
- sahara_memory-0.2.1/src/sahara/daemon.py +5 -0
- sahara_memory-0.2.1/src/sahara/encryption.py +5 -0
- sahara_memory-0.2.1/src/sahara/file_watcher.py +5 -0
- sahara_memory-0.2.1/src/sahara/ignore_rules.py +5 -0
- sahara_memory-0.2.1/src/sahara/library.py +224 -0
- sahara_memory-0.2.1/src/sahara/mcp_server.py +347 -0
- sahara_memory-0.2.1/src/sahara/models.py +173 -0
- sahara_memory-0.2.1/src/sahara/notifier.py +5 -0
- sahara_memory-0.2.1/src/sahara/s3_client.py +5 -0
- sahara_memory-0.2.1/src/sahara/search/__init__.py +10 -0
- sahara_memory-0.2.1/src/sahara/search/ask_engine.py +212 -0
- sahara_memory-0.2.1/src/sahara/search/search_engine.py +340 -0
- sahara_memory-0.2.1/src/sahara/search_engine.py +5 -0
- sahara_memory-0.2.1/src/sahara/state_db.py +5 -0
- sahara_memory-0.2.1/src/sahara/storage/__init__.py +25 -0
- sahara_memory-0.2.1/src/sahara/storage/backend.py +95 -0
- sahara_memory-0.2.1/src/sahara/storage/cost_estimator.py +271 -0
- sahara_memory-0.2.1/src/sahara/storage/dual_write_backend.py +175 -0
- sahara_memory-0.2.1/src/sahara/storage/lifecycle.py +204 -0
- sahara_memory-0.2.1/src/sahara/storage/local_drive_client.py +321 -0
- sahara_memory-0.2.1/src/sahara/storage/s3_client.py +701 -0
- sahara_memory-0.2.1/src/sahara/storage/state_db.py +1404 -0
- sahara_memory-0.2.1/src/sahara/sync/__init__.py +35 -0
- sahara_memory-0.2.1/src/sahara/sync/daemon.py +553 -0
- sahara_memory-0.2.1/src/sahara/sync/file_watcher.py +204 -0
- sahara_memory-0.2.1/src/sahara/sync/ignore_rules.py +84 -0
- sahara_memory-0.2.1/src/sahara/sync/sync_engine.py +1081 -0
- sahara_memory-0.2.1/src/sahara/sync_engine.py +5 -0
- sahara_memory-0.2.1/src/sahara/utils/__init__.py +37 -0
- sahara_memory-0.2.1/src/sahara/utils/encryption.py +307 -0
- sahara_memory-0.2.1/src/sahara/utils/hash.py +17 -0
- sahara_memory-0.2.1/src/sahara/utils/notifier.py +130 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
.Python
|
|
7
|
+
*.egg-info/
|
|
8
|
+
*.egg
|
|
9
|
+
.eggs/
|
|
10
|
+
build/
|
|
11
|
+
dist/
|
|
12
|
+
wheels/
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
env/
|
|
16
|
+
|
|
17
|
+
# Testing / coverage
|
|
18
|
+
.coverage
|
|
19
|
+
.coverage.*
|
|
20
|
+
htmlcov/
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.tox/
|
|
23
|
+
|
|
24
|
+
# macOS
|
|
25
|
+
.DS_Store
|
|
26
|
+
.AppleDouble
|
|
27
|
+
._*
|
|
28
|
+
|
|
29
|
+
# IDE
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
|
|
35
|
+
# Environment / secrets
|
|
36
|
+
.env
|
|
37
|
+
.env.local
|
|
38
|
+
.env.*.local
|
|
39
|
+
|
|
40
|
+
# Sahara runtime files (not source)
|
|
41
|
+
*.db
|
|
42
|
+
*.db-wal
|
|
43
|
+
*.db-shm
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# .saharaignore — Sahara ignore rules
|
|
2
|
+
#
|
|
3
|
+
# Syntax is identical to .gitignore (gitignore wildmatch patterns).
|
|
4
|
+
# Lines beginning with # are comments.
|
|
5
|
+
# Blank lines are ignored.
|
|
6
|
+
#
|
|
7
|
+
# Patterns are evaluated relative to the indexed content root.
|
|
8
|
+
#
|
|
9
|
+
# Examples:
|
|
10
|
+
# *.log — ignore all .log files anywhere
|
|
11
|
+
# /tmp/ — ignore the "tmp" folder at the root of the content root
|
|
12
|
+
# build/ — ignore any folder named "build"
|
|
13
|
+
# docs/**/*.pdf — ignore all PDFs inside docs/ recursively
|
|
14
|
+
# !important.log — un-ignore important.log (exception to a broader rule)
|
|
15
|
+
#
|
|
16
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
17
|
+
# Version control
|
|
18
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
19
|
+
.git/
|
|
20
|
+
.hg/
|
|
21
|
+
.svn/
|
|
22
|
+
|
|
23
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
24
|
+
# Python
|
|
25
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
26
|
+
__pycache__/
|
|
27
|
+
*.py[cod]
|
|
28
|
+
*.pyo
|
|
29
|
+
.venv/
|
|
30
|
+
venv/
|
|
31
|
+
.env/
|
|
32
|
+
env/
|
|
33
|
+
*.egg-info/
|
|
34
|
+
dist/
|
|
35
|
+
build/
|
|
36
|
+
.mypy_cache/
|
|
37
|
+
.ruff_cache/
|
|
38
|
+
.pytest_cache/
|
|
39
|
+
|
|
40
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
41
|
+
# JavaScript / Node
|
|
42
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
43
|
+
node_modules/
|
|
44
|
+
.npm/
|
|
45
|
+
.yarn/
|
|
46
|
+
.pnp.*
|
|
47
|
+
|
|
48
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
49
|
+
# macOS
|
|
50
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
51
|
+
.DS_Store
|
|
52
|
+
.AppleDouble
|
|
53
|
+
.LSOverride
|
|
54
|
+
._*
|
|
55
|
+
.Spotlight-V100
|
|
56
|
+
.Trashes
|
|
57
|
+
Icon
|
|
58
|
+
|
|
59
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
60
|
+
# Windows
|
|
61
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
62
|
+
Thumbs.db
|
|
63
|
+
Desktop.ini
|
|
64
|
+
ehthumbs.db
|
|
65
|
+
$RECYCLE.BIN/
|
|
66
|
+
|
|
67
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
68
|
+
# Linux
|
|
69
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
70
|
+
*~
|
|
71
|
+
.directory
|
|
72
|
+
|
|
73
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
74
|
+
# Editors
|
|
75
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
76
|
+
.idea/
|
|
77
|
+
.vscode/
|
|
78
|
+
*.swp
|
|
79
|
+
*.swo
|
|
80
|
+
*.bak
|
|
81
|
+
*.orig
|
|
82
|
+
*.tmp
|
|
83
|
+
*_conflict-*
|
|
84
|
+
|
|
85
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
86
|
+
# Logs and databases
|
|
87
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
88
|
+
*.log
|
|
89
|
+
*.sqlite
|
|
90
|
+
*.sqlite3
|
|
91
|
+
*.db
|
|
92
|
+
|
|
93
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
94
|
+
# Archives (comment out if you want to sync zip files)
|
|
95
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
96
|
+
# *.zip
|
|
97
|
+
# *.tar
|
|
98
|
+
# *.tar.gz
|
|
99
|
+
# *.tar.bz2
|
|
100
|
+
# *.7z
|
|
101
|
+
# *.rar
|
|
102
|
+
|
|
103
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
104
|
+
# Secrets (HIGHLY RECOMMENDED to keep these ignored)
|
|
105
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
106
|
+
.env
|
|
107
|
+
.env.*
|
|
108
|
+
*.pem
|
|
109
|
+
*.key
|
|
110
|
+
*.p12
|
|
111
|
+
*.pfx
|
|
112
|
+
secrets/
|
|
113
|
+
credentials/
|
|
114
|
+
|
|
115
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
116
|
+
# Sahara internals (do not remove)
|
|
117
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
118
|
+
.sahara/
|
|
119
|
+
.saharaignore
|
|
120
|
+
|
|
121
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
122
|
+
# Add your custom rules below this line
|
|
123
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
# Sahara — Architecture
|
|
2
|
+
|
|
3
|
+
This document explains how Sahara is structured so contributors can find their way around quickly and extend the system without touching unrelated code.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## System overview
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
┌─────────────┐ ┌────────────────┐ ┌──────────────────────┐
|
|
11
|
+
│ CLI (click) │──▶│ IndexingService│──▶│ SearchEngine │
|
|
12
|
+
│ cli.py │ │ library.py │ │ fastembed + vec │
|
|
13
|
+
└─────────────┘ └────────────────┘ └──────────────────────┘
|
|
14
|
+
│ │ │
|
|
15
|
+
│ ▼ ▼
|
|
16
|
+
│ ┌──────────┐ AskEngine
|
|
17
|
+
│ │ StateDB │ Ollama/OpenAI
|
|
18
|
+
│ └──────────┘
|
|
19
|
+
│ ▲
|
|
20
|
+
▼ │
|
|
21
|
+
┌─────────────┐ ┌──────────────────────┐
|
|
22
|
+
│ SyncEngine │──▶│ Optional StorageBackend│
|
|
23
|
+
└─────────────┘ │ S3 / LocalDrive │
|
|
24
|
+
└──────────────────────┘
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
The CLI is the only public surface. Everything else is an internal library that the CLI composes.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Source layout
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
src/sahara/
|
|
35
|
+
├── cli.py # All Click commands — the public API
|
|
36
|
+
├── config.py # SaharaConfig dataclass + TOML I/O
|
|
37
|
+
├── library.py # Content-root migration and local indexing service
|
|
38
|
+
├── models.py # FileRecord, SyncOperation, ManifestEntry, ...
|
|
39
|
+
│
|
|
40
|
+
├── storage/
|
|
41
|
+
│ ├── backend.py # StorageBackend Protocol (the interface)
|
|
42
|
+
│ ├── s3_client.py # AWS S3 + MinIO implementation
|
|
43
|
+
│ ├── local_drive_client.py # Local filesystem implementation
|
|
44
|
+
│ ├── dual_write_backend.py # local+glacier dual-write wrapper
|
|
45
|
+
│ ├── state_db.py # SQLite state — files, history, chunks, ...
|
|
46
|
+
│ └── cost_estimator.py # S3 pricing estimates
|
|
47
|
+
│
|
|
48
|
+
├── sync/
|
|
49
|
+
│ ├── sync_engine.py # Three-way diff, conflict resolution, execution
|
|
50
|
+
│ ├── daemon.py # Background sync loop
|
|
51
|
+
│ ├── file_watcher.py # watchdog integration
|
|
52
|
+
│ └── ignore_rules.py # .saharaignore parser
|
|
53
|
+
│
|
|
54
|
+
├── search/
|
|
55
|
+
│ ├── search_engine.py # Text extraction, chunking, embedding, sqlite-vec KNN
|
|
56
|
+
│ └── ask_engine.py # LLM answer generation (ollama / OpenAI)
|
|
57
|
+
│
|
|
58
|
+
└── utils/
|
|
59
|
+
├── encryption.py # AES-256-GCM, PBKDF2, keyring
|
|
60
|
+
├── hash.py # SHA-256 helpers (shared between sync and search)
|
|
61
|
+
└── notifier.py # OS desktop notification
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Storage backends
|
|
67
|
+
|
|
68
|
+
### The `StorageBackend` Protocol
|
|
69
|
+
|
|
70
|
+
`src/sahara/storage/backend.py` defines the `StorageBackend` Protocol. Every backend must implement these methods:
|
|
71
|
+
|
|
72
|
+
| Method | Purpose |
|
|
73
|
+
|--------|---------|
|
|
74
|
+
| `upload_file` | Upload a local file, optionally encrypting it first |
|
|
75
|
+
| `download_file` | Download a key to a local path, optionally decrypting |
|
|
76
|
+
| `delete_object` | Delete a key |
|
|
77
|
+
| `copy_object` | Copy within the same backend (rename path) |
|
|
78
|
+
| `get_manifest` / `put_manifest` | Fetch / write the Sahara manifest atomically |
|
|
79
|
+
| `list_all_objects` | Bootstrap when no manifest exists yet |
|
|
80
|
+
| `head_object` | Return metadata (size, etag, storage class) |
|
|
81
|
+
| `validate_bucket_access` | Connectivity check |
|
|
82
|
+
| `check_conditional_put_support` | Whether atomic manifest writes are supported |
|
|
83
|
+
| `restore_object` | Glacier restore (S3 only; raise if unsupported) |
|
|
84
|
+
|
|
85
|
+
`SyncEngine` accepts any `StorageBackend` — it never imports a concrete backend class directly.
|
|
86
|
+
|
|
87
|
+
### Adding a new backend
|
|
88
|
+
|
|
89
|
+
1. Create `src/sahara/storage/mybackend_client.py`
|
|
90
|
+
2. Implement all methods in the `StorageBackend` Protocol (use `LocalDriveClient` as the simplest reference)
|
|
91
|
+
3. Add an `isinstance` check in `cli.py` where the backend is instantiated (search for `storage_mode`)
|
|
92
|
+
4. Add tests in `tests/test_mybackend.py` — mock the external service, do not require real network access
|
|
93
|
+
|
|
94
|
+
### Current backends
|
|
95
|
+
|
|
96
|
+
| Class | Module | Description |
|
|
97
|
+
|-------|--------|-------------|
|
|
98
|
+
| `S3Client` | `storage/s3_client.py` | AWS S3 and MinIO (via `endpoint_url`) |
|
|
99
|
+
| `LocalDriveClient` | `storage/local_drive_client.py` | Local filesystem or network mount |
|
|
100
|
+
| `DualWriteBackend` | `storage/dual_write_backend.py` | Writes to two backends simultaneously (local + glacier) |
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Sync pipeline
|
|
105
|
+
|
|
106
|
+
The sync pipeline lives in `sync/sync_engine.py`. The sequence for a full sync:
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
1. Load manifest from storage (single JSON object — avoids per-file HeadObject calls)
|
|
110
|
+
2. Scan local folder → build local snapshot {path → sha256}
|
|
111
|
+
3. Load last-known-good state from StateDB
|
|
112
|
+
4. Three-way diff(local, remote_manifest, last_known_good):
|
|
113
|
+
- New local file → upload
|
|
114
|
+
- Deleted locally → delete from remote (or skip if remote was also changed = conflict)
|
|
115
|
+
- New remote file → download
|
|
116
|
+
- Deleted remotely → delete locally
|
|
117
|
+
- Both modified → conflict
|
|
118
|
+
5. For each operation: execute in thread pool (max_workers parallel)
|
|
119
|
+
6. Write updated manifest back to storage (atomic via If-Match ETag check)
|
|
120
|
+
7. Update StateDB with new sync state
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Why the manifest?
|
|
124
|
+
|
|
125
|
+
Without the manifest, every sync would need to call `HeadObject` on every file in the bucket to check its current state — at $0.0004 per 1,000 calls and 50k files, that is $0.02 per sync, $7/month. The manifest is a single JSON blob stored at `.sahara/manifest.json` in the bucket. One `GetObject` replaces thousands of `HeadObject` calls.
|
|
126
|
+
|
|
127
|
+
### Conflict resolution
|
|
128
|
+
|
|
129
|
+
Conflict strategy is set in config (`backup` / `local` / `remote` / `ask`). The `backup` strategy (default) renames the local copy to `filename.conflict-TIMESTAMP.ext` and downloads the remote version — no data loss.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Search pipeline
|
|
134
|
+
|
|
135
|
+
The search pipeline runs entirely locally. `library.py` scans every registered content
|
|
136
|
+
root directly; it does not depend on sync records or a storage backend.
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
1. IndexingService.index():
|
|
140
|
+
a. Load content roots from StateDB
|
|
141
|
+
b. Walk each root with .saharaignore rules
|
|
142
|
+
c. Maintain index_entries inventory and detect missing files
|
|
143
|
+
d. Call SearchEngine for supported local files
|
|
144
|
+
|
|
145
|
+
2. SearchEngine.index_file(path):
|
|
146
|
+
a. Extract text (TextExtractor) — supports .txt, .md, .py, .pdf, .docx, and plain-text heuristic
|
|
147
|
+
b. Chunk text: 1600-char chunks with 320-char overlap
|
|
148
|
+
c. Embed each chunk independently with BAAI/bge-small-en-v1.5 (384-dim) via fastembed
|
|
149
|
+
d. Insert rows into `chunks` table and `vec_chunks` virtual table (sqlite-vec)
|
|
150
|
+
|
|
151
|
+
3. search(query):
|
|
152
|
+
a. Embed the query string
|
|
153
|
+
b. KNN query against vec_chunks (O(log n) ANN, not a Python cosine loop)
|
|
154
|
+
c. Join against `chunks` to get file paths and snippet text
|
|
155
|
+
d. Deduplicate: keep best chunk score per file
|
|
156
|
+
e. Return ranked list of {relative_path, score, snippet}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Why chunked indexing?
|
|
160
|
+
|
|
161
|
+
A 50-page PDF has ~25,000 words. Embedding the whole document as one vector would mean the embedding averages over all content, making any specific detail on page 30 nearly unretrievable. By splitting into 400-token chunks with 80-token overlap, each chunk can be matched independently, so a query about page 30 will find the right chunk.
|
|
162
|
+
|
|
163
|
+
### Adding a new file parser
|
|
164
|
+
|
|
165
|
+
`TextExtractor.extract()` in `search/search_engine.py` dispatches on file extension. Add a new `elif suffix == ".xyz"` branch there. For heavier parsers (OCR, audio transcription) consider wrapping the import in a `try/except ImportError` so the base install does not require the dependency.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Ask pipeline
|
|
170
|
+
|
|
171
|
+
`search/ask_engine.py` wraps `SearchEngine` with an LLM layer.
|
|
172
|
+
|
|
173
|
+
```
|
|
174
|
+
1. Run search(question, top_k)
|
|
175
|
+
2. Build context string from top chunk texts (capped at 6,000 chars)
|
|
176
|
+
3. Try LLM in priority order:
|
|
177
|
+
a. OpenAI if OPENAI_API_KEY is set
|
|
178
|
+
b. Ollama at http://localhost:11434 if reachable
|
|
179
|
+
c. Degrade: return search results with snippets, no generated answer
|
|
180
|
+
4. Return AskResult(answer, sources, degraded, model_used)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Degraded mode is intentional — `sahara ask` is useful even without any LLM installed, because the ranked snippets alone often answer the question visually.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Daemon and file watcher
|
|
188
|
+
|
|
189
|
+
`sync/daemon.py` runs a background loop that calls `SyncEngine.sync()` on a configurable interval. `sync/file_watcher.py` wraps watchdog's `Observer` and triggers an immediate partial sync when specific files change, rather than waiting for the interval.
|
|
190
|
+
|
|
191
|
+
The daemon writes a PID file to `~/.sahara/daemon.pid` and logs to `~/.sahara/daemon.log`. The CLI's `sahara daemon start/stop/status` commands manage it.
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## SQLite schema
|
|
196
|
+
|
|
197
|
+
All state is stored in `~/.sahara/state.db`. WAL mode is enabled on every connection for safe concurrent reads.
|
|
198
|
+
|
|
199
|
+
| Table | Purpose |
|
|
200
|
+
|-------|---------|
|
|
201
|
+
| `files` | One row per synced file — sha256, size, tier, timestamps, is_deleted |
|
|
202
|
+
| `history` | Append-only log of every sync operation |
|
|
203
|
+
| `pending_multipart` | In-flight multipart upload state (crash recovery) |
|
|
204
|
+
| `sync_targets` | Registered (local_path, s3_prefix) pairs |
|
|
205
|
+
| `content_roots` | Canonical indexed folders with primary and sync-enabled flags |
|
|
206
|
+
| `index_entries` | Local indexing inventory and indexed/unsupported/missing status |
|
|
207
|
+
| `storage_residency` | Explicit present/offloaded/missing state for stored files |
|
|
208
|
+
| `config_kv` | Key-value store for runtime config values |
|
|
209
|
+
| `embeddings` | Legacy single-vector-per-file index (superseded by `chunks`) |
|
|
210
|
+
| `chunks` | One row per text chunk — path, chunk_index, content_hash, chunk_text |
|
|
211
|
+
| `vec_chunks` | sqlite-vec virtual table — one float[384] vector per chunk (rowid matches `chunks.id`) |
|
|
212
|
+
|
|
213
|
+
The `chunks` and `vec_chunks` tables work as a pair. `vec_chunks` stores the raw vectors; `chunks` stores the text and metadata. A JOIN on `rowid = id` links them.
|
|
214
|
+
|
|
215
|
+
### Offload lifecycle
|
|
216
|
+
|
|
217
|
+
`StorageLifecycle.offload()` requires a synced, indexed file. It downloads the stored
|
|
218
|
+
object to temporary storage, decrypts it when needed, verifies the plaintext SHA-256,
|
|
219
|
+
marks the file offloaded, and then removes the local source. Chunks and embeddings are
|
|
220
|
+
retained. `fetch()` downloads atomically, verifies the same checksum, and marks the file
|
|
221
|
+
present again. Sync ignores intentional offloads so they cannot be mistaken for local
|
|
222
|
+
deletions.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Configuration
|
|
227
|
+
|
|
228
|
+
Config lives at `~/.sahara/config.toml`. `storage_mode = "none"` is the fresh-install
|
|
229
|
+
default. Existing configuration files that predate `storage_mode` are loaded as S3
|
|
230
|
+
configurations for compatibility. The CLI reads configuration at startup and passes a
|
|
231
|
+
snapshot down to each subsystem.
|
|
232
|
+
|
|
233
|
+
The TOML format is stable and user-editable. Do not add auto-generated comments or machine-managed sections to the config file.
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Known limitations
|
|
238
|
+
|
|
239
|
+
- **No reranker yet.** Results from sqlite-vec KNN are re-sorted by score but not re-ranked by a cross-encoder. Precision is good but not state-of-the-art for ambiguous queries.
|
|
240
|
+
- **Single embedding model.** Only `BAAI/bge-small-en-v1.5` (384-dim) is supported. Switching models requires re-indexing all files.
|
|
241
|
+
- **No incremental re-indexing.** `sahara index` re-indexes the whole collection. Content-hash tracking means only changed files are re-embedded, but the check is O(n) on the files table.
|
|
242
|
+
- **Single-user only.** The manifest + SQLite architecture assumes one writer at a time. Multiple machines syncing to the same bucket will serialize through the manifest ETag check.
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Where to start
|
|
247
|
+
|
|
248
|
+
| Contribution area | Start here |
|
|
249
|
+
|-------------------|-----------|
|
|
250
|
+
| New storage backend | `storage/backend.py` (Protocol) → `storage/local_drive_client.py` (simplest impl) |
|
|
251
|
+
| New file parser | `search/search_engine.py` `TextExtractor.extract()` |
|
|
252
|
+
| Improve search ranking | `search/search_engine.py` `SearchEngine.search()` |
|
|
253
|
+
| New CLI command | `cli.py` — add a `@main.command()` function |
|
|
254
|
+
| Sync bug | `sync/sync_engine.py` `DiffResult` and `_execute_operations()` |
|
|
255
|
+
| Daemon / watcher | `sync/daemon.py`, `sync/file_watcher.py` |
|
|
256
|
+
| Encryption | `utils/encryption.py` |
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to Sahara are documented here.
|
|
4
|
+
|
|
5
|
+
Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). Sahara uses [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## [Unreleased]
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## [0.2.1] — 2026-06-07
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- Trusted Publishing workflows for verified TestPyPI and PyPI releases
|
|
18
|
+
- Contributor Covenant 3.0 code of conduct with private incident reporting
|
|
19
|
+
- Claude Desktop launch guide with platform configuration, exact MCP tool contracts,
|
|
20
|
+
security boundaries, verification, and troubleshooting
|
|
21
|
+
- Three-step product plan for basic indexing with optional local-drive or AWS storage
|
|
22
|
+
- Basic index-only mode with non-interactive `sahara init --mode basic --folder <path>`
|
|
23
|
+
- Canonical content-root and index-inventory database tables
|
|
24
|
+
- `sahara folder add/list/remove/sync` commands for index and sync scope management
|
|
25
|
+
- `sahara storage configure local/aws` for upgrading an existing basic library
|
|
26
|
+
- Checksum-verified `sahara offload` and `sahara fetch` with retained search metadata
|
|
27
|
+
- Explicit storage residency in CLI search/list/status and MCP results
|
|
28
|
+
- Local indexing that scans content roots without requiring sync records or storage
|
|
29
|
+
- `sahara mcp install-claude` for merge-safe, one-command Claude Desktop setup on
|
|
30
|
+
macOS and Windows
|
|
31
|
+
|
|
32
|
+
### Changed
|
|
33
|
+
|
|
34
|
+
- Restored full mypy checking for the daemon and filesystem watcher
|
|
35
|
+
- Renamed the Python distribution from `sahara` to `sahara-memory` to avoid the
|
|
36
|
+
unrelated OpenStack Sahara project on PyPI; the product name, `sahara` CLI,
|
|
37
|
+
and `sahara` import package are unchanged
|
|
38
|
+
- First-time indexing now explains the local embedding-model download and clarifies
|
|
39
|
+
that Hugging Face authentication warnings do not require user action
|
|
40
|
+
- Package and license metadata now identify Nidheesh Puthalath as the maintainer
|
|
41
|
+
- README quick start now demonstrates both CLI retrieval and cited Claude Desktop use
|
|
42
|
+
- Added fictional, privacy-safe README, social, and reproducible terminal demo assets
|
|
43
|
+
- Ollama is the initial answer provider; OpenAI can be selected explicitly or saved
|
|
44
|
+
as the user's default without installing Ollama
|
|
45
|
+
- Added first-run Ollama and optional OpenAI setup guidance
|
|
46
|
+
- Streamlined the README around local search first, with answers, MCP, and storage
|
|
47
|
+
introduced as optional extensions
|
|
48
|
+
- Added a categorized reference covering every CLI command
|
|
49
|
+
- Documentation consolidated around current user, contributor, release, and architecture
|
|
50
|
+
guidance; superseded specifications remain available through Git history
|
|
51
|
+
- Fresh installations default to local indexing; legacy configs without `storage_mode`
|
|
52
|
+
retain their previous S3 behavior
|
|
53
|
+
- `index-report` now reads the local index inventory rather than the sync file table
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## [0.2.0] — 2026-06-06
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
|
|
61
|
+
- **Semantic search** — `sahara index` extracts and embeds file content; `sahara search <query>` retrieves files by meaning using sqlite-vec KNN
|
|
62
|
+
- **Chunked indexing** — long documents (PDFs, DOCX) are split into overlapping 400-token chunks so content past the first page is retrievable
|
|
63
|
+
- **`sahara ask`** — natural language question answering; uses local Ollama or OpenAI when available, degrades gracefully to ranked snippets
|
|
64
|
+
- **MinIO backend** — S3-compatible self-hosted storage via `endpoint_url` configuration
|
|
65
|
+
- **Local drive backend** — sync to a second local drive or NAS with no cloud account required
|
|
66
|
+
- **`local+glacier` dual-write mode** — writes to a local drive and S3 Glacier simultaneously
|
|
67
|
+
- **`StorageBackend` Protocol** — formal structural interface for all storage backends; `SyncEngine` no longer imports concrete backend classes
|
|
68
|
+
- **`BAAI/bge-small-en-v1.5` embedding model** — 384-dim vectors via `fastembed`; fast enough for CPU-only indexing
|
|
69
|
+
- **PDF and DOCX extraction** — `pypdf` and `python-docx` are optional dependencies under `[search]`
|
|
70
|
+
- **`sahara doctor --repair`** — diagnose and auto-fix common configuration problems
|
|
71
|
+
- **SHA-256 utility** — shared `utils/hash.py` used by both sync and search (previously duplicated)
|
|
72
|
+
- **Read-only MCP server** — exposes search, ask, chunk reads, folder listing, and index status to Claude Desktop and other MCP clients
|
|
73
|
+
- **Authenticated remote MCP** — HTTP/streamable transport with bearer-token protection for secure tunnel and remote-client workflows
|
|
74
|
+
- **MCP exposure controls** — tool and storage-prefix allowlists, snippet-size limits, and warnings for non-loopback bindings
|
|
75
|
+
- **`sahara index-report`** — reports indexed/unindexed file counts, skip reasons, and sample indexing gaps
|
|
76
|
+
- **MIT license file** — included in the repository, wheel metadata, and source distribution
|
|
77
|
+
|
|
78
|
+
### Changed
|
|
79
|
+
|
|
80
|
+
- Public positioning updated to "Sahara: extended storage, searchable memory and instant retrieval"
|
|
81
|
+
- `_require_config` guard: local drive mode no longer requires a bucket to be configured
|
|
82
|
+
- Storage modules reorganised into `src/sahara/storage/`, sync modules into `src/sahara/sync/`
|
|
83
|
+
- Indexing skips unsupported binary media instead of attempting noisy text extraction
|
|
84
|
+
|
|
85
|
+
### Fixed
|
|
86
|
+
|
|
87
|
+
- Manifest locking race condition under concurrent syncs
|
|
88
|
+
- False abort in local drive mode due to missing bucket check
|
|
89
|
+
- Optional MCP dependency tests now skip cleanly when the `[mcp]` extra is not installed
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## [0.1.0] — 2024-03-16
|
|
94
|
+
|
|
95
|
+
### Added
|
|
96
|
+
|
|
97
|
+
- **Bidirectional sync** to AWS S3 with three-way diff (local / remote / last-known-good)
|
|
98
|
+
- **Client-side AES-256-GCM encryption** with PBKDF2-HMAC-SHA256 key derivation (600,000 iterations)
|
|
99
|
+
- **Glacier archiving** — `sahara archive`, `sahara restore`, `sahara restore-download`
|
|
100
|
+
- **Background daemon** with file-watching via watchdog
|
|
101
|
+
- **Rename detection** — moves are tracked as copy + delete rather than delete + upload
|
|
102
|
+
- **Conflict resolution** — backup, local, remote, and ask strategies
|
|
103
|
+
- **Cost reporting** — `sahara usage` shows storage usage and estimated monthly S3 cost
|
|
104
|
+
- **`.saharaignore`** — gitignore-style rules for excluding files from sync
|
|
105
|
+
- **Multipart uploads** — automatic for files above a configurable threshold
|
|
106
|
+
- **`sahara doctor`** — connectivity and configuration diagnostics
|
|
107
|
+
- `sahara init` interactive setup wizard
|
|
108
|
+
- `sahara config show/get/set` configuration management
|
|
109
|
+
- `sahara history` sync operation log
|
|
110
|
+
- `sahara conflicts` and `sahara resolve`
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
|
2
|
+
|
|
3
|
+
## Our Pledge
|
|
4
|
+
|
|
5
|
+
We pledge to make our community welcoming, safe, and equitable for all.
|
|
6
|
+
|
|
7
|
+
We are committed to fostering an environment that respects and promotes the dignity,
|
|
8
|
+
rights, and contributions of all individuals, regardless of characteristics including
|
|
9
|
+
race, ethnicity, caste, color, age, physical characteristics, neurodiversity,
|
|
10
|
+
disability, sex or gender, gender identity or expression, sexual orientation, language,
|
|
11
|
+
philosophy or religion, national or social origin, socio-economic position, level of
|
|
12
|
+
education, or other status. The same privileges of participation are extended to
|
|
13
|
+
everyone who participates in good faith and in accordance with this Covenant.
|
|
14
|
+
|
|
15
|
+
## Encouraged Behaviors
|
|
16
|
+
|
|
17
|
+
While acknowledging differences in social norms, we all strive to meet our community's
|
|
18
|
+
expectations for positive behavior. We also understand that our words and actions may
|
|
19
|
+
be interpreted differently than we intend based on culture, background, or native
|
|
20
|
+
language.
|
|
21
|
+
|
|
22
|
+
With these considerations in mind, we agree to behave mindfully toward each other and
|
|
23
|
+
act in ways that center our shared values, including:
|
|
24
|
+
|
|
25
|
+
1. Respecting the **purpose of our community**, our activities, and our ways of
|
|
26
|
+
gathering.
|
|
27
|
+
2. Engaging **kindly and honestly** with others.
|
|
28
|
+
3. Respecting **different viewpoints** and experiences.
|
|
29
|
+
4. **Taking responsibility** for our actions and contributions.
|
|
30
|
+
5. Gracefully giving and accepting **constructive feedback**.
|
|
31
|
+
6. Committing to **repairing harm** when it occurs.
|
|
32
|
+
7. Behaving in other ways that promote and sustain the **well-being of our community**.
|
|
33
|
+
|
|
34
|
+
## Restricted Behaviors
|
|
35
|
+
|
|
36
|
+
We agree to restrict the following behaviors in our community. Instances, threats, and
|
|
37
|
+
promotion of these behaviors are violations of this Code of Conduct.
|
|
38
|
+
|
|
39
|
+
1. **Harassment.** Violating explicitly expressed boundaries or engaging in unnecessary
|
|
40
|
+
personal attention after any clear request to stop.
|
|
41
|
+
2. **Character attacks.** Making insulting, demeaning, or pejorative comments directed
|
|
42
|
+
at a community member or group of people.
|
|
43
|
+
3. **Stereotyping or discrimination.** Characterizing anyone's personality or behavior
|
|
44
|
+
on the basis of immutable identities or traits.
|
|
45
|
+
4. **Sexualization.** Behaving in a way that would generally be considered
|
|
46
|
+
inappropriately intimate in the context or purpose of the community.
|
|
47
|
+
5. **Violating confidentiality.** Sharing or acting on someone's personal or private
|
|
48
|
+
information without their permission.
|
|
49
|
+
6. **Endangerment.** Causing, encouraging, or threatening violence or other harm toward
|
|
50
|
+
any person or group.
|
|
51
|
+
7. Behaving in other ways that **threaten the well-being** of our community.
|
|
52
|
+
|
|
53
|
+
### Other Restrictions
|
|
54
|
+
|
|
55
|
+
1. **Misleading identity.** Impersonating someone else for any reason, or pretending to
|
|
56
|
+
be someone else to evade enforcement actions.
|
|
57
|
+
2. **Failing to credit sources.** Not properly crediting the sources of content you
|
|
58
|
+
contribute.
|
|
59
|
+
3. **Promotional materials.** Sharing marketing or other commercial content in a way
|
|
60
|
+
that is outside the norms of the community.
|
|
61
|
+
4. **Irresponsible communication.** Failing to responsibly present content which
|
|
62
|
+
includes, links, or describes any other restricted behaviors.
|
|
63
|
+
|
|
64
|
+
## Reporting an Issue
|
|
65
|
+
|
|
66
|
+
Tensions can occur between community members even when they are trying their best to
|
|
67
|
+
collaborate. Not every conflict represents a code of conduct violation, and this Code
|
|
68
|
+
of Conduct reinforces encouraged behaviors and norms that can help avoid conflicts and
|
|
69
|
+
minimize harm.
|
|
70
|
+
|
|
71
|
+
Report possible violations privately through the repository's
|
|
72
|
+
[private reporting channel](https://github.com/nidheesh-p/sahara/security/advisories/new).
|
|
73
|
+
Start the report title with `[Conduct]` and include relevant links, context, and any
|
|
74
|
+
supporting records. Do not report conduct incidents in a public issue.
|
|
75
|
+
|
|
76
|
+
The maintainer will take reports seriously and make every effort to respond promptly.
|
|
77
|
+
Reports will be investigated by reviewing available messages, logs, and other relevant
|
|
78
|
+
records. Investigation and enforcement actions will prioritize safety and
|
|
79
|
+
confidentiality.
|
|
80
|
+
|
|
81
|
+
## Addressing and Repairing Harm
|
|
82
|
+
|
|
83
|
+
If an investigation finds that this Code of Conduct has been violated, the following
|
|
84
|
+
enforcement ladder may be used. Depending on the severity of a violation, lower steps
|
|
85
|
+
may be skipped.
|
|
86
|
+
|
|
87
|
+
1. **Warning**
|
|
88
|
+
- Event: A violation involving a single incident or series of incidents.
|
|
89
|
+
- Consequence: A private, written warning from the maintainer.
|
|
90
|
+
- Repair: A private apology, acknowledgement of responsibility, or clarification of
|
|
91
|
+
expectations may be requested.
|
|
92
|
+
2. **Temporarily Limited Activities**
|
|
93
|
+
- Event: A repeated violation after a warning, or the first occurrence of a more
|
|
94
|
+
serious violation.
|
|
95
|
+
- Consequence: A private warning with a time-limited cooldown period or restricted
|
|
96
|
+
participation.
|
|
97
|
+
- Repair: The person may be asked to reflect, apologize, and re-enter community
|
|
98
|
+
spaces thoughtfully.
|
|
99
|
+
3. **Temporary Suspension**
|
|
100
|
+
- Event: A pattern of repeated violations or a single serious violation.
|
|
101
|
+
- Consequence: Temporary removal from community spaces with conditions for return.
|
|
102
|
+
- Repair: The person must respect the suspension and meet the stated conditions
|
|
103
|
+
before returning.
|
|
104
|
+
4. **Permanent Ban**
|
|
105
|
+
- Event: Repeated violations that other steps have not resolved, or a violation so
|
|
106
|
+
serious that continued participation would endanger the community.
|
|
107
|
+
- Consequence: Permanent removal from community spaces and communication channels.
|
|
108
|
+
- Repair: There is no possible repair in cases of this severity.
|
|
109
|
+
|
|
110
|
+
This enforcement ladder is a guideline. The maintainer may use discretion and judgment
|
|
111
|
+
in the best interests of the community.
|
|
112
|
+
|
|
113
|
+
## Scope
|
|
114
|
+
|
|
115
|
+
This Code of Conduct applies within all project community spaces and when an individual
|
|
116
|
+
is officially representing the project in public or other spaces. Examples include
|
|
117
|
+
posting through an official account or acting as an appointed representative at an
|
|
118
|
+
online or offline event.
|
|
119
|
+
|
|
120
|
+
## Attribution
|
|
121
|
+
|
|
122
|
+
This Code of Conduct is adapted from the
|
|
123
|
+
[Contributor Covenant, version 3.0](https://www.contributor-covenant.org/version/3/0/).
|
|
124
|
+
|
|
125
|
+
Contributor Covenant is stewarded by the Organization for Ethical Source and licensed
|
|
126
|
+
under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
|