okb 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. okb-1.1.0/PKG-INFO +448 -0
  2. okb-1.1.0/README.md +398 -0
  3. okb-1.1.0/okb/__init__.py +3 -0
  4. okb-1.1.0/okb/cli.py +2465 -0
  5. okb-1.1.0/okb/config.py +779 -0
  6. okb-1.1.0/okb/data/init.sql +92 -0
  7. okb-1.1.0/okb/http_server.py +669 -0
  8. okb-1.1.0/okb/ingest.py +1589 -0
  9. okb-1.1.0/okb/llm/__init__.py +86 -0
  10. okb-1.1.0/okb/llm/analyze.py +524 -0
  11. okb-1.1.0/okb/llm/base.py +83 -0
  12. okb-1.1.0/okb/llm/cache.py +217 -0
  13. okb-1.1.0/okb/llm/consolidate.py +685 -0
  14. okb-1.1.0/okb/llm/enrich.py +723 -0
  15. okb-1.1.0/okb/llm/extractors/__init__.py +13 -0
  16. okb-1.1.0/okb/llm/extractors/base.py +44 -0
  17. okb-1.1.0/okb/llm/extractors/cross_doc.py +478 -0
  18. okb-1.1.0/okb/llm/extractors/dedup.py +499 -0
  19. okb-1.1.0/okb/llm/extractors/entity.py +369 -0
  20. okb-1.1.0/okb/llm/extractors/todo.py +149 -0
  21. okb-1.1.0/okb/llm/filter.py +187 -0
  22. okb-1.1.0/okb/llm/providers.py +325 -0
  23. okb-1.1.0/okb/local_embedder.py +87 -0
  24. okb-1.1.0/okb/mcp_server.py +2660 -0
  25. okb-1.1.0/okb/migrate.py +53 -0
  26. okb-1.1.0/okb/migrations/0001.initial-schema.sql +91 -0
  27. okb-1.1.0/okb/migrations/0002.sync-state.sql +22 -0
  28. okb-1.1.0/okb/migrations/0003.structured-fields.sql +22 -0
  29. okb-1.1.0/okb/migrations/0004.tokens.sql +13 -0
  30. okb-1.1.0/okb/migrations/0005.database-metadata.sql +19 -0
  31. okb-1.1.0/okb/migrations/0006.llm-cache.sql +13 -0
  32. okb-1.1.0/okb/migrations/0008.enrichment.sql +46 -0
  33. okb-1.1.0/okb/migrations/0009.entity-consolidation.sql +120 -0
  34. okb-1.1.0/okb/migrations/0010.token-id.sql +7 -0
  35. okb-1.1.0/okb/modal_embedder.py +120 -0
  36. okb-1.1.0/okb/modal_llm.py +196 -0
  37. okb-1.1.0/okb/plugins/__init__.py +8 -0
  38. okb-1.1.0/okb/plugins/base.py +110 -0
  39. okb-1.1.0/okb/plugins/registry.py +123 -0
  40. okb-1.1.0/okb/plugins/sources/__init__.py +6 -0
  41. okb-1.1.0/okb/plugins/sources/dropbox_paper.py +223 -0
  42. okb-1.1.0/okb/plugins/sources/github.py +484 -0
  43. okb-1.1.0/okb/plugins/sources/todoist.py +254 -0
  44. okb-1.1.0/okb/rescan.py +227 -0
  45. okb-1.1.0/okb/scripts/__init__.py +1 -0
  46. okb-1.1.0/okb/scripts/watch.py +206 -0
  47. okb-1.1.0/okb/tokens.py +299 -0
  48. okb-1.1.0/pyproject.toml +87 -0
okb-1.1.0/PKG-INFO ADDED
@@ -0,0 +1,448 @@
1
+ Metadata-Version: 2.3
2
+ Name: okb
3
+ Version: 1.1.0
4
+ Summary: Personal knowledge base with semantic search for LLMs
5
+ Requires-Python: >=3.11
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3.11
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Programming Language :: Python :: 3.13
10
+ Provides-Extra: all
11
+ Provides-Extra: dev
12
+ Provides-Extra: docx
13
+ Provides-Extra: llm
14
+ Provides-Extra: llm-bedrock
15
+ Provides-Extra: pdf
16
+ Provides-Extra: todoist
17
+ Provides-Extra: web
18
+ Requires-Dist: PyGithub (>=2.0.0)
19
+ Requires-Dist: anthropic (>=0.40.0) ; extra == "all"
20
+ Requires-Dist: anthropic (>=0.40.0) ; extra == "llm"
21
+ Requires-Dist: anthropic (>=0.40.0) ; extra == "llm-bedrock"
22
+ Requires-Dist: boto3 (>=1.28.0) ; extra == "llm-bedrock"
23
+ Requires-Dist: botocore (>=1.31.0) ; extra == "llm-bedrock"
24
+ Requires-Dist: click (>=8.0.0)
25
+ Requires-Dist: dropbox (>=12.0.0)
26
+ Requires-Dist: einops (>=0.7.0)
27
+ Requires-Dist: mcp (>=1.0.0)
28
+ Requires-Dist: modal (>=1.0.0)
29
+ Requires-Dist: pgvector (>=0.2.0)
30
+ Requires-Dist: psycopg[binary] (>=3.1.0)
31
+ Requires-Dist: pymupdf (>=1.23.0) ; extra == "all"
32
+ Requires-Dist: pymupdf (>=1.23.0) ; extra == "pdf"
33
+ Requires-Dist: pytest (>=7.0.0) ; extra == "dev"
34
+ Requires-Dist: python-docx (>=1.1.0) ; extra == "all"
35
+ Requires-Dist: python-docx (>=1.1.0) ; extra == "docx"
36
+ Requires-Dist: pyyaml (>=6.0)
37
+ Requires-Dist: ruff (>=0.1.0) ; extra == "dev"
38
+ Requires-Dist: sentence-transformers (>=2.2.0)
39
+ Requires-Dist: todoist-api-python (>=3.0.0) ; extra == "all"
40
+ Requires-Dist: todoist-api-python (>=3.0.0) ; extra == "todoist"
41
+ Requires-Dist: trafilatura (>=1.6.0) ; extra == "all"
42
+ Requires-Dist: trafilatura (>=1.6.0) ; extra == "web"
43
+ Requires-Dist: watchdog (>=3.0.0)
44
+ Requires-Dist: yoyo-migrations (>=8.0.0)
45
+ Project-URL: Homepage, https://github.com/username/okb
46
+ Project-URL: Issues, https://github.com/username/okb/issues
47
+ Project-URL: Repository, https://github.com/username/okb
48
+ Description-Content-Type: text/markdown
49
+
50
+ # Owned Knowledge Base (OKB)
51
+
52
+ A local-first semantic search system for personal documents with Claude Code integration via MCP.
53
+
54
+ ## Installation
55
+
56
+ pipx - preferred!
57
+ ```bash
58
+ pipx install okb
59
+ ```
60
+
61
+ Or pip:
62
+ ```bash
63
+ pip install okb
64
+ ```
65
+
66
+ ## Quick Start
67
+
68
+ ```bash
69
+ # 1. Start the database
70
+ okb db start
71
+
72
+ # 2. (Optional) Deploy Modal embedder for faster batch ingestion
73
+ okb modal deploy
74
+
75
+ # 3. Ingest your documents
76
+ okb ingest ~/notes ~/docs
77
+
78
+ # 4. Configure Claude Code MCP (see below)
79
+ ```
80
+
81
+ ## CLI Commands
82
+
83
+ | Command | Description |
84
+ |---------|-------------|
85
+ | `okb db start` | Start pgvector database container |
86
+ | `okb db stop` | Stop database container |
87
+ | `okb db status` | Show database status |
88
+ | `okb db migrate [name]` | Apply pending migrations (optionally for specific db) |
89
+ | `okb db list` | List configured databases |
90
+ | `okb db destroy` | Remove container and volume (destructive) |
91
+ | `okb ingest <paths>` | Ingest documents into knowledge base |
92
+ | `okb ingest <paths> --local` | Ingest using local GPU/CPU embedding (no Modal) |
93
+ | `okb serve` | Start MCP server (stdio, for Claude Code) |
94
+ | `okb serve --http` | Start HTTP MCP server with token auth |
95
+ | `okb watch <paths>` | Watch directories for changes |
96
+ | `okb config init` | Create default config file |
97
+ | `okb config show` | Show current configuration |
98
+ | `okb config path` | Print config file path |
99
+ | `okb modal deploy` | Deploy GPU embedder to Modal |
100
+ | `okb token create` | Create API token for HTTP server |
101
+ | `okb token list` | List tokens for a database |
102
+ | `okb token revoke [TOKEN] --id <n>` | Revoke token by full value or ID |
103
+ | `okb sync list` | List available API sources (plugins) |
104
+ | `okb sync list-projects <source>` | List projects from source (for config) |
105
+ | `okb sync run <sources>` | Sync data from external APIs |
106
+ | `okb sync auth <source>` | Interactive OAuth setup (e.g., dropbox-paper) |
107
+ | `okb sync status` | Show last sync times |
108
+ | `okb rescan` | Check indexed files for changes, re-ingest stale |
109
+ | `okb rescan --dry-run` | Show what would change without executing |
110
+ | `okb rescan --delete` | Also remove documents for missing files |
111
+ | `okb llm status` | Show LLM config and connectivity |
112
+ | `okb llm deploy` | Deploy Modal LLM for open model inference |
113
+ | `okb llm clear-cache` | Clear LLM response cache |
114
+ | `okb enrich run` | Extract TODOs and entities from documents |
115
+ | `okb enrich run --dry-run` | Show what would be enriched |
116
+ | `okb enrich pending` | List entities awaiting review |
117
+ | `okb enrich approve <id>` | Approve a pending entity |
118
+ | `okb enrich reject <id>` | Reject a pending entity |
119
+ | `okb enrich analyze` | Analyze database and update description/topics |
120
+ | `okb enrich consolidate` | Run entity consolidation (duplicates, clusters) |
121
+ | `okb enrich merge-proposals` | List pending merge proposals |
122
+ | `okb enrich approve-merge <id>` | Approve an entity merge |
123
+ | `okb enrich reject-merge <id>` | Reject an entity merge |
124
+ | `okb enrich clusters` | List topic clusters |
125
+ | `okb enrich relationships` | List entity relationships |
126
+
127
+
128
+ ## Configuration
129
+
130
+ Configuration is loaded from `~/.config/okb/config.yaml` (or `$XDG_CONFIG_HOME/okb/config.yaml`).
131
+
132
+ Create default config:
133
+ ```bash
134
+ okb config init
135
+ ```
136
+
137
+ Example config:
138
+ ```yaml
139
+ databases:
140
+ personal:
141
+ url: postgresql://knowledge:localdev@localhost:5433/personal_kb
142
+ default: true # Used when --db not specified (only one can be default)
143
+ managed: true # okb manages via Docker
144
+ work:
145
+ url: postgresql://knowledge:localdev@localhost:5433/work_kb
146
+ managed: true
147
+
148
+ docker:
149
+ port: 5433
150
+ container_name: okb-pgvector
151
+
152
+ chunking:
153
+ chunk_size: 512
154
+ chunk_overlap: 64
155
+ ```
156
+
157
+ Use `--db <name>` to target a specific database with any command.
158
+
159
+ Environment variables override config file settings:
160
+ - `OKB_DATABASE_URL` - Database connection string
161
+ - `OKB_DOCKER_PORT` - Docker port mapping
162
+ - `OKB_CONTAINER_NAME` - Docker container name
163
+
164
+ ### Project-Local Config
165
+
166
+ Override global config per-project with `.okbconf.yaml` (searched from CWD upward):
167
+
168
+ ```yaml
169
+ # .okbconf.yaml
170
+ default_database: work # Use 'work' db in this project
171
+
172
+ extensions:
173
+ skip_directories: # Extends global list
174
+ - test_fixtures
175
+ ```
176
+
177
+ Merge: scalars replace, lists extend, dicts deep-merge.
178
+
179
+ ### LLM Integration (Optional)
180
+
181
+ Enable LLM-based document classification, filtering, and enrichment:
182
+
183
+ ```yaml
184
+ llm:
185
+ provider: claude # "claude", "modal", or null (disabled)
186
+ model: claude-haiku-4-5-20251001
187
+ timeout: 30
188
+ cache_responses: true
189
+ ```
190
+
191
+ **Providers:**
192
+ | Provider | Setup | Cost |
193
+ |----------|-------|------|
194
+ | `claude` | `export ANTHROPIC_API_KEY=...` | ~$0.25/1M tokens |
195
+ | `modal` | `okb llm deploy` | ~$0.02/min GPU |
196
+
197
+ **Modal LLM Setup** (no API key needed, runs on Modal's GPUs):
198
+
199
+ ```yaml
200
+ llm:
201
+ provider: modal
202
+ model: microsoft/Phi-3-mini-4k-instruct # Recommended: no gating
203
+ ```
204
+
205
+ Non-gated models (work immediately):
206
+ - `microsoft/Phi-3-mini-4k-instruct` - Good quality, 4K context
207
+ - `Qwen/Qwen2-1.5B-Instruct` - Smaller/faster
208
+
209
+ Gated models (require HuggingFace approval + token):
210
+ - `meta-llama/Llama-3.2-3B-Instruct` - Requires accepting license at HuggingFace
211
+ - Setup: `modal secret create huggingface HF_TOKEN=hf_...`
212
+
213
+ Deploy after configuring:
214
+ ```bash
215
+ okb llm deploy
216
+ ```
217
+
218
+ **Pre-ingest filtering** - skip low-value content during sync:
219
+ ```yaml
220
+ plugins:
221
+ sources:
222
+ dropbox-paper:
223
+ llm_filter:
224
+ enabled: true
225
+ prompt: "Skip meeting notes and drafts"
226
+ action_on_skip: discard # or "archive"
227
+ ```
228
+
229
+ ### Document Enrichment
230
+
231
+ Extract TODOs and entities (people, projects, technologies) from documents using LLM:
232
+
233
+ ```bash
234
+ okb enrich run # Enrich un-enriched documents
235
+ okb enrich run --dry-run # Preview what would be enriched
236
+ okb enrich run --source-type markdown # Only markdown files
237
+ okb enrich run --query "meeting" # Filter by semantic search
238
+ ```
239
+
240
+ Entities are created as pending suggestions for review:
241
+ ```bash
242
+ okb enrich pending # List pending entities
243
+ okb enrich approve <id> # Approve → creates entity document
244
+ okb enrich reject <id> # Reject → hidden from future suggestions
245
+ ```
246
+
247
+ Configure enrichment behavior:
248
+ ```yaml
249
+ enrichment:
250
+ enabled: true
251
+ extract_todos: true
252
+ extract_entities: true
253
+ auto_create_todos: true # TODOs created immediately
254
+ auto_create_entities: false # Entities go to pending review
255
+ min_confidence_todo: 0.7
256
+ min_confidence_entity: 0.8
257
+ ```
258
+
259
+ CLI commands:
260
+ ```bash
261
+ okb llm status # Show config and connectivity
262
+ okb llm deploy # Deploy Modal LLM (for provider: modal)
263
+ okb llm clear-cache # Clear response cache
264
+ ```
265
+
266
+ ## Claude Code MCP Config
267
+
268
+ ### stdio mode (default)
269
+
270
+ Add to your Claude Code MCP configuration:
271
+
272
+ ```json
273
+ {
274
+ "mcpServers": {
275
+ "knowledge-base": {
276
+ "command": "okb",
277
+ "args": ["serve"]
278
+ }
279
+ }
280
+ }
281
+ ```
282
+
283
+ ### HTTP mode (for remote/shared servers)
284
+
285
+ First, start the HTTP server and create a token:
286
+
287
+ ```bash
288
+ # Create a token
289
+ okb token create --db default -d "Claude Code"
290
+ # Output: okb_default_rw_a1b2c3d4e5f6g7h8
291
+
292
+ # Start HTTP server
293
+ okb serve --http --host 0.0.0.0 --port 8080
294
+ ```
295
+
296
+ Then configure Claude Code to connect via SSE:
297
+
298
+ ```json
299
+ {
300
+ "mcpServers": {
301
+ "knowledge-base": {
302
+ "type": "sse",
303
+ "url": "http://localhost:8080/sse",
304
+ "headers": {
305
+ "Authorization": "Bearer okb_default_rw_a1b2c3d4e5f6g7h8"
306
+ }
307
+ }
308
+ }
309
+ }
310
+ ```
311
+
312
+ ## MCP Tools available to LLM
313
+
314
+ | Tool | Purpose |
315
+ |------|---------|
316
+ | `search_knowledge` | Semantic search with natural language queries |
317
+ | `keyword_search` | Exact keyword/symbol matching |
318
+ | `hybrid_search` | Combined semantic + keyword (RRF fusion) |
319
+ | `get_document` | Retrieve full document by path |
320
+ | `list_sources` | Show indexed document stats |
321
+ | `list_projects` | List known projects |
322
+ | `recent_documents` | Show recently indexed files |
323
+ | `save_knowledge` | Save knowledge from Claude for future reference |
324
+ | `delete_knowledge` | Delete a Claude-saved knowledge entry |
325
+ | `get_actionable_items` | Query tasks/events with structured filters |
326
+ | `get_database_info` | Get database description, topics, and stats |
327
+ | `set_database_description` | Update database description/topics (LLM can self-document) |
328
+ | `add_todo` | Create a TODO item in the knowledge base |
329
+ | `trigger_sync` | Sync API sources (Todoist, GitHub, Dropbox Paper) |
330
+ | `trigger_rescan` | Check indexed files for changes and re-ingest |
331
+ | `list_sync_sources` | List available API sync sources with status |
332
+ | `enrich_document` | Run LLM enrichment to extract TODOs/entities |
333
+ | `list_pending_entities` | List entities awaiting review |
334
+ | `approve_entity` | Approve a pending entity |
335
+ | `reject_entity` | Reject a pending entity |
336
+ | `analyze_knowledge_base` | Analyze content and generate description/topics |
337
+ | `find_entity_duplicates` | Find potential duplicate entities |
338
+ | `merge_entities` | Merge duplicate entities |
339
+ | `list_pending_merges` | List pending merge proposals |
340
+ | `approve_merge` | Approve a merge proposal |
341
+ | `reject_merge` | Reject a merge proposal |
342
+ | `get_topic_clusters` | Get topic clusters from consolidation |
343
+ | `get_entity_relationships` | Get relationships between entities |
344
+ | `run_consolidation` | Run full entity consolidation pipeline |
345
+
346
+ ## Contextual Chunking
347
+
348
+ Documents are chunked with context for better retrieval:
349
+
350
+ ```
351
+ Document: Django Performance Notes
352
+ Project: student-app ← inferred from path or frontmatter
353
+ Section: Query Optimization ← extracted from markdown headers
354
+ Topics: django, performance ← from frontmatter tags
355
+ Content: Use `select_related()` to avoid N+1 queries...
356
+ ```
357
+
358
+ ### Frontmatter Example
359
+
360
+ ```markdown
361
+ ---
362
+ tags: [django, postgresql, performance]
363
+ project: student-app
364
+ category: backend
365
+ ---
366
+
367
+ # Your Document Title
368
+
369
+ Content here...
370
+ ```
371
+
372
+ ## Plugin System
373
+
374
+ OKB supports plugins for custom file parsers and API data sources (GitHub, Todoist, etc).
375
+
376
+ ### Creating a Plugin
377
+
378
+ ```python
379
+ # File parser plugin
380
+ from okb.plugins import FileParser, Document
381
+
382
+ class EpubParser:
383
+ extensions = ['.epub']
384
+ source_type = 'epub'
385
+
386
+ def can_parse(self, path): return path.suffix.lower() == '.epub'
387
+ def parse(self, path, extra_metadata=None) -> Document: ...
388
+
389
+ # API source plugin
390
+ from okb.plugins import APISource, SyncState, Document
391
+
392
+ class GitHubSource:
393
+ name = 'github'
394
+ source_type = 'github-issue'
395
+
396
+ def configure(self, config): ...
397
+ def fetch(self, state: SyncState | None) -> tuple[list[Document], SyncState]: ...
398
+ ```
399
+
400
+ ### Registering Plugins
401
+
402
+ In your plugin's `pyproject.toml`:
403
+ ```toml
404
+ [project.entry-points."okb.parsers"]
405
+ epub = "okb_epub:EpubParser"
406
+
407
+ [project.entry-points."okb.sources"]
408
+ github = "okb_github:GitHubSource"
409
+ ```
410
+
411
+ ### Configuring API Sources
412
+
413
+ ```yaml
414
+ # ~/.config/okb/config.yaml
415
+ plugins:
416
+ sources:
417
+ github:
418
+ enabled: true
419
+ token: ${GITHUB_TOKEN} # Resolved from environment
420
+ repos: [owner/repo1, owner/repo2]
421
+ todoist:
422
+ enabled: true
423
+ token: ${TODOIST_TOKEN}
424
+ include_completed: false # Sync completed tasks
425
+ completed_days: 30 # Days of completed history
426
+ include_comments: false # Include task comments (1 API call per task)
427
+ project_filter: [] # List of project IDs (use sync list-projects to find)
428
+ dropbox-paper:
429
+ enabled: true
430
+ # Option 1: Refresh token (recommended, auto-refreshes)
431
+ app_key: ${DROPBOX_APP_KEY}
432
+ app_secret: ${DROPBOX_APP_SECRET}
433
+ refresh_token: ${DROPBOX_REFRESH_TOKEN}
434
+ # Option 2: Access token (short-lived, expires after ~4 hours)
435
+ # token: ${DROPBOX_TOKEN}
436
+ folders: [/] # Optional: filter to specific folders
437
+ ```
438
+
439
+ **Dropbox Paper OAuth Setup:**
440
+ ```bash
441
+ okb sync auth dropbox-paper
442
+ ```
443
+ This interactive command will guide you through getting a refresh token from Dropbox.
444
+
445
+ ## License
446
+
447
+ MIT
448
+