okb 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/__init__.py +3 -0
- okb/cli.py +1272 -0
- okb/config.py +661 -0
- okb/data/init.sql +92 -0
- okb/http_server.py +463 -0
- okb/ingest.py +1589 -0
- okb/llm/__init__.py +86 -0
- okb/llm/base.py +83 -0
- okb/llm/cache.py +217 -0
- okb/llm/filter.py +187 -0
- okb/llm/providers.py +322 -0
- okb/local_embedder.py +87 -0
- okb/mcp_server.py +1393 -0
- okb/migrate.py +53 -0
- okb/migrations/0001.initial-schema.sql +91 -0
- okb/migrations/0002.sync-state.sql +22 -0
- okb/migrations/0003.structured-fields.sql +22 -0
- okb/migrations/0004.tokens.sql +13 -0
- okb/migrations/0005.database-metadata.sql +19 -0
- okb/migrations/0006.llm-cache.sql +13 -0
- okb/modal_embedder.py +120 -0
- okb/modal_llm.py +178 -0
- okb/plugins/__init__.py +8 -0
- okb/plugins/base.py +110 -0
- okb/plugins/registry.py +123 -0
- okb/plugins/sources/__init__.py +5 -0
- okb/plugins/sources/dropbox_paper.py +188 -0
- okb/plugins/sources/github.py +484 -0
- okb/rescan.py +227 -0
- okb/scripts/__init__.py +1 -0
- okb/scripts/watch.py +206 -0
- okb/tokens.py +277 -0
- okb-1.0.0.dist-info/METADATA +397 -0
- okb-1.0.0.dist-info/RECORD +36 -0
- okb-1.0.0.dist-info/WHEEL +4 -0
- okb-1.0.0.dist-info/entry_points.txt +9 -0
okb/tokens.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""Token management for HTTP authentication.
|
|
2
|
+
|
|
3
|
+
Token format: okb_<database>_<ro|rw>_<random16hex>
|
|
4
|
+
Example: okb_personal_ro_a1b2c3d4e5f6g7h8
|
|
5
|
+
|
|
6
|
+
Tokens are stored hashed (SHA256) in the database for security.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import re
|
|
13
|
+
import secrets
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
|
|
17
|
+
import psycopg
|
|
18
|
+
from psycopg.rows import dict_row
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TokenInfo:
|
|
23
|
+
"""Information about a token."""
|
|
24
|
+
|
|
25
|
+
token_hash: str
|
|
26
|
+
database: str
|
|
27
|
+
permissions: str # 'ro' or 'rw'
|
|
28
|
+
description: str | None
|
|
29
|
+
created_at: datetime
|
|
30
|
+
last_used_at: datetime | None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Token format regex: okb_<database>_<ro|rw>_<hex16>
|
|
34
|
+
TOKEN_PATTERN = re.compile(r"^okb_([a-z0-9_-]+)_(ro|rw)_([a-f0-9]{16})$")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def generate_token(database: str, permissions: str = "rw") -> str:
|
|
38
|
+
"""Generate a new token for a database.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
database: Database name (must be alphanumeric with _ or -)
|
|
42
|
+
permissions: 'ro' for read-only, 'rw' for read-write
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Token string like 'okb_personal_ro_a1b2c3d4e5f6g7h8'
|
|
46
|
+
"""
|
|
47
|
+
if permissions not in ("ro", "rw"):
|
|
48
|
+
raise ValueError("permissions must be 'ro' or 'rw'")
|
|
49
|
+
if not re.match(r"^[a-z0-9_-]+$", database):
|
|
50
|
+
raise ValueError("database name must be lowercase alphanumeric with _ or -")
|
|
51
|
+
|
|
52
|
+
random_part = secrets.token_hex(8) # 16 hex chars
|
|
53
|
+
return f"okb_{database}_{permissions}_{random_part}"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def hash_token(token: str) -> str:
|
|
57
|
+
"""Hash a token using SHA256.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
token: Full token string
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Hex-encoded SHA256 hash
|
|
64
|
+
"""
|
|
65
|
+
return hashlib.sha256(token.encode()).hexdigest()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def parse_token(token: str) -> tuple[str, str] | None:
|
|
69
|
+
"""Parse a token to extract database and permissions.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
token: Full token string
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Tuple of (database, permissions) or None if invalid format
|
|
76
|
+
"""
|
|
77
|
+
if not token:
|
|
78
|
+
return None
|
|
79
|
+
match = TOKEN_PATTERN.match(token)
|
|
80
|
+
if match:
|
|
81
|
+
return (match.group(1), match.group(2))
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def create_token(
|
|
86
|
+
db_url: str,
|
|
87
|
+
database: str,
|
|
88
|
+
permissions: str = "rw",
|
|
89
|
+
description: str | None = None,
|
|
90
|
+
) -> str:
|
|
91
|
+
"""Create a new token and store its hash in the database.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
db_url: Database connection URL
|
|
95
|
+
database: Database name for the token
|
|
96
|
+
permissions: 'ro' or 'rw'
|
|
97
|
+
description: Optional description for the token
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
The plaintext token (only returned once, not stored)
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
RuntimeError: If token could not be saved to database
|
|
104
|
+
"""
|
|
105
|
+
token = generate_token(database, permissions)
|
|
106
|
+
token_hash = hash_token(token)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
with psycopg.connect(db_url) as conn:
|
|
110
|
+
conn.execute(
|
|
111
|
+
"""
|
|
112
|
+
INSERT INTO tokens (token_hash, permissions, description)
|
|
113
|
+
VALUES (%s, %s, %s)
|
|
114
|
+
""",
|
|
115
|
+
(token_hash, permissions, description),
|
|
116
|
+
)
|
|
117
|
+
conn.commit()
|
|
118
|
+
except psycopg.Error as e:
|
|
119
|
+
raise RuntimeError(f"Failed to save token to database: {e}") from e
|
|
120
|
+
|
|
121
|
+
return token
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def list_tokens(db_url: str) -> list[TokenInfo]:
|
|
125
|
+
"""List all tokens in a database (without the actual token values).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
db_url: Database connection URL
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
List of TokenInfo objects
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
RuntimeError: If database connection fails
|
|
135
|
+
"""
|
|
136
|
+
# Extract database name from URL for display
|
|
137
|
+
from urllib.parse import urlparse
|
|
138
|
+
|
|
139
|
+
parsed = urlparse(db_url)
|
|
140
|
+
db_name = parsed.path.lstrip("/") or "default"
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
144
|
+
results = conn.execute(
|
|
145
|
+
"""
|
|
146
|
+
SELECT token_hash, permissions, description, created_at, last_used_at
|
|
147
|
+
FROM tokens
|
|
148
|
+
ORDER BY created_at DESC
|
|
149
|
+
"""
|
|
150
|
+
).fetchall()
|
|
151
|
+
|
|
152
|
+
return [
|
|
153
|
+
TokenInfo(
|
|
154
|
+
token_hash=r["token_hash"],
|
|
155
|
+
database=db_name,
|
|
156
|
+
permissions=r["permissions"],
|
|
157
|
+
description=r["description"],
|
|
158
|
+
created_at=r["created_at"],
|
|
159
|
+
last_used_at=r["last_used_at"],
|
|
160
|
+
)
|
|
161
|
+
for r in results
|
|
162
|
+
]
|
|
163
|
+
except psycopg.OperationalError as e:
|
|
164
|
+
raise RuntimeError(
|
|
165
|
+
f"Failed to connect to database. Ensure the database is running: {e}"
|
|
166
|
+
) from e
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def delete_token(db_url: str, token_or_prefix: str) -> bool:
|
|
170
|
+
"""Delete a token by full token or prefix.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
db_url: Database connection URL
|
|
174
|
+
token_or_prefix: Full token or token prefix (e.g., 'lkb_personal_ro')
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
True if token was deleted, False if not found
|
|
178
|
+
"""
|
|
179
|
+
with psycopg.connect(db_url) as conn:
|
|
180
|
+
# If it looks like a full token, hash and delete by hash
|
|
181
|
+
if TOKEN_PATTERN.match(token_or_prefix):
|
|
182
|
+
token_hash = hash_token(token_or_prefix)
|
|
183
|
+
result = conn.execute(
|
|
184
|
+
"DELETE FROM tokens WHERE token_hash = %s RETURNING token_hash",
|
|
185
|
+
(token_hash,),
|
|
186
|
+
).fetchone()
|
|
187
|
+
conn.commit()
|
|
188
|
+
return result is not None
|
|
189
|
+
|
|
190
|
+
# Otherwise, delete by prefix match on the hash
|
|
191
|
+
# Since we can't reconstruct the token from the hash, we need to
|
|
192
|
+
# match by the prefix pattern in a different way.
|
|
193
|
+
# For prefix deletion, we'll use LIKE on the token_hash which won't work...
|
|
194
|
+
# Actually, we need to store a prefix or identifier separately.
|
|
195
|
+
# For now, return False for prefix-based deletion - full token required.
|
|
196
|
+
|
|
197
|
+
# Alternative: store token_prefix in the tokens table
|
|
198
|
+
# For this implementation, we'll just return False if not a full token
|
|
199
|
+
return False
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def verify_token(token: str, get_db_url_fn) -> TokenInfo | None:
|
|
203
|
+
"""Verify a token and return its info if valid.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
token: Full token string
|
|
207
|
+
get_db_url_fn: Function that takes a database name and returns its URL
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
TokenInfo if valid, None if invalid
|
|
211
|
+
"""
|
|
212
|
+
parsed = parse_token(token)
|
|
213
|
+
if not parsed:
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
database, permissions = parsed
|
|
217
|
+
token_hash = hash_token(token)
|
|
218
|
+
|
|
219
|
+
try:
|
|
220
|
+
db_url = get_db_url_fn(database)
|
|
221
|
+
except ValueError:
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
226
|
+
result = conn.execute(
|
|
227
|
+
"""
|
|
228
|
+
SELECT token_hash, permissions, description, created_at, last_used_at
|
|
229
|
+
FROM tokens
|
|
230
|
+
WHERE token_hash = %s
|
|
231
|
+
""",
|
|
232
|
+
(token_hash,),
|
|
233
|
+
).fetchone()
|
|
234
|
+
|
|
235
|
+
if not result:
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
# Update last_used_at
|
|
239
|
+
conn.execute(
|
|
240
|
+
"UPDATE tokens SET last_used_at = NOW() WHERE token_hash = %s",
|
|
241
|
+
(token_hash,),
|
|
242
|
+
)
|
|
243
|
+
conn.commit()
|
|
244
|
+
|
|
245
|
+
return TokenInfo(
|
|
246
|
+
token_hash=result["token_hash"],
|
|
247
|
+
database=database,
|
|
248
|
+
permissions=result["permissions"],
|
|
249
|
+
description=result["description"],
|
|
250
|
+
created_at=result["created_at"],
|
|
251
|
+
last_used_at=result["last_used_at"],
|
|
252
|
+
)
|
|
253
|
+
except Exception:
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class OKBTokenVerifier:
|
|
258
|
+
"""Token verifier for HTTP middleware integration."""
|
|
259
|
+
|
|
260
|
+
def __init__(self, get_db_url_fn):
|
|
261
|
+
"""Initialize the verifier.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
get_db_url_fn: Function that takes a database name and returns its URL
|
|
265
|
+
"""
|
|
266
|
+
self.get_db_url_fn = get_db_url_fn
|
|
267
|
+
|
|
268
|
+
def verify(self, token: str) -> TokenInfo | None:
|
|
269
|
+
"""Verify a token and return its info if valid.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
token: Full token string (without 'Bearer ' prefix)
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
TokenInfo if valid, None if invalid
|
|
276
|
+
"""
|
|
277
|
+
return verify_token(token, self.get_db_url_fn)
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: okb
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Personal knowledge base with semantic search for LLMs
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
10
|
+
Provides-Extra: all
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Provides-Extra: docx
|
|
13
|
+
Provides-Extra: llm
|
|
14
|
+
Provides-Extra: llm-bedrock
|
|
15
|
+
Provides-Extra: pdf
|
|
16
|
+
Provides-Extra: web
|
|
17
|
+
Requires-Dist: PyGithub (>=2.0.0)
|
|
18
|
+
Requires-Dist: anthropic (>=0.40.0) ; extra == "all"
|
|
19
|
+
Requires-Dist: anthropic (>=0.40.0) ; extra == "llm"
|
|
20
|
+
Requires-Dist: anthropic (>=0.40.0) ; extra == "llm-bedrock"
|
|
21
|
+
Requires-Dist: boto3 (>=1.28.0) ; extra == "llm-bedrock"
|
|
22
|
+
Requires-Dist: botocore (>=1.31.0) ; extra == "llm-bedrock"
|
|
23
|
+
Requires-Dist: click (>=8.0.0)
|
|
24
|
+
Requires-Dist: dropbox (>=12.0.0)
|
|
25
|
+
Requires-Dist: einops (>=0.7.0)
|
|
26
|
+
Requires-Dist: mcp (>=1.0.0)
|
|
27
|
+
Requires-Dist: modal (>=1.0.0)
|
|
28
|
+
Requires-Dist: pgvector (>=0.2.0)
|
|
29
|
+
Requires-Dist: psycopg[binary] (>=3.1.0)
|
|
30
|
+
Requires-Dist: pymupdf (>=1.23.0) ; extra == "all"
|
|
31
|
+
Requires-Dist: pymupdf (>=1.23.0) ; extra == "pdf"
|
|
32
|
+
Requires-Dist: pytest (>=7.0.0) ; extra == "dev"
|
|
33
|
+
Requires-Dist: python-docx (>=1.1.0) ; extra == "all"
|
|
34
|
+
Requires-Dist: python-docx (>=1.1.0) ; extra == "docx"
|
|
35
|
+
Requires-Dist: pyyaml (>=6.0)
|
|
36
|
+
Requires-Dist: ruff (>=0.1.0) ; extra == "dev"
|
|
37
|
+
Requires-Dist: sentence-transformers (>=2.2.0)
|
|
38
|
+
Requires-Dist: trafilatura (>=1.6.0) ; extra == "all"
|
|
39
|
+
Requires-Dist: trafilatura (>=1.6.0) ; extra == "web"
|
|
40
|
+
Requires-Dist: watchdog (>=3.0.0)
|
|
41
|
+
Requires-Dist: yoyo-migrations (>=8.0.0)
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+
# Owned Knowledge Base (OKB)
|
|
45
|
+
|
|
46
|
+
A local-first semantic search system for personal documents with Claude Code integration via MCP.
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install okb
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Or from source:
|
|
55
|
+
```bash
|
|
56
|
+
git clone https://github.com/yourusername/okb
|
|
57
|
+
cd okb
|
|
58
|
+
pip install -e .
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# 1. Start the database
|
|
65
|
+
okb db start
|
|
66
|
+
|
|
67
|
+
# 2. (Optional) Deploy Modal embedder for faster batch ingestion
|
|
68
|
+
okb modal deploy
|
|
69
|
+
|
|
70
|
+
# 3. Ingest your documents
|
|
71
|
+
okb ingest ~/notes ~/docs
|
|
72
|
+
|
|
73
|
+
# 4. Configure Claude Code MCP (see below)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## CLI Commands
|
|
77
|
+
|
|
78
|
+
| Command | Description |
|
|
79
|
+
|---------|-------------|
|
|
80
|
+
| `okb db start` | Start pgvector database container |
|
|
81
|
+
| `okb db stop` | Stop database container |
|
|
82
|
+
| `okb db status` | Show database status |
|
|
83
|
+
| `okb db destroy` | Remove container and volume (destructive) |
|
|
84
|
+
| `okb ingest <paths>` | Ingest documents into knowledge base |
|
|
85
|
+
| `okb ingest <paths> --local` | Ingest using CPU embedding (no Modal) |
|
|
86
|
+
| `okb serve` | Start MCP server (stdio, for Claude Code) |
|
|
87
|
+
| `okb serve --http` | Start HTTP MCP server with token auth |
|
|
88
|
+
| `okb watch <paths>` | Watch directories for changes |
|
|
89
|
+
| `okb config init` | Create default config file |
|
|
90
|
+
| `okb config show` | Show current configuration |
|
|
91
|
+
| `okb modal deploy` | Deploy GPU embedder to Modal |
|
|
92
|
+
| `okb token create` | Create API token for HTTP server |
|
|
93
|
+
| `okb token list` | List tokens for a database |
|
|
94
|
+
| `okb token revoke` | Revoke an API token |
|
|
95
|
+
| `okb sync list` | List available API sources (plugins) |
|
|
96
|
+
| `okb sync run <sources>` | Sync data from external APIs |
|
|
97
|
+
| `okb sync status` | Show last sync times |
|
|
98
|
+
| `okb rescan` | Check indexed files for changes, re-ingest stale |
|
|
99
|
+
| `okb rescan --dry-run` | Show what would change without executing |
|
|
100
|
+
| `okb rescan --delete` | Also remove documents for missing files |
|
|
101
|
+
| `okb llm status` | Show LLM config and connectivity |
|
|
102
|
+
| `okb llm deploy` | Deploy Modal LLM for open model inference |
|
|
103
|
+
| `okb llm clear-cache` | Clear LLM response cache |
|
|
104
|
+
|
|
105
|
+
## Architecture
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
109
|
+
│ INGESTION (Burst GPU) │
|
|
110
|
+
│ │
|
|
111
|
+
│ Local Files → Contextual Chunking → Modal (GPU T4) → pgvector │
|
|
112
|
+
│ │
|
|
113
|
+
│ ~/notes/project-x/api-design.md │
|
|
114
|
+
│ ↓ │
|
|
115
|
+
│ "Document: API Design Notes │
|
|
116
|
+
│ Project: project-x │
|
|
117
|
+
│ Section: Authentication │
|
|
118
|
+
│ Content: Use JWT tokens with..." │
|
|
119
|
+
│ ↓ │
|
|
120
|
+
│ [0.23, -0.41, 0.87, ...] → pgvector │
|
|
121
|
+
└─────────────────────────────────────────────────────────────────────┘
|
|
122
|
+
|
|
123
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
124
|
+
│ RETRIEVAL (Always-on, Local) │
|
|
125
|
+
│ │
|
|
126
|
+
│ Claude Code → MCP Server → CPU Embedding → pgvector → Results │
|
|
127
|
+
│ │
|
|
128
|
+
│ "How do I handle auth?" │
|
|
129
|
+
│ ↓ │
|
|
130
|
+
│ [0.19, -0.38, 0.91, ...] (local CPU, ~300ms) │
|
|
131
|
+
│ ↓ │
|
|
132
|
+
│ Cosine similarity search → Top 5 chunks with context │
|
|
133
|
+
└─────────────────────────────────────────────────────────────────────┘
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Configuration
|
|
137
|
+
|
|
138
|
+
Configuration is loaded from `~/.config/okb/config.yaml` (or `$XDG_CONFIG_HOME/okb/config.yaml`).
|
|
139
|
+
|
|
140
|
+
Create default config:
|
|
141
|
+
```bash
|
|
142
|
+
okb config init
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Example config:
|
|
146
|
+
```yaml
|
|
147
|
+
databases:
|
|
148
|
+
personal:
|
|
149
|
+
url: postgresql://knowledge:localdev@localhost:5433/personal_kb
|
|
150
|
+
default: true # Used when --db not specified (only one can be default)
|
|
151
|
+
managed: true # okb manages via Docker
|
|
152
|
+
work:
|
|
153
|
+
url: postgresql://knowledge:localdev@localhost:5433/work_kb
|
|
154
|
+
managed: true
|
|
155
|
+
|
|
156
|
+
docker:
|
|
157
|
+
port: 5433
|
|
158
|
+
container_name: okb-pgvector
|
|
159
|
+
|
|
160
|
+
chunking:
|
|
161
|
+
chunk_size: 512
|
|
162
|
+
chunk_overlap: 64
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Use `--db <name>` to target a specific database with any command.
|
|
166
|
+
|
|
167
|
+
Environment variables override config file settings:
|
|
168
|
+
- `KB_DATABASE_URL` - Database connection string
|
|
169
|
+
- `OKB_DOCKER_PORT` - Docker port mapping
|
|
170
|
+
- `OKB_CONTAINER_NAME` - Docker container name
|
|
171
|
+
|
|
172
|
+
### Project-Local Config
|
|
173
|
+
|
|
174
|
+
Override global config per-project with `.okbconf.yaml` (searched from CWD upward):
|
|
175
|
+
|
|
176
|
+
```yaml
|
|
177
|
+
# .okbconf.yaml
|
|
178
|
+
default_database: work # Use 'work' db in this project
|
|
179
|
+
|
|
180
|
+
extensions:
|
|
181
|
+
skip_directories: # Extends global list
|
|
182
|
+
- test_fixtures
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Merge: scalars replace, lists extend, dicts deep-merge.
|
|
186
|
+
|
|
187
|
+
### LLM Integration (Optional)
|
|
188
|
+
|
|
189
|
+
Enable LLM-based document classification and filtering:
|
|
190
|
+
|
|
191
|
+
```yaml
|
|
192
|
+
llm:
|
|
193
|
+
provider: claude # "claude", "modal", or null (disabled)
|
|
194
|
+
model: claude-haiku-4-5-20251001
|
|
195
|
+
timeout: 30
|
|
196
|
+
cache_responses: true
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**Providers:**
|
|
200
|
+
| Provider | Setup | Cost |
|
|
201
|
+
|----------|-------|------|
|
|
202
|
+
| `claude` | `export ANTHROPIC_API_KEY=...` | ~$0.25/1M tokens |
|
|
203
|
+
| `modal` | `okb llm deploy` | ~$0.02/min GPU |
|
|
204
|
+
|
|
205
|
+
For Modal (no API key needed):
|
|
206
|
+
```yaml
|
|
207
|
+
llm:
|
|
208
|
+
provider: modal
|
|
209
|
+
model: meta-llama/Llama-3.2-3B-Instruct
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
**Pre-ingest filtering** - skip low-value content during sync:
|
|
213
|
+
```yaml
|
|
214
|
+
plugins:
|
|
215
|
+
sources:
|
|
216
|
+
dropbox-paper:
|
|
217
|
+
llm_filter:
|
|
218
|
+
enabled: true
|
|
219
|
+
prompt: "Skip meeting notes and drafts"
|
|
220
|
+
action_on_skip: discard # or "archive"
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
CLI commands:
|
|
224
|
+
```bash
|
|
225
|
+
okb llm status # Show config and connectivity
|
|
226
|
+
okb llm deploy # Deploy Modal LLM (for provider: modal)
|
|
227
|
+
okb llm clear-cache # Clear response cache
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## Claude Code MCP Config
|
|
231
|
+
|
|
232
|
+
### stdio mode (default)
|
|
233
|
+
|
|
234
|
+
Add to your Claude Code MCP configuration:
|
|
235
|
+
|
|
236
|
+
```json
|
|
237
|
+
{
|
|
238
|
+
"mcpServers": {
|
|
239
|
+
"knowledge-base": {
|
|
240
|
+
"command": "okb",
|
|
241
|
+
"args": ["serve"]
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### HTTP mode (for remote/shared servers)
|
|
248
|
+
|
|
249
|
+
First, start the HTTP server and create a token:
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
# Create a token
|
|
253
|
+
okb token create --db default -d "Claude Code"
|
|
254
|
+
# Output: okb_default_rw_a1b2c3d4e5f6g7h8
|
|
255
|
+
|
|
256
|
+
# Start HTTP server
|
|
257
|
+
okb serve --http --host 0.0.0.0 --port 8080
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Then configure Claude Code to connect via SSE:
|
|
261
|
+
|
|
262
|
+
```json
|
|
263
|
+
{
|
|
264
|
+
"mcpServers": {
|
|
265
|
+
"knowledge-base": {
|
|
266
|
+
"type": "sse",
|
|
267
|
+
"url": "http://localhost:8080/sse",
|
|
268
|
+
"headers": {
|
|
269
|
+
"Authorization": "Bearer okb_default_rw_a1b2c3d4e5f6g7h8"
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## MCP Tools (Available in Claude Code)
|
|
277
|
+
|
|
278
|
+
| Tool | Purpose |
|
|
279
|
+
|------|---------|
|
|
280
|
+
| `search_knowledge` | Semantic search with natural language queries |
|
|
281
|
+
| `keyword_search` | Exact keyword/symbol matching |
|
|
282
|
+
| `hybrid_search` | Combined semantic + keyword (RRF fusion) |
|
|
283
|
+
| `get_document` | Retrieve full document by path |
|
|
284
|
+
| `list_sources` | Show indexed document stats |
|
|
285
|
+
| `list_projects` | List known projects |
|
|
286
|
+
| `recent_documents` | Show recently indexed files |
|
|
287
|
+
| `save_knowledge` | Save knowledge from Claude for future reference |
|
|
288
|
+
| `delete_knowledge` | Delete a Claude-saved knowledge entry |
|
|
289
|
+
| `get_actionable_items` | Query tasks/events with structured filters |
|
|
290
|
+
|
|
291
|
+
## Contextual Chunking
|
|
292
|
+
|
|
293
|
+
Documents are chunked with context for better retrieval:
|
|
294
|
+
|
|
295
|
+
```
|
|
296
|
+
Document: Django Performance Notes
|
|
297
|
+
Project: student-app ← inferred from path or frontmatter
|
|
298
|
+
Section: Query Optimization ← extracted from markdown headers
|
|
299
|
+
Topics: django, performance ← from frontmatter tags
|
|
300
|
+
Content: Use `select_related()` to avoid N+1 queries...
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Frontmatter Example
|
|
304
|
+
|
|
305
|
+
```markdown
|
|
306
|
+
---
|
|
307
|
+
tags: [django, postgresql, performance]
|
|
308
|
+
project: student-app
|
|
309
|
+
category: backend
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
# Query Optimization
|
|
313
|
+
|
|
314
|
+
Use `select_related()` for foreign keys...
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Cost Estimate
|
|
318
|
+
|
|
319
|
+
| Component | Local | Cloud Alternative |
|
|
320
|
+
|-----------|-------|-------------------|
|
|
321
|
+
| pgvector | $0 | ~$15-30/mo (CloudSQL) |
|
|
322
|
+
| MCP Server | $0 | ~$5/mo (small VM) |
|
|
323
|
+
| Modal embedding | ~$0.50-2/mo | N/A |
|
|
324
|
+
| **Total** | **~$1-2/mo** | **~$20-35/mo** |
|
|
325
|
+
|
|
326
|
+
## Development
|
|
327
|
+
|
|
328
|
+
```bash
|
|
329
|
+
# Install dev dependencies
|
|
330
|
+
pip install -e ".[dev]"
|
|
331
|
+
|
|
332
|
+
# Run tests
|
|
333
|
+
pytest
|
|
334
|
+
|
|
335
|
+
# Lint and format
|
|
336
|
+
ruff check . && ruff format .
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
## Plugin System
|
|
340
|
+
|
|
341
|
+
OKB supports plugins for custom file parsers and API data sources (GitHub, Todoist, etc).
|
|
342
|
+
|
|
343
|
+
### Creating a Plugin
|
|
344
|
+
|
|
345
|
+
```python
|
|
346
|
+
# File parser plugin
|
|
347
|
+
from okb.plugins import FileParser, Document
|
|
348
|
+
|
|
349
|
+
class EpubParser:
|
|
350
|
+
extensions = ['.epub']
|
|
351
|
+
source_type = 'epub'
|
|
352
|
+
|
|
353
|
+
def can_parse(self, path): return path.suffix.lower() == '.epub'
|
|
354
|
+
def parse(self, path, extra_metadata=None) -> Document: ...
|
|
355
|
+
|
|
356
|
+
# API source plugin
|
|
357
|
+
from okb.plugins import APISource, SyncState, Document
|
|
358
|
+
|
|
359
|
+
class GitHubSource:
|
|
360
|
+
name = 'github'
|
|
361
|
+
source_type = 'github-issue'
|
|
362
|
+
|
|
363
|
+
def configure(self, config): ...
|
|
364
|
+
def fetch(self, state: SyncState | None) -> tuple[list[Document], SyncState]: ...
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
### Registering Plugins
|
|
368
|
+
|
|
369
|
+
In your plugin's `pyproject.toml`:
|
|
370
|
+
```toml
|
|
371
|
+
[project.entry-points."okb.parsers"]
|
|
372
|
+
epub = "okb_epub:EpubParser"
|
|
373
|
+
|
|
374
|
+
[project.entry-points."okb.sources"]
|
|
375
|
+
github = "okb_github:GitHubSource"
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
### Configuring API Sources
|
|
379
|
+
|
|
380
|
+
```yaml
|
|
381
|
+
# ~/.config/okb/config.yaml
|
|
382
|
+
plugins:
|
|
383
|
+
sources:
|
|
384
|
+
github:
|
|
385
|
+
enabled: true
|
|
386
|
+
token: ${GITHUB_TOKEN} # Resolved from environment
|
|
387
|
+
repos: [owner/repo1, owner/repo2]
|
|
388
|
+
dropbox-paper:
|
|
389
|
+
enabled: true
|
|
390
|
+
token: ${DROPBOX_TOKEN}
|
|
391
|
+
folders: [/] # Optional: filter to specific folders
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
## License
|
|
395
|
+
|
|
396
|
+
MIT
|
|
397
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
okb/__init__.py,sha256=2yaWIYQbho7N2O2zwTn3ZH11b8b3SaoDVlxluVTqwy4,92
|
|
2
|
+
okb/cli.py,sha256=KZ8LmzHYiD19NZtrTx7UoFeiH4MMFZX621gs5JElLc4,43663
|
|
3
|
+
okb/config.py,sha256=DKmX2fgteGdh0QMsA-Immu-mZcvLjHWeB8HIf9rcM5o,22898
|
|
4
|
+
okb/data/init.sql,sha256=QpsicUN7PQ7d8zyOCRNChOu5XKdUVC3xySlRDPyKSN8,2728
|
|
5
|
+
okb/http_server.py,sha256=H8AvsRiStxupr-KnXrIlHnoRtqeA-2SgphVgGAOnqmQ,18673
|
|
6
|
+
okb/ingest.py,sha256=D5plxCC2tQXZenMNUa482dUDqsyuaq2APAQqaIgRAqU,54505
|
|
7
|
+
okb/llm/__init__.py,sha256=4jelqgXvF-eEPyLCuAmcxagN0H923wI9pBJJZKv4r0E,2368
|
|
8
|
+
okb/llm/base.py,sha256=gOm7zBiNdHrj7xxJfpb-4qZdYxWM0lA0vKfrBStO60E,2279
|
|
9
|
+
okb/llm/cache.py,sha256=rxRPMNBtP336MSpGWA8F7rDZnF0O2RM3rEsNtoxS0Zk,6142
|
|
10
|
+
okb/llm/filter.py,sha256=y20bc3vHtp5gj7T7AhsJ45ZkAkBgztj6WPjsVAmvEeo,5447
|
|
11
|
+
okb/llm/providers.py,sha256=AdVw9FFgv58-KJEfXv9JqWlkxBl-LcRWOao95CsjqWA,9718
|
|
12
|
+
okb/local_embedder.py,sha256=zzjBUFp4IH2xsvKyKjKZyX9dJuE_3PDMHMwpyRYSISQ,2098
|
|
13
|
+
okb/mcp_server.py,sha256=95e1wEtan0UrzQveCNNyhHXvtpZ18nDsxfaCKYJeX1I,50701
|
|
14
|
+
okb/migrate.py,sha256=2faYL-SHiQCkGXpTUlBFMCj0B-6JYCHqZl9u6vOlui8,1693
|
|
15
|
+
okb/migrations/0001.initial-schema.sql,sha256=0s5pj9Ad6f0u_mxODAM_-DbDI3aI37Wdu5XMPAzAIqw,2577
|
|
16
|
+
okb/migrations/0002.sync-state.sql,sha256=w34dOA9sPg60NMS1aHvOhORff1k_Di9cO2ghwVQSPHU,696
|
|
17
|
+
okb/migrations/0003.structured-fields.sql,sha256=rPCSrdtotCoRpOfjHf1Ifx0pfizpYS9n4MD4CHxrv_c,1225
|
|
18
|
+
okb/migrations/0004.tokens.sql,sha256=VtcLfA1_SVVQLkEKZ-av_93Fg0ksVWLm1tlR7nJXoaQ,448
|
|
19
|
+
okb/migrations/0005.database-metadata.sql,sha256=0X4LyuUUX34s3ph2C70FnBBau5HEBwR4xyY-hwPEX90,709
|
|
20
|
+
okb/migrations/0006.llm-cache.sql,sha256=azjPpj00WH_8tx4JI8PJKZ1AOAJEhbkneVvYa3ZRZ1w,493
|
|
21
|
+
okb/modal_embedder.py,sha256=V1cpGWrtEo1MGkrD9Nc-5wRmf9e7IwKPsQj7nVuLlyg,3469
|
|
22
|
+
okb/modal_llm.py,sha256=4rYE3VZ_T09HXCgTIYFLu1s_C2FRC9y4dgMUGqJuO2M,5368
|
|
23
|
+
okb/plugins/__init__.py,sha256=50LNAH4bvfIw5CHT82sknGjdCldQ-4ds0wxo1zM9E2k,324
|
|
24
|
+
okb/plugins/base.py,sha256=6TIN1UIItmuIsP4NDJhuRMH0ngKkQiGmtHTeYj1K8OU,3171
|
|
25
|
+
okb/plugins/registry.py,sha256=fN7NfoOaRnMyXSWT2srd6vEr4riJjmncQFfberf0IE8,3741
|
|
26
|
+
okb/plugins/sources/__init__.py,sha256=CTqgfRXCHWjKiQS0dpx2jRXj98ogcBClNORfVQwzL0I,143
|
|
27
|
+
okb/plugins/sources/dropbox_paper.py,sha256=YdNDkMEsaKz7pDfwfirwApg2v3Ub-GsbgWkSm9E4Yi0,6144
|
|
28
|
+
okb/plugins/sources/github.py,sha256=ozdTZPkU8h2-ZIx5o1FB58QBZ6P0eoVntluWL3vG87I,16309
|
|
29
|
+
okb/rescan.py,sha256=dVdQEkVUjsrtOKAGZc0LC2uwcnkjB8hn2SOVWHnY-R8,8396
|
|
30
|
+
okb/scripts/__init__.py,sha256=HPp8YCtIeo9XMOtOGCtntiwYr9eCxAJ1MF9Lo9WVzUA,53
|
|
31
|
+
okb/scripts/watch.py,sha256=b8oGPTN3flNdNQJETeqQ1RNZ8U1LiKvHntLwvHRIviA,6354
|
|
32
|
+
okb/tokens.py,sha256=JJ1C-mvtnT2O0cmjSu57PI9Nt53Sl9DqbmPuLnHlN6g,8043
|
|
33
|
+
okb-1.0.0.dist-info/METADATA,sha256=NVrVwX-U0Nj5qYks0w-UJNoDsiBPFth_J6oxzwftCBM,12002
|
|
34
|
+
okb-1.0.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
35
|
+
okb-1.0.0.dist-info/entry_points.txt,sha256=P2XvigL7DxPy8F5KvgRilX2G_rRh14bI9hyJpGgi80Y,180
|
|
36
|
+
okb-1.0.0.dist-info/RECORD,,
|