okb 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/__init__.py +3 -0
- okb/cli.py +1272 -0
- okb/config.py +661 -0
- okb/data/init.sql +92 -0
- okb/http_server.py +463 -0
- okb/ingest.py +1589 -0
- okb/llm/__init__.py +86 -0
- okb/llm/base.py +83 -0
- okb/llm/cache.py +217 -0
- okb/llm/filter.py +187 -0
- okb/llm/providers.py +322 -0
- okb/local_embedder.py +87 -0
- okb/mcp_server.py +1393 -0
- okb/migrate.py +53 -0
- okb/migrations/0001.initial-schema.sql +91 -0
- okb/migrations/0002.sync-state.sql +22 -0
- okb/migrations/0003.structured-fields.sql +22 -0
- okb/migrations/0004.tokens.sql +13 -0
- okb/migrations/0005.database-metadata.sql +19 -0
- okb/migrations/0006.llm-cache.sql +13 -0
- okb/modal_embedder.py +120 -0
- okb/modal_llm.py +178 -0
- okb/plugins/__init__.py +8 -0
- okb/plugins/base.py +110 -0
- okb/plugins/registry.py +123 -0
- okb/plugins/sources/__init__.py +5 -0
- okb/plugins/sources/dropbox_paper.py +188 -0
- okb/plugins/sources/github.py +484 -0
- okb/rescan.py +227 -0
- okb/scripts/__init__.py +1 -0
- okb/scripts/watch.py +206 -0
- okb/tokens.py +277 -0
- okb-1.0.0.dist-info/METADATA +397 -0
- okb-1.0.0.dist-info/RECORD +36 -0
- okb-1.0.0.dist-info/WHEEL +4 -0
- okb-1.0.0.dist-info/entry_points.txt +9 -0
okb/cli.py
ADDED
|
@@ -0,0 +1,1272 @@
|
|
|
1
|
+
"""Command-line interface for Local Knowledge Base."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib.resources
|
|
6
|
+
import json
|
|
7
|
+
import shutil
|
|
8
|
+
import subprocess
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import click
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
from .config import config, get_config_dir, get_config_path, get_default_config_yaml
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@click.group()
|
|
20
|
+
@click.version_option(package_name="okb")
|
|
21
|
+
@click.option("--db", "database", default=None, help="Database to use")
|
|
22
|
+
@click.pass_context
|
|
23
|
+
def main(ctx, database):
|
|
24
|
+
"""Local Knowledge Base - semantic search for personal documents."""
|
|
25
|
+
ctx.ensure_object(dict)
|
|
26
|
+
ctx.obj["database"] = database
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# =============================================================================
|
|
30
|
+
# Database commands
|
|
31
|
+
# =============================================================================
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@main.group()
|
|
35
|
+
@click.pass_context
|
|
36
|
+
def db(ctx):
|
|
37
|
+
"""Manage the pgvector database container."""
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _check_docker() -> bool:
|
|
42
|
+
"""Check if docker is available."""
|
|
43
|
+
return shutil.which("docker") is not None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_container_status() -> str | None:
|
|
47
|
+
"""Get the status of the lkb container. Returns None if not found."""
|
|
48
|
+
try:
|
|
49
|
+
result = subprocess.run(
|
|
50
|
+
[
|
|
51
|
+
"docker",
|
|
52
|
+
"container",
|
|
53
|
+
"inspect",
|
|
54
|
+
"-f",
|
|
55
|
+
"{{.State.Status}}",
|
|
56
|
+
config.docker_container_name,
|
|
57
|
+
],
|
|
58
|
+
capture_output=True,
|
|
59
|
+
text=True,
|
|
60
|
+
timeout=10,
|
|
61
|
+
)
|
|
62
|
+
if result.returncode == 0:
|
|
63
|
+
return result.stdout.strip()
|
|
64
|
+
return None
|
|
65
|
+
except subprocess.TimeoutExpired:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _get_init_sql_path() -> Path:
|
|
70
|
+
"""Get the path to init.sql, extracting from package if needed."""
|
|
71
|
+
# Try to access init.sql from package data
|
|
72
|
+
try:
|
|
73
|
+
ref = importlib.resources.files("okb.data").joinpath("init.sql")
|
|
74
|
+
# If it's a real file path, return it directly
|
|
75
|
+
with importlib.resources.as_file(ref) as path:
|
|
76
|
+
return path
|
|
77
|
+
except Exception:
|
|
78
|
+
# Fallback: look relative to this file
|
|
79
|
+
return Path(__file__).parent / "data" / "init.sql"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _wait_for_db_ready(timeout: int = 30) -> bool:
|
|
83
|
+
"""Wait for database to be ready to accept connections."""
|
|
84
|
+
import time
|
|
85
|
+
|
|
86
|
+
click.echo("Waiting for database to be ready...", nl=False)
|
|
87
|
+
for _ in range(timeout):
|
|
88
|
+
try:
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
[
|
|
91
|
+
"docker",
|
|
92
|
+
"exec",
|
|
93
|
+
config.docker_container_name,
|
|
94
|
+
"pg_isready",
|
|
95
|
+
"-U",
|
|
96
|
+
"knowledge",
|
|
97
|
+
"-d",
|
|
98
|
+
"knowledge_base",
|
|
99
|
+
],
|
|
100
|
+
capture_output=True,
|
|
101
|
+
text=True,
|
|
102
|
+
timeout=5,
|
|
103
|
+
)
|
|
104
|
+
if result.returncode == 0:
|
|
105
|
+
click.echo(" ready.")
|
|
106
|
+
return True
|
|
107
|
+
except subprocess.TimeoutExpired:
|
|
108
|
+
pass
|
|
109
|
+
click.echo(".", nl=False)
|
|
110
|
+
time.sleep(1)
|
|
111
|
+
click.echo(" timeout!")
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _run_migrations_for_db(db_cfg):
|
|
116
|
+
"""Run pending migrations for a specific database."""
|
|
117
|
+
from .migrate import get_pending, run_migrations
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
pending = get_pending(db_cfg.url)
|
|
121
|
+
if pending:
|
|
122
|
+
click.echo(f" {db_cfg.name}: applying {len(pending)} migration(s)...")
|
|
123
|
+
applied = run_migrations(db_cfg.url)
|
|
124
|
+
for m in applied:
|
|
125
|
+
click.echo(f" ✓ {m}")
|
|
126
|
+
else:
|
|
127
|
+
click.echo(f" {db_cfg.name}: up to date")
|
|
128
|
+
except Exception as e:
|
|
129
|
+
click.echo(f" {db_cfg.name}: error ({e})", err=True)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _run_migrations_all():
|
|
133
|
+
"""Run pending migrations on all managed databases."""
|
|
134
|
+
managed_dbs = [db for db in config.databases.values() if db.managed]
|
|
135
|
+
if managed_dbs:
|
|
136
|
+
click.echo("Running migrations...")
|
|
137
|
+
for db_cfg in managed_dbs:
|
|
138
|
+
_run_migrations_for_db(db_cfg)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _ensure_databases_exist():
|
|
142
|
+
"""Create databases in PostgreSQL container if they don't exist."""
|
|
143
|
+
import psycopg
|
|
144
|
+
from psycopg import sql
|
|
145
|
+
|
|
146
|
+
managed_dbs = [db for db in config.databases.values() if db.managed]
|
|
147
|
+
if not managed_dbs:
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
# Connect to postgres database (admin db) to create others
|
|
151
|
+
admin_url = (
|
|
152
|
+
f"postgresql://knowledge:{config.docker_password}@localhost:{config.docker_port}/postgres"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
with psycopg.connect(admin_url, autocommit=True) as conn:
|
|
157
|
+
# Get existing databases
|
|
158
|
+
result = conn.execute("SELECT datname FROM pg_database WHERE datistemplate = false")
|
|
159
|
+
existing = {row[0] for row in result.fetchall()}
|
|
160
|
+
|
|
161
|
+
for db_cfg in managed_dbs:
|
|
162
|
+
db_name = db_cfg.database_name
|
|
163
|
+
if db_name not in existing:
|
|
164
|
+
click.echo(f"Creating database: {db_name}")
|
|
165
|
+
conn.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name)))
|
|
166
|
+
|
|
167
|
+
# Enable pgvector extension on the new database
|
|
168
|
+
new_db_url = (
|
|
169
|
+
f"postgresql://knowledge:{config.docker_password}@"
|
|
170
|
+
f"localhost:{config.docker_port}/{db_name}"
|
|
171
|
+
)
|
|
172
|
+
with psycopg.connect(new_db_url, autocommit=True) as new_conn:
|
|
173
|
+
new_conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
174
|
+
except Exception as e:
|
|
175
|
+
click.echo(f"Warning: Could not create databases: {e}", err=True)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@db.command()
|
|
179
|
+
def start():
|
|
180
|
+
"""Start the pgvector database container."""
|
|
181
|
+
if not _check_docker():
|
|
182
|
+
click.echo("Error: docker is not installed or not in PATH", err=True)
|
|
183
|
+
sys.exit(1)
|
|
184
|
+
|
|
185
|
+
status = _get_container_status()
|
|
186
|
+
if status == "running":
|
|
187
|
+
click.echo(f"Container '{config.docker_container_name}' is already running.")
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if status == "exited":
|
|
191
|
+
# Container exists but is stopped, start it
|
|
192
|
+
click.echo(f"Starting existing container '{config.docker_container_name}'...")
|
|
193
|
+
try:
|
|
194
|
+
result = subprocess.run(
|
|
195
|
+
["docker", "start", config.docker_container_name],
|
|
196
|
+
capture_output=True,
|
|
197
|
+
text=True,
|
|
198
|
+
timeout=30,
|
|
199
|
+
)
|
|
200
|
+
except subprocess.TimeoutExpired:
|
|
201
|
+
click.echo("Error: docker start timed out", err=True)
|
|
202
|
+
sys.exit(1)
|
|
203
|
+
if result.returncode != 0:
|
|
204
|
+
click.echo(f"Error starting container: {result.stderr}", err=True)
|
|
205
|
+
sys.exit(1)
|
|
206
|
+
click.echo("Database started.")
|
|
207
|
+
_wait_for_db_ready()
|
|
208
|
+
_ensure_databases_exist()
|
|
209
|
+
_run_migrations_all()
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
# Container doesn't exist, create it
|
|
213
|
+
click.echo(f"Creating container '{config.docker_container_name}'...")
|
|
214
|
+
|
|
215
|
+
# Get init.sql path - we need to handle the case where it's in a package
|
|
216
|
+
init_sql = _get_init_sql_path()
|
|
217
|
+
|
|
218
|
+
# If init.sql is inside a zip/egg, we need to extract it to a temp location
|
|
219
|
+
if not init_sql.exists():
|
|
220
|
+
ref = importlib.resources.files("okb.data").joinpath("init.sql")
|
|
221
|
+
init_sql_content = ref.read_text()
|
|
222
|
+
# Write to temp file
|
|
223
|
+
temp_dir = Path(tempfile.gettempdir()) / "okb"
|
|
224
|
+
temp_dir.mkdir(exist_ok=True)
|
|
225
|
+
init_sql = temp_dir / "init.sql"
|
|
226
|
+
init_sql.write_text(init_sql_content)
|
|
227
|
+
|
|
228
|
+
cmd = [
|
|
229
|
+
"docker",
|
|
230
|
+
"run",
|
|
231
|
+
"-d",
|
|
232
|
+
"--name",
|
|
233
|
+
config.docker_container_name,
|
|
234
|
+
"-e",
|
|
235
|
+
"POSTGRES_USER=knowledge",
|
|
236
|
+
"-e",
|
|
237
|
+
f"POSTGRES_PASSWORD={config.docker_password}",
|
|
238
|
+
"-e",
|
|
239
|
+
"POSTGRES_DB=knowledge_base",
|
|
240
|
+
"-v",
|
|
241
|
+
f"{config.docker_volume_name}:/var/lib/postgresql/data",
|
|
242
|
+
"-v",
|
|
243
|
+
f"{init_sql}:/docker-entrypoint-initdb.d/init.sql:ro",
|
|
244
|
+
"-p",
|
|
245
|
+
f"{config.docker_port}:5432",
|
|
246
|
+
"--restart",
|
|
247
|
+
"unless-stopped",
|
|
248
|
+
"pgvector/pgvector:pg16",
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
253
|
+
except subprocess.TimeoutExpired:
|
|
254
|
+
click.echo("Error: docker run timed out (may need to pull image manually)", err=True)
|
|
255
|
+
sys.exit(1)
|
|
256
|
+
if result.returncode != 0:
|
|
257
|
+
click.echo(f"Error creating container: {result.stderr}", err=True)
|
|
258
|
+
sys.exit(1)
|
|
259
|
+
|
|
260
|
+
click.echo("Database started.")
|
|
261
|
+
click.echo(f" Container: {config.docker_container_name}")
|
|
262
|
+
click.echo(f" Port: {config.docker_port}")
|
|
263
|
+
click.echo(f" Volume: {config.docker_volume_name}")
|
|
264
|
+
_wait_for_db_ready()
|
|
265
|
+
_ensure_databases_exist()
|
|
266
|
+
_run_migrations_all()
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@db.command()
|
|
270
|
+
def stop():
|
|
271
|
+
"""Stop the pgvector database container."""
|
|
272
|
+
if not _check_docker():
|
|
273
|
+
click.echo("Error: docker is not installed or not in PATH", err=True)
|
|
274
|
+
sys.exit(1)
|
|
275
|
+
|
|
276
|
+
status = _get_container_status()
|
|
277
|
+
if status is None:
|
|
278
|
+
click.echo(f"Container '{config.docker_container_name}' does not exist.")
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
if status != "running":
|
|
282
|
+
click.echo(f"Container '{config.docker_container_name}' is not running (status: {status}).")
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
click.echo(f"Stopping container '{config.docker_container_name}'...")
|
|
286
|
+
try:
|
|
287
|
+
result = subprocess.run(
|
|
288
|
+
["docker", "stop", config.docker_container_name],
|
|
289
|
+
capture_output=True,
|
|
290
|
+
text=True,
|
|
291
|
+
timeout=30,
|
|
292
|
+
)
|
|
293
|
+
except subprocess.TimeoutExpired:
|
|
294
|
+
click.echo("Error: docker stop timed out", err=True)
|
|
295
|
+
sys.exit(1)
|
|
296
|
+
if result.returncode != 0:
|
|
297
|
+
click.echo(f"Error stopping container: {result.stderr}", err=True)
|
|
298
|
+
sys.exit(1)
|
|
299
|
+
|
|
300
|
+
click.echo("Database stopped.")
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@db.command()
|
|
304
|
+
def status():
|
|
305
|
+
"""Show database container status."""
|
|
306
|
+
if not _check_docker():
|
|
307
|
+
click.echo("Error: docker is not installed or not in PATH", err=True)
|
|
308
|
+
sys.exit(1)
|
|
309
|
+
|
|
310
|
+
container_status = _get_container_status()
|
|
311
|
+
if container_status is None:
|
|
312
|
+
click.echo(f"Container '{config.docker_container_name}' does not exist.")
|
|
313
|
+
click.echo("Run 'okb db start' to create it.")
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
click.echo(f"Container: {config.docker_container_name}")
|
|
317
|
+
click.echo(f"Status: {container_status}")
|
|
318
|
+
click.echo(f"Port: {config.docker_port}")
|
|
319
|
+
click.echo(f"Volume: {config.docker_volume_name}")
|
|
320
|
+
|
|
321
|
+
if container_status == "running":
|
|
322
|
+
# Try to get more info
|
|
323
|
+
try:
|
|
324
|
+
result = subprocess.run(
|
|
325
|
+
[
|
|
326
|
+
"docker",
|
|
327
|
+
"exec",
|
|
328
|
+
config.docker_container_name,
|
|
329
|
+
"pg_isready",
|
|
330
|
+
"-U",
|
|
331
|
+
"knowledge",
|
|
332
|
+
"-d",
|
|
333
|
+
"knowledge_base",
|
|
334
|
+
],
|
|
335
|
+
capture_output=True,
|
|
336
|
+
text=True,
|
|
337
|
+
timeout=10,
|
|
338
|
+
)
|
|
339
|
+
except subprocess.TimeoutExpired:
|
|
340
|
+
click.echo("Database: check timed out")
|
|
341
|
+
return
|
|
342
|
+
if result.returncode == 0:
|
|
343
|
+
click.echo("Database: ready")
|
|
344
|
+
# Show migration status
|
|
345
|
+
try:
|
|
346
|
+
from .migrate import get_applied, get_pending
|
|
347
|
+
|
|
348
|
+
applied = get_applied(config.db_url)
|
|
349
|
+
pending = get_pending(config.db_url)
|
|
350
|
+
click.echo(f"Migrations: {len(applied)} applied, {len(pending)} pending")
|
|
351
|
+
if pending:
|
|
352
|
+
click.echo(" Run 'okb db migrate' to apply pending migrations.")
|
|
353
|
+
except Exception as e:
|
|
354
|
+
click.echo(f"Migrations: error checking ({e})")
|
|
355
|
+
else:
|
|
356
|
+
click.echo("Database: not ready")
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
@db.command()
|
|
360
|
+
@click.argument("name", required=False)
|
|
361
|
+
def migrate(name):
|
|
362
|
+
"""Apply pending database migrations.
|
|
363
|
+
|
|
364
|
+
If NAME is provided, migrate only that database.
|
|
365
|
+
Otherwise, migrate all configured databases.
|
|
366
|
+
|
|
367
|
+
Creates missing databases automatically for managed databases.
|
|
368
|
+
"""
|
|
369
|
+
# Ensure managed databases exist before migrating
|
|
370
|
+
_ensure_databases_exist()
|
|
371
|
+
|
|
372
|
+
if name:
|
|
373
|
+
# Migrate specific database
|
|
374
|
+
try:
|
|
375
|
+
db_cfg = config.get_database(name)
|
|
376
|
+
except ValueError as e:
|
|
377
|
+
click.echo(f"Error: {e}", err=True)
|
|
378
|
+
sys.exit(1)
|
|
379
|
+
_run_migrations_for_db(db_cfg)
|
|
380
|
+
else:
|
|
381
|
+
# Migrate all databases
|
|
382
|
+
for db_cfg in config.databases.values():
|
|
383
|
+
_run_migrations_for_db(db_cfg)
|
|
384
|
+
click.echo("Done.")
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@db.command()
|
|
388
|
+
def destroy():
|
|
389
|
+
"""Remove the database container and volume (destructive!)."""
|
|
390
|
+
if not _check_docker():
|
|
391
|
+
click.echo("Error: docker is not installed or not in PATH", err=True)
|
|
392
|
+
sys.exit(1)
|
|
393
|
+
|
|
394
|
+
if not click.confirm(
|
|
395
|
+
f"This will delete container '{config.docker_container_name}' and volume "
|
|
396
|
+
f"'{config.docker_volume_name}'. All data will be lost. Continue?"
|
|
397
|
+
):
|
|
398
|
+
return
|
|
399
|
+
|
|
400
|
+
# Stop and remove container
|
|
401
|
+
subprocess.run(
|
|
402
|
+
["docker", "rm", "-f", config.docker_container_name],
|
|
403
|
+
capture_output=True,
|
|
404
|
+
timeout=30,
|
|
405
|
+
)
|
|
406
|
+
click.echo(f"Removed container '{config.docker_container_name}'.")
|
|
407
|
+
|
|
408
|
+
# Remove volume
|
|
409
|
+
subprocess.run(
|
|
410
|
+
["docker", "volume", "rm", config.docker_volume_name],
|
|
411
|
+
capture_output=True,
|
|
412
|
+
timeout=30,
|
|
413
|
+
)
|
|
414
|
+
click.echo(f"Removed volume '{config.docker_volume_name}'.")
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
@db.command("list")
|
|
418
|
+
def db_list():
|
|
419
|
+
"""List all configured databases."""
|
|
420
|
+
click.echo("Configured databases:")
|
|
421
|
+
for name, db_cfg in config.databases.items():
|
|
422
|
+
markers = []
|
|
423
|
+
if db_cfg.default:
|
|
424
|
+
markers.append("default")
|
|
425
|
+
markers.append("managed" if db_cfg.managed else "external")
|
|
426
|
+
click.echo(f" {name} [{', '.join(markers)}]")
|
|
427
|
+
click.echo(f" URL: {db_cfg.url}")
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
# =============================================================================
|
|
431
|
+
# Config commands
|
|
432
|
+
# =============================================================================
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
@main.group("config")
|
|
436
|
+
def config_cmd():
|
|
437
|
+
"""Manage configuration."""
|
|
438
|
+
pass
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
@config_cmd.command("init")
|
|
442
|
+
@click.option("--force", is_flag=True, help="Overwrite existing config file")
|
|
443
|
+
def config_init(force: bool):
|
|
444
|
+
"""Create default config file at ~/.config/okb/config.yaml."""
|
|
445
|
+
config_path = get_config_path()
|
|
446
|
+
|
|
447
|
+
if config_path.exists() and not force:
|
|
448
|
+
click.echo(f"Config file already exists at {config_path}")
|
|
449
|
+
click.echo("Use --force to overwrite.")
|
|
450
|
+
return
|
|
451
|
+
|
|
452
|
+
config_dir = get_config_dir()
|
|
453
|
+
config_dir.mkdir(parents=True, exist_ok=True)
|
|
454
|
+
|
|
455
|
+
config_path.write_text(get_default_config_yaml())
|
|
456
|
+
click.echo(f"Created config file at {config_path}")
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
@config_cmd.command("show")
|
|
460
|
+
@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
|
|
461
|
+
def config_show(as_json: bool):
|
|
462
|
+
"""Show current configuration."""
|
|
463
|
+
config_path = get_config_path()
|
|
464
|
+
|
|
465
|
+
if as_json:
|
|
466
|
+
click.echo(json.dumps(config.to_dict(), indent=2))
|
|
467
|
+
else:
|
|
468
|
+
click.echo(f"Config file: {config_path}")
|
|
469
|
+
click.echo(f" Exists: {config_path.exists()}")
|
|
470
|
+
click.echo("")
|
|
471
|
+
click.echo(yaml.dump(config.to_dict(), default_flow_style=False, sort_keys=False))
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
@config_cmd.command("path")
|
|
475
|
+
def config_path_cmd():
|
|
476
|
+
"""Print the config file path."""
|
|
477
|
+
click.echo(get_config_path())
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# =============================================================================
|
|
481
|
+
# Ingest command
|
|
482
|
+
# =============================================================================
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
@main.command()
|
|
486
|
+
@click.argument("paths", nargs=-1, required=True)
|
|
487
|
+
@click.option("--metadata", "-m", default="{}", help="JSON metadata to attach")
|
|
488
|
+
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
489
|
+
@click.option("--db", "database", default=None, help="Database to ingest into")
|
|
490
|
+
@click.pass_context
|
|
491
|
+
def ingest(ctx, paths: tuple[str, ...], metadata: str, local: bool, database: str | None):
|
|
492
|
+
"""Ingest documents or URLs into the knowledge base."""
|
|
493
|
+
import json as json_module
|
|
494
|
+
from pathlib import Path
|
|
495
|
+
|
|
496
|
+
from .ingest import (
|
|
497
|
+
Ingester,
|
|
498
|
+
check_file_skip,
|
|
499
|
+
collect_documents,
|
|
500
|
+
is_text_file,
|
|
501
|
+
is_url,
|
|
502
|
+
parse_document,
|
|
503
|
+
parse_url,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
try:
|
|
507
|
+
extra_metadata = json_module.loads(metadata)
|
|
508
|
+
except json_module.JSONDecodeError as e:
|
|
509
|
+
click.echo(f"Error parsing metadata JSON: {e}", err=True)
|
|
510
|
+
sys.exit(1)
|
|
511
|
+
|
|
512
|
+
# Get database URL from --db option or context
|
|
513
|
+
db_name = database or ctx.obj.get("database")
|
|
514
|
+
db_cfg = config.get_database(db_name)
|
|
515
|
+
ingester = Ingester(db_cfg.url, use_modal=not local)
|
|
516
|
+
|
|
517
|
+
documents = []
|
|
518
|
+
for path_str in paths:
|
|
519
|
+
# Check if it's a URL first
|
|
520
|
+
if is_url(path_str):
|
|
521
|
+
click.echo(f"Fetching: {path_str}")
|
|
522
|
+
try:
|
|
523
|
+
documents.append(parse_url(path_str, extra_metadata))
|
|
524
|
+
except Exception as e:
|
|
525
|
+
click.echo(f"Error fetching URL: {e}", err=True)
|
|
526
|
+
continue
|
|
527
|
+
|
|
528
|
+
path = Path(path_str).resolve()
|
|
529
|
+
if path.is_dir():
|
|
530
|
+
documents.extend(collect_documents(path, extra_metadata))
|
|
531
|
+
elif path.is_file():
|
|
532
|
+
# Check security patterns first
|
|
533
|
+
skip_check = check_file_skip(path)
|
|
534
|
+
if skip_check.should_skip:
|
|
535
|
+
prefix = "BLOCKED" if skip_check.is_security else "Skipping"
|
|
536
|
+
click.echo(f"{prefix}: {path} ({skip_check.reason})", err=True)
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
# For explicitly provided files, try to parse even with unknown extension
|
|
540
|
+
# Always allow .pdf and .docx even if not in config (user may have old config)
|
|
541
|
+
if path.suffix in config.all_extensions or path.suffix in (".pdf", ".docx"):
|
|
542
|
+
try:
|
|
543
|
+
documents.extend(parse_document(path, extra_metadata))
|
|
544
|
+
except ValueError as e:
|
|
545
|
+
click.echo(f"Skipping: {e}", err=True)
|
|
546
|
+
continue
|
|
547
|
+
elif is_text_file(path):
|
|
548
|
+
# Unknown extension but appears to be text - parse as code/config
|
|
549
|
+
click.echo(f"Parsing as text: {path}")
|
|
550
|
+
documents.extend(parse_document(path, extra_metadata, force=True))
|
|
551
|
+
else:
|
|
552
|
+
click.echo(f"Skipping binary file: {path}", err=True)
|
|
553
|
+
else:
|
|
554
|
+
click.echo(f"Not found: {path_str}", err=True)
|
|
555
|
+
|
|
556
|
+
if not documents:
|
|
557
|
+
click.echo("No documents found to ingest.")
|
|
558
|
+
return
|
|
559
|
+
|
|
560
|
+
click.echo(f"Found {len(documents)} documents to process")
|
|
561
|
+
ingester.ingest_documents(documents)
|
|
562
|
+
click.echo("Done!")
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
# =============================================================================
|
|
566
|
+
# Rescan command
|
|
567
|
+
# =============================================================================
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
@main.command()
|
|
571
|
+
@click.option("--db", "database", default=None, help="Database to rescan")
|
|
572
|
+
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
573
|
+
@click.option("--dry-run", is_flag=True, help="Show changes without executing")
|
|
574
|
+
@click.option("--delete", "delete_missing", is_flag=True, help="Remove documents for missing files")
|
|
575
|
+
@click.pass_context
|
|
576
|
+
def rescan(ctx, database: str | None, local: bool, dry_run: bool, delete_missing: bool):
|
|
577
|
+
"""Check indexed documents for freshness and re-ingest changed ones.
|
|
578
|
+
|
|
579
|
+
Compares stored file modification times against actual file mtimes.
|
|
580
|
+
Files that have changed are deleted and re-ingested. Missing files
|
|
581
|
+
are reported (use --delete to remove them from the index).
|
|
582
|
+
|
|
583
|
+
Examples:
|
|
584
|
+
|
|
585
|
+
okb rescan # Rescan default database
|
|
586
|
+
|
|
587
|
+
okb rescan --dry-run # Show what would change
|
|
588
|
+
|
|
589
|
+
okb rescan --delete # Also remove missing files
|
|
590
|
+
|
|
591
|
+
okb rescan --db work # Rescan specific database
|
|
592
|
+
"""
|
|
593
|
+
from .rescan import Rescanner
|
|
594
|
+
|
|
595
|
+
# Get database URL from --db option or context
|
|
596
|
+
db_name = database or ctx.obj.get("database")
|
|
597
|
+
db_cfg = config.get_database(db_name)
|
|
598
|
+
|
|
599
|
+
click.echo(f"Scanning database '{db_cfg.name}'...")
|
|
600
|
+
if dry_run:
|
|
601
|
+
click.echo("(dry run - no changes will be made)")
|
|
602
|
+
|
|
603
|
+
rescanner = Rescanner(db_cfg.url, use_modal=not local)
|
|
604
|
+
result = rescanner.rescan(dry_run=dry_run, delete_missing=delete_missing, verbose=True)
|
|
605
|
+
|
|
606
|
+
# Print summary
|
|
607
|
+
click.echo("")
|
|
608
|
+
summary_parts = []
|
|
609
|
+
if result.updated:
|
|
610
|
+
summary_parts.append(f"{len(result.updated)} updated")
|
|
611
|
+
if result.deleted:
|
|
612
|
+
summary_parts.append(f"{len(result.deleted)} deleted")
|
|
613
|
+
if result.missing:
|
|
614
|
+
summary_parts.append(f"{len(result.missing)} missing")
|
|
615
|
+
summary_parts.append(f"{result.unchanged} unchanged")
|
|
616
|
+
|
|
617
|
+
if result.errors:
|
|
618
|
+
summary_parts.append(f"{len(result.errors)} errors")
|
|
619
|
+
|
|
620
|
+
click.echo(f"Summary: {', '.join(summary_parts)}")
|
|
621
|
+
|
|
622
|
+
if result.missing and not delete_missing:
|
|
623
|
+
click.echo("Use --delete to remove missing files from the index.")
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
# =============================================================================
|
|
627
|
+
# Serve command
|
|
628
|
+
# =============================================================================
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
@main.command()
|
|
632
|
+
@click.option("--db", "database", default=None, help="Database to serve")
|
|
633
|
+
@click.option("--http", "use_http", is_flag=True, help="Use HTTP transport instead of stdio")
|
|
634
|
+
@click.option("--host", default=None, help="HTTP server host (default: 127.0.0.1)")
|
|
635
|
+
@click.option("--port", type=int, default=None, help="HTTP server port (default: 8080)")
|
|
636
|
+
@click.pass_context
|
|
637
|
+
def serve(ctx, database: str | None, use_http: bool, host: str | None, port: int | None):
|
|
638
|
+
"""Start the MCP server for Claude Code integration.
|
|
639
|
+
|
|
640
|
+
By default, runs in stdio mode for direct Claude Code integration.
|
|
641
|
+
Use --http to run as an HTTP server with token authentication.
|
|
642
|
+
"""
|
|
643
|
+
import asyncio
|
|
644
|
+
|
|
645
|
+
if use_http:
|
|
646
|
+
from .http_server import run_http_server
|
|
647
|
+
|
|
648
|
+
http_host = host or config.http_host
|
|
649
|
+
http_port = port or config.http_port
|
|
650
|
+
run_http_server(host=http_host, port=http_port)
|
|
651
|
+
else:
|
|
652
|
+
from .mcp_server import main as mcp_main
|
|
653
|
+
|
|
654
|
+
# Get database URL from --db option or context
|
|
655
|
+
db_name = database or ctx.obj.get("database")
|
|
656
|
+
db_cfg = config.get_database(db_name)
|
|
657
|
+
asyncio.run(mcp_main(db_cfg.url))
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
# =============================================================================
|
|
661
|
+
# Watch command
|
|
662
|
+
# =============================================================================
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
@main.command()
|
|
666
|
+
@click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True))
|
|
667
|
+
@click.option("--metadata", "-m", default="{}", help="JSON metadata to attach")
|
|
668
|
+
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
669
|
+
@click.option("--db", "database", default=None, help="Database to watch for")
|
|
670
|
+
@click.pass_context
|
|
671
|
+
def watch(ctx, paths: tuple[str, ...], metadata: str, local: bool, database: str | None):
|
|
672
|
+
"""Watch directories for changes and auto-ingest."""
|
|
673
|
+
from .scripts.watch import main as watch_main
|
|
674
|
+
|
|
675
|
+
# Get database URL from --db option or context
|
|
676
|
+
db_name = database or ctx.obj.get("database")
|
|
677
|
+
db_cfg = config.get_database(db_name)
|
|
678
|
+
|
|
679
|
+
# Convert to the format watch.py expects
|
|
680
|
+
sys.argv = ["okb-watch"] + list(paths)
|
|
681
|
+
sys.argv.extend(["--db-url", db_cfg.url])
|
|
682
|
+
if metadata != "{}":
|
|
683
|
+
sys.argv.extend(["--metadata", metadata])
|
|
684
|
+
if local:
|
|
685
|
+
sys.argv.append("--local")
|
|
686
|
+
|
|
687
|
+
watch_main()
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
# =============================================================================
|
|
691
|
+
# Modal commands
|
|
692
|
+
# =============================================================================
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
@main.group()
|
|
696
|
+
def modal():
|
|
697
|
+
"""Manage Modal GPU embedder."""
|
|
698
|
+
pass
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
@modal.command()
|
|
702
|
+
def deploy():
|
|
703
|
+
"""Deploy embedder to Modal."""
|
|
704
|
+
if not shutil.which("modal"):
|
|
705
|
+
click.echo("Error: modal CLI is not installed.", err=True)
|
|
706
|
+
click.echo("Install with: pip install modal", err=True)
|
|
707
|
+
sys.exit(1)
|
|
708
|
+
|
|
709
|
+
# Find modal_embedder.py in the package
|
|
710
|
+
embedder_path = Path(__file__).parent / "modal_embedder.py"
|
|
711
|
+
if not embedder_path.exists():
|
|
712
|
+
click.echo(f"Error: modal_embedder.py not found at {embedder_path}", err=True)
|
|
713
|
+
sys.exit(1)
|
|
714
|
+
|
|
715
|
+
click.echo(f"Deploying {embedder_path} to Modal...")
|
|
716
|
+
result = subprocess.run(
|
|
717
|
+
["modal", "deploy", str(embedder_path)],
|
|
718
|
+
cwd=embedder_path.parent,
|
|
719
|
+
)
|
|
720
|
+
sys.exit(result.returncode)
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
# =============================================================================
|
|
724
|
+
# Sync commands (plugin system)
|
|
725
|
+
# =============================================================================
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
@main.group()
|
|
729
|
+
def sync():
|
|
730
|
+
"""Sync data from external API sources (plugins)."""
|
|
731
|
+
pass
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
def _get_sync_state(conn, source_name: str, db_name: str):
|
|
735
|
+
"""Get sync state from database."""
|
|
736
|
+
from .plugins.base import SyncState
|
|
737
|
+
|
|
738
|
+
result = conn.execute(
|
|
739
|
+
"""SELECT last_sync, cursor, extra FROM sync_state
|
|
740
|
+
WHERE source_name = %s AND database_name = %s""",
|
|
741
|
+
(source_name, db_name),
|
|
742
|
+
).fetchone()
|
|
743
|
+
|
|
744
|
+
if result:
|
|
745
|
+
return SyncState(
|
|
746
|
+
last_sync=result["last_sync"],
|
|
747
|
+
cursor=result["cursor"],
|
|
748
|
+
extra=result["extra"] or {},
|
|
749
|
+
)
|
|
750
|
+
return None
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def _save_sync_state(conn, source_name: str, db_name: str, state):
|
|
754
|
+
"""Save sync state to database."""
|
|
755
|
+
conn.execute(
|
|
756
|
+
"""INSERT INTO sync_state (source_name, database_name, last_sync, cursor, extra, updated_at)
|
|
757
|
+
VALUES (%s, %s, %s, %s, %s, NOW())
|
|
758
|
+
ON CONFLICT (source_name, database_name)
|
|
759
|
+
DO UPDATE SET last_sync = EXCLUDED.last_sync,
|
|
760
|
+
cursor = EXCLUDED.cursor,
|
|
761
|
+
extra = EXCLUDED.extra,
|
|
762
|
+
updated_at = NOW()""",
|
|
763
|
+
(source_name, db_name, state.last_sync, state.cursor, json.dumps(state.extra)),
|
|
764
|
+
)
|
|
765
|
+
conn.commit()
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def _apply_llm_filter(documents: list, filter_cfg: dict, source_name: str) -> list:
|
|
769
|
+
"""Apply LLM filtering to documents.
|
|
770
|
+
|
|
771
|
+
Args:
|
|
772
|
+
documents: List of documents to filter
|
|
773
|
+
filter_cfg: Filter configuration with 'prompt' and 'action_on_skip'
|
|
774
|
+
source_name: Name of the source (for logging)
|
|
775
|
+
|
|
776
|
+
Returns:
|
|
777
|
+
Filtered list of documents
|
|
778
|
+
"""
|
|
779
|
+
from .llm import FilterAction, filter_document
|
|
780
|
+
|
|
781
|
+
custom_prompt = filter_cfg.get("prompt")
|
|
782
|
+
action_on_skip = filter_cfg.get("action_on_skip", "discard")
|
|
783
|
+
|
|
784
|
+
filtered = []
|
|
785
|
+
skipped = 0
|
|
786
|
+
review = 0
|
|
787
|
+
|
|
788
|
+
for doc in documents:
|
|
789
|
+
result = filter_document(doc, custom_prompt=custom_prompt)
|
|
790
|
+
|
|
791
|
+
if result.action == FilterAction.SKIP:
|
|
792
|
+
skipped += 1
|
|
793
|
+
if action_on_skip == "archive":
|
|
794
|
+
# Store without embedding (future: add flag to document)
|
|
795
|
+
pass
|
|
796
|
+
# Otherwise discard
|
|
797
|
+
continue
|
|
798
|
+
elif result.action == FilterAction.REVIEW:
|
|
799
|
+
review += 1
|
|
800
|
+
# Still ingest, but could flag for review (future: add metadata)
|
|
801
|
+
|
|
802
|
+
filtered.append(doc)
|
|
803
|
+
|
|
804
|
+
if skipped or review:
|
|
805
|
+
click.echo(f" Filter: {len(filtered)} ingested, {skipped} skipped, {review} for review")
|
|
806
|
+
|
|
807
|
+
return filtered
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
@sync.command("run")
|
|
811
|
+
@click.argument("sources", nargs=-1)
|
|
812
|
+
@click.option("--all", "sync_all", is_flag=True, help="Sync all enabled sources")
|
|
813
|
+
@click.option("--full", is_flag=True, help="Ignore incremental state, do full sync")
|
|
814
|
+
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
815
|
+
@click.option("--db", "database", default=None, help="Database to sync into")
|
|
816
|
+
@click.option("--folder", multiple=True, help="Filter to specific folder path (can repeat)")
|
|
817
|
+
@click.option("--doc", "doc_ids", multiple=True, help="Sync specific document ID (can repeat)")
|
|
818
|
+
# GitHub-specific options
|
|
819
|
+
@click.option("--repo", multiple=True, help="GitHub repo to sync (owner/repo, can repeat)")
|
|
820
|
+
@click.option(
|
|
821
|
+
"--source", "include_source", is_flag=True, help="Sync all source files (not just README+docs)"
|
|
822
|
+
)
|
|
823
|
+
@click.option("--issues", "include_issues", is_flag=True, help="Include GitHub issues")
|
|
824
|
+
@click.option("--prs", "include_prs", is_flag=True, help="Include GitHub pull requests")
|
|
825
|
+
@click.option("--wiki", "include_wiki", is_flag=True, help="Include GitHub wiki pages")
|
|
826
|
+
@click.pass_context
|
|
827
|
+
def sync_run(
|
|
828
|
+
ctx,
|
|
829
|
+
sources: tuple[str, ...],
|
|
830
|
+
sync_all: bool,
|
|
831
|
+
full: bool,
|
|
832
|
+
local: bool,
|
|
833
|
+
database: str | None,
|
|
834
|
+
folder: tuple[str, ...],
|
|
835
|
+
doc_ids: tuple[str, ...],
|
|
836
|
+
repo: tuple[str, ...],
|
|
837
|
+
include_source: bool,
|
|
838
|
+
include_issues: bool,
|
|
839
|
+
include_prs: bool,
|
|
840
|
+
include_wiki: bool,
|
|
841
|
+
):
|
|
842
|
+
"""Sync from API sources.
|
|
843
|
+
|
|
844
|
+
Example: lkb sync run github --repo owner/repo
|
|
845
|
+
"""
|
|
846
|
+
import psycopg
|
|
847
|
+
from psycopg.rows import dict_row
|
|
848
|
+
|
|
849
|
+
from .ingest import Ingester
|
|
850
|
+
from .plugins.registry import PluginRegistry
|
|
851
|
+
|
|
852
|
+
# Get database
|
|
853
|
+
db_name = database or ctx.obj.get("database")
|
|
854
|
+
db_cfg = config.get_database(db_name)
|
|
855
|
+
|
|
856
|
+
# Determine which sources to sync
|
|
857
|
+
if sync_all:
|
|
858
|
+
source_names = config.list_enabled_sources()
|
|
859
|
+
elif sources:
|
|
860
|
+
source_names = list(sources)
|
|
861
|
+
else:
|
|
862
|
+
click.echo("Error: Specify sources to sync or use --all", err=True)
|
|
863
|
+
click.echo("Available sources: ", nl=False)
|
|
864
|
+
click.echo(", ".join(PluginRegistry.list_sources()) or "(none installed)")
|
|
865
|
+
sys.exit(1)
|
|
866
|
+
|
|
867
|
+
if not source_names:
|
|
868
|
+
click.echo("No sources to sync.")
|
|
869
|
+
return
|
|
870
|
+
|
|
871
|
+
ingester = Ingester(db_cfg.url, use_modal=not local)
|
|
872
|
+
|
|
873
|
+
with psycopg.connect(db_cfg.url, row_factory=dict_row) as conn:
|
|
874
|
+
for source_name in source_names:
|
|
875
|
+
# Get the plugin
|
|
876
|
+
source = PluginRegistry.get_source(source_name)
|
|
877
|
+
if source is None:
|
|
878
|
+
click.echo(f"Error: Source '{source_name}' not found.", err=True)
|
|
879
|
+
click.echo(f"Installed sources: {', '.join(PluginRegistry.list_sources())}")
|
|
880
|
+
continue
|
|
881
|
+
|
|
882
|
+
# Get and resolve config
|
|
883
|
+
source_cfg = config.get_source_config(source_name)
|
|
884
|
+
if source_cfg is None:
|
|
885
|
+
click.echo(f"Skipping '{source_name}': not configured or disabled", err=True)
|
|
886
|
+
continue
|
|
887
|
+
|
|
888
|
+
# Merge CLI options into config (override config file values)
|
|
889
|
+
if folder:
|
|
890
|
+
source_cfg["folders"] = list(folder)
|
|
891
|
+
if doc_ids:
|
|
892
|
+
source_cfg["doc_ids"] = list(doc_ids)
|
|
893
|
+
# GitHub-specific options
|
|
894
|
+
if repo:
|
|
895
|
+
source_cfg["repos"] = list(repo)
|
|
896
|
+
if include_source:
|
|
897
|
+
source_cfg["include_source"] = True
|
|
898
|
+
if include_issues:
|
|
899
|
+
source_cfg["include_issues"] = True
|
|
900
|
+
if include_prs:
|
|
901
|
+
source_cfg["include_prs"] = True
|
|
902
|
+
if include_wiki:
|
|
903
|
+
source_cfg["include_wiki"] = True
|
|
904
|
+
|
|
905
|
+
try:
|
|
906
|
+
source.configure(source_cfg)
|
|
907
|
+
except Exception as e:
|
|
908
|
+
click.echo(f"Error configuring '{source_name}': {e}", err=True)
|
|
909
|
+
continue
|
|
910
|
+
|
|
911
|
+
# Get sync state (unless --full)
|
|
912
|
+
state = None if full else _get_sync_state(conn, source_name, db_cfg.name)
|
|
913
|
+
|
|
914
|
+
click.echo(f"Syncing {source_name}..." + (" (full)" if full else ""))
|
|
915
|
+
|
|
916
|
+
try:
|
|
917
|
+
documents, new_state = source.fetch(state)
|
|
918
|
+
except Exception as e:
|
|
919
|
+
click.echo(f"Error fetching from '{source_name}': {e}", err=True)
|
|
920
|
+
continue
|
|
921
|
+
|
|
922
|
+
if documents:
|
|
923
|
+
click.echo(f" Fetched {len(documents)} documents")
|
|
924
|
+
|
|
925
|
+
# Apply LLM filtering if configured
|
|
926
|
+
llm_filter_cfg = source_cfg.get("llm_filter", {})
|
|
927
|
+
if llm_filter_cfg.get("enabled"):
|
|
928
|
+
documents = _apply_llm_filter(
|
|
929
|
+
documents,
|
|
930
|
+
llm_filter_cfg,
|
|
931
|
+
source_name,
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
if documents:
|
|
935
|
+
ingester.ingest_documents(documents)
|
|
936
|
+
else:
|
|
937
|
+
click.echo(" All documents filtered out")
|
|
938
|
+
else:
|
|
939
|
+
click.echo(" No new documents")
|
|
940
|
+
|
|
941
|
+
# Save state
|
|
942
|
+
_save_sync_state(conn, source_name, db_cfg.name, new_state)
|
|
943
|
+
|
|
944
|
+
click.echo("Done!")
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
@sync.command("list")
|
|
948
|
+
def sync_list():
|
|
949
|
+
"""List available API sources."""
|
|
950
|
+
from .plugins.registry import PluginRegistry
|
|
951
|
+
|
|
952
|
+
installed = PluginRegistry.list_sources()
|
|
953
|
+
configured = config.list_enabled_sources()
|
|
954
|
+
|
|
955
|
+
click.echo("Installed sources:")
|
|
956
|
+
if installed:
|
|
957
|
+
for name in installed:
|
|
958
|
+
status = "configured" if name in configured else "not configured"
|
|
959
|
+
click.echo(f" {name} [{status}]")
|
|
960
|
+
else:
|
|
961
|
+
click.echo(" (none)")
|
|
962
|
+
|
|
963
|
+
# Show configured but not installed
|
|
964
|
+
not_installed = set(configured) - set(installed)
|
|
965
|
+
if not_installed:
|
|
966
|
+
click.echo("\nConfigured but not installed:")
|
|
967
|
+
for name in not_installed:
|
|
968
|
+
click.echo(f" {name}")
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
@sync.command("status")
|
|
972
|
+
@click.argument("source", required=False)
|
|
973
|
+
@click.option("--db", "database", default=None, help="Database to check")
|
|
974
|
+
@click.pass_context
|
|
975
|
+
def sync_status(ctx, source: str | None, database: str | None):
|
|
976
|
+
"""Show sync status and last sync times."""
|
|
977
|
+
import psycopg
|
|
978
|
+
from psycopg.rows import dict_row
|
|
979
|
+
|
|
980
|
+
db_name = database or ctx.obj.get("database")
|
|
981
|
+
db_cfg = config.get_database(db_name)
|
|
982
|
+
|
|
983
|
+
with psycopg.connect(db_cfg.url, row_factory=dict_row) as conn:
|
|
984
|
+
if source:
|
|
985
|
+
# Show status for specific source
|
|
986
|
+
result = conn.execute(
|
|
987
|
+
"""SELECT source_name, last_sync, cursor, extra, updated_at
|
|
988
|
+
FROM sync_state
|
|
989
|
+
WHERE source_name = %s AND database_name = %s""",
|
|
990
|
+
(source, db_cfg.name),
|
|
991
|
+
).fetchone()
|
|
992
|
+
|
|
993
|
+
if result:
|
|
994
|
+
click.echo(f"Source: {result['source_name']}")
|
|
995
|
+
click.echo(f" Last sync: {result['last_sync'] or 'never'}")
|
|
996
|
+
click.echo(f" Updated: {result['updated_at']}")
|
|
997
|
+
if result["cursor"]:
|
|
998
|
+
click.echo(f" Cursor: {result['cursor'][:50]}...")
|
|
999
|
+
else:
|
|
1000
|
+
click.echo(f"No sync history for '{source}'")
|
|
1001
|
+
|
|
1002
|
+
# Show document count
|
|
1003
|
+
doc_count = conn.execute(
|
|
1004
|
+
"""SELECT COUNT(*) as count FROM documents
|
|
1005
|
+
WHERE metadata->>'sync_source' = %s""",
|
|
1006
|
+
(source,),
|
|
1007
|
+
).fetchone()
|
|
1008
|
+
click.echo(f" Documents: {doc_count['count']}")
|
|
1009
|
+
else:
|
|
1010
|
+
# Show all sync states
|
|
1011
|
+
results = conn.execute(
|
|
1012
|
+
"""SELECT source_name, last_sync, updated_at
|
|
1013
|
+
FROM sync_state
|
|
1014
|
+
WHERE database_name = %s
|
|
1015
|
+
ORDER BY updated_at DESC""",
|
|
1016
|
+
(db_cfg.name,),
|
|
1017
|
+
).fetchall()
|
|
1018
|
+
|
|
1019
|
+
if results:
|
|
1020
|
+
click.echo(f"Sync status for database '{db_cfg.name}':")
|
|
1021
|
+
for row in results:
|
|
1022
|
+
if row["last_sync"]:
|
|
1023
|
+
last = row["last_sync"].strftime("%Y-%m-%d %H:%M")
|
|
1024
|
+
else:
|
|
1025
|
+
last = "never"
|
|
1026
|
+
click.echo(f" {row['source_name']}: {last}")
|
|
1027
|
+
else:
|
|
1028
|
+
click.echo("No sync history")
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
# =============================================================================
|
|
1032
|
+
# Token commands
|
|
1033
|
+
# =============================================================================
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
@main.group()
|
|
1037
|
+
def token():
|
|
1038
|
+
"""Manage API tokens for HTTP access."""
|
|
1039
|
+
pass
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
@token.command("create")
|
|
1043
|
+
@click.option("--db", "database", default=None, help="Database to create token for")
|
|
1044
|
+
@click.option("--ro", "read_only", is_flag=True, help="Create read-only token (default: rw)")
|
|
1045
|
+
@click.option("-d", "--description", default=None, help="Token description")
|
|
1046
|
+
@click.pass_context
|
|
1047
|
+
def token_create(ctx, database: str | None, read_only: bool, description: str | None):
|
|
1048
|
+
"""Create a new API token."""
|
|
1049
|
+
from .tokens import create_token
|
|
1050
|
+
|
|
1051
|
+
db_name = database or ctx.obj.get("database")
|
|
1052
|
+
db_cfg = config.get_database(db_name)
|
|
1053
|
+
permissions = "ro" if read_only else "rw"
|
|
1054
|
+
|
|
1055
|
+
try:
|
|
1056
|
+
token = create_token(db_cfg.url, db_cfg.name, permissions, description)
|
|
1057
|
+
click.echo(f"Token created for database '{db_cfg.name}' ({permissions}):")
|
|
1058
|
+
click.echo(f" {token}")
|
|
1059
|
+
click.echo("")
|
|
1060
|
+
click.echo("Save this token - it cannot be retrieved later.")
|
|
1061
|
+
except Exception as e:
|
|
1062
|
+
click.echo(f"Error creating token: {e}", err=True)
|
|
1063
|
+
sys.exit(1)
|
|
1064
|
+
|
|
1065
|
+
|
|
1066
|
+
@token.command("list")
|
|
1067
|
+
@click.option("--db", "database", default=None, help="Database to list tokens for")
|
|
1068
|
+
@click.pass_context
|
|
1069
|
+
def token_list(ctx, database: str | None):
|
|
1070
|
+
"""List all tokens for a database."""
|
|
1071
|
+
from .tokens import list_tokens
|
|
1072
|
+
|
|
1073
|
+
db_name = database or ctx.obj.get("database")
|
|
1074
|
+
db_cfg = config.get_database(db_name)
|
|
1075
|
+
|
|
1076
|
+
try:
|
|
1077
|
+
tokens = list_tokens(db_cfg.url)
|
|
1078
|
+
if not tokens:
|
|
1079
|
+
click.echo(f"No tokens found for database '{db_cfg.name}'")
|
|
1080
|
+
return
|
|
1081
|
+
|
|
1082
|
+
click.echo(f"Tokens for database '{db_cfg.name}':")
|
|
1083
|
+
for t in tokens:
|
|
1084
|
+
desc = f" - {t.description}" if t.description else ""
|
|
1085
|
+
last_used = t.last_used_at.strftime("%Y-%m-%d %H:%M") if t.last_used_at else "never"
|
|
1086
|
+
click.echo(f" [{t.permissions}] {t.token_hash[:12]}...{desc}")
|
|
1087
|
+
created = t.created_at.strftime("%Y-%m-%d %H:%M")
|
|
1088
|
+
click.echo(f" Created: {created}, Last used: {last_used}")
|
|
1089
|
+
except Exception as e:
|
|
1090
|
+
click.echo(f"Error listing tokens: {e}", err=True)
|
|
1091
|
+
sys.exit(1)
|
|
1092
|
+
|
|
1093
|
+
|
|
1094
|
+
@token.command("revoke")
|
|
1095
|
+
@click.argument("token_value")
|
|
1096
|
+
@click.option("--db", "database", default=None, help="Database to revoke token from")
|
|
1097
|
+
@click.pass_context
|
|
1098
|
+
def token_revoke(ctx, token_value: str, database: str | None):
|
|
1099
|
+
"""Revoke (delete) an API token.
|
|
1100
|
+
|
|
1101
|
+
TOKEN_VALUE must be the full token string.
|
|
1102
|
+
"""
|
|
1103
|
+
from .tokens import delete_token
|
|
1104
|
+
|
|
1105
|
+
db_name = database or ctx.obj.get("database")
|
|
1106
|
+
db_cfg = config.get_database(db_name)
|
|
1107
|
+
|
|
1108
|
+
try:
|
|
1109
|
+
deleted = delete_token(db_cfg.url, token_value)
|
|
1110
|
+
if deleted:
|
|
1111
|
+
click.echo("Token revoked.")
|
|
1112
|
+
else:
|
|
1113
|
+
click.echo("Token not found. Make sure you're using the full token string.", err=True)
|
|
1114
|
+
sys.exit(1)
|
|
1115
|
+
except Exception as e:
|
|
1116
|
+
click.echo(f"Error revoking token: {e}", err=True)
|
|
1117
|
+
sys.exit(1)
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
# =============================================================================
|
|
1121
|
+
# LLM commands
|
|
1122
|
+
# =============================================================================
|
|
1123
|
+
|
|
1124
|
+
|
|
1125
|
+
@main.group()
|
|
1126
|
+
def llm():
|
|
1127
|
+
"""Manage LLM integration for document classification."""
|
|
1128
|
+
pass
|
|
1129
|
+
|
|
1130
|
+
|
|
1131
|
+
@llm.command("status")
|
|
1132
|
+
@click.option("--db", "database", default=None, help="Database to check cache for")
|
|
1133
|
+
@click.pass_context
|
|
1134
|
+
def llm_status(ctx, database: str | None):
|
|
1135
|
+
"""Show LLM configuration and connectivity status.
|
|
1136
|
+
|
|
1137
|
+
Displays current provider settings, tests connectivity,
|
|
1138
|
+
and shows cache statistics.
|
|
1139
|
+
"""
|
|
1140
|
+
import os
|
|
1141
|
+
|
|
1142
|
+
click.echo("LLM Configuration")
|
|
1143
|
+
click.echo("-" * 40)
|
|
1144
|
+
|
|
1145
|
+
# Show config
|
|
1146
|
+
click.echo(f"Provider: {config.llm_provider or '(disabled)'}")
|
|
1147
|
+
if config.llm_provider:
|
|
1148
|
+
click.echo(f"Model: {config.llm_model}")
|
|
1149
|
+
click.echo(f"Timeout: {config.llm_timeout}s")
|
|
1150
|
+
click.echo(f"Cache responses: {config.llm_cache_responses}")
|
|
1151
|
+
|
|
1152
|
+
if config.llm_provider == "modal":
|
|
1153
|
+
click.echo("Backend: Modal GPU (deploy with: lkb llm deploy)")
|
|
1154
|
+
elif config.llm_use_bedrock:
|
|
1155
|
+
click.echo(f"Backend: AWS Bedrock (region: {config.llm_aws_region})")
|
|
1156
|
+
else:
|
|
1157
|
+
api_key_set = bool(os.environ.get("ANTHROPIC_API_KEY"))
|
|
1158
|
+
click.echo(f"API key set: {'yes' if api_key_set else 'no (set ANTHROPIC_API_KEY)'}")
|
|
1159
|
+
|
|
1160
|
+
click.echo("")
|
|
1161
|
+
|
|
1162
|
+
# Test connectivity if provider is configured
|
|
1163
|
+
if config.llm_provider:
|
|
1164
|
+
click.echo("Connectivity Test")
|
|
1165
|
+
click.echo("-" * 40)
|
|
1166
|
+
try:
|
|
1167
|
+
from .llm.providers import get_provider
|
|
1168
|
+
|
|
1169
|
+
provider = get_provider()
|
|
1170
|
+
if provider is None:
|
|
1171
|
+
click.echo("Status: provider initialization failed")
|
|
1172
|
+
elif provider.is_available():
|
|
1173
|
+
click.echo("Status: available")
|
|
1174
|
+
# List models
|
|
1175
|
+
if hasattr(provider, "list_models"):
|
|
1176
|
+
models = provider.list_models()
|
|
1177
|
+
click.echo(f"Available models: {', '.join(models[:3])}...")
|
|
1178
|
+
else:
|
|
1179
|
+
click.echo("Status: not available (check API key or credentials)")
|
|
1180
|
+
except ImportError:
|
|
1181
|
+
click.echo("Status: missing dependencies")
|
|
1182
|
+
click.echo(" Install with: pip install 'okb[llm]'")
|
|
1183
|
+
except Exception as e:
|
|
1184
|
+
click.echo(f"Status: error - {e}")
|
|
1185
|
+
|
|
1186
|
+
# Show cache stats if database is available
|
|
1187
|
+
click.echo("")
|
|
1188
|
+
click.echo("Cache Statistics")
|
|
1189
|
+
click.echo("-" * 40)
|
|
1190
|
+
try:
|
|
1191
|
+
db_name = database or ctx.obj.get("database")
|
|
1192
|
+
db_cfg = config.get_database(db_name)
|
|
1193
|
+
|
|
1194
|
+
from .llm.cache import get_cache_stats
|
|
1195
|
+
|
|
1196
|
+
stats = get_cache_stats(db_cfg.url)
|
|
1197
|
+
click.echo(f"Total cached responses: {stats['total_entries']}")
|
|
1198
|
+
if stats["by_provider"]:
|
|
1199
|
+
for entry in stats["by_provider"]:
|
|
1200
|
+
click.echo(f" {entry['provider']}/{entry['model']}: {entry['count']}")
|
|
1201
|
+
if stats["oldest_entry"]:
|
|
1202
|
+
click.echo(f"Oldest entry: {stats['oldest_entry']}")
|
|
1203
|
+
except Exception as e:
|
|
1204
|
+
click.echo(f"Cache unavailable: {e}")
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
@llm.command("clear-cache")
|
|
1208
|
+
@click.option("--db", "database", default=None, help="Database to clear cache for")
|
|
1209
|
+
@click.option(
|
|
1210
|
+
"--older-than", "days", type=int, default=None, help="Only clear entries older than N days"
|
|
1211
|
+
)
|
|
1212
|
+
@click.option("--yes", is_flag=True, help="Skip confirmation")
|
|
1213
|
+
@click.pass_context
|
|
1214
|
+
def llm_clear_cache(ctx, database: str | None, days: int | None, yes: bool):
|
|
1215
|
+
"""Clear the LLM response cache."""
|
|
1216
|
+
from datetime import UTC, datetime, timedelta
|
|
1217
|
+
|
|
1218
|
+
db_name = database or ctx.obj.get("database")
|
|
1219
|
+
db_cfg = config.get_database(db_name)
|
|
1220
|
+
|
|
1221
|
+
if days:
|
|
1222
|
+
older_than = datetime.now(UTC) - timedelta(days=days)
|
|
1223
|
+
msg = f"Clear cache entries older than {days} days?"
|
|
1224
|
+
else:
|
|
1225
|
+
older_than = None
|
|
1226
|
+
msg = "Clear ALL cache entries?"
|
|
1227
|
+
|
|
1228
|
+
if not yes:
|
|
1229
|
+
if not click.confirm(msg):
|
|
1230
|
+
click.echo("Cancelled.")
|
|
1231
|
+
return
|
|
1232
|
+
|
|
1233
|
+
from .llm.cache import clear_cache
|
|
1234
|
+
|
|
1235
|
+
deleted = clear_cache(older_than=older_than, db_url=db_cfg.url)
|
|
1236
|
+
click.echo(f"Deleted {deleted} cache entries.")
|
|
1237
|
+
|
|
1238
|
+
|
|
1239
|
+
@llm.command("deploy")
|
|
1240
|
+
def llm_deploy():
|
|
1241
|
+
"""Deploy the Modal LLM app for open model inference.
|
|
1242
|
+
|
|
1243
|
+
This deploys a GPU-accelerated LLM service on Modal using Llama 3.2.
|
|
1244
|
+
Required for using provider: modal in your config.
|
|
1245
|
+
|
|
1246
|
+
Requires Modal CLI to be installed and authenticated:
|
|
1247
|
+
pip install modal
|
|
1248
|
+
modal token new
|
|
1249
|
+
"""
|
|
1250
|
+
if not shutil.which("modal"):
|
|
1251
|
+
click.echo("Error: modal CLI is not installed.", err=True)
|
|
1252
|
+
click.echo("Install with: pip install modal", err=True)
|
|
1253
|
+
click.echo("Then authenticate: modal token new", err=True)
|
|
1254
|
+
sys.exit(1)
|
|
1255
|
+
|
|
1256
|
+
# Find modal_llm.py in the package
|
|
1257
|
+
llm_path = Path(__file__).parent / "modal_llm.py"
|
|
1258
|
+
if not llm_path.exists():
|
|
1259
|
+
click.echo(f"Error: modal_llm.py not found at {llm_path}", err=True)
|
|
1260
|
+
sys.exit(1)
|
|
1261
|
+
|
|
1262
|
+
click.echo(f"Deploying {llm_path} to Modal...")
|
|
1263
|
+
click.echo("Note: First deploy downloads the model (~2GB) and may take a few minutes.")
|
|
1264
|
+
result = subprocess.run(
|
|
1265
|
+
["modal", "deploy", str(llm_path)],
|
|
1266
|
+
cwd=llm_path.parent,
|
|
1267
|
+
)
|
|
1268
|
+
sys.exit(result.returncode)
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
if __name__ == "__main__":
|
|
1272
|
+
main()
|