okb 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
okb/cli.py ADDED
@@ -0,0 +1,1272 @@
1
+ """Command-line interface for Local Knowledge Base."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib.resources
6
+ import json
7
+ import shutil
8
+ import subprocess
9
+ import sys
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ import click
14
+ import yaml
15
+
16
+ from .config import config, get_config_dir, get_config_path, get_default_config_yaml
17
+
18
+
19
+ @click.group()
20
+ @click.version_option(package_name="okb")
21
+ @click.option("--db", "database", default=None, help="Database to use")
22
+ @click.pass_context
23
+ def main(ctx, database):
24
+ """Local Knowledge Base - semantic search for personal documents."""
25
+ ctx.ensure_object(dict)
26
+ ctx.obj["database"] = database
27
+
28
+
29
+ # =============================================================================
30
+ # Database commands
31
+ # =============================================================================
32
+
33
+
34
+ @main.group()
35
+ @click.pass_context
36
+ def db(ctx):
37
+ """Manage the pgvector database container."""
38
+ pass
39
+
40
+
41
+ def _check_docker() -> bool:
42
+ """Check if docker is available."""
43
+ return shutil.which("docker") is not None
44
+
45
+
46
+ def _get_container_status() -> str | None:
47
+ """Get the status of the lkb container. Returns None if not found."""
48
+ try:
49
+ result = subprocess.run(
50
+ [
51
+ "docker",
52
+ "container",
53
+ "inspect",
54
+ "-f",
55
+ "{{.State.Status}}",
56
+ config.docker_container_name,
57
+ ],
58
+ capture_output=True,
59
+ text=True,
60
+ timeout=10,
61
+ )
62
+ if result.returncode == 0:
63
+ return result.stdout.strip()
64
+ return None
65
+ except subprocess.TimeoutExpired:
66
+ return None
67
+
68
+
69
+ def _get_init_sql_path() -> Path:
70
+ """Get the path to init.sql, extracting from package if needed."""
71
+ # Try to access init.sql from package data
72
+ try:
73
+ ref = importlib.resources.files("okb.data").joinpath("init.sql")
74
+ # If it's a real file path, return it directly
75
+ with importlib.resources.as_file(ref) as path:
76
+ return path
77
+ except Exception:
78
+ # Fallback: look relative to this file
79
+ return Path(__file__).parent / "data" / "init.sql"
80
+
81
+
82
+ def _wait_for_db_ready(timeout: int = 30) -> bool:
83
+ """Wait for database to be ready to accept connections."""
84
+ import time
85
+
86
+ click.echo("Waiting for database to be ready...", nl=False)
87
+ for _ in range(timeout):
88
+ try:
89
+ result = subprocess.run(
90
+ [
91
+ "docker",
92
+ "exec",
93
+ config.docker_container_name,
94
+ "pg_isready",
95
+ "-U",
96
+ "knowledge",
97
+ "-d",
98
+ "knowledge_base",
99
+ ],
100
+ capture_output=True,
101
+ text=True,
102
+ timeout=5,
103
+ )
104
+ if result.returncode == 0:
105
+ click.echo(" ready.")
106
+ return True
107
+ except subprocess.TimeoutExpired:
108
+ pass
109
+ click.echo(".", nl=False)
110
+ time.sleep(1)
111
+ click.echo(" timeout!")
112
+ return False
113
+
114
+
115
+ def _run_migrations_for_db(db_cfg):
116
+ """Run pending migrations for a specific database."""
117
+ from .migrate import get_pending, run_migrations
118
+
119
+ try:
120
+ pending = get_pending(db_cfg.url)
121
+ if pending:
122
+ click.echo(f" {db_cfg.name}: applying {len(pending)} migration(s)...")
123
+ applied = run_migrations(db_cfg.url)
124
+ for m in applied:
125
+ click.echo(f" ✓ {m}")
126
+ else:
127
+ click.echo(f" {db_cfg.name}: up to date")
128
+ except Exception as e:
129
+ click.echo(f" {db_cfg.name}: error ({e})", err=True)
130
+
131
+
132
+ def _run_migrations_all():
133
+ """Run pending migrations on all managed databases."""
134
+ managed_dbs = [db for db in config.databases.values() if db.managed]
135
+ if managed_dbs:
136
+ click.echo("Running migrations...")
137
+ for db_cfg in managed_dbs:
138
+ _run_migrations_for_db(db_cfg)
139
+
140
+
141
+ def _ensure_databases_exist():
142
+ """Create databases in PostgreSQL container if they don't exist."""
143
+ import psycopg
144
+ from psycopg import sql
145
+
146
+ managed_dbs = [db for db in config.databases.values() if db.managed]
147
+ if not managed_dbs:
148
+ return
149
+
150
+ # Connect to postgres database (admin db) to create others
151
+ admin_url = (
152
+ f"postgresql://knowledge:{config.docker_password}@localhost:{config.docker_port}/postgres"
153
+ )
154
+
155
+ try:
156
+ with psycopg.connect(admin_url, autocommit=True) as conn:
157
+ # Get existing databases
158
+ result = conn.execute("SELECT datname FROM pg_database WHERE datistemplate = false")
159
+ existing = {row[0] for row in result.fetchall()}
160
+
161
+ for db_cfg in managed_dbs:
162
+ db_name = db_cfg.database_name
163
+ if db_name not in existing:
164
+ click.echo(f"Creating database: {db_name}")
165
+ conn.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name)))
166
+
167
+ # Enable pgvector extension on the new database
168
+ new_db_url = (
169
+ f"postgresql://knowledge:{config.docker_password}@"
170
+ f"localhost:{config.docker_port}/{db_name}"
171
+ )
172
+ with psycopg.connect(new_db_url, autocommit=True) as new_conn:
173
+ new_conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
174
+ except Exception as e:
175
+ click.echo(f"Warning: Could not create databases: {e}", err=True)
176
+
177
+
178
+ @db.command()
179
+ def start():
180
+ """Start the pgvector database container."""
181
+ if not _check_docker():
182
+ click.echo("Error: docker is not installed or not in PATH", err=True)
183
+ sys.exit(1)
184
+
185
+ status = _get_container_status()
186
+ if status == "running":
187
+ click.echo(f"Container '{config.docker_container_name}' is already running.")
188
+ return
189
+
190
+ if status == "exited":
191
+ # Container exists but is stopped, start it
192
+ click.echo(f"Starting existing container '{config.docker_container_name}'...")
193
+ try:
194
+ result = subprocess.run(
195
+ ["docker", "start", config.docker_container_name],
196
+ capture_output=True,
197
+ text=True,
198
+ timeout=30,
199
+ )
200
+ except subprocess.TimeoutExpired:
201
+ click.echo("Error: docker start timed out", err=True)
202
+ sys.exit(1)
203
+ if result.returncode != 0:
204
+ click.echo(f"Error starting container: {result.stderr}", err=True)
205
+ sys.exit(1)
206
+ click.echo("Database started.")
207
+ _wait_for_db_ready()
208
+ _ensure_databases_exist()
209
+ _run_migrations_all()
210
+ return
211
+
212
+ # Container doesn't exist, create it
213
+ click.echo(f"Creating container '{config.docker_container_name}'...")
214
+
215
+ # Get init.sql path - we need to handle the case where it's in a package
216
+ init_sql = _get_init_sql_path()
217
+
218
+ # If init.sql is inside a zip/egg, we need to extract it to a temp location
219
+ if not init_sql.exists():
220
+ ref = importlib.resources.files("okb.data").joinpath("init.sql")
221
+ init_sql_content = ref.read_text()
222
+ # Write to temp file
223
+ temp_dir = Path(tempfile.gettempdir()) / "okb"
224
+ temp_dir.mkdir(exist_ok=True)
225
+ init_sql = temp_dir / "init.sql"
226
+ init_sql.write_text(init_sql_content)
227
+
228
+ cmd = [
229
+ "docker",
230
+ "run",
231
+ "-d",
232
+ "--name",
233
+ config.docker_container_name,
234
+ "-e",
235
+ "POSTGRES_USER=knowledge",
236
+ "-e",
237
+ f"POSTGRES_PASSWORD={config.docker_password}",
238
+ "-e",
239
+ "POSTGRES_DB=knowledge_base",
240
+ "-v",
241
+ f"{config.docker_volume_name}:/var/lib/postgresql/data",
242
+ "-v",
243
+ f"{init_sql}:/docker-entrypoint-initdb.d/init.sql:ro",
244
+ "-p",
245
+ f"{config.docker_port}:5432",
246
+ "--restart",
247
+ "unless-stopped",
248
+ "pgvector/pgvector:pg16",
249
+ ]
250
+
251
+ try:
252
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
253
+ except subprocess.TimeoutExpired:
254
+ click.echo("Error: docker run timed out (may need to pull image manually)", err=True)
255
+ sys.exit(1)
256
+ if result.returncode != 0:
257
+ click.echo(f"Error creating container: {result.stderr}", err=True)
258
+ sys.exit(1)
259
+
260
+ click.echo("Database started.")
261
+ click.echo(f" Container: {config.docker_container_name}")
262
+ click.echo(f" Port: {config.docker_port}")
263
+ click.echo(f" Volume: {config.docker_volume_name}")
264
+ _wait_for_db_ready()
265
+ _ensure_databases_exist()
266
+ _run_migrations_all()
267
+
268
+
269
+ @db.command()
270
+ def stop():
271
+ """Stop the pgvector database container."""
272
+ if not _check_docker():
273
+ click.echo("Error: docker is not installed or not in PATH", err=True)
274
+ sys.exit(1)
275
+
276
+ status = _get_container_status()
277
+ if status is None:
278
+ click.echo(f"Container '{config.docker_container_name}' does not exist.")
279
+ return
280
+
281
+ if status != "running":
282
+ click.echo(f"Container '{config.docker_container_name}' is not running (status: {status}).")
283
+ return
284
+
285
+ click.echo(f"Stopping container '{config.docker_container_name}'...")
286
+ try:
287
+ result = subprocess.run(
288
+ ["docker", "stop", config.docker_container_name],
289
+ capture_output=True,
290
+ text=True,
291
+ timeout=30,
292
+ )
293
+ except subprocess.TimeoutExpired:
294
+ click.echo("Error: docker stop timed out", err=True)
295
+ sys.exit(1)
296
+ if result.returncode != 0:
297
+ click.echo(f"Error stopping container: {result.stderr}", err=True)
298
+ sys.exit(1)
299
+
300
+ click.echo("Database stopped.")
301
+
302
+
303
+ @db.command()
304
+ def status():
305
+ """Show database container status."""
306
+ if not _check_docker():
307
+ click.echo("Error: docker is not installed or not in PATH", err=True)
308
+ sys.exit(1)
309
+
310
+ container_status = _get_container_status()
311
+ if container_status is None:
312
+ click.echo(f"Container '{config.docker_container_name}' does not exist.")
313
+ click.echo("Run 'okb db start' to create it.")
314
+ return
315
+
316
+ click.echo(f"Container: {config.docker_container_name}")
317
+ click.echo(f"Status: {container_status}")
318
+ click.echo(f"Port: {config.docker_port}")
319
+ click.echo(f"Volume: {config.docker_volume_name}")
320
+
321
+ if container_status == "running":
322
+ # Try to get more info
323
+ try:
324
+ result = subprocess.run(
325
+ [
326
+ "docker",
327
+ "exec",
328
+ config.docker_container_name,
329
+ "pg_isready",
330
+ "-U",
331
+ "knowledge",
332
+ "-d",
333
+ "knowledge_base",
334
+ ],
335
+ capture_output=True,
336
+ text=True,
337
+ timeout=10,
338
+ )
339
+ except subprocess.TimeoutExpired:
340
+ click.echo("Database: check timed out")
341
+ return
342
+ if result.returncode == 0:
343
+ click.echo("Database: ready")
344
+ # Show migration status
345
+ try:
346
+ from .migrate import get_applied, get_pending
347
+
348
+ applied = get_applied(config.db_url)
349
+ pending = get_pending(config.db_url)
350
+ click.echo(f"Migrations: {len(applied)} applied, {len(pending)} pending")
351
+ if pending:
352
+ click.echo(" Run 'okb db migrate' to apply pending migrations.")
353
+ except Exception as e:
354
+ click.echo(f"Migrations: error checking ({e})")
355
+ else:
356
+ click.echo("Database: not ready")
357
+
358
+
359
+ @db.command()
360
+ @click.argument("name", required=False)
361
+ def migrate(name):
362
+ """Apply pending database migrations.
363
+
364
+ If NAME is provided, migrate only that database.
365
+ Otherwise, migrate all configured databases.
366
+
367
+ Creates missing databases automatically for managed databases.
368
+ """
369
+ # Ensure managed databases exist before migrating
370
+ _ensure_databases_exist()
371
+
372
+ if name:
373
+ # Migrate specific database
374
+ try:
375
+ db_cfg = config.get_database(name)
376
+ except ValueError as e:
377
+ click.echo(f"Error: {e}", err=True)
378
+ sys.exit(1)
379
+ _run_migrations_for_db(db_cfg)
380
+ else:
381
+ # Migrate all databases
382
+ for db_cfg in config.databases.values():
383
+ _run_migrations_for_db(db_cfg)
384
+ click.echo("Done.")
385
+
386
+
387
+ @db.command()
388
+ def destroy():
389
+ """Remove the database container and volume (destructive!)."""
390
+ if not _check_docker():
391
+ click.echo("Error: docker is not installed or not in PATH", err=True)
392
+ sys.exit(1)
393
+
394
+ if not click.confirm(
395
+ f"This will delete container '{config.docker_container_name}' and volume "
396
+ f"'{config.docker_volume_name}'. All data will be lost. Continue?"
397
+ ):
398
+ return
399
+
400
+ # Stop and remove container
401
+ subprocess.run(
402
+ ["docker", "rm", "-f", config.docker_container_name],
403
+ capture_output=True,
404
+ timeout=30,
405
+ )
406
+ click.echo(f"Removed container '{config.docker_container_name}'.")
407
+
408
+ # Remove volume
409
+ subprocess.run(
410
+ ["docker", "volume", "rm", config.docker_volume_name],
411
+ capture_output=True,
412
+ timeout=30,
413
+ )
414
+ click.echo(f"Removed volume '{config.docker_volume_name}'.")
415
+
416
+
417
+ @db.command("list")
418
+ def db_list():
419
+ """List all configured databases."""
420
+ click.echo("Configured databases:")
421
+ for name, db_cfg in config.databases.items():
422
+ markers = []
423
+ if db_cfg.default:
424
+ markers.append("default")
425
+ markers.append("managed" if db_cfg.managed else "external")
426
+ click.echo(f" {name} [{', '.join(markers)}]")
427
+ click.echo(f" URL: {db_cfg.url}")
428
+
429
+
430
+ # =============================================================================
431
+ # Config commands
432
+ # =============================================================================
433
+
434
+
435
+ @main.group("config")
436
+ def config_cmd():
437
+ """Manage configuration."""
438
+ pass
439
+
440
+
441
+ @config_cmd.command("init")
442
+ @click.option("--force", is_flag=True, help="Overwrite existing config file")
443
+ def config_init(force: bool):
444
+ """Create default config file at ~/.config/okb/config.yaml."""
445
+ config_path = get_config_path()
446
+
447
+ if config_path.exists() and not force:
448
+ click.echo(f"Config file already exists at {config_path}")
449
+ click.echo("Use --force to overwrite.")
450
+ return
451
+
452
+ config_dir = get_config_dir()
453
+ config_dir.mkdir(parents=True, exist_ok=True)
454
+
455
+ config_path.write_text(get_default_config_yaml())
456
+ click.echo(f"Created config file at {config_path}")
457
+
458
+
459
+ @config_cmd.command("show")
460
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
461
+ def config_show(as_json: bool):
462
+ """Show current configuration."""
463
+ config_path = get_config_path()
464
+
465
+ if as_json:
466
+ click.echo(json.dumps(config.to_dict(), indent=2))
467
+ else:
468
+ click.echo(f"Config file: {config_path}")
469
+ click.echo(f" Exists: {config_path.exists()}")
470
+ click.echo("")
471
+ click.echo(yaml.dump(config.to_dict(), default_flow_style=False, sort_keys=False))
472
+
473
+
474
+ @config_cmd.command("path")
475
+ def config_path_cmd():
476
+ """Print the config file path."""
477
+ click.echo(get_config_path())
478
+
479
+
480
+ # =============================================================================
481
+ # Ingest command
482
+ # =============================================================================
483
+
484
+
485
+ @main.command()
486
+ @click.argument("paths", nargs=-1, required=True)
487
+ @click.option("--metadata", "-m", default="{}", help="JSON metadata to attach")
488
+ @click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
489
+ @click.option("--db", "database", default=None, help="Database to ingest into")
490
+ @click.pass_context
491
+ def ingest(ctx, paths: tuple[str, ...], metadata: str, local: bool, database: str | None):
492
+ """Ingest documents or URLs into the knowledge base."""
493
+ import json as json_module
494
+ from pathlib import Path
495
+
496
+ from .ingest import (
497
+ Ingester,
498
+ check_file_skip,
499
+ collect_documents,
500
+ is_text_file,
501
+ is_url,
502
+ parse_document,
503
+ parse_url,
504
+ )
505
+
506
+ try:
507
+ extra_metadata = json_module.loads(metadata)
508
+ except json_module.JSONDecodeError as e:
509
+ click.echo(f"Error parsing metadata JSON: {e}", err=True)
510
+ sys.exit(1)
511
+
512
+ # Get database URL from --db option or context
513
+ db_name = database or ctx.obj.get("database")
514
+ db_cfg = config.get_database(db_name)
515
+ ingester = Ingester(db_cfg.url, use_modal=not local)
516
+
517
+ documents = []
518
+ for path_str in paths:
519
+ # Check if it's a URL first
520
+ if is_url(path_str):
521
+ click.echo(f"Fetching: {path_str}")
522
+ try:
523
+ documents.append(parse_url(path_str, extra_metadata))
524
+ except Exception as e:
525
+ click.echo(f"Error fetching URL: {e}", err=True)
526
+ continue
527
+
528
+ path = Path(path_str).resolve()
529
+ if path.is_dir():
530
+ documents.extend(collect_documents(path, extra_metadata))
531
+ elif path.is_file():
532
+ # Check security patterns first
533
+ skip_check = check_file_skip(path)
534
+ if skip_check.should_skip:
535
+ prefix = "BLOCKED" if skip_check.is_security else "Skipping"
536
+ click.echo(f"{prefix}: {path} ({skip_check.reason})", err=True)
537
+ continue
538
+
539
+ # For explicitly provided files, try to parse even with unknown extension
540
+ # Always allow .pdf and .docx even if not in config (user may have old config)
541
+ if path.suffix in config.all_extensions or path.suffix in (".pdf", ".docx"):
542
+ try:
543
+ documents.extend(parse_document(path, extra_metadata))
544
+ except ValueError as e:
545
+ click.echo(f"Skipping: {e}", err=True)
546
+ continue
547
+ elif is_text_file(path):
548
+ # Unknown extension but appears to be text - parse as code/config
549
+ click.echo(f"Parsing as text: {path}")
550
+ documents.extend(parse_document(path, extra_metadata, force=True))
551
+ else:
552
+ click.echo(f"Skipping binary file: {path}", err=True)
553
+ else:
554
+ click.echo(f"Not found: {path_str}", err=True)
555
+
556
+ if not documents:
557
+ click.echo("No documents found to ingest.")
558
+ return
559
+
560
+ click.echo(f"Found {len(documents)} documents to process")
561
+ ingester.ingest_documents(documents)
562
+ click.echo("Done!")
563
+
564
+
565
+ # =============================================================================
566
+ # Rescan command
567
+ # =============================================================================
568
+
569
+
570
+ @main.command()
571
+ @click.option("--db", "database", default=None, help="Database to rescan")
572
+ @click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
573
+ @click.option("--dry-run", is_flag=True, help="Show changes without executing")
574
+ @click.option("--delete", "delete_missing", is_flag=True, help="Remove documents for missing files")
575
+ @click.pass_context
576
+ def rescan(ctx, database: str | None, local: bool, dry_run: bool, delete_missing: bool):
577
+ """Check indexed documents for freshness and re-ingest changed ones.
578
+
579
+ Compares stored file modification times against actual file mtimes.
580
+ Files that have changed are deleted and re-ingested. Missing files
581
+ are reported (use --delete to remove them from the index).
582
+
583
+ Examples:
584
+
585
+ okb rescan # Rescan default database
586
+
587
+ okb rescan --dry-run # Show what would change
588
+
589
+ okb rescan --delete # Also remove missing files
590
+
591
+ okb rescan --db work # Rescan specific database
592
+ """
593
+ from .rescan import Rescanner
594
+
595
+ # Get database URL from --db option or context
596
+ db_name = database or ctx.obj.get("database")
597
+ db_cfg = config.get_database(db_name)
598
+
599
+ click.echo(f"Scanning database '{db_cfg.name}'...")
600
+ if dry_run:
601
+ click.echo("(dry run - no changes will be made)")
602
+
603
+ rescanner = Rescanner(db_cfg.url, use_modal=not local)
604
+ result = rescanner.rescan(dry_run=dry_run, delete_missing=delete_missing, verbose=True)
605
+
606
+ # Print summary
607
+ click.echo("")
608
+ summary_parts = []
609
+ if result.updated:
610
+ summary_parts.append(f"{len(result.updated)} updated")
611
+ if result.deleted:
612
+ summary_parts.append(f"{len(result.deleted)} deleted")
613
+ if result.missing:
614
+ summary_parts.append(f"{len(result.missing)} missing")
615
+ summary_parts.append(f"{result.unchanged} unchanged")
616
+
617
+ if result.errors:
618
+ summary_parts.append(f"{len(result.errors)} errors")
619
+
620
+ click.echo(f"Summary: {', '.join(summary_parts)}")
621
+
622
+ if result.missing and not delete_missing:
623
+ click.echo("Use --delete to remove missing files from the index.")
624
+
625
+
626
+ # =============================================================================
627
+ # Serve command
628
+ # =============================================================================
629
+
630
+
631
+ @main.command()
632
+ @click.option("--db", "database", default=None, help="Database to serve")
633
+ @click.option("--http", "use_http", is_flag=True, help="Use HTTP transport instead of stdio")
634
+ @click.option("--host", default=None, help="HTTP server host (default: 127.0.0.1)")
635
+ @click.option("--port", type=int, default=None, help="HTTP server port (default: 8080)")
636
+ @click.pass_context
637
+ def serve(ctx, database: str | None, use_http: bool, host: str | None, port: int | None):
638
+ """Start the MCP server for Claude Code integration.
639
+
640
+ By default, runs in stdio mode for direct Claude Code integration.
641
+ Use --http to run as an HTTP server with token authentication.
642
+ """
643
+ import asyncio
644
+
645
+ if use_http:
646
+ from .http_server import run_http_server
647
+
648
+ http_host = host or config.http_host
649
+ http_port = port or config.http_port
650
+ run_http_server(host=http_host, port=http_port)
651
+ else:
652
+ from .mcp_server import main as mcp_main
653
+
654
+ # Get database URL from --db option or context
655
+ db_name = database or ctx.obj.get("database")
656
+ db_cfg = config.get_database(db_name)
657
+ asyncio.run(mcp_main(db_cfg.url))
658
+
659
+
660
+ # =============================================================================
661
+ # Watch command
662
+ # =============================================================================
663
+
664
+
665
+ @main.command()
666
+ @click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True))
667
+ @click.option("--metadata", "-m", default="{}", help="JSON metadata to attach")
668
+ @click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
669
+ @click.option("--db", "database", default=None, help="Database to watch for")
670
+ @click.pass_context
671
+ def watch(ctx, paths: tuple[str, ...], metadata: str, local: bool, database: str | None):
672
+ """Watch directories for changes and auto-ingest."""
673
+ from .scripts.watch import main as watch_main
674
+
675
+ # Get database URL from --db option or context
676
+ db_name = database or ctx.obj.get("database")
677
+ db_cfg = config.get_database(db_name)
678
+
679
+ # Convert to the format watch.py expects
680
+ sys.argv = ["okb-watch"] + list(paths)
681
+ sys.argv.extend(["--db-url", db_cfg.url])
682
+ if metadata != "{}":
683
+ sys.argv.extend(["--metadata", metadata])
684
+ if local:
685
+ sys.argv.append("--local")
686
+
687
+ watch_main()
688
+
689
+
690
+ # =============================================================================
691
+ # Modal commands
692
+ # =============================================================================
693
+
694
+
695
+ @main.group()
696
+ def modal():
697
+ """Manage Modal GPU embedder."""
698
+ pass
699
+
700
+
701
+ @modal.command()
702
+ def deploy():
703
+ """Deploy embedder to Modal."""
704
+ if not shutil.which("modal"):
705
+ click.echo("Error: modal CLI is not installed.", err=True)
706
+ click.echo("Install with: pip install modal", err=True)
707
+ sys.exit(1)
708
+
709
+ # Find modal_embedder.py in the package
710
+ embedder_path = Path(__file__).parent / "modal_embedder.py"
711
+ if not embedder_path.exists():
712
+ click.echo(f"Error: modal_embedder.py not found at {embedder_path}", err=True)
713
+ sys.exit(1)
714
+
715
+ click.echo(f"Deploying {embedder_path} to Modal...")
716
+ result = subprocess.run(
717
+ ["modal", "deploy", str(embedder_path)],
718
+ cwd=embedder_path.parent,
719
+ )
720
+ sys.exit(result.returncode)
721
+
722
+
723
+ # =============================================================================
724
+ # Sync commands (plugin system)
725
+ # =============================================================================
726
+
727
+
728
+ @main.group()
729
+ def sync():
730
+ """Sync data from external API sources (plugins)."""
731
+ pass
732
+
733
+
734
+ def _get_sync_state(conn, source_name: str, db_name: str):
735
+ """Get sync state from database."""
736
+ from .plugins.base import SyncState
737
+
738
+ result = conn.execute(
739
+ """SELECT last_sync, cursor, extra FROM sync_state
740
+ WHERE source_name = %s AND database_name = %s""",
741
+ (source_name, db_name),
742
+ ).fetchone()
743
+
744
+ if result:
745
+ return SyncState(
746
+ last_sync=result["last_sync"],
747
+ cursor=result["cursor"],
748
+ extra=result["extra"] or {},
749
+ )
750
+ return None
751
+
752
+
753
+ def _save_sync_state(conn, source_name: str, db_name: str, state):
754
+ """Save sync state to database."""
755
+ conn.execute(
756
+ """INSERT INTO sync_state (source_name, database_name, last_sync, cursor, extra, updated_at)
757
+ VALUES (%s, %s, %s, %s, %s, NOW())
758
+ ON CONFLICT (source_name, database_name)
759
+ DO UPDATE SET last_sync = EXCLUDED.last_sync,
760
+ cursor = EXCLUDED.cursor,
761
+ extra = EXCLUDED.extra,
762
+ updated_at = NOW()""",
763
+ (source_name, db_name, state.last_sync, state.cursor, json.dumps(state.extra)),
764
+ )
765
+ conn.commit()
766
+
767
+
768
+ def _apply_llm_filter(documents: list, filter_cfg: dict, source_name: str) -> list:
769
+ """Apply LLM filtering to documents.
770
+
771
+ Args:
772
+ documents: List of documents to filter
773
+ filter_cfg: Filter configuration with 'prompt' and 'action_on_skip'
774
+ source_name: Name of the source (for logging)
775
+
776
+ Returns:
777
+ Filtered list of documents
778
+ """
779
+ from .llm import FilterAction, filter_document
780
+
781
+ custom_prompt = filter_cfg.get("prompt")
782
+ action_on_skip = filter_cfg.get("action_on_skip", "discard")
783
+
784
+ filtered = []
785
+ skipped = 0
786
+ review = 0
787
+
788
+ for doc in documents:
789
+ result = filter_document(doc, custom_prompt=custom_prompt)
790
+
791
+ if result.action == FilterAction.SKIP:
792
+ skipped += 1
793
+ if action_on_skip == "archive":
794
+ # Store without embedding (future: add flag to document)
795
+ pass
796
+ # Otherwise discard
797
+ continue
798
+ elif result.action == FilterAction.REVIEW:
799
+ review += 1
800
+ # Still ingest, but could flag for review (future: add metadata)
801
+
802
+ filtered.append(doc)
803
+
804
+ if skipped or review:
805
+ click.echo(f" Filter: {len(filtered)} ingested, {skipped} skipped, {review} for review")
806
+
807
+ return filtered
808
+
809
+
810
+ @sync.command("run")
811
+ @click.argument("sources", nargs=-1)
812
+ @click.option("--all", "sync_all", is_flag=True, help="Sync all enabled sources")
813
+ @click.option("--full", is_flag=True, help="Ignore incremental state, do full sync")
814
+ @click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
815
+ @click.option("--db", "database", default=None, help="Database to sync into")
816
+ @click.option("--folder", multiple=True, help="Filter to specific folder path (can repeat)")
817
+ @click.option("--doc", "doc_ids", multiple=True, help="Sync specific document ID (can repeat)")
818
+ # GitHub-specific options
819
+ @click.option("--repo", multiple=True, help="GitHub repo to sync (owner/repo, can repeat)")
820
+ @click.option(
821
+ "--source", "include_source", is_flag=True, help="Sync all source files (not just README+docs)"
822
+ )
823
+ @click.option("--issues", "include_issues", is_flag=True, help="Include GitHub issues")
824
+ @click.option("--prs", "include_prs", is_flag=True, help="Include GitHub pull requests")
825
+ @click.option("--wiki", "include_wiki", is_flag=True, help="Include GitHub wiki pages")
826
+ @click.pass_context
827
+ def sync_run(
828
+ ctx,
829
+ sources: tuple[str, ...],
830
+ sync_all: bool,
831
+ full: bool,
832
+ local: bool,
833
+ database: str | None,
834
+ folder: tuple[str, ...],
835
+ doc_ids: tuple[str, ...],
836
+ repo: tuple[str, ...],
837
+ include_source: bool,
838
+ include_issues: bool,
839
+ include_prs: bool,
840
+ include_wiki: bool,
841
+ ):
842
+ """Sync from API sources.
843
+
844
+ Example: lkb sync run github --repo owner/repo
845
+ """
846
+ import psycopg
847
+ from psycopg.rows import dict_row
848
+
849
+ from .ingest import Ingester
850
+ from .plugins.registry import PluginRegistry
851
+
852
+ # Get database
853
+ db_name = database or ctx.obj.get("database")
854
+ db_cfg = config.get_database(db_name)
855
+
856
+ # Determine which sources to sync
857
+ if sync_all:
858
+ source_names = config.list_enabled_sources()
859
+ elif sources:
860
+ source_names = list(sources)
861
+ else:
862
+ click.echo("Error: Specify sources to sync or use --all", err=True)
863
+ click.echo("Available sources: ", nl=False)
864
+ click.echo(", ".join(PluginRegistry.list_sources()) or "(none installed)")
865
+ sys.exit(1)
866
+
867
+ if not source_names:
868
+ click.echo("No sources to sync.")
869
+ return
870
+
871
+ ingester = Ingester(db_cfg.url, use_modal=not local)
872
+
873
+ with psycopg.connect(db_cfg.url, row_factory=dict_row) as conn:
874
+ for source_name in source_names:
875
+ # Get the plugin
876
+ source = PluginRegistry.get_source(source_name)
877
+ if source is None:
878
+ click.echo(f"Error: Source '{source_name}' not found.", err=True)
879
+ click.echo(f"Installed sources: {', '.join(PluginRegistry.list_sources())}")
880
+ continue
881
+
882
+ # Get and resolve config
883
+ source_cfg = config.get_source_config(source_name)
884
+ if source_cfg is None:
885
+ click.echo(f"Skipping '{source_name}': not configured or disabled", err=True)
886
+ continue
887
+
888
+ # Merge CLI options into config (override config file values)
889
+ if folder:
890
+ source_cfg["folders"] = list(folder)
891
+ if doc_ids:
892
+ source_cfg["doc_ids"] = list(doc_ids)
893
+ # GitHub-specific options
894
+ if repo:
895
+ source_cfg["repos"] = list(repo)
896
+ if include_source:
897
+ source_cfg["include_source"] = True
898
+ if include_issues:
899
+ source_cfg["include_issues"] = True
900
+ if include_prs:
901
+ source_cfg["include_prs"] = True
902
+ if include_wiki:
903
+ source_cfg["include_wiki"] = True
904
+
905
+ try:
906
+ source.configure(source_cfg)
907
+ except Exception as e:
908
+ click.echo(f"Error configuring '{source_name}': {e}", err=True)
909
+ continue
910
+
911
+ # Get sync state (unless --full)
912
+ state = None if full else _get_sync_state(conn, source_name, db_cfg.name)
913
+
914
+ click.echo(f"Syncing {source_name}..." + (" (full)" if full else ""))
915
+
916
+ try:
917
+ documents, new_state = source.fetch(state)
918
+ except Exception as e:
919
+ click.echo(f"Error fetching from '{source_name}': {e}", err=True)
920
+ continue
921
+
922
+ if documents:
923
+ click.echo(f" Fetched {len(documents)} documents")
924
+
925
+ # Apply LLM filtering if configured
926
+ llm_filter_cfg = source_cfg.get("llm_filter", {})
927
+ if llm_filter_cfg.get("enabled"):
928
+ documents = _apply_llm_filter(
929
+ documents,
930
+ llm_filter_cfg,
931
+ source_name,
932
+ )
933
+
934
+ if documents:
935
+ ingester.ingest_documents(documents)
936
+ else:
937
+ click.echo(" All documents filtered out")
938
+ else:
939
+ click.echo(" No new documents")
940
+
941
+ # Save state
942
+ _save_sync_state(conn, source_name, db_cfg.name, new_state)
943
+
944
+ click.echo("Done!")
945
+
946
+
947
+ @sync.command("list")
948
+ def sync_list():
949
+ """List available API sources."""
950
+ from .plugins.registry import PluginRegistry
951
+
952
+ installed = PluginRegistry.list_sources()
953
+ configured = config.list_enabled_sources()
954
+
955
+ click.echo("Installed sources:")
956
+ if installed:
957
+ for name in installed:
958
+ status = "configured" if name in configured else "not configured"
959
+ click.echo(f" {name} [{status}]")
960
+ else:
961
+ click.echo(" (none)")
962
+
963
+ # Show configured but not installed
964
+ not_installed = set(configured) - set(installed)
965
+ if not_installed:
966
+ click.echo("\nConfigured but not installed:")
967
+ for name in not_installed:
968
+ click.echo(f" {name}")
969
+
970
+
971
+ @sync.command("status")
972
+ @click.argument("source", required=False)
973
+ @click.option("--db", "database", default=None, help="Database to check")
974
+ @click.pass_context
975
+ def sync_status(ctx, source: str | None, database: str | None):
976
+ """Show sync status and last sync times."""
977
+ import psycopg
978
+ from psycopg.rows import dict_row
979
+
980
+ db_name = database or ctx.obj.get("database")
981
+ db_cfg = config.get_database(db_name)
982
+
983
+ with psycopg.connect(db_cfg.url, row_factory=dict_row) as conn:
984
+ if source:
985
+ # Show status for specific source
986
+ result = conn.execute(
987
+ """SELECT source_name, last_sync, cursor, extra, updated_at
988
+ FROM sync_state
989
+ WHERE source_name = %s AND database_name = %s""",
990
+ (source, db_cfg.name),
991
+ ).fetchone()
992
+
993
+ if result:
994
+ click.echo(f"Source: {result['source_name']}")
995
+ click.echo(f" Last sync: {result['last_sync'] or 'never'}")
996
+ click.echo(f" Updated: {result['updated_at']}")
997
+ if result["cursor"]:
998
+ click.echo(f" Cursor: {result['cursor'][:50]}...")
999
+ else:
1000
+ click.echo(f"No sync history for '{source}'")
1001
+
1002
+ # Show document count
1003
+ doc_count = conn.execute(
1004
+ """SELECT COUNT(*) as count FROM documents
1005
+ WHERE metadata->>'sync_source' = %s""",
1006
+ (source,),
1007
+ ).fetchone()
1008
+ click.echo(f" Documents: {doc_count['count']}")
1009
+ else:
1010
+ # Show all sync states
1011
+ results = conn.execute(
1012
+ """SELECT source_name, last_sync, updated_at
1013
+ FROM sync_state
1014
+ WHERE database_name = %s
1015
+ ORDER BY updated_at DESC""",
1016
+ (db_cfg.name,),
1017
+ ).fetchall()
1018
+
1019
+ if results:
1020
+ click.echo(f"Sync status for database '{db_cfg.name}':")
1021
+ for row in results:
1022
+ if row["last_sync"]:
1023
+ last = row["last_sync"].strftime("%Y-%m-%d %H:%M")
1024
+ else:
1025
+ last = "never"
1026
+ click.echo(f" {row['source_name']}: {last}")
1027
+ else:
1028
+ click.echo("No sync history")
1029
+
1030
+
1031
+ # =============================================================================
1032
+ # Token commands
1033
+ # =============================================================================
1034
+
1035
+
1036
+ @main.group()
1037
+ def token():
1038
+ """Manage API tokens for HTTP access."""
1039
+ pass
1040
+
1041
+
1042
+ @token.command("create")
1043
+ @click.option("--db", "database", default=None, help="Database to create token for")
1044
+ @click.option("--ro", "read_only", is_flag=True, help="Create read-only token (default: rw)")
1045
+ @click.option("-d", "--description", default=None, help="Token description")
1046
+ @click.pass_context
1047
+ def token_create(ctx, database: str | None, read_only: bool, description: str | None):
1048
+ """Create a new API token."""
1049
+ from .tokens import create_token
1050
+
1051
+ db_name = database or ctx.obj.get("database")
1052
+ db_cfg = config.get_database(db_name)
1053
+ permissions = "ro" if read_only else "rw"
1054
+
1055
+ try:
1056
+ token = create_token(db_cfg.url, db_cfg.name, permissions, description)
1057
+ click.echo(f"Token created for database '{db_cfg.name}' ({permissions}):")
1058
+ click.echo(f" {token}")
1059
+ click.echo("")
1060
+ click.echo("Save this token - it cannot be retrieved later.")
1061
+ except Exception as e:
1062
+ click.echo(f"Error creating token: {e}", err=True)
1063
+ sys.exit(1)
1064
+
1065
+
1066
+ @token.command("list")
1067
+ @click.option("--db", "database", default=None, help="Database to list tokens for")
1068
+ @click.pass_context
1069
+ def token_list(ctx, database: str | None):
1070
+ """List all tokens for a database."""
1071
+ from .tokens import list_tokens
1072
+
1073
+ db_name = database or ctx.obj.get("database")
1074
+ db_cfg = config.get_database(db_name)
1075
+
1076
+ try:
1077
+ tokens = list_tokens(db_cfg.url)
1078
+ if not tokens:
1079
+ click.echo(f"No tokens found for database '{db_cfg.name}'")
1080
+ return
1081
+
1082
+ click.echo(f"Tokens for database '{db_cfg.name}':")
1083
+ for t in tokens:
1084
+ desc = f" - {t.description}" if t.description else ""
1085
+ last_used = t.last_used_at.strftime("%Y-%m-%d %H:%M") if t.last_used_at else "never"
1086
+ click.echo(f" [{t.permissions}] {t.token_hash[:12]}...{desc}")
1087
+ created = t.created_at.strftime("%Y-%m-%d %H:%M")
1088
+ click.echo(f" Created: {created}, Last used: {last_used}")
1089
+ except Exception as e:
1090
+ click.echo(f"Error listing tokens: {e}", err=True)
1091
+ sys.exit(1)
1092
+
1093
+
1094
+ @token.command("revoke")
1095
+ @click.argument("token_value")
1096
+ @click.option("--db", "database", default=None, help="Database to revoke token from")
1097
+ @click.pass_context
1098
+ def token_revoke(ctx, token_value: str, database: str | None):
1099
+ """Revoke (delete) an API token.
1100
+
1101
+ TOKEN_VALUE must be the full token string.
1102
+ """
1103
+ from .tokens import delete_token
1104
+
1105
+ db_name = database or ctx.obj.get("database")
1106
+ db_cfg = config.get_database(db_name)
1107
+
1108
+ try:
1109
+ deleted = delete_token(db_cfg.url, token_value)
1110
+ if deleted:
1111
+ click.echo("Token revoked.")
1112
+ else:
1113
+ click.echo("Token not found. Make sure you're using the full token string.", err=True)
1114
+ sys.exit(1)
1115
+ except Exception as e:
1116
+ click.echo(f"Error revoking token: {e}", err=True)
1117
+ sys.exit(1)
1118
+
1119
+
1120
+ # =============================================================================
1121
+ # LLM commands
1122
+ # =============================================================================
1123
+
1124
+
1125
+ @main.group()
1126
+ def llm():
1127
+ """Manage LLM integration for document classification."""
1128
+ pass
1129
+
1130
+
1131
+ @llm.command("status")
1132
+ @click.option("--db", "database", default=None, help="Database to check cache for")
1133
+ @click.pass_context
1134
+ def llm_status(ctx, database: str | None):
1135
+ """Show LLM configuration and connectivity status.
1136
+
1137
+ Displays current provider settings, tests connectivity,
1138
+ and shows cache statistics.
1139
+ """
1140
+ import os
1141
+
1142
+ click.echo("LLM Configuration")
1143
+ click.echo("-" * 40)
1144
+
1145
+ # Show config
1146
+ click.echo(f"Provider: {config.llm_provider or '(disabled)'}")
1147
+ if config.llm_provider:
1148
+ click.echo(f"Model: {config.llm_model}")
1149
+ click.echo(f"Timeout: {config.llm_timeout}s")
1150
+ click.echo(f"Cache responses: {config.llm_cache_responses}")
1151
+
1152
+ if config.llm_provider == "modal":
1153
+ click.echo("Backend: Modal GPU (deploy with: lkb llm deploy)")
1154
+ elif config.llm_use_bedrock:
1155
+ click.echo(f"Backend: AWS Bedrock (region: {config.llm_aws_region})")
1156
+ else:
1157
+ api_key_set = bool(os.environ.get("ANTHROPIC_API_KEY"))
1158
+ click.echo(f"API key set: {'yes' if api_key_set else 'no (set ANTHROPIC_API_KEY)'}")
1159
+
1160
+ click.echo("")
1161
+
1162
+ # Test connectivity if provider is configured
1163
+ if config.llm_provider:
1164
+ click.echo("Connectivity Test")
1165
+ click.echo("-" * 40)
1166
+ try:
1167
+ from .llm.providers import get_provider
1168
+
1169
+ provider = get_provider()
1170
+ if provider is None:
1171
+ click.echo("Status: provider initialization failed")
1172
+ elif provider.is_available():
1173
+ click.echo("Status: available")
1174
+ # List models
1175
+ if hasattr(provider, "list_models"):
1176
+ models = provider.list_models()
1177
+ click.echo(f"Available models: {', '.join(models[:3])}...")
1178
+ else:
1179
+ click.echo("Status: not available (check API key or credentials)")
1180
+ except ImportError:
1181
+ click.echo("Status: missing dependencies")
1182
+ click.echo(" Install with: pip install 'okb[llm]'")
1183
+ except Exception as e:
1184
+ click.echo(f"Status: error - {e}")
1185
+
1186
+ # Show cache stats if database is available
1187
+ click.echo("")
1188
+ click.echo("Cache Statistics")
1189
+ click.echo("-" * 40)
1190
+ try:
1191
+ db_name = database or ctx.obj.get("database")
1192
+ db_cfg = config.get_database(db_name)
1193
+
1194
+ from .llm.cache import get_cache_stats
1195
+
1196
+ stats = get_cache_stats(db_cfg.url)
1197
+ click.echo(f"Total cached responses: {stats['total_entries']}")
1198
+ if stats["by_provider"]:
1199
+ for entry in stats["by_provider"]:
1200
+ click.echo(f" {entry['provider']}/{entry['model']}: {entry['count']}")
1201
+ if stats["oldest_entry"]:
1202
+ click.echo(f"Oldest entry: {stats['oldest_entry']}")
1203
+ except Exception as e:
1204
+ click.echo(f"Cache unavailable: {e}")
1205
+
1206
+
1207
+ @llm.command("clear-cache")
1208
+ @click.option("--db", "database", default=None, help="Database to clear cache for")
1209
+ @click.option(
1210
+ "--older-than", "days", type=int, default=None, help="Only clear entries older than N days"
1211
+ )
1212
+ @click.option("--yes", is_flag=True, help="Skip confirmation")
1213
+ @click.pass_context
1214
+ def llm_clear_cache(ctx, database: str | None, days: int | None, yes: bool):
1215
+ """Clear the LLM response cache."""
1216
+ from datetime import UTC, datetime, timedelta
1217
+
1218
+ db_name = database or ctx.obj.get("database")
1219
+ db_cfg = config.get_database(db_name)
1220
+
1221
+ if days:
1222
+ older_than = datetime.now(UTC) - timedelta(days=days)
1223
+ msg = f"Clear cache entries older than {days} days?"
1224
+ else:
1225
+ older_than = None
1226
+ msg = "Clear ALL cache entries?"
1227
+
1228
+ if not yes:
1229
+ if not click.confirm(msg):
1230
+ click.echo("Cancelled.")
1231
+ return
1232
+
1233
+ from .llm.cache import clear_cache
1234
+
1235
+ deleted = clear_cache(older_than=older_than, db_url=db_cfg.url)
1236
+ click.echo(f"Deleted {deleted} cache entries.")
1237
+
1238
+
1239
+ @llm.command("deploy")
1240
+ def llm_deploy():
1241
+ """Deploy the Modal LLM app for open model inference.
1242
+
1243
+ This deploys a GPU-accelerated LLM service on Modal using Llama 3.2.
1244
+ Required for using provider: modal in your config.
1245
+
1246
+ Requires Modal CLI to be installed and authenticated:
1247
+ pip install modal
1248
+ modal token new
1249
+ """
1250
+ if not shutil.which("modal"):
1251
+ click.echo("Error: modal CLI is not installed.", err=True)
1252
+ click.echo("Install with: pip install modal", err=True)
1253
+ click.echo("Then authenticate: modal token new", err=True)
1254
+ sys.exit(1)
1255
+
1256
+ # Find modal_llm.py in the package
1257
+ llm_path = Path(__file__).parent / "modal_llm.py"
1258
+ if not llm_path.exists():
1259
+ click.echo(f"Error: modal_llm.py not found at {llm_path}", err=True)
1260
+ sys.exit(1)
1261
+
1262
+ click.echo(f"Deploying {llm_path} to Modal...")
1263
+ click.echo("Note: First deploy downloads the model (~2GB) and may take a few minutes.")
1264
+ result = subprocess.run(
1265
+ ["modal", "deploy", str(llm_path)],
1266
+ cwd=llm_path.parent,
1267
+ )
1268
+ sys.exit(result.returncode)
1269
+
1270
+
1271
+ if __name__ == "__main__":
1272
+ main()