okb 1.1.0__py3-none-any.whl → 1.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/cli.py +16 -1083
- okb/config.py +4 -122
- okb/http_server.py +2 -163
- okb/llm/providers.py +6 -9
- okb/mcp_server.py +12 -1036
- okb/modal_llm.py +8 -26
- okb/plugins/sources/github.py +5 -5
- okb/tokens.py +3 -25
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/METADATA +6 -83
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/RECORD +12 -24
- okb/llm/analyze.py +0 -524
- okb/llm/consolidate.py +0 -685
- okb/llm/enrich.py +0 -723
- okb/llm/extractors/__init__.py +0 -13
- okb/llm/extractors/base.py +0 -44
- okb/llm/extractors/cross_doc.py +0 -478
- okb/llm/extractors/dedup.py +0 -499
- okb/llm/extractors/entity.py +0 -369
- okb/llm/extractors/todo.py +0 -149
- okb/migrations/0008.enrichment.sql +0 -46
- okb/migrations/0009.entity-consolidation.sql +0 -120
- okb/migrations/0010.token-id.sql +0 -7
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/WHEEL +0 -0
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/entry_points.txt +0 -0
okb/cli.py
CHANGED
|
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import importlib.resources
|
|
6
6
|
import json
|
|
7
|
-
import os
|
|
8
7
|
import shutil
|
|
9
8
|
import subprocess
|
|
10
9
|
import sys
|
|
@@ -45,7 +44,7 @@ def _check_docker() -> bool:
|
|
|
45
44
|
|
|
46
45
|
|
|
47
46
|
def _get_container_status() -> str | None:
|
|
48
|
-
"""Get the status of the
|
|
47
|
+
"""Get the status of the lkb container. Returns None if not found."""
|
|
49
48
|
try:
|
|
50
49
|
result = subprocess.run(
|
|
51
50
|
[
|
|
@@ -842,7 +841,7 @@ def sync_run(
|
|
|
842
841
|
):
|
|
843
842
|
"""Sync from API sources.
|
|
844
843
|
|
|
845
|
-
Example:
|
|
844
|
+
Example: lkb sync run github --repo owner/repo
|
|
846
845
|
"""
|
|
847
846
|
import psycopg
|
|
848
847
|
from psycopg.rows import dict_row
|
|
@@ -1210,7 +1209,7 @@ def token_list(ctx, database: str | None):
|
|
|
1210
1209
|
for t in tokens:
|
|
1211
1210
|
desc = f" - {t.description}" if t.description else ""
|
|
1212
1211
|
last_used = t.last_used_at.strftime("%Y-%m-%d %H:%M") if t.last_used_at else "never"
|
|
1213
|
-
click.echo(f"
|
|
1212
|
+
click.echo(f" [{t.permissions}] {t.token_hash[:12]}...{desc}")
|
|
1214
1213
|
created = t.created_at.strftime("%Y-%m-%d %H:%M")
|
|
1215
1214
|
click.echo(f" Created: {created}, Last used: {last_used}")
|
|
1216
1215
|
except Exception as e:
|
|
@@ -1219,43 +1218,26 @@ def token_list(ctx, database: str | None):
|
|
|
1219
1218
|
|
|
1220
1219
|
|
|
1221
1220
|
@token.command("revoke")
|
|
1222
|
-
@click.argument("token_value"
|
|
1223
|
-
@click.option("--id", "token_id", type=int, default=None, help="Token ID to revoke (from 'okb token list')")
|
|
1221
|
+
@click.argument("token_value")
|
|
1224
1222
|
@click.option("--db", "database", default=None, help="Database to revoke token from")
|
|
1225
1223
|
@click.pass_context
|
|
1226
|
-
def token_revoke(ctx, token_value: str
|
|
1224
|
+
def token_revoke(ctx, token_value: str, database: str | None):
|
|
1227
1225
|
"""Revoke (delete) an API token.
|
|
1228
1226
|
|
|
1229
|
-
|
|
1227
|
+
TOKEN_VALUE must be the full token string.
|
|
1230
1228
|
"""
|
|
1231
|
-
from .tokens import delete_token
|
|
1232
|
-
|
|
1233
|
-
if not token_value and not token_id:
|
|
1234
|
-
click.echo("Error: Provide either TOKEN_VALUE or --id", err=True)
|
|
1235
|
-
sys.exit(1)
|
|
1236
|
-
|
|
1237
|
-
if token_value and token_id:
|
|
1238
|
-
click.echo("Error: Provide either TOKEN_VALUE or --id, not both", err=True)
|
|
1239
|
-
sys.exit(1)
|
|
1229
|
+
from .tokens import delete_token
|
|
1240
1230
|
|
|
1241
1231
|
db_name = database or ctx.obj.get("database")
|
|
1242
1232
|
db_cfg = config.get_database(db_name)
|
|
1243
1233
|
|
|
1244
1234
|
try:
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
click.echo(f"Token ID {token_id} revoked.")
|
|
1249
|
-
else:
|
|
1250
|
-
click.echo(f"Token ID {token_id} not found.", err=True)
|
|
1251
|
-
sys.exit(1)
|
|
1235
|
+
deleted = delete_token(db_cfg.url, token_value)
|
|
1236
|
+
if deleted:
|
|
1237
|
+
click.echo("Token revoked.")
|
|
1252
1238
|
else:
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
click.echo("Token revoked.")
|
|
1256
|
-
else:
|
|
1257
|
-
click.echo("Token not found. Use --id or provide the full token string.", err=True)
|
|
1258
|
-
sys.exit(1)
|
|
1239
|
+
click.echo("Token not found. Make sure you're using the full token string.", err=True)
|
|
1240
|
+
sys.exit(1)
|
|
1259
1241
|
except Exception as e:
|
|
1260
1242
|
click.echo(f"Error revoking token: {e}", err=True)
|
|
1261
1243
|
sys.exit(1)
|
|
@@ -1294,7 +1276,7 @@ def llm_status(ctx, database: str | None):
|
|
|
1294
1276
|
click.echo(f"Cache responses: {config.llm_cache_responses}")
|
|
1295
1277
|
|
|
1296
1278
|
if config.llm_provider == "modal":
|
|
1297
|
-
click.echo("Backend: Modal GPU (deploy with:
|
|
1279
|
+
click.echo("Backend: Modal GPU (deploy with: lkb llm deploy)")
|
|
1298
1280
|
elif config.llm_use_bedrock:
|
|
1299
1281
|
click.echo(f"Backend: AWS Bedrock (region: {config.llm_aws_region})")
|
|
1300
1282
|
else:
|
|
@@ -1384,9 +1366,7 @@ def llm_clear_cache(ctx, database: str | None, days: int | None, yes: bool):
|
|
|
1384
1366
|
def llm_deploy():
|
|
1385
1367
|
"""Deploy the Modal LLM app for open model inference.
|
|
1386
1368
|
|
|
1387
|
-
|
|
1388
|
-
Default: microsoft/Phi-3-mini-4k-instruct (no HuggingFace approval needed).
|
|
1389
|
-
|
|
1369
|
+
This deploys a GPU-accelerated LLM service on Modal using Llama 3.2.
|
|
1390
1370
|
Required for using provider: modal in your config.
|
|
1391
1371
|
|
|
1392
1372
|
Requires Modal CLI to be installed and authenticated:
|
|
@@ -1405,1061 +1385,14 @@ def llm_deploy():
|
|
|
1405
1385
|
click.echo(f"Error: modal_llm.py not found at {llm_path}", err=True)
|
|
1406
1386
|
sys.exit(1)
|
|
1407
1387
|
|
|
1408
|
-
|
|
1409
|
-
model
|
|
1410
|
-
gpu = config.llm_modal_gpu or "L4"
|
|
1411
|
-
click.echo("Deploying Modal LLM:")
|
|
1412
|
-
click.echo(f" Model: {model}")
|
|
1413
|
-
click.echo(f" GPU: {gpu}")
|
|
1414
|
-
click.echo("Note: First deploy downloads the model and may take a few minutes.")
|
|
1415
|
-
|
|
1416
|
-
# Set model and GPU in environment for Modal to pick up
|
|
1417
|
-
env = os.environ.copy()
|
|
1418
|
-
env["OKB_LLM_MODEL"] = model
|
|
1419
|
-
env["OKB_MODAL_GPU"] = gpu
|
|
1420
|
-
|
|
1388
|
+
click.echo(f"Deploying {llm_path} to Modal...")
|
|
1389
|
+
click.echo("Note: First deploy downloads the model (~2GB) and may take a few minutes.")
|
|
1421
1390
|
result = subprocess.run(
|
|
1422
1391
|
["modal", "deploy", str(llm_path)],
|
|
1423
1392
|
cwd=llm_path.parent,
|
|
1424
|
-
env=env,
|
|
1425
1393
|
)
|
|
1426
1394
|
sys.exit(result.returncode)
|
|
1427
1395
|
|
|
1428
1396
|
|
|
1429
|
-
# =============================================================================
|
|
1430
|
-
# Enrich commands
|
|
1431
|
-
# =============================================================================
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
@main.group()
|
|
1435
|
-
def enrich():
|
|
1436
|
-
"""LLM-based document enrichment (extract TODOs and entities)."""
|
|
1437
|
-
pass
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
@enrich.command("run")
|
|
1441
|
-
@click.option("--db", "database", default=None, help="Database to enrich")
|
|
1442
|
-
@click.option("--source-type", default=None, help="Filter by source type")
|
|
1443
|
-
@click.option("--project", default=None, help="Filter by project")
|
|
1444
|
-
@click.option("--query", default=None, help="Semantic search query to filter documents")
|
|
1445
|
-
@click.option("--path-pattern", default=None, help="SQL LIKE pattern for source_path")
|
|
1446
|
-
@click.option(
|
|
1447
|
-
"--all", "enrich_all", is_flag=True, help="Re-enrich all documents (ignore enriched_at)"
|
|
1448
|
-
)
|
|
1449
|
-
@click.option("--dry-run", is_flag=True, help="Show what would be enriched without executing")
|
|
1450
|
-
@click.option("--limit", default=100, help="Maximum documents to process")
|
|
1451
|
-
@click.option("--workers", default=None, type=int, help="Parallel workers (default: docs/5, min 1)")
|
|
1452
|
-
@click.pass_context
|
|
1453
|
-
def enrich_run(
|
|
1454
|
-
ctx,
|
|
1455
|
-
database: str | None,
|
|
1456
|
-
source_type: str | None,
|
|
1457
|
-
project: str | None,
|
|
1458
|
-
query: str | None,
|
|
1459
|
-
path_pattern: str | None,
|
|
1460
|
-
enrich_all: bool,
|
|
1461
|
-
dry_run: bool,
|
|
1462
|
-
limit: int,
|
|
1463
|
-
workers: int | None,
|
|
1464
|
-
):
|
|
1465
|
-
"""Run enrichment on documents to extract TODOs and entities.
|
|
1466
|
-
|
|
1467
|
-
By default, only processes documents that haven't been enriched yet.
|
|
1468
|
-
Use --all to re-enrich all documents (e.g., after changing enrichment config).
|
|
1469
|
-
|
|
1470
|
-
Examples:
|
|
1471
|
-
|
|
1472
|
-
okb enrich run # Enrich un-enriched documents
|
|
1473
|
-
|
|
1474
|
-
okb enrich run --dry-run # Show what would be enriched
|
|
1475
|
-
|
|
1476
|
-
okb enrich run --all # Re-enrich everything
|
|
1477
|
-
|
|
1478
|
-
okb enrich run --source-type markdown # Only markdown files
|
|
1479
|
-
|
|
1480
|
-
okb enrich run --query "meeting notes" # Filter by semantic search
|
|
1481
|
-
|
|
1482
|
-
okb enrich run --path-pattern '%myrepo%' # Filter by source path
|
|
1483
|
-
|
|
1484
|
-
okb enrich run --workers 8 # Use 8 parallel workers
|
|
1485
|
-
"""
|
|
1486
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
1487
|
-
|
|
1488
|
-
from .llm import get_llm
|
|
1489
|
-
from .llm.enrich import EnrichmentConfig, get_unenriched_documents, process_enrichment
|
|
1490
|
-
|
|
1491
|
-
# Check LLM is configured before doing any work
|
|
1492
|
-
if get_llm() is None:
|
|
1493
|
-
click.echo("Error: No LLM provider configured.", err=True)
|
|
1494
|
-
click.echo("", err=True)
|
|
1495
|
-
click.echo("Enrichment requires an LLM to extract TODOs and entities.", err=True)
|
|
1496
|
-
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml:", err=True)
|
|
1497
|
-
click.echo("", err=True)
|
|
1498
|
-
click.echo(" llm:", err=True)
|
|
1499
|
-
click.echo(" provider: claude", err=True)
|
|
1500
|
-
click.echo(" model: claude-haiku-4-5-20251001", err=True)
|
|
1501
|
-
click.echo("", err=True)
|
|
1502
|
-
click.echo("Run 'okb llm status' to check configuration.", err=True)
|
|
1503
|
-
ctx.exit(1)
|
|
1504
|
-
|
|
1505
|
-
db_name = database or ctx.obj.get("database")
|
|
1506
|
-
db_cfg = config.get_database(db_name)
|
|
1507
|
-
|
|
1508
|
-
# Get enrichment version for re-enrichment check
|
|
1509
|
-
enrichment_version = config.enrichment_version if enrich_all else None
|
|
1510
|
-
|
|
1511
|
-
click.echo(f"Scanning database '{db_cfg.name}' for documents to enrich...")
|
|
1512
|
-
if dry_run:
|
|
1513
|
-
click.echo("(dry run - no changes will be made)")
|
|
1514
|
-
|
|
1515
|
-
docs = get_unenriched_documents(
|
|
1516
|
-
db_url=db_cfg.url,
|
|
1517
|
-
source_type=source_type,
|
|
1518
|
-
project=project,
|
|
1519
|
-
query=query,
|
|
1520
|
-
path_pattern=path_pattern,
|
|
1521
|
-
enrichment_version=enrichment_version,
|
|
1522
|
-
limit=limit,
|
|
1523
|
-
)
|
|
1524
|
-
|
|
1525
|
-
if not docs:
|
|
1526
|
-
click.echo("No documents need enrichment.")
|
|
1527
|
-
return
|
|
1528
|
-
|
|
1529
|
-
click.echo(f"Found {len(docs)} documents to enrich")
|
|
1530
|
-
|
|
1531
|
-
if dry_run:
|
|
1532
|
-
for doc in docs[:20]:
|
|
1533
|
-
click.echo(f" - {doc['title']} ({doc['source_type']})")
|
|
1534
|
-
if len(docs) > 20:
|
|
1535
|
-
click.echo(f" ... and {len(docs) - 20} more")
|
|
1536
|
-
return
|
|
1537
|
-
|
|
1538
|
-
# Calculate workers if not specified: floor(docs/5), minimum 1
|
|
1539
|
-
if workers is None:
|
|
1540
|
-
workers = max(1, len(docs) // 5)
|
|
1541
|
-
|
|
1542
|
-
# Build config
|
|
1543
|
-
enrich_config = EnrichmentConfig.from_config(
|
|
1544
|
-
{
|
|
1545
|
-
"enabled": config.enrichment_enabled,
|
|
1546
|
-
"version": config.enrichment_version,
|
|
1547
|
-
"extract_todos": config.enrichment_extract_todos,
|
|
1548
|
-
"extract_entities": config.enrichment_extract_entities,
|
|
1549
|
-
"auto_create_todos": config.enrichment_auto_create_todos,
|
|
1550
|
-
"auto_create_entities": config.enrichment_auto_create_entities,
|
|
1551
|
-
"min_confidence_todo": config.enrichment_min_confidence_todo,
|
|
1552
|
-
"min_confidence_entity": config.enrichment_min_confidence_entity,
|
|
1553
|
-
}
|
|
1554
|
-
)
|
|
1555
|
-
|
|
1556
|
-
total_todos = 0
|
|
1557
|
-
total_entities_pending = 0
|
|
1558
|
-
total_entities_created = 0
|
|
1559
|
-
completed = 0
|
|
1560
|
-
errors = 0
|
|
1561
|
-
|
|
1562
|
-
def enrich_one(doc: dict) -> tuple[dict, dict | None, str | None]:
|
|
1563
|
-
"""Process a single document. Returns (doc, stats, error)."""
|
|
1564
|
-
proj = doc["metadata"].get("project") if doc["metadata"] else None
|
|
1565
|
-
try:
|
|
1566
|
-
stats = process_enrichment(
|
|
1567
|
-
document_id=str(doc["id"]),
|
|
1568
|
-
source_path=doc["source_path"],
|
|
1569
|
-
title=doc["title"],
|
|
1570
|
-
content=doc["content"],
|
|
1571
|
-
source_type=doc["source_type"],
|
|
1572
|
-
db_url=db_cfg.url,
|
|
1573
|
-
config=enrich_config,
|
|
1574
|
-
project=proj,
|
|
1575
|
-
)
|
|
1576
|
-
return doc, stats, None
|
|
1577
|
-
except Exception as e:
|
|
1578
|
-
return doc, None, str(e)
|
|
1579
|
-
|
|
1580
|
-
click.echo(f"Processing with {workers} parallel workers...")
|
|
1581
|
-
|
|
1582
|
-
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
1583
|
-
futures = {executor.submit(enrich_one, doc): doc for doc in docs}
|
|
1584
|
-
|
|
1585
|
-
for future in as_completed(futures):
|
|
1586
|
-
doc, stats, error = future.result()
|
|
1587
|
-
completed += 1
|
|
1588
|
-
title = doc["title"][:40] if doc["title"] else "Untitled"
|
|
1589
|
-
|
|
1590
|
-
if error:
|
|
1591
|
-
errors += 1
|
|
1592
|
-
click.echo(f"[{completed}/{len(docs)}] {title}... -> error: {error[:50]}")
|
|
1593
|
-
continue
|
|
1594
|
-
|
|
1595
|
-
total_todos += stats["todos_created"]
|
|
1596
|
-
total_entities_pending += stats["entities_pending"]
|
|
1597
|
-
total_entities_created += stats["entities_created"]
|
|
1598
|
-
|
|
1599
|
-
parts = []
|
|
1600
|
-
if stats["todos_created"]:
|
|
1601
|
-
parts.append(f"{stats['todos_created']} TODOs")
|
|
1602
|
-
if stats["entities_pending"]:
|
|
1603
|
-
parts.append(f"{stats['entities_pending']} pending")
|
|
1604
|
-
if stats["entities_created"]:
|
|
1605
|
-
parts.append(f"{stats['entities_created']} entities")
|
|
1606
|
-
if parts:
|
|
1607
|
-
click.echo(f"[{completed}/{len(docs)}] {title}... -> {', '.join(parts)}")
|
|
1608
|
-
else:
|
|
1609
|
-
click.echo(f"[{completed}/{len(docs)}] {title}... -> nothing extracted")
|
|
1610
|
-
|
|
1611
|
-
click.echo("")
|
|
1612
|
-
click.echo("Summary:")
|
|
1613
|
-
click.echo(f" Documents processed: {len(docs)}")
|
|
1614
|
-
if errors:
|
|
1615
|
-
click.echo(f" Errors: {errors}")
|
|
1616
|
-
click.echo(f" TODOs created: {total_todos}")
|
|
1617
|
-
click.echo(f" Entities pending review: {total_entities_pending}")
|
|
1618
|
-
click.echo(f" Entities auto-created: {total_entities_created}")
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
@enrich.command("pending")
|
|
1622
|
-
@click.option("--db", "database", default=None, help="Database to check")
|
|
1623
|
-
@click.option("--type", "entity_type", default=None, help="Filter by entity type")
|
|
1624
|
-
@click.option("--limit", default=50, help="Maximum results")
|
|
1625
|
-
@click.pass_context
|
|
1626
|
-
def enrich_pending(ctx, database: str | None, entity_type: str | None, limit: int):
|
|
1627
|
-
"""List pending entity suggestions awaiting review.
|
|
1628
|
-
|
|
1629
|
-
Shows entities extracted from documents that need approval before
|
|
1630
|
-
becoming searchable. Use 'okb enrich approve' or 'okb enrich reject'
|
|
1631
|
-
to process them.
|
|
1632
|
-
"""
|
|
1633
|
-
from .llm.enrich import list_pending_entities
|
|
1634
|
-
|
|
1635
|
-
db_name = database or ctx.obj.get("database")
|
|
1636
|
-
db_cfg = config.get_database(db_name)
|
|
1637
|
-
|
|
1638
|
-
entities = list_pending_entities(db_cfg.url, entity_type=entity_type, limit=limit)
|
|
1639
|
-
|
|
1640
|
-
if not entities:
|
|
1641
|
-
click.echo("No pending entity suggestions.")
|
|
1642
|
-
return
|
|
1643
|
-
|
|
1644
|
-
click.echo(f"Pending entities ({len(entities)}):\n")
|
|
1645
|
-
for e in entities:
|
|
1646
|
-
confidence = e.get("confidence", 0)
|
|
1647
|
-
confidence_str = f" ({confidence:.0%})" if confidence else ""
|
|
1648
|
-
click.echo(f" [{e['entity_type']}] {e['entity_name']}{confidence_str}")
|
|
1649
|
-
click.echo(f" ID: {e['id']}")
|
|
1650
|
-
if e.get("description"):
|
|
1651
|
-
desc = (
|
|
1652
|
-
e["description"][:60] + "..."
|
|
1653
|
-
if len(e.get("description", "")) > 60
|
|
1654
|
-
else e["description"]
|
|
1655
|
-
)
|
|
1656
|
-
click.echo(f" {desc}")
|
|
1657
|
-
if e.get("aliases"):
|
|
1658
|
-
click.echo(f" Aliases: {', '.join(e['aliases'][:3])}")
|
|
1659
|
-
click.echo(f" Source: {e['source_title']}")
|
|
1660
|
-
click.echo("")
|
|
1661
|
-
|
|
1662
|
-
click.echo("Use 'okb enrich approve <id>' or 'okb enrich reject <id>' to process.")
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
@enrich.command("approve")
|
|
1666
|
-
@click.argument("pending_id")
|
|
1667
|
-
@click.option("--db", "database", default=None, help="Database")
|
|
1668
|
-
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
1669
|
-
@click.pass_context
|
|
1670
|
-
def enrich_approve(ctx, pending_id: str, database: str | None, local: bool):
|
|
1671
|
-
"""Approve a pending entity, creating it as a searchable document."""
|
|
1672
|
-
from .llm.enrich import approve_entity
|
|
1673
|
-
|
|
1674
|
-
db_name = database or ctx.obj.get("database")
|
|
1675
|
-
db_cfg = config.get_database(db_name)
|
|
1676
|
-
|
|
1677
|
-
source_path = approve_entity(db_cfg.url, pending_id, use_modal=not local)
|
|
1678
|
-
if source_path:
|
|
1679
|
-
click.echo(f"Entity approved and created: {source_path}")
|
|
1680
|
-
else:
|
|
1681
|
-
click.echo("Failed to approve entity. ID may be invalid or already processed.", err=True)
|
|
1682
|
-
sys.exit(1)
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
@enrich.command("reject")
|
|
1686
|
-
@click.argument("pending_id")
|
|
1687
|
-
@click.option("--db", "database", default=None, help="Database")
|
|
1688
|
-
@click.pass_context
|
|
1689
|
-
def enrich_reject(ctx, pending_id: str, database: str | None):
|
|
1690
|
-
"""Reject a pending entity suggestion."""
|
|
1691
|
-
from .llm.enrich import reject_entity
|
|
1692
|
-
|
|
1693
|
-
db_name = database or ctx.obj.get("database")
|
|
1694
|
-
db_cfg = config.get_database(db_name)
|
|
1695
|
-
|
|
1696
|
-
if reject_entity(db_cfg.url, pending_id):
|
|
1697
|
-
click.echo("Entity rejected.")
|
|
1698
|
-
else:
|
|
1699
|
-
click.echo("Failed to reject entity. ID may be invalid or already processed.", err=True)
|
|
1700
|
-
sys.exit(1)
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
@enrich.command("analyze")
|
|
1704
|
-
@click.option("--db", "database", default=None, help="Database to analyze")
|
|
1705
|
-
@click.option("--project", default=None, help="Analyze specific project only")
|
|
1706
|
-
@click.option("--sample-size", default=15, help="Number of documents to sample")
|
|
1707
|
-
@click.option("--no-update", is_flag=True, help="Don't update database metadata")
|
|
1708
|
-
@click.option("--stats-only", is_flag=True, help="Show stats without LLM analysis")
|
|
1709
|
-
@click.pass_context
|
|
1710
|
-
def enrich_analyze(
|
|
1711
|
-
ctx,
|
|
1712
|
-
database: str | None,
|
|
1713
|
-
project: str | None,
|
|
1714
|
-
sample_size: int,
|
|
1715
|
-
no_update: bool,
|
|
1716
|
-
stats_only: bool,
|
|
1717
|
-
):
|
|
1718
|
-
"""Analyze knowledge base and update description/topics.
|
|
1719
|
-
|
|
1720
|
-
Uses entity aggregation and document sampling to understand the overall
|
|
1721
|
-
content and themes in the knowledge base. Generates a description and
|
|
1722
|
-
topic keywords using LLM analysis.
|
|
1723
|
-
|
|
1724
|
-
Examples:
|
|
1725
|
-
|
|
1726
|
-
okb enrich analyze # Analyze entire database
|
|
1727
|
-
|
|
1728
|
-
okb enrich analyze --stats-only # Show stats without LLM call
|
|
1729
|
-
|
|
1730
|
-
okb enrich analyze --project myproject # Analyze specific project
|
|
1731
|
-
|
|
1732
|
-
okb enrich analyze --no-update # Analyze without updating metadata
|
|
1733
|
-
"""
|
|
1734
|
-
from .llm.analyze import (
|
|
1735
|
-
analyze_database,
|
|
1736
|
-
get_content_stats,
|
|
1737
|
-
get_entity_summary,
|
|
1738
|
-
)
|
|
1739
|
-
|
|
1740
|
-
db_name = database or ctx.obj.get("database")
|
|
1741
|
-
db_cfg = config.get_database(db_name)
|
|
1742
|
-
|
|
1743
|
-
scope = f"project '{project}'" if project else f"database '{db_cfg.name}'"
|
|
1744
|
-
click.echo(f"Analyzing {scope}...\n")
|
|
1745
|
-
|
|
1746
|
-
# Always get stats
|
|
1747
|
-
stats = get_content_stats(db_cfg.url, project)
|
|
1748
|
-
entities = get_entity_summary(db_cfg.url, project, limit=20)
|
|
1749
|
-
|
|
1750
|
-
# Show stats
|
|
1751
|
-
click.echo("Content Statistics:")
|
|
1752
|
-
click.echo(f" Documents: {stats['total_documents']:,}")
|
|
1753
|
-
click.echo(f" Tokens: ~{stats['total_tokens']:,}")
|
|
1754
|
-
if stats["source_types"]:
|
|
1755
|
-
sorted_types = sorted(stats["source_types"].items(), key=lambda x: -x[1])
|
|
1756
|
-
types_parts = [f"{t}: {c}" for t, c in sorted_types]
|
|
1757
|
-
# Break into multiple lines if many types
|
|
1758
|
-
if len(types_parts) > 4:
|
|
1759
|
-
click.echo(" Source types:")
|
|
1760
|
-
for tp in types_parts:
|
|
1761
|
-
click.echo(f" {tp}")
|
|
1762
|
-
else:
|
|
1763
|
-
click.echo(f" Source types: {', '.join(types_parts)}")
|
|
1764
|
-
if stats["projects"]:
|
|
1765
|
-
click.echo(f" Projects: {', '.join(stats['projects'])}")
|
|
1766
|
-
if stats["date_range"]["earliest"]:
|
|
1767
|
-
earliest = stats["date_range"]["earliest"]
|
|
1768
|
-
latest = stats["date_range"]["latest"]
|
|
1769
|
-
click.echo(f" Date range: {earliest} to {latest}")
|
|
1770
|
-
|
|
1771
|
-
click.echo("")
|
|
1772
|
-
|
|
1773
|
-
# Show top entities
|
|
1774
|
-
if entities:
|
|
1775
|
-
click.echo("Top Entities (by mentions):")
|
|
1776
|
-
for i, e in enumerate(entities[:10], 1):
|
|
1777
|
-
name, etype = e["name"], e["type"]
|
|
1778
|
-
refs, docs = e["ref_count"], e["doc_count"]
|
|
1779
|
-
click.echo(f" {i}. {name} ({etype}) - {refs} mentions in {docs} docs")
|
|
1780
|
-
click.echo("")
|
|
1781
|
-
else:
|
|
1782
|
-
click.echo("No entities extracted yet.")
|
|
1783
|
-
click.echo("Run 'okb enrich run' to extract entities from documents.\n")
|
|
1784
|
-
|
|
1785
|
-
if stats_only:
|
|
1786
|
-
return
|
|
1787
|
-
|
|
1788
|
-
# Check LLM is configured
|
|
1789
|
-
from .llm import get_llm
|
|
1790
|
-
|
|
1791
|
-
if get_llm() is None:
|
|
1792
|
-
click.echo("Error: No LLM provider configured.", err=True)
|
|
1793
|
-
click.echo("", err=True)
|
|
1794
|
-
click.echo("Analysis requires an LLM to generate description and topics.", err=True)
|
|
1795
|
-
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml:", err=True)
|
|
1796
|
-
click.echo("", err=True)
|
|
1797
|
-
click.echo(" llm:", err=True)
|
|
1798
|
-
click.echo(" provider: claude", err=True)
|
|
1799
|
-
click.echo("", err=True)
|
|
1800
|
-
click.echo("Use --stats-only to see statistics without LLM analysis.", err=True)
|
|
1801
|
-
ctx.exit(1)
|
|
1802
|
-
|
|
1803
|
-
click.echo(f"Sampling {sample_size} documents for analysis...")
|
|
1804
|
-
click.echo("Generating description and topics...")
|
|
1805
|
-
click.echo("")
|
|
1806
|
-
|
|
1807
|
-
try:
|
|
1808
|
-
result = analyze_database(
|
|
1809
|
-
db_url=db_cfg.url,
|
|
1810
|
-
project=project,
|
|
1811
|
-
sample_size=sample_size,
|
|
1812
|
-
auto_update=not no_update,
|
|
1813
|
-
)
|
|
1814
|
-
|
|
1815
|
-
click.echo("Analysis Complete:")
|
|
1816
|
-
click.echo(f" Description: {result.description}")
|
|
1817
|
-
click.echo(f" Topics: {', '.join(result.topics)}")
|
|
1818
|
-
|
|
1819
|
-
if not no_update:
|
|
1820
|
-
click.echo("")
|
|
1821
|
-
click.echo("Updated database metadata.")
|
|
1822
|
-
else:
|
|
1823
|
-
click.echo("")
|
|
1824
|
-
click.echo("(metadata not updated - use without --no-update to save)")
|
|
1825
|
-
|
|
1826
|
-
except Exception as e:
|
|
1827
|
-
click.echo(f"Error during analysis: {e}", err=True)
|
|
1828
|
-
ctx.exit(1)
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
@enrich.command("consolidate")
|
|
1832
|
-
@click.option("--db", "database", default=None, help="Database to consolidate")
|
|
1833
|
-
@click.option("--duplicates/--no-duplicates", "detect_duplicates", default=True,
|
|
1834
|
-
help="Detect duplicate entities")
|
|
1835
|
-
@click.option("--cross-doc/--no-cross-doc", "detect_cross_doc", default=True,
|
|
1836
|
-
help="Detect cross-document entities")
|
|
1837
|
-
@click.option("--clusters/--no-clusters", "build_clusters", default=True,
|
|
1838
|
-
help="Build topic clusters")
|
|
1839
|
-
@click.option("--relationships/--no-relationships", "extract_relationships", default=True,
|
|
1840
|
-
help="Extract entity relationships")
|
|
1841
|
-
@click.option("--dry-run", is_flag=True, help="Show what would be found without creating proposals")
|
|
1842
|
-
@click.pass_context
|
|
1843
|
-
def enrich_consolidate(
|
|
1844
|
-
ctx,
|
|
1845
|
-
database: str | None,
|
|
1846
|
-
detect_duplicates: bool,
|
|
1847
|
-
detect_cross_doc: bool,
|
|
1848
|
-
build_clusters: bool,
|
|
1849
|
-
extract_relationships: bool,
|
|
1850
|
-
dry_run: bool,
|
|
1851
|
-
):
|
|
1852
|
-
"""Run entity consolidation pipeline.
|
|
1853
|
-
|
|
1854
|
-
Detects duplicate entities, cross-document mentions, builds topic clusters,
|
|
1855
|
-
and extracts entity relationships. Creates pending proposals for review
|
|
1856
|
-
rather than auto-applying changes.
|
|
1857
|
-
|
|
1858
|
-
Examples:
|
|
1859
|
-
|
|
1860
|
-
okb enrich consolidate # Run full consolidation
|
|
1861
|
-
|
|
1862
|
-
okb enrich consolidate --dry-run # Show what would be found
|
|
1863
|
-
|
|
1864
|
-
okb enrich consolidate --no-clusters # Skip clustering
|
|
1865
|
-
|
|
1866
|
-
okb enrich consolidate --duplicates --no-cross-doc --no-clusters --no-relationships
|
|
1867
|
-
"""
|
|
1868
|
-
from .llm import get_llm
|
|
1869
|
-
from .llm.consolidate import format_consolidation_result, run_consolidation
|
|
1870
|
-
|
|
1871
|
-
# Check LLM is configured if needed
|
|
1872
|
-
if get_llm() is None:
|
|
1873
|
-
click.echo("Error: No LLM provider configured.", err=True)
|
|
1874
|
-
click.echo("Consolidation requires an LLM for deduplication and clustering.", err=True)
|
|
1875
|
-
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml", err=True)
|
|
1876
|
-
ctx.exit(1)
|
|
1877
|
-
|
|
1878
|
-
db_name = database or ctx.obj.get("database")
|
|
1879
|
-
db_cfg = config.get_database(db_name)
|
|
1880
|
-
|
|
1881
|
-
click.echo(f"Running consolidation on database '{db_cfg.name}'...")
|
|
1882
|
-
if dry_run:
|
|
1883
|
-
click.echo("(dry run - no proposals will be created)")
|
|
1884
|
-
|
|
1885
|
-
result = run_consolidation(
|
|
1886
|
-
db_url=db_cfg.url,
|
|
1887
|
-
detect_duplicates=detect_duplicates,
|
|
1888
|
-
detect_cross_doc=detect_cross_doc,
|
|
1889
|
-
build_clusters=build_clusters,
|
|
1890
|
-
extract_relationships=extract_relationships,
|
|
1891
|
-
dry_run=dry_run,
|
|
1892
|
-
)
|
|
1893
|
-
|
|
1894
|
-
# Format and display result
|
|
1895
|
-
output = format_consolidation_result(result)
|
|
1896
|
-
click.echo("")
|
|
1897
|
-
click.echo(output)
|
|
1898
|
-
|
|
1899
|
-
if not dry_run and (result.duplicates_found > 0 or result.cross_doc_candidates > 0):
|
|
1900
|
-
click.echo("")
|
|
1901
|
-
click.echo("Use 'okb enrich merge-proposals' to review pending merges.")
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
@enrich.command("merge-proposals")
|
|
1905
|
-
@click.option("--db", "database", default=None, help="Database to check")
|
|
1906
|
-
@click.option("--limit", default=50, help="Maximum results")
|
|
1907
|
-
@click.pass_context
|
|
1908
|
-
def enrich_merge_proposals(ctx, database: str | None, limit: int):
|
|
1909
|
-
"""List pending entity merge proposals.
|
|
1910
|
-
|
|
1911
|
-
Shows duplicate entities and cross-document mentions awaiting review.
|
|
1912
|
-
Use 'okb enrich approve-merge' or 'okb enrich reject-merge' to process.
|
|
1913
|
-
"""
|
|
1914
|
-
from .llm.extractors.dedup import list_pending_merges
|
|
1915
|
-
|
|
1916
|
-
db_name = database or ctx.obj.get("database")
|
|
1917
|
-
db_cfg = config.get_database(db_name)
|
|
1918
|
-
|
|
1919
|
-
merges = list_pending_merges(db_cfg.url, limit=limit)
|
|
1920
|
-
|
|
1921
|
-
if not merges:
|
|
1922
|
-
click.echo("No pending merge proposals.")
|
|
1923
|
-
return
|
|
1924
|
-
|
|
1925
|
-
click.echo(f"Pending merge proposals ({len(merges)}):\n")
|
|
1926
|
-
for m in merges:
|
|
1927
|
-
confidence = m.get("confidence", 0)
|
|
1928
|
-
confidence_str = f" ({confidence:.0%})" if confidence else ""
|
|
1929
|
-
click.echo(f" {m['canonical_name']} <- {m['duplicate_name']}{confidence_str}")
|
|
1930
|
-
click.echo(f" ID: {m['id']}")
|
|
1931
|
-
click.echo(f" Reason: {m.get('reason', 'similarity')}")
|
|
1932
|
-
click.echo("")
|
|
1933
|
-
|
|
1934
|
-
click.echo("Use 'okb enrich approve-merge <id>' or 'okb enrich reject-merge <id>' to process.")
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
@enrich.command("approve-merge")
|
|
1938
|
-
@click.argument("merge_id")
|
|
1939
|
-
@click.option("--db", "database", default=None, help="Database")
|
|
1940
|
-
@click.pass_context
|
|
1941
|
-
def enrich_approve_merge(ctx, merge_id: str, database: str | None):
|
|
1942
|
-
"""Approve a pending entity merge.
|
|
1943
|
-
|
|
1944
|
-
Merges the duplicate entity into the canonical entity:
|
|
1945
|
-
- Redirects all entity references from duplicate to canonical
|
|
1946
|
-
- Adds duplicate's name as an alias for canonical
|
|
1947
|
-
- Deletes the duplicate entity document
|
|
1948
|
-
"""
|
|
1949
|
-
from .llm.extractors.dedup import approve_merge
|
|
1950
|
-
|
|
1951
|
-
db_name = database or ctx.obj.get("database")
|
|
1952
|
-
db_cfg = config.get_database(db_name)
|
|
1953
|
-
|
|
1954
|
-
if approve_merge(db_cfg.url, merge_id):
|
|
1955
|
-
click.echo("Merge approved and executed.")
|
|
1956
|
-
else:
|
|
1957
|
-
click.echo("Failed to approve merge. ID may be invalid or already processed.", err=True)
|
|
1958
|
-
sys.exit(1)
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
@enrich.command("reject-merge")
|
|
1962
|
-
@click.argument("merge_id")
|
|
1963
|
-
@click.option("--db", "database", default=None, help="Database")
|
|
1964
|
-
@click.pass_context
|
|
1965
|
-
def enrich_reject_merge(ctx, merge_id: str, database: str | None):
|
|
1966
|
-
"""Reject a pending entity merge proposal."""
|
|
1967
|
-
from .llm.extractors.dedup import reject_merge
|
|
1968
|
-
|
|
1969
|
-
db_name = database or ctx.obj.get("database")
|
|
1970
|
-
db_cfg = config.get_database(db_name)
|
|
1971
|
-
|
|
1972
|
-
if reject_merge(db_cfg.url, merge_id):
|
|
1973
|
-
click.echo("Merge rejected.")
|
|
1974
|
-
else:
|
|
1975
|
-
click.echo("Failed to reject merge. ID may be invalid or already processed.", err=True)
|
|
1976
|
-
sys.exit(1)
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
@enrich.command("clusters")
|
|
1980
|
-
@click.option("--db", "database", default=None, help="Database to check")
|
|
1981
|
-
@click.option("--limit", default=20, help="Maximum clusters to show")
|
|
1982
|
-
@click.pass_context
|
|
1983
|
-
def enrich_clusters(ctx, database: str | None, limit: int):
|
|
1984
|
-
"""List topic clusters.
|
|
1985
|
-
|
|
1986
|
-
Shows groups of related entities and documents organized by theme.
|
|
1987
|
-
"""
|
|
1988
|
-
from .llm.consolidate import get_topic_clusters
|
|
1989
|
-
|
|
1990
|
-
db_name = database or ctx.obj.get("database")
|
|
1991
|
-
db_cfg = config.get_database(db_name)
|
|
1992
|
-
|
|
1993
|
-
clusters = get_topic_clusters(db_cfg.url, limit=limit)
|
|
1994
|
-
|
|
1995
|
-
if not clusters:
|
|
1996
|
-
click.echo("No topic clusters found.")
|
|
1997
|
-
click.echo("Run 'okb enrich consolidate' to generate clusters.")
|
|
1998
|
-
return
|
|
1999
|
-
|
|
2000
|
-
click.echo(f"Topic clusters ({len(clusters)}):\n")
|
|
2001
|
-
for c in clusters:
|
|
2002
|
-
click.echo(f" {c['name']}")
|
|
2003
|
-
if c.get("description"):
|
|
2004
|
-
desc = c["description"][:70] + "..." if len(c["description"]) > 70 else c["description"]
|
|
2005
|
-
click.echo(f" {desc}")
|
|
2006
|
-
click.echo(f" Members: {c['member_count']} entities/documents")
|
|
2007
|
-
if c.get("sample_members"):
|
|
2008
|
-
samples = ", ".join(c["sample_members"][:5])
|
|
2009
|
-
click.echo(f" Examples: {samples}")
|
|
2010
|
-
click.echo("")
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
@enrich.command("relationships")
|
|
2014
|
-
@click.option("--db", "database", default=None, help="Database to check")
|
|
2015
|
-
@click.option("--entity", "entity_name", default=None, help="Filter to specific entity")
|
|
2016
|
-
@click.option("--type", "relationship_type", default=None,
|
|
2017
|
-
help="Filter by relationship type (works_for, uses, belongs_to, related_to)")
|
|
2018
|
-
@click.option("--limit", default=50, help="Maximum results")
|
|
2019
|
-
@click.pass_context
|
|
2020
|
-
def enrich_relationships(
|
|
2021
|
-
ctx,
|
|
2022
|
-
database: str | None,
|
|
2023
|
-
entity_name: str | None,
|
|
2024
|
-
relationship_type: str | None,
|
|
2025
|
-
limit: int,
|
|
2026
|
-
):
|
|
2027
|
-
"""List entity relationships.
|
|
2028
|
-
|
|
2029
|
-
Shows connections between entities (person→org, tech→project, etc.).
|
|
2030
|
-
|
|
2031
|
-
Examples:
|
|
2032
|
-
|
|
2033
|
-
okb enrich relationships # All relationships
|
|
2034
|
-
|
|
2035
|
-
okb enrich relationships --entity "Django" # Filter to one entity
|
|
2036
|
-
|
|
2037
|
-
okb enrich relationships --type works_for # Filter by type
|
|
2038
|
-
"""
|
|
2039
|
-
from .llm.consolidate import get_entity_relationships
|
|
2040
|
-
|
|
2041
|
-
db_name = database or ctx.obj.get("database")
|
|
2042
|
-
db_cfg = config.get_database(db_name)
|
|
2043
|
-
|
|
2044
|
-
relationships = get_entity_relationships(
|
|
2045
|
-
db_cfg.url,
|
|
2046
|
-
entity_name=entity_name,
|
|
2047
|
-
relationship_type=relationship_type,
|
|
2048
|
-
limit=limit,
|
|
2049
|
-
)
|
|
2050
|
-
|
|
2051
|
-
if not relationships:
|
|
2052
|
-
if entity_name:
|
|
2053
|
-
click.echo(f"No relationships found for entity '{entity_name}'.")
|
|
2054
|
-
else:
|
|
2055
|
-
click.echo("No relationships found.")
|
|
2056
|
-
click.echo("Run 'okb enrich consolidate' to extract relationships.")
|
|
2057
|
-
return
|
|
2058
|
-
|
|
2059
|
-
click.echo(f"Entity relationships ({len(relationships)}):\n")
|
|
2060
|
-
for r in relationships:
|
|
2061
|
-
confidence = r.get("confidence", 0)
|
|
2062
|
-
conf_str = f" ({confidence:.0%})" if confidence else ""
|
|
2063
|
-
click.echo(f" {r['source_name']} --[{r['relationship_type']}]--> {r['target_name']}{conf_str}")
|
|
2064
|
-
if r.get("evidence"):
|
|
2065
|
-
evidence = r["evidence"][:60] + "..." if len(r["evidence"]) > 60 else r["evidence"]
|
|
2066
|
-
click.echo(f" Evidence: {evidence}")
|
|
2067
|
-
click.echo("")
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
@enrich.command("all")
|
|
2071
|
-
@click.option("--db", "database", default=None, help="Database to enrich")
|
|
2072
|
-
@click.option("--source-type", default=None, help="Filter by source type")
|
|
2073
|
-
@click.option("--project", default=None, help="Filter by project")
|
|
2074
|
-
@click.option("--query", default=None, help="Semantic search query to filter documents")
|
|
2075
|
-
@click.option("--path-pattern", default=None, help="SQL LIKE pattern for source_path")
|
|
2076
|
-
@click.option("--limit", default=100, help="Maximum documents to process")
|
|
2077
|
-
@click.option("--workers", default=None, type=int, help="Parallel workers (default: docs/5, min 1)")
|
|
2078
|
-
@click.option("--dry-run", is_flag=True, help="Show what would be done without executing")
|
|
2079
|
-
@click.option("--skip-consolidate", is_flag=True, help="Skip consolidation phase")
|
|
2080
|
-
@click.option("--duplicates/--no-duplicates", "detect_duplicates", default=True,
|
|
2081
|
-
help="Detect duplicate entities during consolidation")
|
|
2082
|
-
@click.option("--clusters/--no-clusters", "build_clusters", default=True,
|
|
2083
|
-
help="Build topic clusters during consolidation")
|
|
2084
|
-
@click.option("--relationships/--no-relationships", "extract_relationships", default=True,
|
|
2085
|
-
help="Extract entity relationships during consolidation")
|
|
2086
|
-
@click.pass_context
|
|
2087
|
-
def enrich_all(
|
|
2088
|
-
ctx,
|
|
2089
|
-
database: str | None,
|
|
2090
|
-
source_type: str | None,
|
|
2091
|
-
project: str | None,
|
|
2092
|
-
query: str | None,
|
|
2093
|
-
path_pattern: str | None,
|
|
2094
|
-
limit: int,
|
|
2095
|
-
workers: int | None,
|
|
2096
|
-
dry_run: bool,
|
|
2097
|
-
skip_consolidate: bool,
|
|
2098
|
-
detect_duplicates: bool,
|
|
2099
|
-
build_clusters: bool,
|
|
2100
|
-
extract_relationships: bool,
|
|
2101
|
-
):
|
|
2102
|
-
"""Run full enrichment pipeline: extraction + consolidation.
|
|
2103
|
-
|
|
2104
|
-
Combines 'enrich run' and 'enrich consolidate' in one command for
|
|
2105
|
-
one-shot enrichment of documents.
|
|
2106
|
-
|
|
2107
|
-
Examples:
|
|
2108
|
-
|
|
2109
|
-
okb enrich all # Run full pipeline
|
|
2110
|
-
|
|
2111
|
-
okb enrich all --dry-run # Preview what would happen
|
|
2112
|
-
|
|
2113
|
-
okb enrich all --skip-consolidate # Run extraction only
|
|
2114
|
-
|
|
2115
|
-
okb enrich all --source-type markdown # Filter to markdown files
|
|
2116
|
-
|
|
2117
|
-
okb enrich all --no-clusters # Skip cluster building
|
|
2118
|
-
"""
|
|
2119
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2120
|
-
|
|
2121
|
-
from .llm import get_llm
|
|
2122
|
-
from .llm.consolidate import format_consolidation_result, run_consolidation
|
|
2123
|
-
from .llm.enrich import EnrichmentConfig, get_unenriched_documents, process_enrichment
|
|
2124
|
-
|
|
2125
|
-
# Check LLM is configured
|
|
2126
|
-
if get_llm() is None:
|
|
2127
|
-
click.echo("Error: No LLM provider configured.", err=True)
|
|
2128
|
-
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml", err=True)
|
|
2129
|
-
ctx.exit(1)
|
|
2130
|
-
|
|
2131
|
-
db_name = database or ctx.obj.get("database")
|
|
2132
|
-
db_cfg = config.get_database(db_name)
|
|
2133
|
-
|
|
2134
|
-
# Phase 1: Enrichment
|
|
2135
|
-
click.echo("=== Phase 1: Enrichment ===")
|
|
2136
|
-
click.echo(f"Scanning database '{db_cfg.name}' for documents to enrich...")
|
|
2137
|
-
if dry_run:
|
|
2138
|
-
click.echo("(dry run - no changes will be made)")
|
|
2139
|
-
|
|
2140
|
-
docs = get_unenriched_documents(
|
|
2141
|
-
db_url=db_cfg.url,
|
|
2142
|
-
source_type=source_type,
|
|
2143
|
-
project=project,
|
|
2144
|
-
query=query,
|
|
2145
|
-
path_pattern=path_pattern,
|
|
2146
|
-
limit=limit,
|
|
2147
|
-
)
|
|
2148
|
-
|
|
2149
|
-
total_todos = 0
|
|
2150
|
-
total_entities_pending = 0
|
|
2151
|
-
total_entities_created = 0
|
|
2152
|
-
|
|
2153
|
-
if not docs:
|
|
2154
|
-
click.echo("No documents need enrichment.")
|
|
2155
|
-
else:
|
|
2156
|
-
click.echo(f"Found {len(docs)} documents to enrich")
|
|
2157
|
-
|
|
2158
|
-
if dry_run:
|
|
2159
|
-
for doc in docs[:20]:
|
|
2160
|
-
click.echo(f" - {doc['title']} ({doc['source_type']})")
|
|
2161
|
-
if len(docs) > 20:
|
|
2162
|
-
click.echo(f" ... and {len(docs) - 20} more")
|
|
2163
|
-
else:
|
|
2164
|
-
# Build config
|
|
2165
|
-
enrich_config = EnrichmentConfig.from_config(
|
|
2166
|
-
{
|
|
2167
|
-
"enabled": config.enrichment_enabled,
|
|
2168
|
-
"version": config.enrichment_version,
|
|
2169
|
-
"extract_todos": config.enrichment_extract_todos,
|
|
2170
|
-
"extract_entities": config.enrichment_extract_entities,
|
|
2171
|
-
"auto_create_todos": config.enrichment_auto_create_todos,
|
|
2172
|
-
"auto_create_entities": config.enrichment_auto_create_entities,
|
|
2173
|
-
"min_confidence_todo": config.enrichment_min_confidence_todo,
|
|
2174
|
-
"min_confidence_entity": config.enrichment_min_confidence_entity,
|
|
2175
|
-
}
|
|
2176
|
-
)
|
|
2177
|
-
|
|
2178
|
-
# Calculate workers
|
|
2179
|
-
if workers is None:
|
|
2180
|
-
workers = max(1, len(docs) // 5)
|
|
2181
|
-
|
|
2182
|
-
completed = 0
|
|
2183
|
-
errors = 0
|
|
2184
|
-
|
|
2185
|
-
def enrich_one(doc: dict) -> tuple[dict, dict | None, str | None]:
|
|
2186
|
-
proj = doc["metadata"].get("project") if doc["metadata"] else None
|
|
2187
|
-
try:
|
|
2188
|
-
stats = process_enrichment(
|
|
2189
|
-
document_id=str(doc["id"]),
|
|
2190
|
-
source_path=doc["source_path"],
|
|
2191
|
-
title=doc["title"],
|
|
2192
|
-
content=doc["content"],
|
|
2193
|
-
source_type=doc["source_type"],
|
|
2194
|
-
db_url=db_cfg.url,
|
|
2195
|
-
config=enrich_config,
|
|
2196
|
-
project=proj,
|
|
2197
|
-
)
|
|
2198
|
-
return doc, stats, None
|
|
2199
|
-
except Exception as e:
|
|
2200
|
-
return doc, None, str(e)
|
|
2201
|
-
|
|
2202
|
-
click.echo(f"Processing with {workers} parallel workers...")
|
|
2203
|
-
|
|
2204
|
-
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
2205
|
-
futures = {executor.submit(enrich_one, doc): doc for doc in docs}
|
|
2206
|
-
|
|
2207
|
-
for future in as_completed(futures):
|
|
2208
|
-
doc, stats, error = future.result()
|
|
2209
|
-
completed += 1
|
|
2210
|
-
title = doc["title"][:40] if doc["title"] else "Untitled"
|
|
2211
|
-
|
|
2212
|
-
if error:
|
|
2213
|
-
errors += 1
|
|
2214
|
-
click.echo(f"[{completed}/{len(docs)}] {title}... -> error: {error[:50]}")
|
|
2215
|
-
continue
|
|
2216
|
-
|
|
2217
|
-
total_todos += stats["todos_created"]
|
|
2218
|
-
total_entities_pending += stats["entities_pending"]
|
|
2219
|
-
total_entities_created += stats["entities_created"]
|
|
2220
|
-
|
|
2221
|
-
parts = []
|
|
2222
|
-
if stats["todos_created"]:
|
|
2223
|
-
parts.append(f"{stats['todos_created']} TODOs")
|
|
2224
|
-
if stats["entities_pending"]:
|
|
2225
|
-
parts.append(f"{stats['entities_pending']} pending")
|
|
2226
|
-
if stats["entities_created"]:
|
|
2227
|
-
parts.append(f"{stats['entities_created']} entities")
|
|
2228
|
-
if parts:
|
|
2229
|
-
click.echo(f"[{completed}/{len(docs)}] {title}... -> {', '.join(parts)}")
|
|
2230
|
-
else:
|
|
2231
|
-
click.echo(f"[{completed}/{len(docs)}] {title}... -> nothing extracted")
|
|
2232
|
-
|
|
2233
|
-
click.echo("")
|
|
2234
|
-
click.echo("Enrichment summary:")
|
|
2235
|
-
click.echo(f" Documents processed: {len(docs)}")
|
|
2236
|
-
if errors:
|
|
2237
|
-
click.echo(f" Errors: {errors}")
|
|
2238
|
-
click.echo(f" TODOs created: {total_todos}")
|
|
2239
|
-
click.echo(f" Entities pending review: {total_entities_pending}")
|
|
2240
|
-
click.echo(f" Entities auto-created: {total_entities_created}")
|
|
2241
|
-
|
|
2242
|
-
# Phase 2: Consolidation
|
|
2243
|
-
if skip_consolidate:
|
|
2244
|
-
click.echo("")
|
|
2245
|
-
click.echo("Skipping consolidation (--skip-consolidate)")
|
|
2246
|
-
return
|
|
2247
|
-
|
|
2248
|
-
click.echo("")
|
|
2249
|
-
click.echo("=== Phase 2: Consolidation ===")
|
|
2250
|
-
|
|
2251
|
-
result = run_consolidation(
|
|
2252
|
-
db_url=db_cfg.url,
|
|
2253
|
-
detect_duplicates=detect_duplicates,
|
|
2254
|
-
detect_cross_doc=True,
|
|
2255
|
-
build_clusters=build_clusters,
|
|
2256
|
-
extract_relationships=extract_relationships,
|
|
2257
|
-
dry_run=dry_run,
|
|
2258
|
-
)
|
|
2259
|
-
|
|
2260
|
-
output = format_consolidation_result(result)
|
|
2261
|
-
click.echo(output)
|
|
2262
|
-
|
|
2263
|
-
if not dry_run and (result.duplicates_found > 0 or result.cross_doc_candidates > 0):
|
|
2264
|
-
click.echo("")
|
|
2265
|
-
click.echo("Use 'okb enrich review' to review pending entities and merges.")
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
@enrich.command("review")
|
|
2269
|
-
@click.option("--db", "database", default=None, help="Database to review")
|
|
2270
|
-
@click.option("--entities-only", is_flag=True, help="Only review pending entities")
|
|
2271
|
-
@click.option("--merges-only", is_flag=True, help="Only review pending merges")
|
|
2272
|
-
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
2273
|
-
@click.option("--wait/--no-wait", default=True, help="Wait for embeddings to complete")
|
|
2274
|
-
@click.pass_context
|
|
2275
|
-
def enrich_review(
|
|
2276
|
-
ctx, database: str | None, entities_only: bool, merges_only: bool, local: bool, wait: bool
|
|
2277
|
-
):
|
|
2278
|
-
"""Interactive review of pending entities and merge proposals.
|
|
2279
|
-
|
|
2280
|
-
Loops through pending items with approve/reject prompts.
|
|
2281
|
-
Press Q to quit early - remaining items stay pending for later.
|
|
2282
|
-
|
|
2283
|
-
Entity approvals run asynchronously - you can continue reviewing while
|
|
2284
|
-
embeddings are generated. Use --no-wait to exit immediately after reviewing.
|
|
2285
|
-
|
|
2286
|
-
Examples:
|
|
2287
|
-
|
|
2288
|
-
okb enrich review # Review all pending items
|
|
2289
|
-
|
|
2290
|
-
okb enrich review --entities-only # Only review entities
|
|
2291
|
-
|
|
2292
|
-
okb enrich review --merges-only # Only review merges
|
|
2293
|
-
|
|
2294
|
-
okb enrich review --local # Use local CPU embedding
|
|
2295
|
-
|
|
2296
|
-
okb enrich review --no-wait # Don't wait for embeddings
|
|
2297
|
-
"""
|
|
2298
|
-
|
|
2299
|
-
from .llm.enrich import (
|
|
2300
|
-
approve_entity_async,
|
|
2301
|
-
list_pending_entities,
|
|
2302
|
-
reject_entity,
|
|
2303
|
-
shutdown_executor,
|
|
2304
|
-
)
|
|
2305
|
-
from .llm.extractors.dedup import approve_merge, list_pending_merges, reject_merge
|
|
2306
|
-
|
|
2307
|
-
db_name = database or ctx.obj.get("database")
|
|
2308
|
-
db_cfg = config.get_database(db_name)
|
|
2309
|
-
use_modal = not local
|
|
2310
|
-
|
|
2311
|
-
# Get pending items
|
|
2312
|
-
entities = [] if merges_only else list_pending_entities(db_cfg.url, limit=100)
|
|
2313
|
-
merges = [] if entities_only else list_pending_merges(db_cfg.url, limit=100)
|
|
2314
|
-
|
|
2315
|
-
if not entities and not merges:
|
|
2316
|
-
click.echo("No pending items to review.")
|
|
2317
|
-
return
|
|
2318
|
-
|
|
2319
|
-
click.echo(f"Pending: {len(entities)} entities, {len(merges)} merges")
|
|
2320
|
-
click.echo("")
|
|
2321
|
-
|
|
2322
|
-
# Counters
|
|
2323
|
-
approved = 0
|
|
2324
|
-
rejected = 0
|
|
2325
|
-
skipped = 0
|
|
2326
|
-
|
|
2327
|
-
# Track async approval futures
|
|
2328
|
-
pending_futures: list[tuple] = [] # (future, entity_name)
|
|
2329
|
-
|
|
2330
|
-
# Review entities
|
|
2331
|
-
choice = None
|
|
2332
|
-
if entities and not merges_only:
|
|
2333
|
-
for i, e in enumerate(entities, 1):
|
|
2334
|
-
# Check for completed futures
|
|
2335
|
-
done_count = sum(1 for f, _ in pending_futures if f.done())
|
|
2336
|
-
if pending_futures and done_count > 0:
|
|
2337
|
-
total = len(pending_futures)
|
|
2338
|
-
click.echo(click.style(f" ({done_count}/{total} embeddings done)", dim=True))
|
|
2339
|
-
|
|
2340
|
-
click.echo(click.style(f"=== Entity Review [{i}/{len(entities)}] ===", bold=True))
|
|
2341
|
-
click.echo(f"Name: {click.style(e['entity_name'], fg='cyan')}")
|
|
2342
|
-
click.echo(f"Type: {e['entity_type']}")
|
|
2343
|
-
confidence = e.get("confidence", 0)
|
|
2344
|
-
if confidence:
|
|
2345
|
-
click.echo(f"Confidence: {confidence:.0%}")
|
|
2346
|
-
if e.get("description"):
|
|
2347
|
-
d = e["description"]
|
|
2348
|
-
desc = d[:80] + "..." if len(d) > 80 else d
|
|
2349
|
-
click.echo(f"Description: {desc}")
|
|
2350
|
-
if e.get("aliases"):
|
|
2351
|
-
click.echo(f"Aliases: {', '.join(e['aliases'][:5])}")
|
|
2352
|
-
click.echo(f"Source: {e['source_title']}")
|
|
2353
|
-
click.echo("")
|
|
2354
|
-
|
|
2355
|
-
choice = click.prompt(
|
|
2356
|
-
"[A]pprove [R]eject [S]kip [Q]uit",
|
|
2357
|
-
type=click.Choice(["A", "R", "S", "Q", "a", "r", "s", "q"]),
|
|
2358
|
-
show_choices=False,
|
|
2359
|
-
).upper()
|
|
2360
|
-
|
|
2361
|
-
if choice == "Q":
|
|
2362
|
-
click.echo("Quitting review...")
|
|
2363
|
-
break
|
|
2364
|
-
elif choice == "A":
|
|
2365
|
-
# Submit async approval
|
|
2366
|
-
future = approve_entity_async(db_cfg.url, str(e["id"]), use_modal)
|
|
2367
|
-
pending_futures.append((future, e["entity_name"]))
|
|
2368
|
-
click.echo(click.style("⏳ Queued for approval", fg="cyan"))
|
|
2369
|
-
approved += 1
|
|
2370
|
-
elif choice == "R":
|
|
2371
|
-
if reject_entity(db_cfg.url, str(e["id"])):
|
|
2372
|
-
click.echo(click.style("✗ Rejected", fg="yellow"))
|
|
2373
|
-
rejected += 1
|
|
2374
|
-
else:
|
|
2375
|
-
click.echo(click.style("✗ Failed to reject", fg="red"))
|
|
2376
|
-
else:
|
|
2377
|
-
click.echo("Skipped")
|
|
2378
|
-
skipped += 1
|
|
2379
|
-
|
|
2380
|
-
click.echo("")
|
|
2381
|
-
else:
|
|
2382
|
-
# Completed all entities, continue to merges
|
|
2383
|
-
pass
|
|
2384
|
-
|
|
2385
|
-
# Review merges (only if we didn't quit early)
|
|
2386
|
-
if merges and not entities_only and (not entities or choice != "Q"):
|
|
2387
|
-
for i, m in enumerate(merges, 1):
|
|
2388
|
-
click.echo(click.style(f"=== Merge Review [{i}/{len(merges)}] ===", bold=True))
|
|
2389
|
-
cname = click.style(m["canonical_name"], fg="cyan")
|
|
2390
|
-
ctype = m.get("canonical_type", "unknown")
|
|
2391
|
-
click.echo(f"Canonical: {cname} ({ctype})")
|
|
2392
|
-
dname = click.style(m["duplicate_name"], fg="yellow")
|
|
2393
|
-
dtype = m.get("duplicate_type", "unknown")
|
|
2394
|
-
click.echo(f"Duplicate: {dname} ({dtype})")
|
|
2395
|
-
confidence = m.get("confidence", 0)
|
|
2396
|
-
if confidence:
|
|
2397
|
-
click.echo(f"Confidence: {confidence:.0%}")
|
|
2398
|
-
click.echo(f"Reason: {m.get('reason', 'similarity')}")
|
|
2399
|
-
click.echo("")
|
|
2400
|
-
|
|
2401
|
-
choice = click.prompt(
|
|
2402
|
-
"[A]pprove [R]eject [S]kip [Q]uit",
|
|
2403
|
-
type=click.Choice(["A", "R", "S", "Q", "a", "r", "s", "q"]),
|
|
2404
|
-
show_choices=False,
|
|
2405
|
-
).upper()
|
|
2406
|
-
|
|
2407
|
-
if choice == "Q":
|
|
2408
|
-
click.echo("Quitting review...")
|
|
2409
|
-
break
|
|
2410
|
-
elif choice == "A":
|
|
2411
|
-
if approve_merge(db_cfg.url, str(m["id"])):
|
|
2412
|
-
click.echo(click.style("✓ Merged", fg="green"))
|
|
2413
|
-
approved += 1
|
|
2414
|
-
else:
|
|
2415
|
-
click.echo(click.style("✗ Failed to merge", fg="red"))
|
|
2416
|
-
elif choice == "R":
|
|
2417
|
-
if reject_merge(db_cfg.url, str(m["id"])):
|
|
2418
|
-
click.echo(click.style("✗ Rejected", fg="yellow"))
|
|
2419
|
-
rejected += 1
|
|
2420
|
-
else:
|
|
2421
|
-
click.echo(click.style("✗ Failed to reject", fg="red"))
|
|
2422
|
-
else:
|
|
2423
|
-
click.echo("Skipped")
|
|
2424
|
-
skipped += 1
|
|
2425
|
-
|
|
2426
|
-
click.echo("")
|
|
2427
|
-
|
|
2428
|
-
# Wait for pending approvals if requested
|
|
2429
|
-
if pending_futures:
|
|
2430
|
-
if wait:
|
|
2431
|
-
click.echo(f"Waiting for {len(pending_futures)} pending approvals...")
|
|
2432
|
-
succeeded = 0
|
|
2433
|
-
failed = 0
|
|
2434
|
-
for future, name in pending_futures:
|
|
2435
|
-
try:
|
|
2436
|
-
result = future.result(timeout=120)
|
|
2437
|
-
if result:
|
|
2438
|
-
click.echo(click.style(f" ✓ {name}", fg="green"))
|
|
2439
|
-
succeeded += 1
|
|
2440
|
-
else:
|
|
2441
|
-
click.echo(click.style(f" ✗ {name} failed", fg="red"))
|
|
2442
|
-
failed += 1
|
|
2443
|
-
except Exception as e:
|
|
2444
|
-
click.echo(click.style(f" ✗ {name}: {e}", fg="red"))
|
|
2445
|
-
failed += 1
|
|
2446
|
-
click.echo(f"Embeddings: {succeeded} succeeded, {failed} failed")
|
|
2447
|
-
else:
|
|
2448
|
-
done_count = sum(1 for f, _ in pending_futures if f.done())
|
|
2449
|
-
pending_count = len(pending_futures) - done_count
|
|
2450
|
-
if pending_count > 0:
|
|
2451
|
-
click.echo(f"{pending_count} embeddings still processing in background...")
|
|
2452
|
-
|
|
2453
|
-
# Cleanup executor
|
|
2454
|
-
shutdown_executor(wait=wait)
|
|
2455
|
-
|
|
2456
|
-
# Summary
|
|
2457
|
-
click.echo("")
|
|
2458
|
-
click.echo(click.style("Review complete:", bold=True))
|
|
2459
|
-
click.echo(f" {click.style(str(approved), fg='green')} approved")
|
|
2460
|
-
click.echo(f" {click.style(str(rejected), fg='yellow')} rejected")
|
|
2461
|
-
click.echo(f" {skipped} skipped")
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
1397
|
if __name__ == "__main__":
|
|
2465
1398
|
main()
|