okb 1.1.0a0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/cli.py +1083 -16
- okb/config.py +122 -4
- okb/http_server.py +356 -91
- okb/llm/analyze.py +524 -0
- okb/llm/consolidate.py +685 -0
- okb/llm/enrich.py +723 -0
- okb/llm/extractors/__init__.py +13 -0
- okb/llm/extractors/base.py +44 -0
- okb/llm/extractors/cross_doc.py +478 -0
- okb/llm/extractors/dedup.py +499 -0
- okb/llm/extractors/entity.py +369 -0
- okb/llm/extractors/todo.py +149 -0
- okb/llm/providers.py +9 -6
- okb/mcp_server.py +1036 -12
- okb/migrations/0008.enrichment.sql +46 -0
- okb/migrations/0009.entity-consolidation.sql +120 -0
- okb/migrations/0010.token-id.sql +7 -0
- okb/modal_llm.py +26 -8
- okb/plugins/sources/github.py +5 -5
- okb/tokens.py +25 -3
- {okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/METADATA +91 -8
- {okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/RECORD +24 -12
- {okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/WHEEL +0 -0
- {okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/entry_points.txt +0 -0
okb/cli.py
CHANGED
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import importlib.resources
|
|
6
6
|
import json
|
|
7
|
+
import os
|
|
7
8
|
import shutil
|
|
8
9
|
import subprocess
|
|
9
10
|
import sys
|
|
@@ -44,7 +45,7 @@ def _check_docker() -> bool:
|
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
def _get_container_status() -> str | None:
|
|
47
|
-
"""Get the status of the
|
|
48
|
+
"""Get the status of the okb container. Returns None if not found."""
|
|
48
49
|
try:
|
|
49
50
|
result = subprocess.run(
|
|
50
51
|
[
|
|
@@ -841,7 +842,7 @@ def sync_run(
|
|
|
841
842
|
):
|
|
842
843
|
"""Sync from API sources.
|
|
843
844
|
|
|
844
|
-
Example:
|
|
845
|
+
Example: okb sync run github --repo owner/repo
|
|
845
846
|
"""
|
|
846
847
|
import psycopg
|
|
847
848
|
from psycopg.rows import dict_row
|
|
@@ -1209,7 +1210,7 @@ def token_list(ctx, database: str | None):
|
|
|
1209
1210
|
for t in tokens:
|
|
1210
1211
|
desc = f" - {t.description}" if t.description else ""
|
|
1211
1212
|
last_used = t.last_used_at.strftime("%Y-%m-%d %H:%M") if t.last_used_at else "never"
|
|
1212
|
-
click.echo(f" [{t.permissions}] {t.token_hash[:12]}...{desc}")
|
|
1213
|
+
click.echo(f" ID {t.id} [{t.permissions}] {t.token_hash[:12]}...{desc}")
|
|
1213
1214
|
created = t.created_at.strftime("%Y-%m-%d %H:%M")
|
|
1214
1215
|
click.echo(f" Created: {created}, Last used: {last_used}")
|
|
1215
1216
|
except Exception as e:
|
|
@@ -1218,26 +1219,43 @@ def token_list(ctx, database: str | None):
|
|
|
1218
1219
|
|
|
1219
1220
|
|
|
1220
1221
|
@token.command("revoke")
|
|
1221
|
-
@click.argument("token_value")
|
|
1222
|
+
@click.argument("token_value", required=False)
|
|
1223
|
+
@click.option("--id", "token_id", type=int, default=None, help="Token ID to revoke (from 'okb token list')")
|
|
1222
1224
|
@click.option("--db", "database", default=None, help="Database to revoke token from")
|
|
1223
1225
|
@click.pass_context
|
|
1224
|
-
def token_revoke(ctx, token_value: str, database: str | None):
|
|
1226
|
+
def token_revoke(ctx, token_value: str | None, token_id: int | None, database: str | None):
|
|
1225
1227
|
"""Revoke (delete) an API token.
|
|
1226
1228
|
|
|
1227
|
-
TOKEN_VALUE
|
|
1229
|
+
Either provide the full TOKEN_VALUE or use --id with the token ID from 'okb token list'.
|
|
1228
1230
|
"""
|
|
1229
|
-
from .tokens import delete_token
|
|
1231
|
+
from .tokens import delete_token, delete_token_by_id
|
|
1232
|
+
|
|
1233
|
+
if not token_value and not token_id:
|
|
1234
|
+
click.echo("Error: Provide either TOKEN_VALUE or --id", err=True)
|
|
1235
|
+
sys.exit(1)
|
|
1236
|
+
|
|
1237
|
+
if token_value and token_id:
|
|
1238
|
+
click.echo("Error: Provide either TOKEN_VALUE or --id, not both", err=True)
|
|
1239
|
+
sys.exit(1)
|
|
1230
1240
|
|
|
1231
1241
|
db_name = database or ctx.obj.get("database")
|
|
1232
1242
|
db_cfg = config.get_database(db_name)
|
|
1233
1243
|
|
|
1234
1244
|
try:
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1245
|
+
if token_id:
|
|
1246
|
+
deleted = delete_token_by_id(db_cfg.url, token_id)
|
|
1247
|
+
if deleted:
|
|
1248
|
+
click.echo(f"Token ID {token_id} revoked.")
|
|
1249
|
+
else:
|
|
1250
|
+
click.echo(f"Token ID {token_id} not found.", err=True)
|
|
1251
|
+
sys.exit(1)
|
|
1238
1252
|
else:
|
|
1239
|
-
|
|
1240
|
-
|
|
1253
|
+
deleted = delete_token(db_cfg.url, token_value)
|
|
1254
|
+
if deleted:
|
|
1255
|
+
click.echo("Token revoked.")
|
|
1256
|
+
else:
|
|
1257
|
+
click.echo("Token not found. Use --id or provide the full token string.", err=True)
|
|
1258
|
+
sys.exit(1)
|
|
1241
1259
|
except Exception as e:
|
|
1242
1260
|
click.echo(f"Error revoking token: {e}", err=True)
|
|
1243
1261
|
sys.exit(1)
|
|
@@ -1276,7 +1294,7 @@ def llm_status(ctx, database: str | None):
|
|
|
1276
1294
|
click.echo(f"Cache responses: {config.llm_cache_responses}")
|
|
1277
1295
|
|
|
1278
1296
|
if config.llm_provider == "modal":
|
|
1279
|
-
click.echo("Backend: Modal GPU (deploy with:
|
|
1297
|
+
click.echo("Backend: Modal GPU (deploy with: okb llm deploy)")
|
|
1280
1298
|
elif config.llm_use_bedrock:
|
|
1281
1299
|
click.echo(f"Backend: AWS Bedrock (region: {config.llm_aws_region})")
|
|
1282
1300
|
else:
|
|
@@ -1366,7 +1384,9 @@ def llm_clear_cache(ctx, database: str | None, days: int | None, yes: bool):
|
|
|
1366
1384
|
def llm_deploy():
|
|
1367
1385
|
"""Deploy the Modal LLM app for open model inference.
|
|
1368
1386
|
|
|
1369
|
-
|
|
1387
|
+
Deploys a GPU-accelerated LLM service on Modal using the model from your config.
|
|
1388
|
+
Default: microsoft/Phi-3-mini-4k-instruct (no HuggingFace approval needed).
|
|
1389
|
+
|
|
1370
1390
|
Required for using provider: modal in your config.
|
|
1371
1391
|
|
|
1372
1392
|
Requires Modal CLI to be installed and authenticated:
|
|
@@ -1385,14 +1405,1061 @@ def llm_deploy():
|
|
|
1385
1405
|
click.echo(f"Error: modal_llm.py not found at {llm_path}", err=True)
|
|
1386
1406
|
sys.exit(1)
|
|
1387
1407
|
|
|
1388
|
-
|
|
1389
|
-
|
|
1408
|
+
# Get model and GPU from config
|
|
1409
|
+
model = config.llm_model or "microsoft/Phi-3-mini-4k-instruct"
|
|
1410
|
+
gpu = config.llm_modal_gpu or "L4"
|
|
1411
|
+
click.echo("Deploying Modal LLM:")
|
|
1412
|
+
click.echo(f" Model: {model}")
|
|
1413
|
+
click.echo(f" GPU: {gpu}")
|
|
1414
|
+
click.echo("Note: First deploy downloads the model and may take a few minutes.")
|
|
1415
|
+
|
|
1416
|
+
# Set model and GPU in environment for Modal to pick up
|
|
1417
|
+
env = os.environ.copy()
|
|
1418
|
+
env["OKB_LLM_MODEL"] = model
|
|
1419
|
+
env["OKB_MODAL_GPU"] = gpu
|
|
1420
|
+
|
|
1390
1421
|
result = subprocess.run(
|
|
1391
1422
|
["modal", "deploy", str(llm_path)],
|
|
1392
1423
|
cwd=llm_path.parent,
|
|
1424
|
+
env=env,
|
|
1393
1425
|
)
|
|
1394
1426
|
sys.exit(result.returncode)
|
|
1395
1427
|
|
|
1396
1428
|
|
|
1429
|
+
# =============================================================================
|
|
1430
|
+
# Enrich commands
|
|
1431
|
+
# =============================================================================
|
|
1432
|
+
|
|
1433
|
+
|
|
1434
|
+
@main.group()
|
|
1435
|
+
def enrich():
|
|
1436
|
+
"""LLM-based document enrichment (extract TODOs and entities)."""
|
|
1437
|
+
pass
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
@enrich.command("run")
|
|
1441
|
+
@click.option("--db", "database", default=None, help="Database to enrich")
|
|
1442
|
+
@click.option("--source-type", default=None, help="Filter by source type")
|
|
1443
|
+
@click.option("--project", default=None, help="Filter by project")
|
|
1444
|
+
@click.option("--query", default=None, help="Semantic search query to filter documents")
|
|
1445
|
+
@click.option("--path-pattern", default=None, help="SQL LIKE pattern for source_path")
|
|
1446
|
+
@click.option(
|
|
1447
|
+
"--all", "enrich_all", is_flag=True, help="Re-enrich all documents (ignore enriched_at)"
|
|
1448
|
+
)
|
|
1449
|
+
@click.option("--dry-run", is_flag=True, help="Show what would be enriched without executing")
|
|
1450
|
+
@click.option("--limit", default=100, help="Maximum documents to process")
|
|
1451
|
+
@click.option("--workers", default=None, type=int, help="Parallel workers (default: docs/5, min 1)")
|
|
1452
|
+
@click.pass_context
|
|
1453
|
+
def enrich_run(
|
|
1454
|
+
ctx,
|
|
1455
|
+
database: str | None,
|
|
1456
|
+
source_type: str | None,
|
|
1457
|
+
project: str | None,
|
|
1458
|
+
query: str | None,
|
|
1459
|
+
path_pattern: str | None,
|
|
1460
|
+
enrich_all: bool,
|
|
1461
|
+
dry_run: bool,
|
|
1462
|
+
limit: int,
|
|
1463
|
+
workers: int | None,
|
|
1464
|
+
):
|
|
1465
|
+
"""Run enrichment on documents to extract TODOs and entities.
|
|
1466
|
+
|
|
1467
|
+
By default, only processes documents that haven't been enriched yet.
|
|
1468
|
+
Use --all to re-enrich all documents (e.g., after changing enrichment config).
|
|
1469
|
+
|
|
1470
|
+
Examples:
|
|
1471
|
+
|
|
1472
|
+
okb enrich run # Enrich un-enriched documents
|
|
1473
|
+
|
|
1474
|
+
okb enrich run --dry-run # Show what would be enriched
|
|
1475
|
+
|
|
1476
|
+
okb enrich run --all # Re-enrich everything
|
|
1477
|
+
|
|
1478
|
+
okb enrich run --source-type markdown # Only markdown files
|
|
1479
|
+
|
|
1480
|
+
okb enrich run --query "meeting notes" # Filter by semantic search
|
|
1481
|
+
|
|
1482
|
+
okb enrich run --path-pattern '%myrepo%' # Filter by source path
|
|
1483
|
+
|
|
1484
|
+
okb enrich run --workers 8 # Use 8 parallel workers
|
|
1485
|
+
"""
|
|
1486
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
1487
|
+
|
|
1488
|
+
from .llm import get_llm
|
|
1489
|
+
from .llm.enrich import EnrichmentConfig, get_unenriched_documents, process_enrichment
|
|
1490
|
+
|
|
1491
|
+
# Check LLM is configured before doing any work
|
|
1492
|
+
if get_llm() is None:
|
|
1493
|
+
click.echo("Error: No LLM provider configured.", err=True)
|
|
1494
|
+
click.echo("", err=True)
|
|
1495
|
+
click.echo("Enrichment requires an LLM to extract TODOs and entities.", err=True)
|
|
1496
|
+
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml:", err=True)
|
|
1497
|
+
click.echo("", err=True)
|
|
1498
|
+
click.echo(" llm:", err=True)
|
|
1499
|
+
click.echo(" provider: claude", err=True)
|
|
1500
|
+
click.echo(" model: claude-haiku-4-5-20251001", err=True)
|
|
1501
|
+
click.echo("", err=True)
|
|
1502
|
+
click.echo("Run 'okb llm status' to check configuration.", err=True)
|
|
1503
|
+
ctx.exit(1)
|
|
1504
|
+
|
|
1505
|
+
db_name = database or ctx.obj.get("database")
|
|
1506
|
+
db_cfg = config.get_database(db_name)
|
|
1507
|
+
|
|
1508
|
+
# Get enrichment version for re-enrichment check
|
|
1509
|
+
enrichment_version = config.enrichment_version if enrich_all else None
|
|
1510
|
+
|
|
1511
|
+
click.echo(f"Scanning database '{db_cfg.name}' for documents to enrich...")
|
|
1512
|
+
if dry_run:
|
|
1513
|
+
click.echo("(dry run - no changes will be made)")
|
|
1514
|
+
|
|
1515
|
+
docs = get_unenriched_documents(
|
|
1516
|
+
db_url=db_cfg.url,
|
|
1517
|
+
source_type=source_type,
|
|
1518
|
+
project=project,
|
|
1519
|
+
query=query,
|
|
1520
|
+
path_pattern=path_pattern,
|
|
1521
|
+
enrichment_version=enrichment_version,
|
|
1522
|
+
limit=limit,
|
|
1523
|
+
)
|
|
1524
|
+
|
|
1525
|
+
if not docs:
|
|
1526
|
+
click.echo("No documents need enrichment.")
|
|
1527
|
+
return
|
|
1528
|
+
|
|
1529
|
+
click.echo(f"Found {len(docs)} documents to enrich")
|
|
1530
|
+
|
|
1531
|
+
if dry_run:
|
|
1532
|
+
for doc in docs[:20]:
|
|
1533
|
+
click.echo(f" - {doc['title']} ({doc['source_type']})")
|
|
1534
|
+
if len(docs) > 20:
|
|
1535
|
+
click.echo(f" ... and {len(docs) - 20} more")
|
|
1536
|
+
return
|
|
1537
|
+
|
|
1538
|
+
# Calculate workers if not specified: floor(docs/5), minimum 1
|
|
1539
|
+
if workers is None:
|
|
1540
|
+
workers = max(1, len(docs) // 5)
|
|
1541
|
+
|
|
1542
|
+
# Build config
|
|
1543
|
+
enrich_config = EnrichmentConfig.from_config(
|
|
1544
|
+
{
|
|
1545
|
+
"enabled": config.enrichment_enabled,
|
|
1546
|
+
"version": config.enrichment_version,
|
|
1547
|
+
"extract_todos": config.enrichment_extract_todos,
|
|
1548
|
+
"extract_entities": config.enrichment_extract_entities,
|
|
1549
|
+
"auto_create_todos": config.enrichment_auto_create_todos,
|
|
1550
|
+
"auto_create_entities": config.enrichment_auto_create_entities,
|
|
1551
|
+
"min_confidence_todo": config.enrichment_min_confidence_todo,
|
|
1552
|
+
"min_confidence_entity": config.enrichment_min_confidence_entity,
|
|
1553
|
+
}
|
|
1554
|
+
)
|
|
1555
|
+
|
|
1556
|
+
total_todos = 0
|
|
1557
|
+
total_entities_pending = 0
|
|
1558
|
+
total_entities_created = 0
|
|
1559
|
+
completed = 0
|
|
1560
|
+
errors = 0
|
|
1561
|
+
|
|
1562
|
+
def enrich_one(doc: dict) -> tuple[dict, dict | None, str | None]:
|
|
1563
|
+
"""Process a single document. Returns (doc, stats, error)."""
|
|
1564
|
+
proj = doc["metadata"].get("project") if doc["metadata"] else None
|
|
1565
|
+
try:
|
|
1566
|
+
stats = process_enrichment(
|
|
1567
|
+
document_id=str(doc["id"]),
|
|
1568
|
+
source_path=doc["source_path"],
|
|
1569
|
+
title=doc["title"],
|
|
1570
|
+
content=doc["content"],
|
|
1571
|
+
source_type=doc["source_type"],
|
|
1572
|
+
db_url=db_cfg.url,
|
|
1573
|
+
config=enrich_config,
|
|
1574
|
+
project=proj,
|
|
1575
|
+
)
|
|
1576
|
+
return doc, stats, None
|
|
1577
|
+
except Exception as e:
|
|
1578
|
+
return doc, None, str(e)
|
|
1579
|
+
|
|
1580
|
+
click.echo(f"Processing with {workers} parallel workers...")
|
|
1581
|
+
|
|
1582
|
+
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
1583
|
+
futures = {executor.submit(enrich_one, doc): doc for doc in docs}
|
|
1584
|
+
|
|
1585
|
+
for future in as_completed(futures):
|
|
1586
|
+
doc, stats, error = future.result()
|
|
1587
|
+
completed += 1
|
|
1588
|
+
title = doc["title"][:40] if doc["title"] else "Untitled"
|
|
1589
|
+
|
|
1590
|
+
if error:
|
|
1591
|
+
errors += 1
|
|
1592
|
+
click.echo(f"[{completed}/{len(docs)}] {title}... -> error: {error[:50]}")
|
|
1593
|
+
continue
|
|
1594
|
+
|
|
1595
|
+
total_todos += stats["todos_created"]
|
|
1596
|
+
total_entities_pending += stats["entities_pending"]
|
|
1597
|
+
total_entities_created += stats["entities_created"]
|
|
1598
|
+
|
|
1599
|
+
parts = []
|
|
1600
|
+
if stats["todos_created"]:
|
|
1601
|
+
parts.append(f"{stats['todos_created']} TODOs")
|
|
1602
|
+
if stats["entities_pending"]:
|
|
1603
|
+
parts.append(f"{stats['entities_pending']} pending")
|
|
1604
|
+
if stats["entities_created"]:
|
|
1605
|
+
parts.append(f"{stats['entities_created']} entities")
|
|
1606
|
+
if parts:
|
|
1607
|
+
click.echo(f"[{completed}/{len(docs)}] {title}... -> {', '.join(parts)}")
|
|
1608
|
+
else:
|
|
1609
|
+
click.echo(f"[{completed}/{len(docs)}] {title}... -> nothing extracted")
|
|
1610
|
+
|
|
1611
|
+
click.echo("")
|
|
1612
|
+
click.echo("Summary:")
|
|
1613
|
+
click.echo(f" Documents processed: {len(docs)}")
|
|
1614
|
+
if errors:
|
|
1615
|
+
click.echo(f" Errors: {errors}")
|
|
1616
|
+
click.echo(f" TODOs created: {total_todos}")
|
|
1617
|
+
click.echo(f" Entities pending review: {total_entities_pending}")
|
|
1618
|
+
click.echo(f" Entities auto-created: {total_entities_created}")
|
|
1619
|
+
|
|
1620
|
+
|
|
1621
|
+
@enrich.command("pending")
|
|
1622
|
+
@click.option("--db", "database", default=None, help="Database to check")
|
|
1623
|
+
@click.option("--type", "entity_type", default=None, help="Filter by entity type")
|
|
1624
|
+
@click.option("--limit", default=50, help="Maximum results")
|
|
1625
|
+
@click.pass_context
|
|
1626
|
+
def enrich_pending(ctx, database: str | None, entity_type: str | None, limit: int):
|
|
1627
|
+
"""List pending entity suggestions awaiting review.
|
|
1628
|
+
|
|
1629
|
+
Shows entities extracted from documents that need approval before
|
|
1630
|
+
becoming searchable. Use 'okb enrich approve' or 'okb enrich reject'
|
|
1631
|
+
to process them.
|
|
1632
|
+
"""
|
|
1633
|
+
from .llm.enrich import list_pending_entities
|
|
1634
|
+
|
|
1635
|
+
db_name = database or ctx.obj.get("database")
|
|
1636
|
+
db_cfg = config.get_database(db_name)
|
|
1637
|
+
|
|
1638
|
+
entities = list_pending_entities(db_cfg.url, entity_type=entity_type, limit=limit)
|
|
1639
|
+
|
|
1640
|
+
if not entities:
|
|
1641
|
+
click.echo("No pending entity suggestions.")
|
|
1642
|
+
return
|
|
1643
|
+
|
|
1644
|
+
click.echo(f"Pending entities ({len(entities)}):\n")
|
|
1645
|
+
for e in entities:
|
|
1646
|
+
confidence = e.get("confidence", 0)
|
|
1647
|
+
confidence_str = f" ({confidence:.0%})" if confidence else ""
|
|
1648
|
+
click.echo(f" [{e['entity_type']}] {e['entity_name']}{confidence_str}")
|
|
1649
|
+
click.echo(f" ID: {e['id']}")
|
|
1650
|
+
if e.get("description"):
|
|
1651
|
+
desc = (
|
|
1652
|
+
e["description"][:60] + "..."
|
|
1653
|
+
if len(e.get("description", "")) > 60
|
|
1654
|
+
else e["description"]
|
|
1655
|
+
)
|
|
1656
|
+
click.echo(f" {desc}")
|
|
1657
|
+
if e.get("aliases"):
|
|
1658
|
+
click.echo(f" Aliases: {', '.join(e['aliases'][:3])}")
|
|
1659
|
+
click.echo(f" Source: {e['source_title']}")
|
|
1660
|
+
click.echo("")
|
|
1661
|
+
|
|
1662
|
+
click.echo("Use 'okb enrich approve <id>' or 'okb enrich reject <id>' to process.")
|
|
1663
|
+
|
|
1664
|
+
|
|
1665
|
+
@enrich.command("approve")
|
|
1666
|
+
@click.argument("pending_id")
|
|
1667
|
+
@click.option("--db", "database", default=None, help="Database")
|
|
1668
|
+
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
1669
|
+
@click.pass_context
|
|
1670
|
+
def enrich_approve(ctx, pending_id: str, database: str | None, local: bool):
|
|
1671
|
+
"""Approve a pending entity, creating it as a searchable document."""
|
|
1672
|
+
from .llm.enrich import approve_entity
|
|
1673
|
+
|
|
1674
|
+
db_name = database or ctx.obj.get("database")
|
|
1675
|
+
db_cfg = config.get_database(db_name)
|
|
1676
|
+
|
|
1677
|
+
source_path = approve_entity(db_cfg.url, pending_id, use_modal=not local)
|
|
1678
|
+
if source_path:
|
|
1679
|
+
click.echo(f"Entity approved and created: {source_path}")
|
|
1680
|
+
else:
|
|
1681
|
+
click.echo("Failed to approve entity. ID may be invalid or already processed.", err=True)
|
|
1682
|
+
sys.exit(1)
|
|
1683
|
+
|
|
1684
|
+
|
|
1685
|
+
@enrich.command("reject")
|
|
1686
|
+
@click.argument("pending_id")
|
|
1687
|
+
@click.option("--db", "database", default=None, help="Database")
|
|
1688
|
+
@click.pass_context
|
|
1689
|
+
def enrich_reject(ctx, pending_id: str, database: str | None):
|
|
1690
|
+
"""Reject a pending entity suggestion."""
|
|
1691
|
+
from .llm.enrich import reject_entity
|
|
1692
|
+
|
|
1693
|
+
db_name = database or ctx.obj.get("database")
|
|
1694
|
+
db_cfg = config.get_database(db_name)
|
|
1695
|
+
|
|
1696
|
+
if reject_entity(db_cfg.url, pending_id):
|
|
1697
|
+
click.echo("Entity rejected.")
|
|
1698
|
+
else:
|
|
1699
|
+
click.echo("Failed to reject entity. ID may be invalid or already processed.", err=True)
|
|
1700
|
+
sys.exit(1)
|
|
1701
|
+
|
|
1702
|
+
|
|
1703
|
+
@enrich.command("analyze")
|
|
1704
|
+
@click.option("--db", "database", default=None, help="Database to analyze")
|
|
1705
|
+
@click.option("--project", default=None, help="Analyze specific project only")
|
|
1706
|
+
@click.option("--sample-size", default=15, help="Number of documents to sample")
|
|
1707
|
+
@click.option("--no-update", is_flag=True, help="Don't update database metadata")
|
|
1708
|
+
@click.option("--stats-only", is_flag=True, help="Show stats without LLM analysis")
|
|
1709
|
+
@click.pass_context
|
|
1710
|
+
def enrich_analyze(
|
|
1711
|
+
ctx,
|
|
1712
|
+
database: str | None,
|
|
1713
|
+
project: str | None,
|
|
1714
|
+
sample_size: int,
|
|
1715
|
+
no_update: bool,
|
|
1716
|
+
stats_only: bool,
|
|
1717
|
+
):
|
|
1718
|
+
"""Analyze knowledge base and update description/topics.
|
|
1719
|
+
|
|
1720
|
+
Uses entity aggregation and document sampling to understand the overall
|
|
1721
|
+
content and themes in the knowledge base. Generates a description and
|
|
1722
|
+
topic keywords using LLM analysis.
|
|
1723
|
+
|
|
1724
|
+
Examples:
|
|
1725
|
+
|
|
1726
|
+
okb enrich analyze # Analyze entire database
|
|
1727
|
+
|
|
1728
|
+
okb enrich analyze --stats-only # Show stats without LLM call
|
|
1729
|
+
|
|
1730
|
+
okb enrich analyze --project myproject # Analyze specific project
|
|
1731
|
+
|
|
1732
|
+
okb enrich analyze --no-update # Analyze without updating metadata
|
|
1733
|
+
"""
|
|
1734
|
+
from .llm.analyze import (
|
|
1735
|
+
analyze_database,
|
|
1736
|
+
get_content_stats,
|
|
1737
|
+
get_entity_summary,
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
db_name = database or ctx.obj.get("database")
|
|
1741
|
+
db_cfg = config.get_database(db_name)
|
|
1742
|
+
|
|
1743
|
+
scope = f"project '{project}'" if project else f"database '{db_cfg.name}'"
|
|
1744
|
+
click.echo(f"Analyzing {scope}...\n")
|
|
1745
|
+
|
|
1746
|
+
# Always get stats
|
|
1747
|
+
stats = get_content_stats(db_cfg.url, project)
|
|
1748
|
+
entities = get_entity_summary(db_cfg.url, project, limit=20)
|
|
1749
|
+
|
|
1750
|
+
# Show stats
|
|
1751
|
+
click.echo("Content Statistics:")
|
|
1752
|
+
click.echo(f" Documents: {stats['total_documents']:,}")
|
|
1753
|
+
click.echo(f" Tokens: ~{stats['total_tokens']:,}")
|
|
1754
|
+
if stats["source_types"]:
|
|
1755
|
+
sorted_types = sorted(stats["source_types"].items(), key=lambda x: -x[1])
|
|
1756
|
+
types_parts = [f"{t}: {c}" for t, c in sorted_types]
|
|
1757
|
+
# Break into multiple lines if many types
|
|
1758
|
+
if len(types_parts) > 4:
|
|
1759
|
+
click.echo(" Source types:")
|
|
1760
|
+
for tp in types_parts:
|
|
1761
|
+
click.echo(f" {tp}")
|
|
1762
|
+
else:
|
|
1763
|
+
click.echo(f" Source types: {', '.join(types_parts)}")
|
|
1764
|
+
if stats["projects"]:
|
|
1765
|
+
click.echo(f" Projects: {', '.join(stats['projects'])}")
|
|
1766
|
+
if stats["date_range"]["earliest"]:
|
|
1767
|
+
earliest = stats["date_range"]["earliest"]
|
|
1768
|
+
latest = stats["date_range"]["latest"]
|
|
1769
|
+
click.echo(f" Date range: {earliest} to {latest}")
|
|
1770
|
+
|
|
1771
|
+
click.echo("")
|
|
1772
|
+
|
|
1773
|
+
# Show top entities
|
|
1774
|
+
if entities:
|
|
1775
|
+
click.echo("Top Entities (by mentions):")
|
|
1776
|
+
for i, e in enumerate(entities[:10], 1):
|
|
1777
|
+
name, etype = e["name"], e["type"]
|
|
1778
|
+
refs, docs = e["ref_count"], e["doc_count"]
|
|
1779
|
+
click.echo(f" {i}. {name} ({etype}) - {refs} mentions in {docs} docs")
|
|
1780
|
+
click.echo("")
|
|
1781
|
+
else:
|
|
1782
|
+
click.echo("No entities extracted yet.")
|
|
1783
|
+
click.echo("Run 'okb enrich run' to extract entities from documents.\n")
|
|
1784
|
+
|
|
1785
|
+
if stats_only:
|
|
1786
|
+
return
|
|
1787
|
+
|
|
1788
|
+
# Check LLM is configured
|
|
1789
|
+
from .llm import get_llm
|
|
1790
|
+
|
|
1791
|
+
if get_llm() is None:
|
|
1792
|
+
click.echo("Error: No LLM provider configured.", err=True)
|
|
1793
|
+
click.echo("", err=True)
|
|
1794
|
+
click.echo("Analysis requires an LLM to generate description and topics.", err=True)
|
|
1795
|
+
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml:", err=True)
|
|
1796
|
+
click.echo("", err=True)
|
|
1797
|
+
click.echo(" llm:", err=True)
|
|
1798
|
+
click.echo(" provider: claude", err=True)
|
|
1799
|
+
click.echo("", err=True)
|
|
1800
|
+
click.echo("Use --stats-only to see statistics without LLM analysis.", err=True)
|
|
1801
|
+
ctx.exit(1)
|
|
1802
|
+
|
|
1803
|
+
click.echo(f"Sampling {sample_size} documents for analysis...")
|
|
1804
|
+
click.echo("Generating description and topics...")
|
|
1805
|
+
click.echo("")
|
|
1806
|
+
|
|
1807
|
+
try:
|
|
1808
|
+
result = analyze_database(
|
|
1809
|
+
db_url=db_cfg.url,
|
|
1810
|
+
project=project,
|
|
1811
|
+
sample_size=sample_size,
|
|
1812
|
+
auto_update=not no_update,
|
|
1813
|
+
)
|
|
1814
|
+
|
|
1815
|
+
click.echo("Analysis Complete:")
|
|
1816
|
+
click.echo(f" Description: {result.description}")
|
|
1817
|
+
click.echo(f" Topics: {', '.join(result.topics)}")
|
|
1818
|
+
|
|
1819
|
+
if not no_update:
|
|
1820
|
+
click.echo("")
|
|
1821
|
+
click.echo("Updated database metadata.")
|
|
1822
|
+
else:
|
|
1823
|
+
click.echo("")
|
|
1824
|
+
click.echo("(metadata not updated - use without --no-update to save)")
|
|
1825
|
+
|
|
1826
|
+
except Exception as e:
|
|
1827
|
+
click.echo(f"Error during analysis: {e}", err=True)
|
|
1828
|
+
ctx.exit(1)
|
|
1829
|
+
|
|
1830
|
+
|
|
1831
|
+
@enrich.command("consolidate")
|
|
1832
|
+
@click.option("--db", "database", default=None, help="Database to consolidate")
|
|
1833
|
+
@click.option("--duplicates/--no-duplicates", "detect_duplicates", default=True,
|
|
1834
|
+
help="Detect duplicate entities")
|
|
1835
|
+
@click.option("--cross-doc/--no-cross-doc", "detect_cross_doc", default=True,
|
|
1836
|
+
help="Detect cross-document entities")
|
|
1837
|
+
@click.option("--clusters/--no-clusters", "build_clusters", default=True,
|
|
1838
|
+
help="Build topic clusters")
|
|
1839
|
+
@click.option("--relationships/--no-relationships", "extract_relationships", default=True,
|
|
1840
|
+
help="Extract entity relationships")
|
|
1841
|
+
@click.option("--dry-run", is_flag=True, help="Show what would be found without creating proposals")
|
|
1842
|
+
@click.pass_context
|
|
1843
|
+
def enrich_consolidate(
|
|
1844
|
+
ctx,
|
|
1845
|
+
database: str | None,
|
|
1846
|
+
detect_duplicates: bool,
|
|
1847
|
+
detect_cross_doc: bool,
|
|
1848
|
+
build_clusters: bool,
|
|
1849
|
+
extract_relationships: bool,
|
|
1850
|
+
dry_run: bool,
|
|
1851
|
+
):
|
|
1852
|
+
"""Run entity consolidation pipeline.
|
|
1853
|
+
|
|
1854
|
+
Detects duplicate entities, cross-document mentions, builds topic clusters,
|
|
1855
|
+
and extracts entity relationships. Creates pending proposals for review
|
|
1856
|
+
rather than auto-applying changes.
|
|
1857
|
+
|
|
1858
|
+
Examples:
|
|
1859
|
+
|
|
1860
|
+
okb enrich consolidate # Run full consolidation
|
|
1861
|
+
|
|
1862
|
+
okb enrich consolidate --dry-run # Show what would be found
|
|
1863
|
+
|
|
1864
|
+
okb enrich consolidate --no-clusters # Skip clustering
|
|
1865
|
+
|
|
1866
|
+
okb enrich consolidate --duplicates --no-cross-doc --no-clusters --no-relationships
|
|
1867
|
+
"""
|
|
1868
|
+
from .llm import get_llm
|
|
1869
|
+
from .llm.consolidate import format_consolidation_result, run_consolidation
|
|
1870
|
+
|
|
1871
|
+
# Check LLM is configured if needed
|
|
1872
|
+
if get_llm() is None:
|
|
1873
|
+
click.echo("Error: No LLM provider configured.", err=True)
|
|
1874
|
+
click.echo("Consolidation requires an LLM for deduplication and clustering.", err=True)
|
|
1875
|
+
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml", err=True)
|
|
1876
|
+
ctx.exit(1)
|
|
1877
|
+
|
|
1878
|
+
db_name = database or ctx.obj.get("database")
|
|
1879
|
+
db_cfg = config.get_database(db_name)
|
|
1880
|
+
|
|
1881
|
+
click.echo(f"Running consolidation on database '{db_cfg.name}'...")
|
|
1882
|
+
if dry_run:
|
|
1883
|
+
click.echo("(dry run - no proposals will be created)")
|
|
1884
|
+
|
|
1885
|
+
result = run_consolidation(
|
|
1886
|
+
db_url=db_cfg.url,
|
|
1887
|
+
detect_duplicates=detect_duplicates,
|
|
1888
|
+
detect_cross_doc=detect_cross_doc,
|
|
1889
|
+
build_clusters=build_clusters,
|
|
1890
|
+
extract_relationships=extract_relationships,
|
|
1891
|
+
dry_run=dry_run,
|
|
1892
|
+
)
|
|
1893
|
+
|
|
1894
|
+
# Format and display result
|
|
1895
|
+
output = format_consolidation_result(result)
|
|
1896
|
+
click.echo("")
|
|
1897
|
+
click.echo(output)
|
|
1898
|
+
|
|
1899
|
+
if not dry_run and (result.duplicates_found > 0 or result.cross_doc_candidates > 0):
|
|
1900
|
+
click.echo("")
|
|
1901
|
+
click.echo("Use 'okb enrich merge-proposals' to review pending merges.")
|
|
1902
|
+
|
|
1903
|
+
|
|
1904
|
+
@enrich.command("merge-proposals")
|
|
1905
|
+
@click.option("--db", "database", default=None, help="Database to check")
|
|
1906
|
+
@click.option("--limit", default=50, help="Maximum results")
|
|
1907
|
+
@click.pass_context
|
|
1908
|
+
def enrich_merge_proposals(ctx, database: str | None, limit: int):
|
|
1909
|
+
"""List pending entity merge proposals.
|
|
1910
|
+
|
|
1911
|
+
Shows duplicate entities and cross-document mentions awaiting review.
|
|
1912
|
+
Use 'okb enrich approve-merge' or 'okb enrich reject-merge' to process.
|
|
1913
|
+
"""
|
|
1914
|
+
from .llm.extractors.dedup import list_pending_merges
|
|
1915
|
+
|
|
1916
|
+
db_name = database or ctx.obj.get("database")
|
|
1917
|
+
db_cfg = config.get_database(db_name)
|
|
1918
|
+
|
|
1919
|
+
merges = list_pending_merges(db_cfg.url, limit=limit)
|
|
1920
|
+
|
|
1921
|
+
if not merges:
|
|
1922
|
+
click.echo("No pending merge proposals.")
|
|
1923
|
+
return
|
|
1924
|
+
|
|
1925
|
+
click.echo(f"Pending merge proposals ({len(merges)}):\n")
|
|
1926
|
+
for m in merges:
|
|
1927
|
+
confidence = m.get("confidence", 0)
|
|
1928
|
+
confidence_str = f" ({confidence:.0%})" if confidence else ""
|
|
1929
|
+
click.echo(f" {m['canonical_name']} <- {m['duplicate_name']}{confidence_str}")
|
|
1930
|
+
click.echo(f" ID: {m['id']}")
|
|
1931
|
+
click.echo(f" Reason: {m.get('reason', 'similarity')}")
|
|
1932
|
+
click.echo("")
|
|
1933
|
+
|
|
1934
|
+
click.echo("Use 'okb enrich approve-merge <id>' or 'okb enrich reject-merge <id>' to process.")
|
|
1935
|
+
|
|
1936
|
+
|
|
1937
|
+
@enrich.command("approve-merge")
|
|
1938
|
+
@click.argument("merge_id")
|
|
1939
|
+
@click.option("--db", "database", default=None, help="Database")
|
|
1940
|
+
@click.pass_context
|
|
1941
|
+
def enrich_approve_merge(ctx, merge_id: str, database: str | None):
|
|
1942
|
+
"""Approve a pending entity merge.
|
|
1943
|
+
|
|
1944
|
+
Merges the duplicate entity into the canonical entity:
|
|
1945
|
+
- Redirects all entity references from duplicate to canonical
|
|
1946
|
+
- Adds duplicate's name as an alias for canonical
|
|
1947
|
+
- Deletes the duplicate entity document
|
|
1948
|
+
"""
|
|
1949
|
+
from .llm.extractors.dedup import approve_merge
|
|
1950
|
+
|
|
1951
|
+
db_name = database or ctx.obj.get("database")
|
|
1952
|
+
db_cfg = config.get_database(db_name)
|
|
1953
|
+
|
|
1954
|
+
if approve_merge(db_cfg.url, merge_id):
|
|
1955
|
+
click.echo("Merge approved and executed.")
|
|
1956
|
+
else:
|
|
1957
|
+
click.echo("Failed to approve merge. ID may be invalid or already processed.", err=True)
|
|
1958
|
+
sys.exit(1)
|
|
1959
|
+
|
|
1960
|
+
|
|
1961
|
+
@enrich.command("reject-merge")
|
|
1962
|
+
@click.argument("merge_id")
|
|
1963
|
+
@click.option("--db", "database", default=None, help="Database")
|
|
1964
|
+
@click.pass_context
|
|
1965
|
+
def enrich_reject_merge(ctx, merge_id: str, database: str | None):
|
|
1966
|
+
"""Reject a pending entity merge proposal."""
|
|
1967
|
+
from .llm.extractors.dedup import reject_merge
|
|
1968
|
+
|
|
1969
|
+
db_name = database or ctx.obj.get("database")
|
|
1970
|
+
db_cfg = config.get_database(db_name)
|
|
1971
|
+
|
|
1972
|
+
if reject_merge(db_cfg.url, merge_id):
|
|
1973
|
+
click.echo("Merge rejected.")
|
|
1974
|
+
else:
|
|
1975
|
+
click.echo("Failed to reject merge. ID may be invalid or already processed.", err=True)
|
|
1976
|
+
sys.exit(1)
|
|
1977
|
+
|
|
1978
|
+
|
|
1979
|
+
@enrich.command("clusters")
|
|
1980
|
+
@click.option("--db", "database", default=None, help="Database to check")
|
|
1981
|
+
@click.option("--limit", default=20, help="Maximum clusters to show")
|
|
1982
|
+
@click.pass_context
|
|
1983
|
+
def enrich_clusters(ctx, database: str | None, limit: int):
|
|
1984
|
+
"""List topic clusters.
|
|
1985
|
+
|
|
1986
|
+
Shows groups of related entities and documents organized by theme.
|
|
1987
|
+
"""
|
|
1988
|
+
from .llm.consolidate import get_topic_clusters
|
|
1989
|
+
|
|
1990
|
+
db_name = database or ctx.obj.get("database")
|
|
1991
|
+
db_cfg = config.get_database(db_name)
|
|
1992
|
+
|
|
1993
|
+
clusters = get_topic_clusters(db_cfg.url, limit=limit)
|
|
1994
|
+
|
|
1995
|
+
if not clusters:
|
|
1996
|
+
click.echo("No topic clusters found.")
|
|
1997
|
+
click.echo("Run 'okb enrich consolidate' to generate clusters.")
|
|
1998
|
+
return
|
|
1999
|
+
|
|
2000
|
+
click.echo(f"Topic clusters ({len(clusters)}):\n")
|
|
2001
|
+
for c in clusters:
|
|
2002
|
+
click.echo(f" {c['name']}")
|
|
2003
|
+
if c.get("description"):
|
|
2004
|
+
desc = c["description"][:70] + "..." if len(c["description"]) > 70 else c["description"]
|
|
2005
|
+
click.echo(f" {desc}")
|
|
2006
|
+
click.echo(f" Members: {c['member_count']} entities/documents")
|
|
2007
|
+
if c.get("sample_members"):
|
|
2008
|
+
samples = ", ".join(c["sample_members"][:5])
|
|
2009
|
+
click.echo(f" Examples: {samples}")
|
|
2010
|
+
click.echo("")
|
|
2011
|
+
|
|
2012
|
+
|
|
2013
|
+
@enrich.command("relationships")
|
|
2014
|
+
@click.option("--db", "database", default=None, help="Database to check")
|
|
2015
|
+
@click.option("--entity", "entity_name", default=None, help="Filter to specific entity")
|
|
2016
|
+
@click.option("--type", "relationship_type", default=None,
|
|
2017
|
+
help="Filter by relationship type (works_for, uses, belongs_to, related_to)")
|
|
2018
|
+
@click.option("--limit", default=50, help="Maximum results")
|
|
2019
|
+
@click.pass_context
|
|
2020
|
+
def enrich_relationships(
|
|
2021
|
+
ctx,
|
|
2022
|
+
database: str | None,
|
|
2023
|
+
entity_name: str | None,
|
|
2024
|
+
relationship_type: str | None,
|
|
2025
|
+
limit: int,
|
|
2026
|
+
):
|
|
2027
|
+
"""List entity relationships.
|
|
2028
|
+
|
|
2029
|
+
Shows connections between entities (person→org, tech→project, etc.).
|
|
2030
|
+
|
|
2031
|
+
Examples:
|
|
2032
|
+
|
|
2033
|
+
okb enrich relationships # All relationships
|
|
2034
|
+
|
|
2035
|
+
okb enrich relationships --entity "Django" # Filter to one entity
|
|
2036
|
+
|
|
2037
|
+
okb enrich relationships --type works_for # Filter by type
|
|
2038
|
+
"""
|
|
2039
|
+
from .llm.consolidate import get_entity_relationships
|
|
2040
|
+
|
|
2041
|
+
db_name = database or ctx.obj.get("database")
|
|
2042
|
+
db_cfg = config.get_database(db_name)
|
|
2043
|
+
|
|
2044
|
+
relationships = get_entity_relationships(
|
|
2045
|
+
db_cfg.url,
|
|
2046
|
+
entity_name=entity_name,
|
|
2047
|
+
relationship_type=relationship_type,
|
|
2048
|
+
limit=limit,
|
|
2049
|
+
)
|
|
2050
|
+
|
|
2051
|
+
if not relationships:
|
|
2052
|
+
if entity_name:
|
|
2053
|
+
click.echo(f"No relationships found for entity '{entity_name}'.")
|
|
2054
|
+
else:
|
|
2055
|
+
click.echo("No relationships found.")
|
|
2056
|
+
click.echo("Run 'okb enrich consolidate' to extract relationships.")
|
|
2057
|
+
return
|
|
2058
|
+
|
|
2059
|
+
click.echo(f"Entity relationships ({len(relationships)}):\n")
|
|
2060
|
+
for r in relationships:
|
|
2061
|
+
confidence = r.get("confidence", 0)
|
|
2062
|
+
conf_str = f" ({confidence:.0%})" if confidence else ""
|
|
2063
|
+
click.echo(f" {r['source_name']} --[{r['relationship_type']}]--> {r['target_name']}{conf_str}")
|
|
2064
|
+
if r.get("evidence"):
|
|
2065
|
+
evidence = r["evidence"][:60] + "..." if len(r["evidence"]) > 60 else r["evidence"]
|
|
2066
|
+
click.echo(f" Evidence: {evidence}")
|
|
2067
|
+
click.echo("")
|
|
2068
|
+
|
|
2069
|
+
|
|
2070
|
+
@enrich.command("all")
|
|
2071
|
+
@click.option("--db", "database", default=None, help="Database to enrich")
|
|
2072
|
+
@click.option("--source-type", default=None, help="Filter by source type")
|
|
2073
|
+
@click.option("--project", default=None, help="Filter by project")
|
|
2074
|
+
@click.option("--query", default=None, help="Semantic search query to filter documents")
|
|
2075
|
+
@click.option("--path-pattern", default=None, help="SQL LIKE pattern for source_path")
|
|
2076
|
+
@click.option("--limit", default=100, help="Maximum documents to process")
|
|
2077
|
+
@click.option("--workers", default=None, type=int, help="Parallel workers (default: docs/5, min 1)")
|
|
2078
|
+
@click.option("--dry-run", is_flag=True, help="Show what would be done without executing")
|
|
2079
|
+
@click.option("--skip-consolidate", is_flag=True, help="Skip consolidation phase")
|
|
2080
|
+
@click.option("--duplicates/--no-duplicates", "detect_duplicates", default=True,
|
|
2081
|
+
help="Detect duplicate entities during consolidation")
|
|
2082
|
+
@click.option("--clusters/--no-clusters", "build_clusters", default=True,
|
|
2083
|
+
help="Build topic clusters during consolidation")
|
|
2084
|
+
@click.option("--relationships/--no-relationships", "extract_relationships", default=True,
|
|
2085
|
+
help="Extract entity relationships during consolidation")
|
|
2086
|
+
@click.pass_context
|
|
2087
|
+
def enrich_all(
|
|
2088
|
+
ctx,
|
|
2089
|
+
database: str | None,
|
|
2090
|
+
source_type: str | None,
|
|
2091
|
+
project: str | None,
|
|
2092
|
+
query: str | None,
|
|
2093
|
+
path_pattern: str | None,
|
|
2094
|
+
limit: int,
|
|
2095
|
+
workers: int | None,
|
|
2096
|
+
dry_run: bool,
|
|
2097
|
+
skip_consolidate: bool,
|
|
2098
|
+
detect_duplicates: bool,
|
|
2099
|
+
build_clusters: bool,
|
|
2100
|
+
extract_relationships: bool,
|
|
2101
|
+
):
|
|
2102
|
+
"""Run full enrichment pipeline: extraction + consolidation.
|
|
2103
|
+
|
|
2104
|
+
Combines 'enrich run' and 'enrich consolidate' in one command for
|
|
2105
|
+
one-shot enrichment of documents.
|
|
2106
|
+
|
|
2107
|
+
Examples:
|
|
2108
|
+
|
|
2109
|
+
okb enrich all # Run full pipeline
|
|
2110
|
+
|
|
2111
|
+
okb enrich all --dry-run # Preview what would happen
|
|
2112
|
+
|
|
2113
|
+
okb enrich all --skip-consolidate # Run extraction only
|
|
2114
|
+
|
|
2115
|
+
okb enrich all --source-type markdown # Filter to markdown files
|
|
2116
|
+
|
|
2117
|
+
okb enrich all --no-clusters # Skip cluster building
|
|
2118
|
+
"""
|
|
2119
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2120
|
+
|
|
2121
|
+
from .llm import get_llm
|
|
2122
|
+
from .llm.consolidate import format_consolidation_result, run_consolidation
|
|
2123
|
+
from .llm.enrich import EnrichmentConfig, get_unenriched_documents, process_enrichment
|
|
2124
|
+
|
|
2125
|
+
# Check LLM is configured
|
|
2126
|
+
if get_llm() is None:
|
|
2127
|
+
click.echo("Error: No LLM provider configured.", err=True)
|
|
2128
|
+
click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml", err=True)
|
|
2129
|
+
ctx.exit(1)
|
|
2130
|
+
|
|
2131
|
+
db_name = database or ctx.obj.get("database")
|
|
2132
|
+
db_cfg = config.get_database(db_name)
|
|
2133
|
+
|
|
2134
|
+
# Phase 1: Enrichment
|
|
2135
|
+
click.echo("=== Phase 1: Enrichment ===")
|
|
2136
|
+
click.echo(f"Scanning database '{db_cfg.name}' for documents to enrich...")
|
|
2137
|
+
if dry_run:
|
|
2138
|
+
click.echo("(dry run - no changes will be made)")
|
|
2139
|
+
|
|
2140
|
+
docs = get_unenriched_documents(
|
|
2141
|
+
db_url=db_cfg.url,
|
|
2142
|
+
source_type=source_type,
|
|
2143
|
+
project=project,
|
|
2144
|
+
query=query,
|
|
2145
|
+
path_pattern=path_pattern,
|
|
2146
|
+
limit=limit,
|
|
2147
|
+
)
|
|
2148
|
+
|
|
2149
|
+
total_todos = 0
|
|
2150
|
+
total_entities_pending = 0
|
|
2151
|
+
total_entities_created = 0
|
|
2152
|
+
|
|
2153
|
+
if not docs:
|
|
2154
|
+
click.echo("No documents need enrichment.")
|
|
2155
|
+
else:
|
|
2156
|
+
click.echo(f"Found {len(docs)} documents to enrich")
|
|
2157
|
+
|
|
2158
|
+
if dry_run:
|
|
2159
|
+
for doc in docs[:20]:
|
|
2160
|
+
click.echo(f" - {doc['title']} ({doc['source_type']})")
|
|
2161
|
+
if len(docs) > 20:
|
|
2162
|
+
click.echo(f" ... and {len(docs) - 20} more")
|
|
2163
|
+
else:
|
|
2164
|
+
# Build config
|
|
2165
|
+
enrich_config = EnrichmentConfig.from_config(
|
|
2166
|
+
{
|
|
2167
|
+
"enabled": config.enrichment_enabled,
|
|
2168
|
+
"version": config.enrichment_version,
|
|
2169
|
+
"extract_todos": config.enrichment_extract_todos,
|
|
2170
|
+
"extract_entities": config.enrichment_extract_entities,
|
|
2171
|
+
"auto_create_todos": config.enrichment_auto_create_todos,
|
|
2172
|
+
"auto_create_entities": config.enrichment_auto_create_entities,
|
|
2173
|
+
"min_confidence_todo": config.enrichment_min_confidence_todo,
|
|
2174
|
+
"min_confidence_entity": config.enrichment_min_confidence_entity,
|
|
2175
|
+
}
|
|
2176
|
+
)
|
|
2177
|
+
|
|
2178
|
+
# Calculate workers
|
|
2179
|
+
if workers is None:
|
|
2180
|
+
workers = max(1, len(docs) // 5)
|
|
2181
|
+
|
|
2182
|
+
completed = 0
|
|
2183
|
+
errors = 0
|
|
2184
|
+
|
|
2185
|
+
def enrich_one(doc: dict) -> tuple[dict, dict | None, str | None]:
|
|
2186
|
+
proj = doc["metadata"].get("project") if doc["metadata"] else None
|
|
2187
|
+
try:
|
|
2188
|
+
stats = process_enrichment(
|
|
2189
|
+
document_id=str(doc["id"]),
|
|
2190
|
+
source_path=doc["source_path"],
|
|
2191
|
+
title=doc["title"],
|
|
2192
|
+
content=doc["content"],
|
|
2193
|
+
source_type=doc["source_type"],
|
|
2194
|
+
db_url=db_cfg.url,
|
|
2195
|
+
config=enrich_config,
|
|
2196
|
+
project=proj,
|
|
2197
|
+
)
|
|
2198
|
+
return doc, stats, None
|
|
2199
|
+
except Exception as e:
|
|
2200
|
+
return doc, None, str(e)
|
|
2201
|
+
|
|
2202
|
+
click.echo(f"Processing with {workers} parallel workers...")
|
|
2203
|
+
|
|
2204
|
+
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
2205
|
+
futures = {executor.submit(enrich_one, doc): doc for doc in docs}
|
|
2206
|
+
|
|
2207
|
+
for future in as_completed(futures):
|
|
2208
|
+
doc, stats, error = future.result()
|
|
2209
|
+
completed += 1
|
|
2210
|
+
title = doc["title"][:40] if doc["title"] else "Untitled"
|
|
2211
|
+
|
|
2212
|
+
if error:
|
|
2213
|
+
errors += 1
|
|
2214
|
+
click.echo(f"[{completed}/{len(docs)}] {title}... -> error: {error[:50]}")
|
|
2215
|
+
continue
|
|
2216
|
+
|
|
2217
|
+
total_todos += stats["todos_created"]
|
|
2218
|
+
total_entities_pending += stats["entities_pending"]
|
|
2219
|
+
total_entities_created += stats["entities_created"]
|
|
2220
|
+
|
|
2221
|
+
parts = []
|
|
2222
|
+
if stats["todos_created"]:
|
|
2223
|
+
parts.append(f"{stats['todos_created']} TODOs")
|
|
2224
|
+
if stats["entities_pending"]:
|
|
2225
|
+
parts.append(f"{stats['entities_pending']} pending")
|
|
2226
|
+
if stats["entities_created"]:
|
|
2227
|
+
parts.append(f"{stats['entities_created']} entities")
|
|
2228
|
+
if parts:
|
|
2229
|
+
click.echo(f"[{completed}/{len(docs)}] {title}... -> {', '.join(parts)}")
|
|
2230
|
+
else:
|
|
2231
|
+
click.echo(f"[{completed}/{len(docs)}] {title}... -> nothing extracted")
|
|
2232
|
+
|
|
2233
|
+
click.echo("")
|
|
2234
|
+
click.echo("Enrichment summary:")
|
|
2235
|
+
click.echo(f" Documents processed: {len(docs)}")
|
|
2236
|
+
if errors:
|
|
2237
|
+
click.echo(f" Errors: {errors}")
|
|
2238
|
+
click.echo(f" TODOs created: {total_todos}")
|
|
2239
|
+
click.echo(f" Entities pending review: {total_entities_pending}")
|
|
2240
|
+
click.echo(f" Entities auto-created: {total_entities_created}")
|
|
2241
|
+
|
|
2242
|
+
# Phase 2: Consolidation
|
|
2243
|
+
if skip_consolidate:
|
|
2244
|
+
click.echo("")
|
|
2245
|
+
click.echo("Skipping consolidation (--skip-consolidate)")
|
|
2246
|
+
return
|
|
2247
|
+
|
|
2248
|
+
click.echo("")
|
|
2249
|
+
click.echo("=== Phase 2: Consolidation ===")
|
|
2250
|
+
|
|
2251
|
+
result = run_consolidation(
|
|
2252
|
+
db_url=db_cfg.url,
|
|
2253
|
+
detect_duplicates=detect_duplicates,
|
|
2254
|
+
detect_cross_doc=True,
|
|
2255
|
+
build_clusters=build_clusters,
|
|
2256
|
+
extract_relationships=extract_relationships,
|
|
2257
|
+
dry_run=dry_run,
|
|
2258
|
+
)
|
|
2259
|
+
|
|
2260
|
+
output = format_consolidation_result(result)
|
|
2261
|
+
click.echo(output)
|
|
2262
|
+
|
|
2263
|
+
if not dry_run and (result.duplicates_found > 0 or result.cross_doc_candidates > 0):
|
|
2264
|
+
click.echo("")
|
|
2265
|
+
click.echo("Use 'okb enrich review' to review pending entities and merges.")
|
|
2266
|
+
|
|
2267
|
+
|
|
2268
|
+
@enrich.command("review")
|
|
2269
|
+
@click.option("--db", "database", default=None, help="Database to review")
|
|
2270
|
+
@click.option("--entities-only", is_flag=True, help="Only review pending entities")
|
|
2271
|
+
@click.option("--merges-only", is_flag=True, help="Only review pending merges")
|
|
2272
|
+
@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
|
|
2273
|
+
@click.option("--wait/--no-wait", default=True, help="Wait for embeddings to complete")
|
|
2274
|
+
@click.pass_context
|
|
2275
|
+
def enrich_review(
|
|
2276
|
+
ctx, database: str | None, entities_only: bool, merges_only: bool, local: bool, wait: bool
|
|
2277
|
+
):
|
|
2278
|
+
"""Interactive review of pending entities and merge proposals.
|
|
2279
|
+
|
|
2280
|
+
Loops through pending items with approve/reject prompts.
|
|
2281
|
+
Press Q to quit early - remaining items stay pending for later.
|
|
2282
|
+
|
|
2283
|
+
Entity approvals run asynchronously - you can continue reviewing while
|
|
2284
|
+
embeddings are generated. Use --no-wait to exit immediately after reviewing.
|
|
2285
|
+
|
|
2286
|
+
Examples:
|
|
2287
|
+
|
|
2288
|
+
okb enrich review # Review all pending items
|
|
2289
|
+
|
|
2290
|
+
okb enrich review --entities-only # Only review entities
|
|
2291
|
+
|
|
2292
|
+
okb enrich review --merges-only # Only review merges
|
|
2293
|
+
|
|
2294
|
+
okb enrich review --local # Use local CPU embedding
|
|
2295
|
+
|
|
2296
|
+
okb enrich review --no-wait # Don't wait for embeddings
|
|
2297
|
+
"""
|
|
2298
|
+
|
|
2299
|
+
from .llm.enrich import (
|
|
2300
|
+
approve_entity_async,
|
|
2301
|
+
list_pending_entities,
|
|
2302
|
+
reject_entity,
|
|
2303
|
+
shutdown_executor,
|
|
2304
|
+
)
|
|
2305
|
+
from .llm.extractors.dedup import approve_merge, list_pending_merges, reject_merge
|
|
2306
|
+
|
|
2307
|
+
db_name = database or ctx.obj.get("database")
|
|
2308
|
+
db_cfg = config.get_database(db_name)
|
|
2309
|
+
use_modal = not local
|
|
2310
|
+
|
|
2311
|
+
# Get pending items
|
|
2312
|
+
entities = [] if merges_only else list_pending_entities(db_cfg.url, limit=100)
|
|
2313
|
+
merges = [] if entities_only else list_pending_merges(db_cfg.url, limit=100)
|
|
2314
|
+
|
|
2315
|
+
if not entities and not merges:
|
|
2316
|
+
click.echo("No pending items to review.")
|
|
2317
|
+
return
|
|
2318
|
+
|
|
2319
|
+
click.echo(f"Pending: {len(entities)} entities, {len(merges)} merges")
|
|
2320
|
+
click.echo("")
|
|
2321
|
+
|
|
2322
|
+
# Counters
|
|
2323
|
+
approved = 0
|
|
2324
|
+
rejected = 0
|
|
2325
|
+
skipped = 0
|
|
2326
|
+
|
|
2327
|
+
# Track async approval futures
|
|
2328
|
+
pending_futures: list[tuple] = [] # (future, entity_name)
|
|
2329
|
+
|
|
2330
|
+
# Review entities
|
|
2331
|
+
choice = None
|
|
2332
|
+
if entities and not merges_only:
|
|
2333
|
+
for i, e in enumerate(entities, 1):
|
|
2334
|
+
# Check for completed futures
|
|
2335
|
+
done_count = sum(1 for f, _ in pending_futures if f.done())
|
|
2336
|
+
if pending_futures and done_count > 0:
|
|
2337
|
+
total = len(pending_futures)
|
|
2338
|
+
click.echo(click.style(f" ({done_count}/{total} embeddings done)", dim=True))
|
|
2339
|
+
|
|
2340
|
+
click.echo(click.style(f"=== Entity Review [{i}/{len(entities)}] ===", bold=True))
|
|
2341
|
+
click.echo(f"Name: {click.style(e['entity_name'], fg='cyan')}")
|
|
2342
|
+
click.echo(f"Type: {e['entity_type']}")
|
|
2343
|
+
confidence = e.get("confidence", 0)
|
|
2344
|
+
if confidence:
|
|
2345
|
+
click.echo(f"Confidence: {confidence:.0%}")
|
|
2346
|
+
if e.get("description"):
|
|
2347
|
+
d = e["description"]
|
|
2348
|
+
desc = d[:80] + "..." if len(d) > 80 else d
|
|
2349
|
+
click.echo(f"Description: {desc}")
|
|
2350
|
+
if e.get("aliases"):
|
|
2351
|
+
click.echo(f"Aliases: {', '.join(e['aliases'][:5])}")
|
|
2352
|
+
click.echo(f"Source: {e['source_title']}")
|
|
2353
|
+
click.echo("")
|
|
2354
|
+
|
|
2355
|
+
choice = click.prompt(
|
|
2356
|
+
"[A]pprove [R]eject [S]kip [Q]uit",
|
|
2357
|
+
type=click.Choice(["A", "R", "S", "Q", "a", "r", "s", "q"]),
|
|
2358
|
+
show_choices=False,
|
|
2359
|
+
).upper()
|
|
2360
|
+
|
|
2361
|
+
if choice == "Q":
|
|
2362
|
+
click.echo("Quitting review...")
|
|
2363
|
+
break
|
|
2364
|
+
elif choice == "A":
|
|
2365
|
+
# Submit async approval
|
|
2366
|
+
future = approve_entity_async(db_cfg.url, str(e["id"]), use_modal)
|
|
2367
|
+
pending_futures.append((future, e["entity_name"]))
|
|
2368
|
+
click.echo(click.style("⏳ Queued for approval", fg="cyan"))
|
|
2369
|
+
approved += 1
|
|
2370
|
+
elif choice == "R":
|
|
2371
|
+
if reject_entity(db_cfg.url, str(e["id"])):
|
|
2372
|
+
click.echo(click.style("✗ Rejected", fg="yellow"))
|
|
2373
|
+
rejected += 1
|
|
2374
|
+
else:
|
|
2375
|
+
click.echo(click.style("✗ Failed to reject", fg="red"))
|
|
2376
|
+
else:
|
|
2377
|
+
click.echo("Skipped")
|
|
2378
|
+
skipped += 1
|
|
2379
|
+
|
|
2380
|
+
click.echo("")
|
|
2381
|
+
else:
|
|
2382
|
+
# Completed all entities, continue to merges
|
|
2383
|
+
pass
|
|
2384
|
+
|
|
2385
|
+
# Review merges (only if we didn't quit early)
|
|
2386
|
+
if merges and not entities_only and (not entities or choice != "Q"):
|
|
2387
|
+
for i, m in enumerate(merges, 1):
|
|
2388
|
+
click.echo(click.style(f"=== Merge Review [{i}/{len(merges)}] ===", bold=True))
|
|
2389
|
+
cname = click.style(m["canonical_name"], fg="cyan")
|
|
2390
|
+
ctype = m.get("canonical_type", "unknown")
|
|
2391
|
+
click.echo(f"Canonical: {cname} ({ctype})")
|
|
2392
|
+
dname = click.style(m["duplicate_name"], fg="yellow")
|
|
2393
|
+
dtype = m.get("duplicate_type", "unknown")
|
|
2394
|
+
click.echo(f"Duplicate: {dname} ({dtype})")
|
|
2395
|
+
confidence = m.get("confidence", 0)
|
|
2396
|
+
if confidence:
|
|
2397
|
+
click.echo(f"Confidence: {confidence:.0%}")
|
|
2398
|
+
click.echo(f"Reason: {m.get('reason', 'similarity')}")
|
|
2399
|
+
click.echo("")
|
|
2400
|
+
|
|
2401
|
+
choice = click.prompt(
|
|
2402
|
+
"[A]pprove [R]eject [S]kip [Q]uit",
|
|
2403
|
+
type=click.Choice(["A", "R", "S", "Q", "a", "r", "s", "q"]),
|
|
2404
|
+
show_choices=False,
|
|
2405
|
+
).upper()
|
|
2406
|
+
|
|
2407
|
+
if choice == "Q":
|
|
2408
|
+
click.echo("Quitting review...")
|
|
2409
|
+
break
|
|
2410
|
+
elif choice == "A":
|
|
2411
|
+
if approve_merge(db_cfg.url, str(m["id"])):
|
|
2412
|
+
click.echo(click.style("✓ Merged", fg="green"))
|
|
2413
|
+
approved += 1
|
|
2414
|
+
else:
|
|
2415
|
+
click.echo(click.style("✗ Failed to merge", fg="red"))
|
|
2416
|
+
elif choice == "R":
|
|
2417
|
+
if reject_merge(db_cfg.url, str(m["id"])):
|
|
2418
|
+
click.echo(click.style("✗ Rejected", fg="yellow"))
|
|
2419
|
+
rejected += 1
|
|
2420
|
+
else:
|
|
2421
|
+
click.echo(click.style("✗ Failed to reject", fg="red"))
|
|
2422
|
+
else:
|
|
2423
|
+
click.echo("Skipped")
|
|
2424
|
+
skipped += 1
|
|
2425
|
+
|
|
2426
|
+
click.echo("")
|
|
2427
|
+
|
|
2428
|
+
# Wait for pending approvals if requested
|
|
2429
|
+
if pending_futures:
|
|
2430
|
+
if wait:
|
|
2431
|
+
click.echo(f"Waiting for {len(pending_futures)} pending approvals...")
|
|
2432
|
+
succeeded = 0
|
|
2433
|
+
failed = 0
|
|
2434
|
+
for future, name in pending_futures:
|
|
2435
|
+
try:
|
|
2436
|
+
result = future.result(timeout=120)
|
|
2437
|
+
if result:
|
|
2438
|
+
click.echo(click.style(f" ✓ {name}", fg="green"))
|
|
2439
|
+
succeeded += 1
|
|
2440
|
+
else:
|
|
2441
|
+
click.echo(click.style(f" ✗ {name} failed", fg="red"))
|
|
2442
|
+
failed += 1
|
|
2443
|
+
except Exception as e:
|
|
2444
|
+
click.echo(click.style(f" ✗ {name}: {e}", fg="red"))
|
|
2445
|
+
failed += 1
|
|
2446
|
+
click.echo(f"Embeddings: {succeeded} succeeded, {failed} failed")
|
|
2447
|
+
else:
|
|
2448
|
+
done_count = sum(1 for f, _ in pending_futures if f.done())
|
|
2449
|
+
pending_count = len(pending_futures) - done_count
|
|
2450
|
+
if pending_count > 0:
|
|
2451
|
+
click.echo(f"{pending_count} embeddings still processing in background...")
|
|
2452
|
+
|
|
2453
|
+
# Cleanup executor
|
|
2454
|
+
shutdown_executor(wait=wait)
|
|
2455
|
+
|
|
2456
|
+
# Summary
|
|
2457
|
+
click.echo("")
|
|
2458
|
+
click.echo(click.style("Review complete:", bold=True))
|
|
2459
|
+
click.echo(f" {click.style(str(approved), fg='green')} approved")
|
|
2460
|
+
click.echo(f" {click.style(str(rejected), fg='yellow')} rejected")
|
|
2461
|
+
click.echo(f" {skipped} skipped")
|
|
2462
|
+
|
|
2463
|
+
|
|
1397
2464
|
if __name__ == "__main__":
|
|
1398
2465
|
main()
|