okb 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/cli.py +1209 -16
- okb/config.py +122 -4
- okb/http_server.py +208 -2
- okb/llm/analyze.py +524 -0
- okb/llm/consolidate.py +685 -0
- okb/llm/enrich.py +723 -0
- okb/llm/extractors/__init__.py +13 -0
- okb/llm/extractors/base.py +44 -0
- okb/llm/extractors/cross_doc.py +478 -0
- okb/llm/extractors/dedup.py +499 -0
- okb/llm/extractors/entity.py +369 -0
- okb/llm/extractors/todo.py +149 -0
- okb/llm/providers.py +9 -6
- okb/mcp_server.py +1279 -12
- okb/migrations/0008.enrichment.sql +46 -0
- okb/migrations/0009.entity-consolidation.sql +120 -0
- okb/migrations/0010.token-id.sql +7 -0
- okb/modal_llm.py +26 -8
- okb/plugins/sources/__init__.py +2 -1
- okb/plugins/sources/dropbox_paper.py +44 -9
- okb/plugins/sources/github.py +5 -5
- okb/plugins/sources/todoist.py +254 -0
- okb/tokens.py +25 -3
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/METADATA +119 -68
- okb-1.1.0.dist-info/RECORD +49 -0
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/entry_points.txt +1 -0
- okb-1.0.0.dist-info/RECORD +0 -36
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/WHEEL +0 -0
okb/mcp_server.py
CHANGED
|
@@ -4,14 +4,14 @@ MCP Server for Knowledge Base.
|
|
|
4
4
|
Exposes semantic search to Claude Code via the Model Context Protocol.
|
|
5
5
|
|
|
6
6
|
Usage:
|
|
7
|
-
|
|
7
|
+
okb serve
|
|
8
8
|
|
|
9
|
-
Configure in Claude Code (
|
|
9
|
+
Configure in Claude Code (see https://docs.anthropic.com/en/docs/claude-code):
|
|
10
10
|
{
|
|
11
11
|
"mcpServers": {
|
|
12
12
|
"knowledge-base": {
|
|
13
|
-
"command": "
|
|
14
|
-
"args": ["
|
|
13
|
+
"command": "okb",
|
|
14
|
+
"args": ["serve"]
|
|
15
15
|
}
|
|
16
16
|
}
|
|
17
17
|
}
|
|
@@ -313,6 +313,69 @@ class KnowledgeBase:
|
|
|
313
313
|
""").fetchall()
|
|
314
314
|
return [r["project"] for r in results]
|
|
315
315
|
|
|
316
|
+
def get_project_stats(self) -> list[dict]:
|
|
317
|
+
"""Get projects with document counts for consolidation review."""
|
|
318
|
+
conn = self.get_connection()
|
|
319
|
+
results = conn.execute("""
|
|
320
|
+
SELECT
|
|
321
|
+
metadata->>'project' as project,
|
|
322
|
+
COUNT(*) as doc_count,
|
|
323
|
+
array_agg(DISTINCT source_type) as source_types
|
|
324
|
+
FROM documents
|
|
325
|
+
WHERE metadata->>'project' IS NOT NULL
|
|
326
|
+
GROUP BY metadata->>'project'
|
|
327
|
+
ORDER BY doc_count DESC, project
|
|
328
|
+
""").fetchall()
|
|
329
|
+
return [dict(r) for r in results]
|
|
330
|
+
|
|
331
|
+
def list_documents_by_project(self, project: str, limit: int = 100) -> list[dict]:
|
|
332
|
+
"""List documents for a specific project."""
|
|
333
|
+
conn = self.get_connection()
|
|
334
|
+
rows = conn.execute(
|
|
335
|
+
"""SELECT source_path, title, source_type FROM documents
|
|
336
|
+
WHERE metadata->>'project' = %s ORDER BY title LIMIT %s""",
|
|
337
|
+
(project, limit),
|
|
338
|
+
).fetchall()
|
|
339
|
+
return [dict(r) for r in rows]
|
|
340
|
+
|
|
341
|
+
def rename_project(self, old_name: str, new_name: str) -> int:
|
|
342
|
+
"""Rename a project (update all documents). Returns count of updated docs."""
|
|
343
|
+
conn = self.get_connection()
|
|
344
|
+
result = conn.execute(
|
|
345
|
+
"""
|
|
346
|
+
UPDATE documents
|
|
347
|
+
SET metadata = jsonb_set(metadata, '{project}', %s::jsonb)
|
|
348
|
+
WHERE metadata->>'project' = %s
|
|
349
|
+
""",
|
|
350
|
+
(f'"{new_name}"', old_name),
|
|
351
|
+
)
|
|
352
|
+
conn.commit()
|
|
353
|
+
return result.rowcount
|
|
354
|
+
|
|
355
|
+
def set_document_project(self, source_path: str, project: str | None) -> bool:
|
|
356
|
+
"""Set or clear the project for a single document."""
|
|
357
|
+
conn = self.get_connection()
|
|
358
|
+
if project:
|
|
359
|
+
result = conn.execute(
|
|
360
|
+
"""
|
|
361
|
+
UPDATE documents
|
|
362
|
+
SET metadata = jsonb_set(metadata, '{project}', %s::jsonb)
|
|
363
|
+
WHERE source_path = %s
|
|
364
|
+
""",
|
|
365
|
+
(f'"{project}"', source_path),
|
|
366
|
+
)
|
|
367
|
+
else:
|
|
368
|
+
result = conn.execute(
|
|
369
|
+
"""
|
|
370
|
+
UPDATE documents
|
|
371
|
+
SET metadata = metadata - 'project'
|
|
372
|
+
WHERE source_path = %s
|
|
373
|
+
""",
|
|
374
|
+
(source_path,),
|
|
375
|
+
)
|
|
376
|
+
conn.commit()
|
|
377
|
+
return result.rowcount > 0
|
|
378
|
+
|
|
316
379
|
def get_document(self, source_path: str) -> dict | None:
|
|
317
380
|
"""Get full document content by path."""
|
|
318
381
|
conn = self.get_connection()
|
|
@@ -435,10 +498,7 @@ class KnowledgeBase:
|
|
|
435
498
|
}
|
|
436
499
|
|
|
437
500
|
def delete_knowledge(self, source_path: str) -> bool:
|
|
438
|
-
"""Delete a
|
|
439
|
-
if not source_path.startswith("claude://"):
|
|
440
|
-
return False
|
|
441
|
-
|
|
501
|
+
"""Delete a document by source path."""
|
|
442
502
|
conn = self.get_connection()
|
|
443
503
|
result = conn.execute(
|
|
444
504
|
"DELETE FROM documents WHERE source_path = %s RETURNING id",
|
|
@@ -676,6 +736,571 @@ class KnowledgeBase:
|
|
|
676
736
|
return [dict(r) for r in results]
|
|
677
737
|
|
|
678
738
|
|
|
739
|
+
def _get_sync_state(conn, source_name: str, db_name: str):
|
|
740
|
+
"""Get sync state from database."""
|
|
741
|
+
from .plugins.base import SyncState
|
|
742
|
+
|
|
743
|
+
result = conn.execute(
|
|
744
|
+
"""SELECT last_sync, cursor, extra FROM sync_state
|
|
745
|
+
WHERE source_name = %s AND database_name = %s""",
|
|
746
|
+
(source_name, db_name),
|
|
747
|
+
).fetchone()
|
|
748
|
+
|
|
749
|
+
if result:
|
|
750
|
+
return SyncState(
|
|
751
|
+
last_sync=result["last_sync"],
|
|
752
|
+
cursor=result["cursor"],
|
|
753
|
+
extra=result["extra"] or {},
|
|
754
|
+
)
|
|
755
|
+
return None
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
def _save_sync_state(conn, source_name: str, db_name: str, state):
|
|
759
|
+
"""Save sync state to database."""
|
|
760
|
+
import json
|
|
761
|
+
|
|
762
|
+
conn.execute(
|
|
763
|
+
"""INSERT INTO sync_state (source_name, database_name, last_sync, cursor, extra, updated_at)
|
|
764
|
+
VALUES (%s, %s, %s, %s, %s, NOW())
|
|
765
|
+
ON CONFLICT (source_name, database_name)
|
|
766
|
+
DO UPDATE SET last_sync = EXCLUDED.last_sync,
|
|
767
|
+
cursor = EXCLUDED.cursor,
|
|
768
|
+
extra = EXCLUDED.extra,
|
|
769
|
+
updated_at = NOW()""",
|
|
770
|
+
(source_name, db_name, state.last_sync, state.cursor, json.dumps(state.extra)),
|
|
771
|
+
)
|
|
772
|
+
conn.commit()
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def _run_sync(
|
|
776
|
+
db_url: str,
|
|
777
|
+
sources: list[str],
|
|
778
|
+
sync_all: bool = False,
|
|
779
|
+
full: bool = False,
|
|
780
|
+
doc_ids: list[str] | None = None,
|
|
781
|
+
) -> str:
|
|
782
|
+
"""Run sync for specified sources and return formatted result."""
|
|
783
|
+
from psycopg.rows import dict_row
|
|
784
|
+
|
|
785
|
+
from .ingest import Ingester
|
|
786
|
+
from .plugins.registry import PluginRegistry
|
|
787
|
+
|
|
788
|
+
# Determine which sources to sync
|
|
789
|
+
if sync_all:
|
|
790
|
+
source_names = config.list_enabled_sources()
|
|
791
|
+
elif sources:
|
|
792
|
+
source_names = list(sources)
|
|
793
|
+
else:
|
|
794
|
+
# Return list of available sources
|
|
795
|
+
installed = PluginRegistry.list_sources()
|
|
796
|
+
configured = config.list_enabled_sources()
|
|
797
|
+
lines = ["Available API sources:"]
|
|
798
|
+
for name in installed:
|
|
799
|
+
status = "enabled" if name in configured else "disabled"
|
|
800
|
+
lines.append(f" - {name} ({status})")
|
|
801
|
+
if not installed:
|
|
802
|
+
lines.append(" (none installed)")
|
|
803
|
+
return "\n".join(lines)
|
|
804
|
+
|
|
805
|
+
if not source_names:
|
|
806
|
+
return "No sources to sync."
|
|
807
|
+
|
|
808
|
+
# Get database name from URL for sync state
|
|
809
|
+
db_name = config.get_database().name
|
|
810
|
+
|
|
811
|
+
results = []
|
|
812
|
+
ingester = Ingester(db_url, use_modal=True)
|
|
813
|
+
|
|
814
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
815
|
+
for source_name in source_names:
|
|
816
|
+
# Get the plugin
|
|
817
|
+
source = PluginRegistry.get_source(source_name)
|
|
818
|
+
if source is None:
|
|
819
|
+
results.append(f"{source_name}: not found")
|
|
820
|
+
continue
|
|
821
|
+
|
|
822
|
+
# Get and resolve config
|
|
823
|
+
source_cfg = config.get_source_config(source_name)
|
|
824
|
+
if source_cfg is None:
|
|
825
|
+
results.append(f"{source_name}: not configured or disabled")
|
|
826
|
+
continue
|
|
827
|
+
|
|
828
|
+
# Inject doc_ids if provided (for sources that support it)
|
|
829
|
+
if doc_ids:
|
|
830
|
+
source_cfg = {**source_cfg, "doc_ids": doc_ids}
|
|
831
|
+
|
|
832
|
+
try:
|
|
833
|
+
source.configure(source_cfg)
|
|
834
|
+
except Exception as e:
|
|
835
|
+
results.append(f"{source_name}: config error - {e}")
|
|
836
|
+
continue
|
|
837
|
+
|
|
838
|
+
# Get sync state (unless full)
|
|
839
|
+
state = None if full else _get_sync_state(conn, source_name, db_name)
|
|
840
|
+
|
|
841
|
+
try:
|
|
842
|
+
documents, new_state = source.fetch(state)
|
|
843
|
+
except Exception as e:
|
|
844
|
+
results.append(f"{source_name}: fetch error - {e}")
|
|
845
|
+
continue
|
|
846
|
+
|
|
847
|
+
if documents:
|
|
848
|
+
ingester.ingest_documents(documents)
|
|
849
|
+
results.append(f"{source_name}: synced {len(documents)} documents")
|
|
850
|
+
else:
|
|
851
|
+
results.append(f"{source_name}: no new documents")
|
|
852
|
+
|
|
853
|
+
# Save state
|
|
854
|
+
_save_sync_state(conn, source_name, db_name, new_state)
|
|
855
|
+
|
|
856
|
+
return "\n".join(results)
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def _run_rescan(
|
|
860
|
+
db_url: str,
|
|
861
|
+
dry_run: bool = False,
|
|
862
|
+
delete_missing: bool = False,
|
|
863
|
+
) -> str:
|
|
864
|
+
"""Run rescan and return formatted result."""
|
|
865
|
+
from .rescan import Rescanner
|
|
866
|
+
|
|
867
|
+
rescanner = Rescanner(db_url, use_modal=True)
|
|
868
|
+
result = rescanner.rescan(dry_run=dry_run, delete_missing=delete_missing, verbose=False)
|
|
869
|
+
|
|
870
|
+
lines = []
|
|
871
|
+
if dry_run:
|
|
872
|
+
lines.append("(dry run - no changes made)")
|
|
873
|
+
|
|
874
|
+
if result.updated:
|
|
875
|
+
lines.append(f"Updated: {len(result.updated)} files")
|
|
876
|
+
for path in result.updated[:5]: # Show first 5
|
|
877
|
+
lines.append(f" - {path}")
|
|
878
|
+
if len(result.updated) > 5:
|
|
879
|
+
lines.append(f" ... and {len(result.updated) - 5} more")
|
|
880
|
+
|
|
881
|
+
if result.deleted:
|
|
882
|
+
lines.append(f"Deleted: {len(result.deleted)} files")
|
|
883
|
+
|
|
884
|
+
if result.missing:
|
|
885
|
+
lines.append(f"Missing (not deleted): {len(result.missing)} files")
|
|
886
|
+
for path in result.missing[:5]:
|
|
887
|
+
lines.append(f" - {path}")
|
|
888
|
+
if len(result.missing) > 5:
|
|
889
|
+
lines.append(f" ... and {len(result.missing) - 5} more")
|
|
890
|
+
|
|
891
|
+
lines.append(f"Unchanged: {result.unchanged} files")
|
|
892
|
+
|
|
893
|
+
if result.errors:
|
|
894
|
+
lines.append(f"Errors: {len(result.errors)}")
|
|
895
|
+
for path, error in result.errors[:3]:
|
|
896
|
+
lines.append(f" - {path}: {error}")
|
|
897
|
+
if len(result.errors) > 3:
|
|
898
|
+
lines.append(f" ... and {len(result.errors) - 3} more")
|
|
899
|
+
|
|
900
|
+
return "\n".join(lines) if lines else "No indexed files found."
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def _list_sync_sources(db_url: str, db_name: str) -> str:
|
|
904
|
+
"""List available sync sources with status and last sync time."""
|
|
905
|
+
import psycopg
|
|
906
|
+
from psycopg.rows import dict_row
|
|
907
|
+
|
|
908
|
+
from .plugins.registry import PluginRegistry
|
|
909
|
+
|
|
910
|
+
installed = PluginRegistry.list_sources()
|
|
911
|
+
enabled = set(config.list_enabled_sources())
|
|
912
|
+
|
|
913
|
+
if not installed:
|
|
914
|
+
return "No API sync sources installed."
|
|
915
|
+
|
|
916
|
+
# Get last sync times from database
|
|
917
|
+
last_syncs = {}
|
|
918
|
+
try:
|
|
919
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
920
|
+
results = conn.execute(
|
|
921
|
+
"""SELECT source_name, last_sync FROM sync_state WHERE database_name = %s""",
|
|
922
|
+
(db_name,),
|
|
923
|
+
).fetchall()
|
|
924
|
+
last_syncs = {r["source_name"]: r["last_sync"] for r in results}
|
|
925
|
+
except Exception:
|
|
926
|
+
pass # Database may not be accessible
|
|
927
|
+
|
|
928
|
+
lines = ["## API Sync Sources\n"]
|
|
929
|
+
|
|
930
|
+
for name in sorted(installed):
|
|
931
|
+
source = PluginRegistry.get_source(name)
|
|
932
|
+
status = "enabled" if name in enabled else "disabled"
|
|
933
|
+
source_type = source.source_type if source else "unknown"
|
|
934
|
+
|
|
935
|
+
last_sync = last_syncs.get(name)
|
|
936
|
+
if last_sync:
|
|
937
|
+
last_sync_str = format_relative_time(last_sync.isoformat())
|
|
938
|
+
else:
|
|
939
|
+
last_sync_str = "never"
|
|
940
|
+
|
|
941
|
+
lines.append(f"- **{name}** ({status}) - {source_type}")
|
|
942
|
+
lines.append(f" Last sync: {last_sync_str}")
|
|
943
|
+
|
|
944
|
+
return "\n".join(lines)
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def _enrich_document(
|
|
948
|
+
db_url: str,
|
|
949
|
+
source_path: str,
|
|
950
|
+
extract_todos: bool = True,
|
|
951
|
+
extract_entities: bool = True,
|
|
952
|
+
auto_create_entities: bool = False,
|
|
953
|
+
use_modal: bool = True,
|
|
954
|
+
) -> str:
|
|
955
|
+
"""Run enrichment on a specific document."""
|
|
956
|
+
from psycopg.rows import dict_row
|
|
957
|
+
|
|
958
|
+
from .llm import get_llm
|
|
959
|
+
from .llm.enrich import EnrichmentConfig, process_enrichment
|
|
960
|
+
|
|
961
|
+
# Check LLM is configured
|
|
962
|
+
if get_llm() is None:
|
|
963
|
+
return (
|
|
964
|
+
"Error: No LLM provider configured. "
|
|
965
|
+
"Enrichment requires an LLM. Set ANTHROPIC_API_KEY or configure llm.provider in config."
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
969
|
+
doc = conn.execute(
|
|
970
|
+
"SELECT id, source_path, title, content, source_type, metadata FROM documents WHERE source_path = %s",
|
|
971
|
+
(source_path,),
|
|
972
|
+
).fetchone()
|
|
973
|
+
|
|
974
|
+
if not doc:
|
|
975
|
+
return f"Document not found: {source_path}"
|
|
976
|
+
|
|
977
|
+
enrich_config = EnrichmentConfig(
|
|
978
|
+
extract_todos=extract_todos,
|
|
979
|
+
extract_entities=extract_entities,
|
|
980
|
+
auto_create_todos=True,
|
|
981
|
+
auto_create_entities=auto_create_entities,
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
project = doc["metadata"].get("project") if doc["metadata"] else None
|
|
985
|
+
|
|
986
|
+
stats = process_enrichment(
|
|
987
|
+
document_id=str(doc["id"]),
|
|
988
|
+
source_path=doc["source_path"],
|
|
989
|
+
title=doc["title"],
|
|
990
|
+
content=doc["content"],
|
|
991
|
+
source_type=doc["source_type"],
|
|
992
|
+
db_url=db_url,
|
|
993
|
+
config=enrich_config,
|
|
994
|
+
project=project,
|
|
995
|
+
use_modal=use_modal,
|
|
996
|
+
)
|
|
997
|
+
|
|
998
|
+
lines = [f"Enriched: {source_path}"]
|
|
999
|
+
if stats["todos_created"]:
|
|
1000
|
+
lines.append(f" TODOs created: {stats['todos_created']}")
|
|
1001
|
+
if stats["entities_pending"]:
|
|
1002
|
+
lines.append(f" Entities pending review: {stats['entities_pending']}")
|
|
1003
|
+
if stats["entities_created"]:
|
|
1004
|
+
lines.append(f" Entities created: {stats['entities_created']}")
|
|
1005
|
+
if not any(stats.values()):
|
|
1006
|
+
lines.append(" No TODOs or entities extracted")
|
|
1007
|
+
|
|
1008
|
+
return "\n".join(lines)
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
def _list_pending_entities(
|
|
1012
|
+
db_url: str,
|
|
1013
|
+
entity_type: str | None = None,
|
|
1014
|
+
limit: int = 20,
|
|
1015
|
+
) -> str:
|
|
1016
|
+
"""List pending entity suggestions."""
|
|
1017
|
+
from .llm.enrich import list_pending_entities
|
|
1018
|
+
|
|
1019
|
+
entities = list_pending_entities(db_url, entity_type=entity_type, limit=limit)
|
|
1020
|
+
|
|
1021
|
+
if not entities:
|
|
1022
|
+
return "No pending entity suggestions."
|
|
1023
|
+
|
|
1024
|
+
lines = ["## Pending Entities\n"]
|
|
1025
|
+
for e in entities:
|
|
1026
|
+
confidence = e.get("confidence", 0)
|
|
1027
|
+
confidence_str = f" ({confidence:.0%})" if confidence else ""
|
|
1028
|
+
lines.append(f"- **{e['entity_name']}** ({e['entity_type']}){confidence_str}")
|
|
1029
|
+
lines.append(f" ID: `{e['id']}`")
|
|
1030
|
+
if e.get("description"):
|
|
1031
|
+
lines.append(f" {e['description']}")
|
|
1032
|
+
if e.get("aliases"):
|
|
1033
|
+
lines.append(f" Aliases: {', '.join(e['aliases'])}")
|
|
1034
|
+
lines.append(f" Source: {e['source_title']}")
|
|
1035
|
+
lines.append("")
|
|
1036
|
+
|
|
1037
|
+
lines.append(f"\nUse `approve_entity` or `reject_entity` with the entity ID.")
|
|
1038
|
+
return "\n".join(lines)
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
def _approve_entity(
|
|
1042
|
+
db_url: str, pending_id: str, use_modal: bool = True, run_async: bool = False
|
|
1043
|
+
) -> str:
|
|
1044
|
+
"""Approve a pending entity.
|
|
1045
|
+
|
|
1046
|
+
Args:
|
|
1047
|
+
db_url: Database URL
|
|
1048
|
+
pending_id: ID of the pending entity
|
|
1049
|
+
use_modal: If True, use Modal GPU for embedding; else local CPU
|
|
1050
|
+
run_async: If True, return immediately while embedding happens in background
|
|
1051
|
+
"""
|
|
1052
|
+
from .llm.enrich import approve_entity, approve_entity_async
|
|
1053
|
+
|
|
1054
|
+
if run_async:
|
|
1055
|
+
future = approve_entity_async(db_url, pending_id, use_modal)
|
|
1056
|
+
# Return immediately - don't wait for completion
|
|
1057
|
+
return (
|
|
1058
|
+
f"Entity approval queued (ID: {pending_id}). "
|
|
1059
|
+
"Embedding is being generated in the background. "
|
|
1060
|
+
"The entity will be searchable once complete."
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
source_path = approve_entity(db_url, pending_id, use_modal)
|
|
1064
|
+
if source_path:
|
|
1065
|
+
return f"Entity approved and created: `{source_path}`"
|
|
1066
|
+
return "Failed to approve entity. ID may be invalid or already processed."
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def _reject_entity(db_url: str, pending_id: str) -> str:
|
|
1070
|
+
"""Reject a pending entity."""
|
|
1071
|
+
from .llm.enrich import reject_entity
|
|
1072
|
+
|
|
1073
|
+
if reject_entity(db_url, pending_id):
|
|
1074
|
+
return "Entity rejected."
|
|
1075
|
+
return "Failed to reject entity. ID may be invalid or already processed."
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
def _analyze_knowledge_base(
|
|
1079
|
+
db_url: str,
|
|
1080
|
+
project: str | None = None,
|
|
1081
|
+
sample_size: int = 15,
|
|
1082
|
+
auto_update: bool = True,
|
|
1083
|
+
) -> str:
|
|
1084
|
+
"""Analyze the knowledge base and return formatted result."""
|
|
1085
|
+
from .llm import get_llm
|
|
1086
|
+
from .llm.analyze import analyze_database, format_analysis_result
|
|
1087
|
+
|
|
1088
|
+
# Check LLM is configured
|
|
1089
|
+
if get_llm() is None:
|
|
1090
|
+
return (
|
|
1091
|
+
"Error: No LLM provider configured. "
|
|
1092
|
+
"Analysis requires an LLM. Set ANTHROPIC_API_KEY or configure llm.provider in config."
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
try:
|
|
1096
|
+
result = analyze_database(
|
|
1097
|
+
db_url=db_url,
|
|
1098
|
+
project=project,
|
|
1099
|
+
sample_size=sample_size,
|
|
1100
|
+
auto_update=auto_update,
|
|
1101
|
+
)
|
|
1102
|
+
return format_analysis_result(result)
|
|
1103
|
+
except Exception as e:
|
|
1104
|
+
return f"Error analyzing knowledge base: {e}"
|
|
1105
|
+
|
|
1106
|
+
|
|
1107
|
+
def _find_entity_duplicates(
|
|
1108
|
+
db_url: str,
|
|
1109
|
+
similarity_threshold: float = 0.85,
|
|
1110
|
+
entity_type: str | None = None,
|
|
1111
|
+
use_llm: bool = True,
|
|
1112
|
+
) -> str:
|
|
1113
|
+
"""Find duplicate entities and return formatted result."""
|
|
1114
|
+
from .llm.extractors.dedup import create_pending_merge, find_duplicate_entities
|
|
1115
|
+
|
|
1116
|
+
pairs = find_duplicate_entities(
|
|
1117
|
+
db_url,
|
|
1118
|
+
similarity_threshold=similarity_threshold,
|
|
1119
|
+
use_llm=use_llm,
|
|
1120
|
+
entity_type=entity_type,
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
if not pairs:
|
|
1124
|
+
return "No potential duplicate entities found."
|
|
1125
|
+
|
|
1126
|
+
lines = ["## Potential Duplicate Entities\n"]
|
|
1127
|
+
for p in pairs:
|
|
1128
|
+
lines.append(f"- **{p.canonical_name}** ↔ **{p.duplicate_name}**")
|
|
1129
|
+
lines.append(f" Confidence: {p.confidence:.0%} ({p.reason})")
|
|
1130
|
+
lines.append(f" Types: {p.canonical_type} / {p.duplicate_type}")
|
|
1131
|
+
|
|
1132
|
+
# Create pending merge
|
|
1133
|
+
merge_id = create_pending_merge(db_url, p)
|
|
1134
|
+
if merge_id:
|
|
1135
|
+
lines.append(f" Pending merge ID: `{merge_id}`")
|
|
1136
|
+
lines.append("")
|
|
1137
|
+
|
|
1138
|
+
lines.append(f"\nFound {len(pairs)} potential duplicates.")
|
|
1139
|
+
lines.append("Use `approve_merge` or `reject_merge` with merge IDs to process.")
|
|
1140
|
+
return "\n".join(lines)
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
def _merge_entities(db_url: str, canonical_path: str, duplicate_path: str) -> str:
|
|
1144
|
+
"""Merge two entities and return result."""
|
|
1145
|
+
from psycopg.rows import dict_row
|
|
1146
|
+
|
|
1147
|
+
from .llm.extractors.dedup import execute_merge
|
|
1148
|
+
|
|
1149
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
1150
|
+
# Get entity IDs from paths
|
|
1151
|
+
canonical = conn.execute(
|
|
1152
|
+
"SELECT id, title FROM documents WHERE source_path = %s AND source_type = 'entity'",
|
|
1153
|
+
(canonical_path,),
|
|
1154
|
+
).fetchone()
|
|
1155
|
+
duplicate = conn.execute(
|
|
1156
|
+
"SELECT id, title FROM documents WHERE source_path = %s AND source_type = 'entity'",
|
|
1157
|
+
(duplicate_path,),
|
|
1158
|
+
).fetchone()
|
|
1159
|
+
|
|
1160
|
+
if not canonical:
|
|
1161
|
+
return f"Error: Canonical entity not found: {canonical_path}"
|
|
1162
|
+
if not duplicate:
|
|
1163
|
+
return f"Error: Duplicate entity not found: {duplicate_path}"
|
|
1164
|
+
|
|
1165
|
+
if execute_merge(db_url, str(canonical["id"]), str(duplicate["id"])):
|
|
1166
|
+
return (
|
|
1167
|
+
f"Merge successful:\n"
|
|
1168
|
+
f"- Kept: {canonical['title']} ({canonical_path})\n"
|
|
1169
|
+
f"- Merged: {duplicate['title']} (deleted, added as alias)"
|
|
1170
|
+
)
|
|
1171
|
+
return "Error: Merge failed."
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
def _list_pending_merges(db_url: str, limit: int = 50) -> str:
|
|
1175
|
+
"""List pending entity merges."""
|
|
1176
|
+
from .llm.extractors.dedup import list_pending_merges
|
|
1177
|
+
|
|
1178
|
+
merges = list_pending_merges(db_url, limit=limit)
|
|
1179
|
+
|
|
1180
|
+
if not merges:
|
|
1181
|
+
return "No pending entity merges."
|
|
1182
|
+
|
|
1183
|
+
lines = ["## Pending Entity Merges\n"]
|
|
1184
|
+
for m in merges:
|
|
1185
|
+
lines.append(f"- **{m['canonical_name']}** ← {m['duplicate_name']}")
|
|
1186
|
+
lines.append(f" ID: `{m['id']}`")
|
|
1187
|
+
lines.append(f" Confidence: {m['confidence']:.0%} ({m['reason']})")
|
|
1188
|
+
lines.append("")
|
|
1189
|
+
|
|
1190
|
+
lines.append(f"\n{len(merges)} pending merges.")
|
|
1191
|
+
lines.append("Use `approve_merge` or `reject_merge` with IDs to process.")
|
|
1192
|
+
return "\n".join(lines)
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
def _approve_merge(db_url: str, merge_id: str) -> str:
|
|
1196
|
+
"""Approve and execute a pending merge."""
|
|
1197
|
+
from .llm.extractors.dedup import approve_merge
|
|
1198
|
+
|
|
1199
|
+
if approve_merge(db_url, merge_id):
|
|
1200
|
+
return "Merge approved and executed."
|
|
1201
|
+
return "Error: Failed to approve merge. ID may be invalid or already processed."
|
|
1202
|
+
|
|
1203
|
+
|
|
1204
|
+
def _reject_merge(db_url: str, merge_id: str) -> str:
|
|
1205
|
+
"""Reject a pending merge."""
|
|
1206
|
+
from .llm.extractors.dedup import reject_merge
|
|
1207
|
+
|
|
1208
|
+
if reject_merge(db_url, merge_id):
|
|
1209
|
+
return "Merge rejected."
|
|
1210
|
+
return "Error: Failed to reject merge. ID may be invalid or already processed."
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
def _get_topic_clusters(db_url: str, limit: int = 20) -> str:
|
|
1214
|
+
"""Get topic clusters."""
|
|
1215
|
+
from .llm.consolidate import get_topic_clusters
|
|
1216
|
+
|
|
1217
|
+
clusters = get_topic_clusters(db_url, limit=limit)
|
|
1218
|
+
|
|
1219
|
+
if not clusters:
|
|
1220
|
+
return "No topic clusters found. Run `run_consolidation` to create clusters."
|
|
1221
|
+
|
|
1222
|
+
lines = ["## Topic Clusters\n"]
|
|
1223
|
+
for c in clusters:
|
|
1224
|
+
lines.append(f"### {c['name']}")
|
|
1225
|
+
if c.get("description"):
|
|
1226
|
+
lines.append(c["description"])
|
|
1227
|
+
lines.append(f"Members: {c['member_count']}")
|
|
1228
|
+
|
|
1229
|
+
# Show top members
|
|
1230
|
+
entities = [m for m in c.get("members", []) if m.get("is_entity")]
|
|
1231
|
+
if entities:
|
|
1232
|
+
entity_names = [m["title"] for m in entities[:5]]
|
|
1233
|
+
lines.append(f"Entities: {', '.join(entity_names)}")
|
|
1234
|
+
lines.append("")
|
|
1235
|
+
|
|
1236
|
+
return "\n".join(lines)
|
|
1237
|
+
|
|
1238
|
+
|
|
1239
|
+
def _get_entity_relationships(
|
|
1240
|
+
db_url: str,
|
|
1241
|
+
entity_name: str | None = None,
|
|
1242
|
+
relationship_type: str | None = None,
|
|
1243
|
+
limit: int = 50,
|
|
1244
|
+
) -> str:
|
|
1245
|
+
"""Get entity relationships."""
|
|
1246
|
+
from .llm.consolidate import get_entity_relationships
|
|
1247
|
+
|
|
1248
|
+
relationships = get_entity_relationships(
|
|
1249
|
+
db_url, entity_name=entity_name, relationship_type=relationship_type, limit=limit
|
|
1250
|
+
)
|
|
1251
|
+
|
|
1252
|
+
if not relationships:
|
|
1253
|
+
if entity_name:
|
|
1254
|
+
return f"No relationships found involving '{entity_name}'."
|
|
1255
|
+
return "No entity relationships found. Run `run_consolidation` to extract relationships."
|
|
1256
|
+
|
|
1257
|
+
lines = ["## Entity Relationships\n"]
|
|
1258
|
+
for r in relationships:
|
|
1259
|
+
source = r["source"]["name"]
|
|
1260
|
+
target = r["target"]["name"]
|
|
1261
|
+
rel_type = r["type"]
|
|
1262
|
+
confidence = r.get("confidence", 0)
|
|
1263
|
+
|
|
1264
|
+
lines.append(f"- **{source}** → *{rel_type}* → **{target}** ({confidence:.0%})")
|
|
1265
|
+
if r.get("context"):
|
|
1266
|
+
lines.append(f" {r['context']}")
|
|
1267
|
+
|
|
1268
|
+
return "\n".join(lines)
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def _run_consolidation(
|
|
1272
|
+
db_url: str,
|
|
1273
|
+
detect_duplicates: bool = True,
|
|
1274
|
+
detect_cross_doc: bool = True,
|
|
1275
|
+
build_clusters: bool = True,
|
|
1276
|
+
extract_relationships: bool = True,
|
|
1277
|
+
dry_run: bool = False,
|
|
1278
|
+
) -> str:
|
|
1279
|
+
"""Run consolidation pipeline."""
|
|
1280
|
+
from .llm import get_llm
|
|
1281
|
+
from .llm.consolidate import format_consolidation_result, run_consolidation
|
|
1282
|
+
|
|
1283
|
+
# Check LLM is configured (needed for several phases)
|
|
1284
|
+
if get_llm() is None and (detect_cross_doc or build_clusters or extract_relationships):
|
|
1285
|
+
return (
|
|
1286
|
+
"Error: No LLM provider configured. "
|
|
1287
|
+
"Consolidation requires an LLM for cross-doc detection, clustering, and relationships. "
|
|
1288
|
+
"Set ANTHROPIC_API_KEY or configure llm.provider in config."
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
result = run_consolidation(
|
|
1292
|
+
db_url,
|
|
1293
|
+
detect_duplicates=detect_duplicates,
|
|
1294
|
+
detect_cross_doc=detect_cross_doc,
|
|
1295
|
+
build_clusters=build_clusters,
|
|
1296
|
+
extract_relationships=extract_relationships,
|
|
1297
|
+
auto_merge_threshold=config.consolidation_auto_merge_threshold,
|
|
1298
|
+
dry_run=dry_run,
|
|
1299
|
+
)
|
|
1300
|
+
|
|
1301
|
+
return format_consolidation_result(result)
|
|
1302
|
+
|
|
1303
|
+
|
|
679
1304
|
def build_server_instructions(db_config) -> str | None:
|
|
680
1305
|
"""Build server instructions from database config and LLM metadata."""
|
|
681
1306
|
parts = []
|
|
@@ -828,6 +1453,78 @@ async def list_tools() -> list[Tool]:
|
|
|
828
1453
|
"properties": {},
|
|
829
1454
|
},
|
|
830
1455
|
),
|
|
1456
|
+
Tool(
|
|
1457
|
+
name="get_project_stats",
|
|
1458
|
+
description=(
|
|
1459
|
+
"Get projects with document counts. Use this to identify projects that should "
|
|
1460
|
+
"be consolidated (similar names, typos, etc.)."
|
|
1461
|
+
),
|
|
1462
|
+
inputSchema={
|
|
1463
|
+
"type": "object",
|
|
1464
|
+
"properties": {},
|
|
1465
|
+
},
|
|
1466
|
+
),
|
|
1467
|
+
Tool(
|
|
1468
|
+
name="list_documents_by_project",
|
|
1469
|
+
description="List all documents belonging to a specific project.",
|
|
1470
|
+
inputSchema={
|
|
1471
|
+
"type": "object",
|
|
1472
|
+
"properties": {
|
|
1473
|
+
"project": {
|
|
1474
|
+
"type": "string",
|
|
1475
|
+
"description": "Project name to list documents for",
|
|
1476
|
+
},
|
|
1477
|
+
"limit": {
|
|
1478
|
+
"type": "integer",
|
|
1479
|
+
"description": "Maximum documents to return (default: 100)",
|
|
1480
|
+
"default": 100,
|
|
1481
|
+
},
|
|
1482
|
+
},
|
|
1483
|
+
"required": ["project"],
|
|
1484
|
+
},
|
|
1485
|
+
),
|
|
1486
|
+
Tool(
|
|
1487
|
+
name="rename_project",
|
|
1488
|
+
description=(
|
|
1489
|
+
"Rename a project, updating all documents. Use for consolidating similar "
|
|
1490
|
+
"project names (e.g., 'my-app' and 'MyApp' -> 'my-app'). Requires write permission."
|
|
1491
|
+
),
|
|
1492
|
+
inputSchema={
|
|
1493
|
+
"type": "object",
|
|
1494
|
+
"properties": {
|
|
1495
|
+
"old_name": {
|
|
1496
|
+
"type": "string",
|
|
1497
|
+
"description": "Current project name to rename",
|
|
1498
|
+
},
|
|
1499
|
+
"new_name": {
|
|
1500
|
+
"type": "string",
|
|
1501
|
+
"description": "New project name",
|
|
1502
|
+
},
|
|
1503
|
+
},
|
|
1504
|
+
"required": ["old_name", "new_name"],
|
|
1505
|
+
},
|
|
1506
|
+
),
|
|
1507
|
+
Tool(
|
|
1508
|
+
name="set_document_project",
|
|
1509
|
+
description=(
|
|
1510
|
+
"Set or clear the project for a single document. Use to fix incorrectly "
|
|
1511
|
+
"categorized documents. Requires write permission."
|
|
1512
|
+
),
|
|
1513
|
+
inputSchema={
|
|
1514
|
+
"type": "object",
|
|
1515
|
+
"properties": {
|
|
1516
|
+
"source_path": {
|
|
1517
|
+
"type": "string",
|
|
1518
|
+
"description": "Path of the document to update",
|
|
1519
|
+
},
|
|
1520
|
+
"project": {
|
|
1521
|
+
"type": "string",
|
|
1522
|
+
"description": "New project name (omit or null to clear project)",
|
|
1523
|
+
},
|
|
1524
|
+
},
|
|
1525
|
+
"required": ["source_path"],
|
|
1526
|
+
},
|
|
1527
|
+
),
|
|
831
1528
|
Tool(
|
|
832
1529
|
name="recent_documents",
|
|
833
1530
|
description="Get recently indexed or updated documents.",
|
|
@@ -877,8 +1574,8 @@ async def list_tools() -> list[Tool]:
|
|
|
877
1574
|
Tool(
|
|
878
1575
|
name="delete_knowledge",
|
|
879
1576
|
description=(
|
|
880
|
-
"Delete a
|
|
881
|
-
"
|
|
1577
|
+
"Delete a document from the knowledge base by its source path. "
|
|
1578
|
+
"Works for any document type. Requires write permission."
|
|
882
1579
|
),
|
|
883
1580
|
inputSchema={
|
|
884
1581
|
"type": "object",
|
|
@@ -1025,6 +1722,396 @@ async def list_tools() -> list[Tool]:
|
|
|
1025
1722
|
"required": ["title"],
|
|
1026
1723
|
},
|
|
1027
1724
|
),
|
|
1725
|
+
Tool(
|
|
1726
|
+
name="trigger_sync",
|
|
1727
|
+
description=(
|
|
1728
|
+
"Trigger sync of API sources (Todoist, GitHub, Dropbox Paper, etc.). "
|
|
1729
|
+
"Fetches new/updated content from external services. Requires write permission."
|
|
1730
|
+
),
|
|
1731
|
+
inputSchema={
|
|
1732
|
+
"type": "object",
|
|
1733
|
+
"properties": {
|
|
1734
|
+
"sources": {
|
|
1735
|
+
"type": "array",
|
|
1736
|
+
"items": {"type": "string"},
|
|
1737
|
+
"description": (
|
|
1738
|
+
"List of source names to sync (e.g., ['todoist', 'github']). "
|
|
1739
|
+
"If empty and 'all' is false, returns list of available sources."
|
|
1740
|
+
),
|
|
1741
|
+
},
|
|
1742
|
+
"all": {
|
|
1743
|
+
"type": "boolean",
|
|
1744
|
+
"default": False,
|
|
1745
|
+
"description": "Sync all enabled sources",
|
|
1746
|
+
},
|
|
1747
|
+
"full": {
|
|
1748
|
+
"type": "boolean",
|
|
1749
|
+
"default": False,
|
|
1750
|
+
"description": "Ignore incremental state and do full resync",
|
|
1751
|
+
},
|
|
1752
|
+
"doc_ids": {
|
|
1753
|
+
"type": "array",
|
|
1754
|
+
"items": {"type": "string"},
|
|
1755
|
+
"description": (
|
|
1756
|
+
"Specific document IDs to sync (for dropbox-paper). "
|
|
1757
|
+
"If provided, only these documents are synced."
|
|
1758
|
+
),
|
|
1759
|
+
},
|
|
1760
|
+
},
|
|
1761
|
+
},
|
|
1762
|
+
),
|
|
1763
|
+
Tool(
|
|
1764
|
+
name="trigger_rescan",
|
|
1765
|
+
description=(
|
|
1766
|
+
"Check indexed files for changes and re-ingest stale ones. "
|
|
1767
|
+
"Compares stored modification times with current filesystem. "
|
|
1768
|
+
"Requires write permission."
|
|
1769
|
+
),
|
|
1770
|
+
inputSchema={
|
|
1771
|
+
"type": "object",
|
|
1772
|
+
"properties": {
|
|
1773
|
+
"dry_run": {
|
|
1774
|
+
"type": "boolean",
|
|
1775
|
+
"default": False,
|
|
1776
|
+
"description": "Only report what would change, don't actually re-ingest",
|
|
1777
|
+
},
|
|
1778
|
+
"delete_missing": {
|
|
1779
|
+
"type": "boolean",
|
|
1780
|
+
"default": False,
|
|
1781
|
+
"description": "Remove documents for files that no longer exist",
|
|
1782
|
+
},
|
|
1783
|
+
},
|
|
1784
|
+
},
|
|
1785
|
+
),
|
|
1786
|
+
Tool(
|
|
1787
|
+
name="list_sync_sources",
|
|
1788
|
+
description=(
|
|
1789
|
+
"List available API sync sources (Todoist, GitHub, Dropbox Paper, etc.) "
|
|
1790
|
+
"with their enabled/disabled status and last sync time. "
|
|
1791
|
+
"Use this to see what external data sources can be synced."
|
|
1792
|
+
),
|
|
1793
|
+
inputSchema={
|
|
1794
|
+
"type": "object",
|
|
1795
|
+
"properties": {},
|
|
1796
|
+
},
|
|
1797
|
+
),
|
|
1798
|
+
Tool(
|
|
1799
|
+
name="enrich_document",
|
|
1800
|
+
description=(
|
|
1801
|
+
"Run LLM enrichment on a document to extract TODOs and entities. "
|
|
1802
|
+
"TODOs are created as separate documents, entities go to pending review. "
|
|
1803
|
+
"Requires write permission."
|
|
1804
|
+
),
|
|
1805
|
+
inputSchema={
|
|
1806
|
+
"type": "object",
|
|
1807
|
+
"properties": {
|
|
1808
|
+
"source_path": {
|
|
1809
|
+
"type": "string",
|
|
1810
|
+
"description": "Path of the document to enrich",
|
|
1811
|
+
},
|
|
1812
|
+
"extract_todos": {
|
|
1813
|
+
"type": "boolean",
|
|
1814
|
+
"default": True,
|
|
1815
|
+
"description": "Whether to extract TODOs",
|
|
1816
|
+
},
|
|
1817
|
+
"extract_entities": {
|
|
1818
|
+
"type": "boolean",
|
|
1819
|
+
"default": True,
|
|
1820
|
+
"description": "Whether to extract entities",
|
|
1821
|
+
},
|
|
1822
|
+
"auto_create_entities": {
|
|
1823
|
+
"type": "boolean",
|
|
1824
|
+
"default": False,
|
|
1825
|
+
"description": "Auto-create entities instead of pending review",
|
|
1826
|
+
},
|
|
1827
|
+
"use_local": {
|
|
1828
|
+
"type": "boolean",
|
|
1829
|
+
"default": False,
|
|
1830
|
+
"description": "Use local CPU embedding instead of Modal GPU",
|
|
1831
|
+
},
|
|
1832
|
+
},
|
|
1833
|
+
"required": ["source_path"],
|
|
1834
|
+
},
|
|
1835
|
+
),
|
|
1836
|
+
Tool(
|
|
1837
|
+
name="list_pending_entities",
|
|
1838
|
+
description=(
|
|
1839
|
+
"List entity suggestions awaiting review. "
|
|
1840
|
+
"Entities are extracted from documents but need approval before becoming searchable. "
|
|
1841
|
+
"Use approve_entity or reject_entity to process them."
|
|
1842
|
+
),
|
|
1843
|
+
inputSchema={
|
|
1844
|
+
"type": "object",
|
|
1845
|
+
"properties": {
|
|
1846
|
+
"entity_type": {
|
|
1847
|
+
"type": "string",
|
|
1848
|
+
"enum": ["person", "project", "technology", "concept", "organization"],
|
|
1849
|
+
"description": "Filter by entity type (optional)",
|
|
1850
|
+
},
|
|
1851
|
+
"limit": {
|
|
1852
|
+
"type": "integer",
|
|
1853
|
+
"default": 20,
|
|
1854
|
+
"description": "Maximum results",
|
|
1855
|
+
},
|
|
1856
|
+
},
|
|
1857
|
+
},
|
|
1858
|
+
),
|
|
1859
|
+
Tool(
|
|
1860
|
+
name="approve_entity",
|
|
1861
|
+
description=(
|
|
1862
|
+
"Approve a pending entity, creating it as a searchable document. "
|
|
1863
|
+
"The entity will be linked to its source document(s). "
|
|
1864
|
+
"Use async=true to return immediately while embedding runs in background. "
|
|
1865
|
+
"Requires write permission."
|
|
1866
|
+
),
|
|
1867
|
+
inputSchema={
|
|
1868
|
+
"type": "object",
|
|
1869
|
+
"properties": {
|
|
1870
|
+
"pending_id": {
|
|
1871
|
+
"type": "string",
|
|
1872
|
+
"description": "ID of the pending entity to approve",
|
|
1873
|
+
},
|
|
1874
|
+
"async": {
|
|
1875
|
+
"type": "boolean",
|
|
1876
|
+
"default": False,
|
|
1877
|
+
"description": (
|
|
1878
|
+
"If true, return immediately while embedding runs in background. "
|
|
1879
|
+
"Useful to avoid blocking when approving multiple entities."
|
|
1880
|
+
),
|
|
1881
|
+
},
|
|
1882
|
+
"use_local": {
|
|
1883
|
+
"type": "boolean",
|
|
1884
|
+
"default": False,
|
|
1885
|
+
"description": "Use local CPU embedding instead of Modal GPU",
|
|
1886
|
+
},
|
|
1887
|
+
},
|
|
1888
|
+
"required": ["pending_id"],
|
|
1889
|
+
},
|
|
1890
|
+
),
|
|
1891
|
+
Tool(
|
|
1892
|
+
name="reject_entity",
|
|
1893
|
+
description=(
|
|
1894
|
+
"Reject a pending entity suggestion. "
|
|
1895
|
+
"The entity will be marked as rejected and not shown again. "
|
|
1896
|
+
"Requires write permission."
|
|
1897
|
+
),
|
|
1898
|
+
inputSchema={
|
|
1899
|
+
"type": "object",
|
|
1900
|
+
"properties": {
|
|
1901
|
+
"pending_id": {
|
|
1902
|
+
"type": "string",
|
|
1903
|
+
"description": "ID of the pending entity to reject",
|
|
1904
|
+
},
|
|
1905
|
+
},
|
|
1906
|
+
"required": ["pending_id"],
|
|
1907
|
+
},
|
|
1908
|
+
),
|
|
1909
|
+
Tool(
|
|
1910
|
+
name="analyze_knowledge_base",
|
|
1911
|
+
description=(
|
|
1912
|
+
"Analyze the knowledge base to generate or update its description and topics. "
|
|
1913
|
+
"Uses entity data and document samples to understand themes and content. "
|
|
1914
|
+
"Results are stored in database_metadata for future sessions. "
|
|
1915
|
+
"Requires write permission."
|
|
1916
|
+
),
|
|
1917
|
+
inputSchema={
|
|
1918
|
+
"type": "object",
|
|
1919
|
+
"properties": {
|
|
1920
|
+
"project": {
|
|
1921
|
+
"type": "string",
|
|
1922
|
+
"description": "Analyze only a specific project (optional)",
|
|
1923
|
+
},
|
|
1924
|
+
"sample_size": {
|
|
1925
|
+
"type": "integer",
|
|
1926
|
+
"description": "Number of documents to sample (default: 15)",
|
|
1927
|
+
"default": 15,
|
|
1928
|
+
},
|
|
1929
|
+
"auto_update": {
|
|
1930
|
+
"type": "boolean",
|
|
1931
|
+
"description": "Update database metadata with results (default: true)",
|
|
1932
|
+
"default": True,
|
|
1933
|
+
},
|
|
1934
|
+
},
|
|
1935
|
+
},
|
|
1936
|
+
),
|
|
1937
|
+
Tool(
|
|
1938
|
+
name="find_entity_duplicates",
|
|
1939
|
+
description=(
|
|
1940
|
+
"Scan for potential duplicate entities using embedding similarity and LLM. "
|
|
1941
|
+
"Returns pairs of entities that may refer to the same thing. "
|
|
1942
|
+
"Use merge_entities or list_pending_merges to act on results."
|
|
1943
|
+
),
|
|
1944
|
+
inputSchema={
|
|
1945
|
+
"type": "object",
|
|
1946
|
+
"properties": {
|
|
1947
|
+
"similarity_threshold": {
|
|
1948
|
+
"type": "number",
|
|
1949
|
+
"description": "Minimum similarity to consider duplicates (default: 0.85)",
|
|
1950
|
+
"default": 0.85,
|
|
1951
|
+
},
|
|
1952
|
+
"entity_type": {
|
|
1953
|
+
"type": "string",
|
|
1954
|
+
"enum": ["person", "project", "technology", "concept", "organization"],
|
|
1955
|
+
"description": "Filter to specific entity type (optional)",
|
|
1956
|
+
},
|
|
1957
|
+
"use_llm": {
|
|
1958
|
+
"type": "boolean",
|
|
1959
|
+
"description": "Use LLM for batch duplicate detection (default: true)",
|
|
1960
|
+
"default": True,
|
|
1961
|
+
},
|
|
1962
|
+
},
|
|
1963
|
+
},
|
|
1964
|
+
),
|
|
1965
|
+
Tool(
|
|
1966
|
+
name="merge_entities",
|
|
1967
|
+
description=(
|
|
1968
|
+
"Merge two entities: redirect refs from duplicate to canonical, "
|
|
1969
|
+
"add duplicate's name as alias, delete duplicate. "
|
|
1970
|
+
"Requires write permission."
|
|
1971
|
+
),
|
|
1972
|
+
inputSchema={
|
|
1973
|
+
"type": "object",
|
|
1974
|
+
"properties": {
|
|
1975
|
+
"canonical_path": {
|
|
1976
|
+
"type": "string",
|
|
1977
|
+
"description": "Source path of the entity to keep",
|
|
1978
|
+
},
|
|
1979
|
+
"duplicate_path": {
|
|
1980
|
+
"type": "string",
|
|
1981
|
+
"description": "Source path of the entity to merge into canonical",
|
|
1982
|
+
},
|
|
1983
|
+
},
|
|
1984
|
+
"required": ["canonical_path", "duplicate_path"],
|
|
1985
|
+
},
|
|
1986
|
+
),
|
|
1987
|
+
Tool(
|
|
1988
|
+
name="list_pending_merges",
|
|
1989
|
+
description=(
|
|
1990
|
+
"List pending entity merge proposals awaiting approval. "
|
|
1991
|
+
"Created by find_entity_duplicates or run_consolidation."
|
|
1992
|
+
),
|
|
1993
|
+
inputSchema={
|
|
1994
|
+
"type": "object",
|
|
1995
|
+
"properties": {
|
|
1996
|
+
"limit": {
|
|
1997
|
+
"type": "integer",
|
|
1998
|
+
"description": "Maximum results (default: 50)",
|
|
1999
|
+
"default": 50,
|
|
2000
|
+
},
|
|
2001
|
+
},
|
|
2002
|
+
},
|
|
2003
|
+
),
|
|
2004
|
+
Tool(
|
|
2005
|
+
name="approve_merge",
|
|
2006
|
+
description=(
|
|
2007
|
+
"Approve a pending entity merge. Executes the merge: "
|
|
2008
|
+
"redirects refs, adds alias, deletes duplicate. "
|
|
2009
|
+
"Requires write permission."
|
|
2010
|
+
),
|
|
2011
|
+
inputSchema={
|
|
2012
|
+
"type": "object",
|
|
2013
|
+
"properties": {
|
|
2014
|
+
"merge_id": {
|
|
2015
|
+
"type": "string",
|
|
2016
|
+
"description": "ID of the pending merge to approve",
|
|
2017
|
+
},
|
|
2018
|
+
},
|
|
2019
|
+
"required": ["merge_id"],
|
|
2020
|
+
},
|
|
2021
|
+
),
|
|
2022
|
+
Tool(
|
|
2023
|
+
name="reject_merge",
|
|
2024
|
+
description=(
|
|
2025
|
+
"Reject a pending entity merge proposal. "
|
|
2026
|
+
"Requires write permission."
|
|
2027
|
+
),
|
|
2028
|
+
inputSchema={
|
|
2029
|
+
"type": "object",
|
|
2030
|
+
"properties": {
|
|
2031
|
+
"merge_id": {
|
|
2032
|
+
"type": "string",
|
|
2033
|
+
"description": "ID of the pending merge to reject",
|
|
2034
|
+
},
|
|
2035
|
+
},
|
|
2036
|
+
"required": ["merge_id"],
|
|
2037
|
+
},
|
|
2038
|
+
),
|
|
2039
|
+
Tool(
|
|
2040
|
+
name="get_topic_clusters",
|
|
2041
|
+
description=(
|
|
2042
|
+
"Get topic clusters - groups of related entities and documents. "
|
|
2043
|
+
"Clusters are created by run_consolidation."
|
|
2044
|
+
),
|
|
2045
|
+
inputSchema={
|
|
2046
|
+
"type": "object",
|
|
2047
|
+
"properties": {
|
|
2048
|
+
"limit": {
|
|
2049
|
+
"type": "integer",
|
|
2050
|
+
"description": "Maximum clusters to return (default: 20)",
|
|
2051
|
+
"default": 20,
|
|
2052
|
+
},
|
|
2053
|
+
},
|
|
2054
|
+
},
|
|
2055
|
+
),
|
|
2056
|
+
Tool(
|
|
2057
|
+
name="get_entity_relationships",
|
|
2058
|
+
description=(
|
|
2059
|
+
"Get relationships between entities (works_for, uses, belongs_to, related_to). "
|
|
2060
|
+
"Relationships are extracted by run_consolidation."
|
|
2061
|
+
),
|
|
2062
|
+
inputSchema={
|
|
2063
|
+
"type": "object",
|
|
2064
|
+
"properties": {
|
|
2065
|
+
"entity_name": {
|
|
2066
|
+
"type": "string",
|
|
2067
|
+
"description": "Filter to relationships involving this entity (optional)",
|
|
2068
|
+
},
|
|
2069
|
+
"limit": {
|
|
2070
|
+
"type": "integer",
|
|
2071
|
+
"description": "Maximum results (default: 50)",
|
|
2072
|
+
"default": 50,
|
|
2073
|
+
},
|
|
2074
|
+
},
|
|
2075
|
+
},
|
|
2076
|
+
),
|
|
2077
|
+
Tool(
|
|
2078
|
+
name="run_consolidation",
|
|
2079
|
+
description=(
|
|
2080
|
+
"Run full entity consolidation pipeline: duplicate detection, "
|
|
2081
|
+
"cross-document entity detection, topic clustering, relationship extraction. "
|
|
2082
|
+
"Creates pending proposals for review. Requires write permission."
|
|
2083
|
+
),
|
|
2084
|
+
inputSchema={
|
|
2085
|
+
"type": "object",
|
|
2086
|
+
"properties": {
|
|
2087
|
+
"detect_duplicates": {
|
|
2088
|
+
"type": "boolean",
|
|
2089
|
+
"description": "Run duplicate entity detection (default: true)",
|
|
2090
|
+
"default": True,
|
|
2091
|
+
},
|
|
2092
|
+
"detect_cross_doc": {
|
|
2093
|
+
"type": "boolean",
|
|
2094
|
+
"description": "Run cross-document entity detection (default: true)",
|
|
2095
|
+
"default": True,
|
|
2096
|
+
},
|
|
2097
|
+
"build_clusters": {
|
|
2098
|
+
"type": "boolean",
|
|
2099
|
+
"description": "Build topic clusters (default: true)",
|
|
2100
|
+
"default": True,
|
|
2101
|
+
},
|
|
2102
|
+
"extract_relationships": {
|
|
2103
|
+
"type": "boolean",
|
|
2104
|
+
"description": "Extract entity relationships (default: true)",
|
|
2105
|
+
"default": True,
|
|
2106
|
+
},
|
|
2107
|
+
"dry_run": {
|
|
2108
|
+
"type": "boolean",
|
|
2109
|
+
"description": "Report what would happen without making changes",
|
|
2110
|
+
"default": False,
|
|
2111
|
+
},
|
|
2112
|
+
},
|
|
2113
|
+
},
|
|
2114
|
+
),
|
|
1028
2115
|
]
|
|
1029
2116
|
|
|
1030
2117
|
|
|
@@ -1187,6 +2274,63 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult:
|
|
|
1187
2274
|
]
|
|
1188
2275
|
)
|
|
1189
2276
|
|
|
2277
|
+
elif name == "get_project_stats":
|
|
2278
|
+
stats = kb.get_project_stats()
|
|
2279
|
+
if not stats:
|
|
2280
|
+
return CallToolResult(content=[TextContent(type="text", text="No projects found.")])
|
|
2281
|
+
output = ["## Project Statistics\n"]
|
|
2282
|
+
output.append("| Project | Documents | Source Types |")
|
|
2283
|
+
output.append("|---------|-----------|--------------|")
|
|
2284
|
+
for s in stats:
|
|
2285
|
+
types = ", ".join(s["source_types"]) if s["source_types"] else "-"
|
|
2286
|
+
output.append(f"| {s['project']} | {s['doc_count']} | {types} |")
|
|
2287
|
+
return CallToolResult(content=[TextContent(type="text", text="\n".join(output))])
|
|
2288
|
+
|
|
2289
|
+
elif name == "list_documents_by_project":
|
|
2290
|
+
project = arguments["project"]
|
|
2291
|
+
limit = arguments.get("limit", 100)
|
|
2292
|
+
docs = kb.list_documents_by_project(project, limit)
|
|
2293
|
+
if not docs:
|
|
2294
|
+
msg = f"No documents found for project '{project}'."
|
|
2295
|
+
return CallToolResult(content=[TextContent(type="text", text=msg)])
|
|
2296
|
+
output = [f"## Documents in '{project}' ({len(docs)} documents)\n"]
|
|
2297
|
+
for d in docs:
|
|
2298
|
+
output.append(f"- **{d['title'] or d['source_path']}** ({d['source_type']})")
|
|
2299
|
+
output.append(f" - `{d['source_path']}`")
|
|
2300
|
+
return CallToolResult(content=[TextContent(type="text", text="\n".join(output))])
|
|
2301
|
+
|
|
2302
|
+
elif name == "rename_project":
|
|
2303
|
+
old_name = arguments["old_name"]
|
|
2304
|
+
new_name = arguments["new_name"]
|
|
2305
|
+
if old_name == new_name:
|
|
2306
|
+
return CallToolResult(
|
|
2307
|
+
content=[TextContent(type="text", text="Old and new names are the same.")]
|
|
2308
|
+
)
|
|
2309
|
+
count = kb.rename_project(old_name, new_name)
|
|
2310
|
+
if count == 0:
|
|
2311
|
+
return CallToolResult(
|
|
2312
|
+
content=[TextContent(type="text", text=f"No documents found with project '{old_name}'.")]
|
|
2313
|
+
)
|
|
2314
|
+
return CallToolResult(
|
|
2315
|
+
content=[TextContent(type="text", text=f"Renamed project '{old_name}' to '{new_name}' ({count} documents updated).")]
|
|
2316
|
+
)
|
|
2317
|
+
|
|
2318
|
+
elif name == "set_document_project":
|
|
2319
|
+
source_path = arguments["source_path"]
|
|
2320
|
+
project = arguments.get("project")
|
|
2321
|
+
success = kb.set_document_project(source_path, project)
|
|
2322
|
+
if not success:
|
|
2323
|
+
return CallToolResult(
|
|
2324
|
+
content=[TextContent(type="text", text=f"Document not found: {source_path}")]
|
|
2325
|
+
)
|
|
2326
|
+
if project:
|
|
2327
|
+
return CallToolResult(
|
|
2328
|
+
content=[TextContent(type="text", text=f"Set project to '{project}' for {source_path}")]
|
|
2329
|
+
)
|
|
2330
|
+
return CallToolResult(
|
|
2331
|
+
content=[TextContent(type="text", text=f"Cleared project for {source_path}")]
|
|
2332
|
+
)
|
|
2333
|
+
|
|
1190
2334
|
elif name == "recent_documents":
|
|
1191
2335
|
docs = kb.get_recent_documents(arguments.get("limit", 10))
|
|
1192
2336
|
if not docs:
|
|
@@ -1242,13 +2386,13 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult:
|
|
|
1242
2386
|
deleted = kb.delete_knowledge(arguments["source_path"])
|
|
1243
2387
|
if deleted:
|
|
1244
2388
|
return CallToolResult(
|
|
1245
|
-
content=[TextContent(type="text", text="
|
|
2389
|
+
content=[TextContent(type="text", text="Document deleted.")]
|
|
1246
2390
|
)
|
|
1247
2391
|
return CallToolResult(
|
|
1248
2392
|
content=[
|
|
1249
2393
|
TextContent(
|
|
1250
2394
|
type="text",
|
|
1251
|
-
text="Could not delete.
|
|
2395
|
+
text="Could not delete. Document not found.",
|
|
1252
2396
|
)
|
|
1253
2397
|
]
|
|
1254
2398
|
)
|
|
@@ -1358,6 +2502,129 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult:
|
|
|
1358
2502
|
parts.append(f"- Due: {result['due_date']}")
|
|
1359
2503
|
return CallToolResult(content=[TextContent(type="text", text="\n".join(parts))])
|
|
1360
2504
|
|
|
2505
|
+
elif name == "trigger_sync":
|
|
2506
|
+
result = _run_sync(
|
|
2507
|
+
kb.db_url,
|
|
2508
|
+
sources=arguments.get("sources", []),
|
|
2509
|
+
sync_all=arguments.get("all", False),
|
|
2510
|
+
full=arguments.get("full", False),
|
|
2511
|
+
doc_ids=arguments.get("doc_ids"),
|
|
2512
|
+
)
|
|
2513
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2514
|
+
|
|
2515
|
+
elif name == "trigger_rescan":
|
|
2516
|
+
result = _run_rescan(
|
|
2517
|
+
kb.db_url,
|
|
2518
|
+
dry_run=arguments.get("dry_run", False),
|
|
2519
|
+
delete_missing=arguments.get("delete_missing", False),
|
|
2520
|
+
)
|
|
2521
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2522
|
+
|
|
2523
|
+
elif name == "list_sync_sources":
|
|
2524
|
+
db_name = config.get_database().name
|
|
2525
|
+
result = _list_sync_sources(kb.db_url, db_name)
|
|
2526
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2527
|
+
|
|
2528
|
+
elif name == "enrich_document":
|
|
2529
|
+
result = _enrich_document(
|
|
2530
|
+
kb.db_url,
|
|
2531
|
+
source_path=arguments["source_path"],
|
|
2532
|
+
extract_todos=arguments.get("extract_todos", True),
|
|
2533
|
+
extract_entities=arguments.get("extract_entities", True),
|
|
2534
|
+
auto_create_entities=arguments.get("auto_create_entities", False),
|
|
2535
|
+
use_modal=not arguments.get("use_local", False),
|
|
2536
|
+
)
|
|
2537
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2538
|
+
|
|
2539
|
+
elif name == "list_pending_entities":
|
|
2540
|
+
result = _list_pending_entities(
|
|
2541
|
+
kb.db_url,
|
|
2542
|
+
entity_type=arguments.get("entity_type"),
|
|
2543
|
+
limit=arguments.get("limit", 20),
|
|
2544
|
+
)
|
|
2545
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2546
|
+
|
|
2547
|
+
elif name == "approve_entity":
|
|
2548
|
+
result = _approve_entity(
|
|
2549
|
+
kb.db_url,
|
|
2550
|
+
arguments["pending_id"],
|
|
2551
|
+
use_modal=not arguments.get("use_local", False),
|
|
2552
|
+
run_async=arguments.get("async", False),
|
|
2553
|
+
)
|
|
2554
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2555
|
+
|
|
2556
|
+
elif name == "reject_entity":
|
|
2557
|
+
result = _reject_entity(kb.db_url, arguments["pending_id"])
|
|
2558
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2559
|
+
|
|
2560
|
+
elif name == "analyze_knowledge_base":
|
|
2561
|
+
result = _analyze_knowledge_base(
|
|
2562
|
+
kb.db_url,
|
|
2563
|
+
project=arguments.get("project"),
|
|
2564
|
+
sample_size=arguments.get("sample_size", 15),
|
|
2565
|
+
auto_update=arguments.get("auto_update", True),
|
|
2566
|
+
)
|
|
2567
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2568
|
+
|
|
2569
|
+
elif name == "find_entity_duplicates":
|
|
2570
|
+
result = _find_entity_duplicates(
|
|
2571
|
+
kb.db_url,
|
|
2572
|
+
similarity_threshold=arguments.get("similarity_threshold", 0.85),
|
|
2573
|
+
entity_type=arguments.get("entity_type"),
|
|
2574
|
+
use_llm=arguments.get("use_llm", True),
|
|
2575
|
+
)
|
|
2576
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2577
|
+
|
|
2578
|
+
elif name == "merge_entities":
|
|
2579
|
+
result = _merge_entities(
|
|
2580
|
+
kb.db_url,
|
|
2581
|
+
canonical_path=arguments["canonical_path"],
|
|
2582
|
+
duplicate_path=arguments["duplicate_path"],
|
|
2583
|
+
)
|
|
2584
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2585
|
+
|
|
2586
|
+
elif name == "list_pending_merges":
|
|
2587
|
+
result = _list_pending_merges(
|
|
2588
|
+
kb.db_url,
|
|
2589
|
+
limit=arguments.get("limit", 50),
|
|
2590
|
+
)
|
|
2591
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2592
|
+
|
|
2593
|
+
elif name == "approve_merge":
|
|
2594
|
+
result = _approve_merge(kb.db_url, arguments["merge_id"])
|
|
2595
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2596
|
+
|
|
2597
|
+
elif name == "reject_merge":
|
|
2598
|
+
result = _reject_merge(kb.db_url, arguments["merge_id"])
|
|
2599
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2600
|
+
|
|
2601
|
+
elif name == "get_topic_clusters":
|
|
2602
|
+
result = _get_topic_clusters(
|
|
2603
|
+
kb.db_url,
|
|
2604
|
+
limit=arguments.get("limit", 20),
|
|
2605
|
+
)
|
|
2606
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2607
|
+
|
|
2608
|
+
elif name == "get_entity_relationships":
|
|
2609
|
+
result = _get_entity_relationships(
|
|
2610
|
+
kb.db_url,
|
|
2611
|
+
entity_name=arguments.get("entity_name"),
|
|
2612
|
+
relationship_type=arguments.get("relationship_type"),
|
|
2613
|
+
limit=arguments.get("limit", 50),
|
|
2614
|
+
)
|
|
2615
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2616
|
+
|
|
2617
|
+
elif name == "run_consolidation":
|
|
2618
|
+
result = _run_consolidation(
|
|
2619
|
+
kb.db_url,
|
|
2620
|
+
detect_duplicates=arguments.get("detect_duplicates", True),
|
|
2621
|
+
detect_cross_doc=arguments.get("detect_cross_doc", True),
|
|
2622
|
+
build_clusters=arguments.get("build_clusters", True),
|
|
2623
|
+
extract_relationships=arguments.get("extract_relationships", True),
|
|
2624
|
+
dry_run=arguments.get("dry_run", False),
|
|
2625
|
+
)
|
|
2626
|
+
return CallToolResult(content=[TextContent(type="text", text=result)])
|
|
2627
|
+
|
|
1361
2628
|
else:
|
|
1362
2629
|
return CallToolResult(content=[TextContent(type="text", text=f"Unknown tool: {name}")])
|
|
1363
2630
|
|