okb 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/cli.py +1209 -16
- okb/config.py +122 -4
- okb/http_server.py +208 -2
- okb/llm/analyze.py +524 -0
- okb/llm/consolidate.py +685 -0
- okb/llm/enrich.py +723 -0
- okb/llm/extractors/__init__.py +13 -0
- okb/llm/extractors/base.py +44 -0
- okb/llm/extractors/cross_doc.py +478 -0
- okb/llm/extractors/dedup.py +499 -0
- okb/llm/extractors/entity.py +369 -0
- okb/llm/extractors/todo.py +149 -0
- okb/llm/providers.py +9 -6
- okb/mcp_server.py +1279 -12
- okb/migrations/0008.enrichment.sql +46 -0
- okb/migrations/0009.entity-consolidation.sql +120 -0
- okb/migrations/0010.token-id.sql +7 -0
- okb/modal_llm.py +26 -8
- okb/plugins/sources/__init__.py +2 -1
- okb/plugins/sources/dropbox_paper.py +44 -9
- okb/plugins/sources/github.py +5 -5
- okb/plugins/sources/todoist.py +254 -0
- okb/tokens.py +25 -3
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/METADATA +119 -68
- okb-1.1.0.dist-info/RECORD +49 -0
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/entry_points.txt +1 -0
- okb-1.0.0.dist-info/RECORD +0 -36
- {okb-1.0.0.dist-info → okb-1.1.0.dist-info}/WHEEL +0 -0
okb/llm/enrich.py
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
1
|
+
"""Document enrichment orchestration - extract TODOs and entities from documents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import sys
|
|
7
|
+
import uuid
|
|
8
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import psycopg
|
|
13
|
+
from psycopg.rows import dict_row
|
|
14
|
+
|
|
15
|
+
from .extractors import (
|
|
16
|
+
EnrichmentResult,
|
|
17
|
+
ExtractedEntity,
|
|
18
|
+
ExtractedTodo,
|
|
19
|
+
extract_entities,
|
|
20
|
+
extract_todos,
|
|
21
|
+
)
|
|
22
|
+
from .extractors.entity import entity_source_path
|
|
23
|
+
|
|
24
|
+
# Global thread pool for async embedding operations
|
|
25
|
+
_executor: ThreadPoolExecutor | None = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_executor() -> ThreadPoolExecutor:
|
|
29
|
+
"""Get or create the global thread pool for async operations."""
|
|
30
|
+
global _executor
|
|
31
|
+
if _executor is None:
|
|
32
|
+
_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="enrich-embed")
|
|
33
|
+
return _executor
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def shutdown_executor(wait: bool = True) -> None:
|
|
37
|
+
"""Shutdown the global executor. Call at end of session."""
|
|
38
|
+
global _executor
|
|
39
|
+
if _executor is not None:
|
|
40
|
+
_executor.shutdown(wait=wait)
|
|
41
|
+
_executor = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_embedder(use_modal: bool = False):
|
|
45
|
+
"""Get embedder based on configuration.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
use_modal: If True, try to use Modal GPU embedder; fall back to local on failure
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Embedder object with embed_document(text) method
|
|
52
|
+
"""
|
|
53
|
+
if use_modal:
|
|
54
|
+
try:
|
|
55
|
+
import modal
|
|
56
|
+
|
|
57
|
+
modal_embedder = modal.Cls.from_name("knowledge-embedder", "Embedder")()
|
|
58
|
+
|
|
59
|
+
class ModalEmbedderWrapper:
|
|
60
|
+
"""Wrapper to provide consistent interface."""
|
|
61
|
+
|
|
62
|
+
def __init__(self, embedder):
|
|
63
|
+
self._embedder = embedder
|
|
64
|
+
|
|
65
|
+
def embed_document(self, text: str) -> list[float]:
|
|
66
|
+
return self._embedder.embed_single.remote(text, is_query=False)
|
|
67
|
+
|
|
68
|
+
return ModalEmbedderWrapper(modal_embedder)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print(f"Modal unavailable ({e}), using local CPU embedding", file=sys.stderr)
|
|
71
|
+
|
|
72
|
+
# Fall back to local embedder
|
|
73
|
+
from ..local_embedder import embed_document
|
|
74
|
+
|
|
75
|
+
class LocalEmbedderWrapper:
|
|
76
|
+
"""Wrapper to provide consistent interface."""
|
|
77
|
+
|
|
78
|
+
def embed_document(self, text: str) -> list[float]:
|
|
79
|
+
return embed_document(text)
|
|
80
|
+
|
|
81
|
+
return LocalEmbedderWrapper()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class EnrichmentConfig:
|
|
86
|
+
"""Configuration for document enrichment."""
|
|
87
|
+
|
|
88
|
+
enabled: bool = True
|
|
89
|
+
version: int = 1
|
|
90
|
+
|
|
91
|
+
# What to extract
|
|
92
|
+
extract_todos: bool = True
|
|
93
|
+
extract_entities: bool = True
|
|
94
|
+
|
|
95
|
+
# Auto-create behavior
|
|
96
|
+
auto_create_todos: bool = True
|
|
97
|
+
auto_create_entities: bool = False # Entities go to pending by default
|
|
98
|
+
|
|
99
|
+
# Confidence thresholds
|
|
100
|
+
min_confidence_todo: float = 0.7
|
|
101
|
+
min_confidence_entity: float = 0.8
|
|
102
|
+
|
|
103
|
+
# Source types to auto-enrich during ingest
|
|
104
|
+
auto_enrich_source_types: set[str] = field(
|
|
105
|
+
default_factory=lambda: {"markdown", "org", "text"}
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def from_config(cls, cfg: dict) -> EnrichmentConfig:
|
|
110
|
+
"""Create from config dict."""
|
|
111
|
+
auto_enrich = cfg.get("auto_enrich", {})
|
|
112
|
+
auto_enrich_types = {k for k, v in auto_enrich.items() if v}
|
|
113
|
+
|
|
114
|
+
default_types = {"markdown", "org", "text"}
|
|
115
|
+
return cls(
|
|
116
|
+
enabled=cfg.get("enabled", True),
|
|
117
|
+
version=cfg.get("version", 1),
|
|
118
|
+
extract_todos=cfg.get("extract_todos", True),
|
|
119
|
+
extract_entities=cfg.get("extract_entities", True),
|
|
120
|
+
auto_create_todos=cfg.get("auto_create_todos", True),
|
|
121
|
+
auto_create_entities=cfg.get("auto_create_entities", False),
|
|
122
|
+
min_confidence_todo=cfg.get("min_confidence_todo", 0.7),
|
|
123
|
+
min_confidence_entity=cfg.get("min_confidence_entity", 0.8),
|
|
124
|
+
auto_enrich_source_types=auto_enrich_types if auto_enrich_types else default_types,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def enrich_document(
|
|
129
|
+
content: str,
|
|
130
|
+
title: str,
|
|
131
|
+
source_type: str,
|
|
132
|
+
config: EnrichmentConfig | None = None,
|
|
133
|
+
) -> EnrichmentResult:
|
|
134
|
+
"""Enrich a single document with extracted TODOs and entities.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
content: Document content
|
|
138
|
+
title: Document title
|
|
139
|
+
source_type: Source type of the document
|
|
140
|
+
config: Enrichment configuration
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
EnrichmentResult with extracted TODOs and entities
|
|
144
|
+
"""
|
|
145
|
+
if config is None:
|
|
146
|
+
config = EnrichmentConfig()
|
|
147
|
+
|
|
148
|
+
result = EnrichmentResult()
|
|
149
|
+
|
|
150
|
+
if config.extract_todos:
|
|
151
|
+
result.todos = extract_todos(
|
|
152
|
+
content=content,
|
|
153
|
+
title=title,
|
|
154
|
+
source_type=source_type,
|
|
155
|
+
min_confidence=config.min_confidence_todo,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if config.extract_entities:
|
|
159
|
+
result.entities = extract_entities(
|
|
160
|
+
content=content,
|
|
161
|
+
title=title,
|
|
162
|
+
source_type=source_type,
|
|
163
|
+
min_confidence=config.min_confidence_entity,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _create_todo_document(
|
|
170
|
+
todo: ExtractedTodo,
|
|
171
|
+
parent_source_path: str,
|
|
172
|
+
parent_title: str,
|
|
173
|
+
db_url: str,
|
|
174
|
+
project: str | None = None,
|
|
175
|
+
use_modal: bool = False,
|
|
176
|
+
) -> str | None:
|
|
177
|
+
"""Create a TODO document from an extracted TODO.
|
|
178
|
+
|
|
179
|
+
Returns the source_path of the created document, or None if creation failed.
|
|
180
|
+
"""
|
|
181
|
+
embedder = _get_embedder(use_modal)
|
|
182
|
+
|
|
183
|
+
# Generate unique source path
|
|
184
|
+
todo_id = str(uuid.uuid4())[:8]
|
|
185
|
+
source_path = f"{parent_source_path}::todo/{todo_id}"
|
|
186
|
+
|
|
187
|
+
# Build content
|
|
188
|
+
content = todo.title
|
|
189
|
+
if todo.content:
|
|
190
|
+
content += f"\n\n{todo.content}"
|
|
191
|
+
if todo.source_context:
|
|
192
|
+
content += f"\n\n---\nExtracted from: {todo.source_context}"
|
|
193
|
+
|
|
194
|
+
# Build metadata
|
|
195
|
+
metadata: dict[str, Any] = {
|
|
196
|
+
"source": "enrichment",
|
|
197
|
+
"parent_document": parent_source_path,
|
|
198
|
+
"parent_title": parent_title,
|
|
199
|
+
}
|
|
200
|
+
if project:
|
|
201
|
+
metadata["project"] = project
|
|
202
|
+
if todo.assignee:
|
|
203
|
+
metadata["assignee"] = todo.assignee
|
|
204
|
+
|
|
205
|
+
# Content hash
|
|
206
|
+
content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
207
|
+
|
|
208
|
+
# Build embedding text
|
|
209
|
+
embedding_parts = [f"TODO: {todo.title}"]
|
|
210
|
+
if project:
|
|
211
|
+
embedding_parts.append(f"Project: {project}")
|
|
212
|
+
if todo.content:
|
|
213
|
+
embedding_parts.append(f"Details: {todo.content}")
|
|
214
|
+
embedding_text = "\n".join(embedding_parts)
|
|
215
|
+
|
|
216
|
+
embedding = embedder.embed_document(embedding_text)
|
|
217
|
+
|
|
218
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
219
|
+
try:
|
|
220
|
+
doc_id = conn.execute(
|
|
221
|
+
"""
|
|
222
|
+
INSERT INTO documents (
|
|
223
|
+
source_path, source_type, title, content, metadata, content_hash,
|
|
224
|
+
status, priority, due_date
|
|
225
|
+
)
|
|
226
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
227
|
+
ON CONFLICT (content_hash) DO NOTHING
|
|
228
|
+
RETURNING id
|
|
229
|
+
""",
|
|
230
|
+
(
|
|
231
|
+
source_path,
|
|
232
|
+
"enriched-todo",
|
|
233
|
+
todo.title,
|
|
234
|
+
content,
|
|
235
|
+
psycopg.types.json.Json(metadata),
|
|
236
|
+
content_hash,
|
|
237
|
+
"pending",
|
|
238
|
+
todo.priority,
|
|
239
|
+
todo.due_date,
|
|
240
|
+
),
|
|
241
|
+
).fetchone()
|
|
242
|
+
|
|
243
|
+
if doc_id is None:
|
|
244
|
+
return None
|
|
245
|
+
|
|
246
|
+
# Insert chunk
|
|
247
|
+
token_count = len(content) // 4
|
|
248
|
+
conn.execute(
|
|
249
|
+
"""
|
|
250
|
+
INSERT INTO chunks (document_id, chunk_index, content, embedding_text, embedding, token_count, metadata)
|
|
251
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
252
|
+
""",
|
|
253
|
+
(
|
|
254
|
+
doc_id["id"],
|
|
255
|
+
0,
|
|
256
|
+
content,
|
|
257
|
+
embedding_text,
|
|
258
|
+
embedding,
|
|
259
|
+
token_count,
|
|
260
|
+
psycopg.types.json.Json({}),
|
|
261
|
+
),
|
|
262
|
+
)
|
|
263
|
+
conn.commit()
|
|
264
|
+
return source_path
|
|
265
|
+
except Exception as e:
|
|
266
|
+
print(f"Error creating TODO document: {e}", file=sys.stderr)
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _create_pending_entity(
|
|
271
|
+
entity: ExtractedEntity,
|
|
272
|
+
source_document_id: str,
|
|
273
|
+
db_url: str,
|
|
274
|
+
) -> str | None:
|
|
275
|
+
"""Create a pending entity suggestion.
|
|
276
|
+
|
|
277
|
+
Returns the pending entity ID, or None if creation failed.
|
|
278
|
+
"""
|
|
279
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
280
|
+
try:
|
|
281
|
+
result = conn.execute(
|
|
282
|
+
"""
|
|
283
|
+
INSERT INTO pending_entities (
|
|
284
|
+
source_document_id, entity_name, entity_type, aliases,
|
|
285
|
+
description, mentions, confidence, status
|
|
286
|
+
)
|
|
287
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s, 'pending')
|
|
288
|
+
RETURNING id
|
|
289
|
+
""",
|
|
290
|
+
(
|
|
291
|
+
source_document_id,
|
|
292
|
+
entity.name,
|
|
293
|
+
entity.entity_type,
|
|
294
|
+
psycopg.types.json.Json(entity.aliases),
|
|
295
|
+
entity.description,
|
|
296
|
+
psycopg.types.json.Json(entity.mentions),
|
|
297
|
+
entity.confidence,
|
|
298
|
+
),
|
|
299
|
+
).fetchone()
|
|
300
|
+
conn.commit()
|
|
301
|
+
return str(result["id"]) if result else None
|
|
302
|
+
except Exception as e:
|
|
303
|
+
print(f"Error creating pending entity: {e}", file=sys.stderr)
|
|
304
|
+
return None
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _create_entity_document(
|
|
308
|
+
entity: ExtractedEntity,
|
|
309
|
+
source_document_id: str,
|
|
310
|
+
db_url: str,
|
|
311
|
+
use_modal: bool = False,
|
|
312
|
+
) -> str | None:
|
|
313
|
+
"""Create an entity document and add entity_refs.
|
|
314
|
+
|
|
315
|
+
Returns the source_path of the created/existing entity document.
|
|
316
|
+
"""
|
|
317
|
+
embedder = _get_embedder(use_modal)
|
|
318
|
+
|
|
319
|
+
source_path = entity_source_path(entity.entity_type, entity.name)
|
|
320
|
+
|
|
321
|
+
# Build content
|
|
322
|
+
content_parts = [f"# {entity.name}"]
|
|
323
|
+
content_parts.append(f"Type: {entity.entity_type}")
|
|
324
|
+
if entity.aliases:
|
|
325
|
+
content_parts.append(f"Also known as: {', '.join(entity.aliases)}")
|
|
326
|
+
if entity.description:
|
|
327
|
+
content_parts.append(f"\n{entity.description}")
|
|
328
|
+
content = "\n".join(content_parts)
|
|
329
|
+
|
|
330
|
+
# Build metadata
|
|
331
|
+
metadata = {
|
|
332
|
+
"source": "enrichment",
|
|
333
|
+
"entity_type": entity.entity_type,
|
|
334
|
+
"aliases": entity.aliases,
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
338
|
+
|
|
339
|
+
# Build embedding text
|
|
340
|
+
embedding_parts = [f"Entity: {entity.name}", f"Type: {entity.entity_type}"]
|
|
341
|
+
if entity.aliases:
|
|
342
|
+
embedding_parts.append(f"Aliases: {', '.join(entity.aliases)}")
|
|
343
|
+
if entity.description:
|
|
344
|
+
embedding_parts.append(entity.description)
|
|
345
|
+
embedding_text = "\n".join(embedding_parts)
|
|
346
|
+
|
|
347
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
348
|
+
try:
|
|
349
|
+
# Check if entity document already exists
|
|
350
|
+
existing = conn.execute(
|
|
351
|
+
"SELECT id FROM documents WHERE source_path = %s",
|
|
352
|
+
(source_path,),
|
|
353
|
+
).fetchone()
|
|
354
|
+
|
|
355
|
+
if existing:
|
|
356
|
+
entity_doc_id = existing["id"]
|
|
357
|
+
# Update existing document
|
|
358
|
+
conn.execute(
|
|
359
|
+
"""
|
|
360
|
+
UPDATE documents SET
|
|
361
|
+
content = %s,
|
|
362
|
+
metadata = %s,
|
|
363
|
+
updated_at = NOW()
|
|
364
|
+
WHERE id = %s
|
|
365
|
+
""",
|
|
366
|
+
(content, psycopg.types.json.Json(metadata), entity_doc_id),
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
369
|
+
# Create new entity document
|
|
370
|
+
embedding = embedder.embed_document(embedding_text)
|
|
371
|
+
|
|
372
|
+
result = conn.execute(
|
|
373
|
+
"""
|
|
374
|
+
INSERT INTO documents (
|
|
375
|
+
source_path, source_type, title, content, metadata, content_hash
|
|
376
|
+
)
|
|
377
|
+
VALUES (%s, %s, %s, %s, %s, %s)
|
|
378
|
+
RETURNING id
|
|
379
|
+
""",
|
|
380
|
+
(
|
|
381
|
+
source_path,
|
|
382
|
+
"entity",
|
|
383
|
+
entity.name,
|
|
384
|
+
content,
|
|
385
|
+
psycopg.types.json.Json(metadata),
|
|
386
|
+
content_hash,
|
|
387
|
+
),
|
|
388
|
+
).fetchone()
|
|
389
|
+
|
|
390
|
+
entity_doc_id = result["id"]
|
|
391
|
+
|
|
392
|
+
# Insert chunk
|
|
393
|
+
token_count = len(content) // 4
|
|
394
|
+
conn.execute(
|
|
395
|
+
"""
|
|
396
|
+
INSERT INTO chunks (document_id, chunk_index, content, embedding_text, embedding, token_count, metadata)
|
|
397
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
398
|
+
""",
|
|
399
|
+
(
|
|
400
|
+
entity_doc_id,
|
|
401
|
+
0,
|
|
402
|
+
content,
|
|
403
|
+
embedding_text,
|
|
404
|
+
embedding,
|
|
405
|
+
token_count,
|
|
406
|
+
psycopg.types.json.Json({}),
|
|
407
|
+
),
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Add entity reference linking entity to source document
|
|
411
|
+
for mention in entity.mentions or [entity.name]:
|
|
412
|
+
conn.execute(
|
|
413
|
+
"""
|
|
414
|
+
INSERT INTO entity_refs (entity_id, document_id, mention_text, confidence)
|
|
415
|
+
VALUES (%s, %s, %s, %s)
|
|
416
|
+
ON CONFLICT (entity_id, document_id, mention_text) DO NOTHING
|
|
417
|
+
""",
|
|
418
|
+
(entity_doc_id, source_document_id, mention[:500], entity.confidence),
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
conn.commit()
|
|
422
|
+
return source_path
|
|
423
|
+
except Exception as e:
|
|
424
|
+
print(f"Error creating entity document: {e}", file=sys.stderr)
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def process_enrichment(
|
|
429
|
+
document_id: str,
|
|
430
|
+
source_path: str,
|
|
431
|
+
title: str,
|
|
432
|
+
content: str,
|
|
433
|
+
source_type: str,
|
|
434
|
+
db_url: str,
|
|
435
|
+
config: EnrichmentConfig | None = None,
|
|
436
|
+
project: str | None = None,
|
|
437
|
+
use_modal: bool = False,
|
|
438
|
+
) -> dict:
|
|
439
|
+
"""Run enrichment on a document and store results.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
document_id: UUID of the document
|
|
443
|
+
source_path: Source path of the document
|
|
444
|
+
title: Document title
|
|
445
|
+
content: Document content
|
|
446
|
+
source_type: Type of document
|
|
447
|
+
db_url: Database URL
|
|
448
|
+
config: Enrichment configuration
|
|
449
|
+
project: Project name for TODOs
|
|
450
|
+
use_modal: If True, use Modal GPU for embedding; else local CPU
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
Dict with counts: {todos_created, entities_pending, entities_created}
|
|
454
|
+
"""
|
|
455
|
+
if config is None:
|
|
456
|
+
config = EnrichmentConfig()
|
|
457
|
+
|
|
458
|
+
result = enrich_document(content, title, source_type, config)
|
|
459
|
+
|
|
460
|
+
stats = {
|
|
461
|
+
"todos_created": 0,
|
|
462
|
+
"entities_pending": 0,
|
|
463
|
+
"entities_created": 0,
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
# Process TODOs
|
|
467
|
+
if result.todos and config.auto_create_todos:
|
|
468
|
+
for todo in result.todos:
|
|
469
|
+
if _create_todo_document(todo, source_path, title, db_url, project, use_modal):
|
|
470
|
+
stats["todos_created"] += 1
|
|
471
|
+
|
|
472
|
+
# Process entities
|
|
473
|
+
for entity in result.entities:
|
|
474
|
+
if config.auto_create_entities:
|
|
475
|
+
if _create_entity_document(entity, document_id, db_url, use_modal):
|
|
476
|
+
stats["entities_created"] += 1
|
|
477
|
+
else:
|
|
478
|
+
if _create_pending_entity(entity, document_id, db_url):
|
|
479
|
+
stats["entities_pending"] += 1
|
|
480
|
+
|
|
481
|
+
# Mark document as enriched
|
|
482
|
+
with psycopg.connect(db_url) as conn:
|
|
483
|
+
conn.execute(
|
|
484
|
+
"""
|
|
485
|
+
UPDATE documents
|
|
486
|
+
SET enriched_at = NOW(), enrichment_version = %s
|
|
487
|
+
WHERE id = %s
|
|
488
|
+
""",
|
|
489
|
+
(config.version, document_id),
|
|
490
|
+
)
|
|
491
|
+
conn.commit()
|
|
492
|
+
|
|
493
|
+
return stats
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def get_unenriched_documents(
|
|
497
|
+
db_url: str,
|
|
498
|
+
source_type: str | None = None,
|
|
499
|
+
project: str | None = None,
|
|
500
|
+
query: str | None = None,
|
|
501
|
+
path_pattern: str | None = None,
|
|
502
|
+
enrichment_version: int | None = None,
|
|
503
|
+
limit: int = 100,
|
|
504
|
+
) -> list[dict]:
|
|
505
|
+
"""Get documents that need enrichment.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
db_url: Database URL
|
|
509
|
+
source_type: Filter by source type
|
|
510
|
+
project: Filter by project
|
|
511
|
+
query: Semantic search query to filter documents
|
|
512
|
+
path_pattern: SQL LIKE pattern to filter source_path (e.g., '%myrepo%')
|
|
513
|
+
enrichment_version: Only include docs with older version (for re-enrichment)
|
|
514
|
+
limit: Maximum documents to return
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
List of document dicts with id, source_path, title, content, source_type
|
|
518
|
+
"""
|
|
519
|
+
from ..local_embedder import embed_query
|
|
520
|
+
|
|
521
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
522
|
+
sql = """
|
|
523
|
+
SELECT d.id, d.source_path, d.title, d.content, d.source_type, d.metadata
|
|
524
|
+
FROM documents d
|
|
525
|
+
WHERE (d.enriched_at IS NULL
|
|
526
|
+
"""
|
|
527
|
+
params: list[Any] = []
|
|
528
|
+
|
|
529
|
+
if enrichment_version is not None:
|
|
530
|
+
sql += " OR d.enrichment_version < %s"
|
|
531
|
+
params.append(enrichment_version)
|
|
532
|
+
|
|
533
|
+
sql += ")"
|
|
534
|
+
|
|
535
|
+
if source_type:
|
|
536
|
+
sql += " AND d.source_type = %s"
|
|
537
|
+
params.append(source_type)
|
|
538
|
+
|
|
539
|
+
if project:
|
|
540
|
+
sql += " AND d.metadata->>'project' = %s"
|
|
541
|
+
params.append(project)
|
|
542
|
+
|
|
543
|
+
if path_pattern:
|
|
544
|
+
sql += " AND d.source_path LIKE %s"
|
|
545
|
+
params.append(path_pattern)
|
|
546
|
+
|
|
547
|
+
# Exclude already-derived documents (escape % for psycopg)
|
|
548
|
+
sql += " AND d.source_path NOT LIKE '%%::todo/%%'"
|
|
549
|
+
sql += " AND d.source_path NOT LIKE 'okb://entity/%%'"
|
|
550
|
+
sql += " AND d.source_path NOT LIKE 'claude://%%'"
|
|
551
|
+
|
|
552
|
+
if query:
|
|
553
|
+
# Use semantic search to filter
|
|
554
|
+
from pgvector.psycopg import register_vector
|
|
555
|
+
|
|
556
|
+
register_vector(conn)
|
|
557
|
+
embedding = embed_query(query)
|
|
558
|
+
|
|
559
|
+
# Use GROUP BY to aggregate chunk similarities per document
|
|
560
|
+
sql = """
|
|
561
|
+
SELECT d.id, d.source_path, d.title, d.content, d.source_type, d.metadata,
|
|
562
|
+
MIN(c.embedding <=> %s::vector) as distance
|
|
563
|
+
FROM documents d
|
|
564
|
+
JOIN chunks c ON c.document_id = d.id
|
|
565
|
+
WHERE (d.enriched_at IS NULL
|
|
566
|
+
"""
|
|
567
|
+
params = [embedding]
|
|
568
|
+
|
|
569
|
+
if enrichment_version is not None:
|
|
570
|
+
sql += " OR d.enrichment_version < %s"
|
|
571
|
+
params.append(enrichment_version)
|
|
572
|
+
|
|
573
|
+
sql += ")"
|
|
574
|
+
sql += " AND 1 - (c.embedding <=> %s::vector) > 0.3"
|
|
575
|
+
params.append(embedding)
|
|
576
|
+
|
|
577
|
+
if source_type:
|
|
578
|
+
sql += " AND d.source_type = %s"
|
|
579
|
+
params.append(source_type)
|
|
580
|
+
|
|
581
|
+
if project:
|
|
582
|
+
sql += " AND d.metadata->>'project' = %s"
|
|
583
|
+
params.append(project)
|
|
584
|
+
|
|
585
|
+
if path_pattern:
|
|
586
|
+
sql += " AND d.source_path LIKE %s"
|
|
587
|
+
params.append(path_pattern)
|
|
588
|
+
|
|
589
|
+
sql += " AND d.source_path NOT LIKE '%%::todo/%%'"
|
|
590
|
+
sql += " AND d.source_path NOT LIKE 'okb://entity/%%'"
|
|
591
|
+
sql += " AND d.source_path NOT LIKE 'claude://%%'"
|
|
592
|
+
|
|
593
|
+
sql += " GROUP BY d.id, d.source_path, d.title, d.content, d.source_type, d.metadata"
|
|
594
|
+
sql += " ORDER BY distance"
|
|
595
|
+
|
|
596
|
+
sql += f" LIMIT {limit}"
|
|
597
|
+
|
|
598
|
+
results = conn.execute(sql, params).fetchall()
|
|
599
|
+
return [dict(r) for r in results]
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def list_pending_entities(
|
|
603
|
+
db_url: str,
|
|
604
|
+
entity_type: str | None = None,
|
|
605
|
+
limit: int = 50,
|
|
606
|
+
) -> list[dict]:
|
|
607
|
+
"""List pending entity suggestions.
|
|
608
|
+
|
|
609
|
+
Returns list of dicts with: id, entity_name, entity_type, aliases, description,
|
|
610
|
+
mentions, confidence, source_path, source_title, created_at
|
|
611
|
+
"""
|
|
612
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
613
|
+
sql = """
|
|
614
|
+
SELECT
|
|
615
|
+
pe.id, pe.entity_name, pe.entity_type, pe.aliases, pe.description,
|
|
616
|
+
pe.mentions, pe.confidence, pe.created_at,
|
|
617
|
+
d.source_path as source_path, d.title as source_title
|
|
618
|
+
FROM pending_entities pe
|
|
619
|
+
JOIN documents d ON d.id = pe.source_document_id
|
|
620
|
+
WHERE pe.status = 'pending'
|
|
621
|
+
"""
|
|
622
|
+
params: list[Any] = []
|
|
623
|
+
|
|
624
|
+
if entity_type:
|
|
625
|
+
sql += " AND pe.entity_type = %s"
|
|
626
|
+
params.append(entity_type)
|
|
627
|
+
|
|
628
|
+
sql += " ORDER BY pe.confidence DESC, pe.created_at DESC LIMIT %s"
|
|
629
|
+
params.append(limit)
|
|
630
|
+
|
|
631
|
+
results = conn.execute(sql, params).fetchall()
|
|
632
|
+
return [dict(r) for r in results]
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def approve_entity(db_url: str, pending_id: str, use_modal: bool = False) -> str | None:
|
|
636
|
+
"""Approve a pending entity, creating the entity document.
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
db_url: Database URL
|
|
640
|
+
pending_id: ID of the pending entity to approve
|
|
641
|
+
use_modal: If True, use Modal GPU for embedding; else local CPU
|
|
642
|
+
|
|
643
|
+
Returns the entity source_path, or None if failed.
|
|
644
|
+
"""
|
|
645
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
646
|
+
# Get pending entity
|
|
647
|
+
pe = conn.execute(
|
|
648
|
+
"""
|
|
649
|
+
SELECT pe.*, d.id as source_doc_id
|
|
650
|
+
FROM pending_entities pe
|
|
651
|
+
JOIN documents d ON d.id = pe.source_document_id
|
|
652
|
+
WHERE pe.id = %s AND pe.status = 'pending'
|
|
653
|
+
""",
|
|
654
|
+
(pending_id,),
|
|
655
|
+
).fetchone()
|
|
656
|
+
|
|
657
|
+
if not pe:
|
|
658
|
+
return None
|
|
659
|
+
|
|
660
|
+
# Create entity from pending
|
|
661
|
+
entity = ExtractedEntity(
|
|
662
|
+
name=pe["entity_name"],
|
|
663
|
+
entity_type=pe["entity_type"],
|
|
664
|
+
aliases=pe["aliases"] or [],
|
|
665
|
+
description=pe["description"],
|
|
666
|
+
mentions=pe["mentions"] or [],
|
|
667
|
+
confidence=pe["confidence"] or 0.8,
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
source_path = _create_entity_document(entity, pe["source_doc_id"], db_url, use_modal)
|
|
671
|
+
|
|
672
|
+
if source_path:
|
|
673
|
+
# Mark as approved
|
|
674
|
+
conn.execute(
|
|
675
|
+
"""
|
|
676
|
+
UPDATE pending_entities
|
|
677
|
+
SET status = 'approved', reviewed_at = NOW()
|
|
678
|
+
WHERE id = %s
|
|
679
|
+
""",
|
|
680
|
+
(pending_id,),
|
|
681
|
+
)
|
|
682
|
+
conn.commit()
|
|
683
|
+
|
|
684
|
+
return source_path
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def approve_entity_async(
|
|
688
|
+
db_url: str, pending_id: str, use_modal: bool = False
|
|
689
|
+
) -> Future[str | None]:
|
|
690
|
+
"""Approve a pending entity asynchronously.
|
|
691
|
+
|
|
692
|
+
The embedding and document creation happens in a background thread.
|
|
693
|
+
Returns a Future that can be awaited or checked later.
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
db_url: Database URL
|
|
697
|
+
pending_id: ID of the pending entity to approve
|
|
698
|
+
use_modal: If True, use Modal GPU for embedding; else local CPU
|
|
699
|
+
|
|
700
|
+
Returns:
|
|
701
|
+
Future that resolves to the entity source_path, or None if failed.
|
|
702
|
+
"""
|
|
703
|
+
executor = _get_executor()
|
|
704
|
+
return executor.submit(approve_entity, db_url, pending_id, use_modal)
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
def reject_entity(db_url: str, pending_id: str) -> bool:
|
|
708
|
+
"""Reject a pending entity.
|
|
709
|
+
|
|
710
|
+
Returns True if rejected, False if not found.
|
|
711
|
+
"""
|
|
712
|
+
with psycopg.connect(db_url) as conn:
|
|
713
|
+
result = conn.execute(
|
|
714
|
+
"""
|
|
715
|
+
UPDATE pending_entities
|
|
716
|
+
SET status = 'rejected', reviewed_at = NOW()
|
|
717
|
+
WHERE id = %s AND status = 'pending'
|
|
718
|
+
RETURNING id
|
|
719
|
+
""",
|
|
720
|
+
(pending_id,),
|
|
721
|
+
).fetchone()
|
|
722
|
+
conn.commit()
|
|
723
|
+
return result is not None
|