okb 1.1.0a0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/cli.py +1083 -16
- okb/config.py +122 -4
- okb/http_server.py +293 -90
- okb/llm/analyze.py +524 -0
- okb/llm/consolidate.py +685 -0
- okb/llm/enrich.py +723 -0
- okb/llm/extractors/__init__.py +13 -0
- okb/llm/extractors/base.py +44 -0
- okb/llm/extractors/cross_doc.py +478 -0
- okb/llm/extractors/dedup.py +499 -0
- okb/llm/extractors/entity.py +369 -0
- okb/llm/extractors/todo.py +149 -0
- okb/llm/providers.py +9 -6
- okb/mcp_server.py +1036 -12
- okb/migrations/0008.enrichment.sql +46 -0
- okb/migrations/0009.entity-consolidation.sql +120 -0
- okb/migrations/0010.token-id.sql +7 -0
- okb/modal_llm.py +26 -8
- okb/plugins/sources/github.py +5 -5
- okb/tokens.py +25 -3
- {okb-1.1.0a0.dist-info → okb-1.1.1.dist-info}/METADATA +91 -8
- {okb-1.1.0a0.dist-info → okb-1.1.1.dist-info}/RECORD +24 -12
- {okb-1.1.0a0.dist-info → okb-1.1.1.dist-info}/WHEEL +0 -0
- {okb-1.1.0a0.dist-info → okb-1.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Extractors for LLM-based document enrichment."""
|
|
2
|
+
|
|
3
|
+
from .base import EnrichmentResult, ExtractedEntity, ExtractedTodo
|
|
4
|
+
from .entity import extract_entities
|
|
5
|
+
from .todo import extract_todos
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ExtractedTodo",
|
|
9
|
+
"ExtractedEntity",
|
|
10
|
+
"EnrichmentResult",
|
|
11
|
+
"extract_todos",
|
|
12
|
+
"extract_entities",
|
|
13
|
+
]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Base types for document enrichment extractors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class ExtractedTodo:
|
|
11
|
+
"""A TODO item extracted from document content."""
|
|
12
|
+
|
|
13
|
+
title: str
|
|
14
|
+
content: str | None = None
|
|
15
|
+
due_date: datetime | None = None
|
|
16
|
+
priority: int | None = None # 1-5, 1=highest
|
|
17
|
+
assignee: str | None = None
|
|
18
|
+
confidence: float = 1.0
|
|
19
|
+
source_context: str | None = None # Text snippet where TODO was found
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ExtractedEntity:
|
|
24
|
+
"""An entity extracted from document content."""
|
|
25
|
+
|
|
26
|
+
name: str
|
|
27
|
+
entity_type: str # person, project, technology, concept, organization
|
|
28
|
+
aliases: list[str] = field(default_factory=list)
|
|
29
|
+
description: str | None = None
|
|
30
|
+
mentions: list[str] = field(default_factory=list) # Context snippets
|
|
31
|
+
confidence: float = 1.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class EnrichmentResult:
|
|
36
|
+
"""Results from document enrichment."""
|
|
37
|
+
|
|
38
|
+
todos: list[ExtractedTodo] = field(default_factory=list)
|
|
39
|
+
entities: list[ExtractedEntity] = field(default_factory=list)
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def has_extractions(self) -> bool:
|
|
43
|
+
"""Check if any extractions were made."""
|
|
44
|
+
return bool(self.todos or self.entities)
|
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
"""Cross-document entity detection - find mentions appearing in multiple documents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from collections import Counter, defaultdict
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import psycopg
|
|
12
|
+
from psycopg.rows import dict_row
|
|
13
|
+
|
|
14
|
+
from .base import ExtractedEntity
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class CrossDocCandidate:
|
|
19
|
+
"""A potential entity found across multiple documents."""
|
|
20
|
+
|
|
21
|
+
text: str # Normalized mention text
|
|
22
|
+
document_ids: list[str]
|
|
23
|
+
document_count: int
|
|
24
|
+
sample_contexts: list[str] # Sample text contexts where it appears
|
|
25
|
+
suggested_type: str | None = None
|
|
26
|
+
confidence: float = 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
CLASSIFY_SYSTEM = """\
|
|
30
|
+
You classify text mentions as named entities. You MUST respond with ONLY valid JSON, no other text.
|
|
31
|
+
|
|
32
|
+
Entity types: person, project, technology, concept, organization, not_entity
|
|
33
|
+
|
|
34
|
+
Required JSON format:
|
|
35
|
+
{"classifications":[{"text":"Django","type":"technology","confidence":0.9}]}
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
CLASSIFY_USER = """\
|
|
39
|
+
Classify these mentions as entities. Reply with ONLY JSON, no explanation.
|
|
40
|
+
|
|
41
|
+
{mentions}
|
|
42
|
+
|
|
43
|
+
JSON format: {{"classifications":[{{"text":"...","type":"...","confidence":0.9}}]}}
|
|
44
|
+
Types: person, project, technology, concept, organization, not_entity
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def find_cross_document_entities(
|
|
49
|
+
db_url: str,
|
|
50
|
+
min_documents: int = 3,
|
|
51
|
+
limit: int = 100,
|
|
52
|
+
) -> list[CrossDocCandidate]:
|
|
53
|
+
"""Find text mentions appearing in multiple documents but not extracted as entities.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
db_url: Database URL
|
|
57
|
+
min_documents: Minimum documents a mention must appear in
|
|
58
|
+
limit: Maximum candidates to return
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
List of CrossDocCandidate objects
|
|
62
|
+
"""
|
|
63
|
+
candidates: list[CrossDocCandidate] = []
|
|
64
|
+
|
|
65
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
66
|
+
# Get existing entity names and aliases to exclude
|
|
67
|
+
existing_names = set()
|
|
68
|
+
|
|
69
|
+
# Get entity titles
|
|
70
|
+
results = conn.execute(
|
|
71
|
+
"SELECT LOWER(title) as name FROM documents WHERE source_type = 'entity'"
|
|
72
|
+
).fetchall()
|
|
73
|
+
existing_names.update(r["name"] for r in results)
|
|
74
|
+
|
|
75
|
+
# Get aliases
|
|
76
|
+
results = conn.execute("SELECT LOWER(alias_text) as name FROM entity_aliases").fetchall()
|
|
77
|
+
existing_names.update(r["name"] for r in results)
|
|
78
|
+
|
|
79
|
+
# Get already-detected candidates
|
|
80
|
+
results = conn.execute(
|
|
81
|
+
"SELECT LOWER(text) as name FROM cross_doc_entity_candidates"
|
|
82
|
+
).fetchall()
|
|
83
|
+
existing_detected = {r["name"] for r in results}
|
|
84
|
+
|
|
85
|
+
# Get documents (exclude derived documents)
|
|
86
|
+
docs = conn.execute(
|
|
87
|
+
"""
|
|
88
|
+
SELECT id, content
|
|
89
|
+
FROM documents
|
|
90
|
+
WHERE source_path NOT LIKE '%%::todo/%%'
|
|
91
|
+
AND source_path NOT LIKE 'okb://entity/%%'
|
|
92
|
+
AND source_path NOT LIKE 'claude://%%'
|
|
93
|
+
AND content IS NOT NULL
|
|
94
|
+
LIMIT 1000
|
|
95
|
+
"""
|
|
96
|
+
).fetchall()
|
|
97
|
+
|
|
98
|
+
# Extract noun phrases and track document occurrences
|
|
99
|
+
mention_docs: dict[str, set[str]] = defaultdict(set)
|
|
100
|
+
mention_contexts: dict[str, list[str]] = defaultdict(list)
|
|
101
|
+
|
|
102
|
+
for doc in docs:
|
|
103
|
+
doc_id = str(doc["id"])
|
|
104
|
+
content = doc["content"]
|
|
105
|
+
phrases = _extract_noun_phrases(content)
|
|
106
|
+
|
|
107
|
+
for phrase, context in phrases:
|
|
108
|
+
normalized = phrase.lower().strip()
|
|
109
|
+
# Skip if too short, too long, or already exists
|
|
110
|
+
if len(normalized) < 2 or len(normalized) > 50:
|
|
111
|
+
continue
|
|
112
|
+
if normalized in existing_names or normalized in existing_detected:
|
|
113
|
+
continue
|
|
114
|
+
# Skip common words
|
|
115
|
+
if normalized in COMMON_WORDS:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
mention_docs[normalized].add(doc_id)
|
|
119
|
+
if len(mention_contexts[normalized]) < 3:
|
|
120
|
+
mention_contexts[normalized].append(context[:200])
|
|
121
|
+
|
|
122
|
+
# Filter to mentions appearing in min_documents
|
|
123
|
+
for text, doc_ids in mention_docs.items():
|
|
124
|
+
if len(doc_ids) >= min_documents:
|
|
125
|
+
candidates.append(
|
|
126
|
+
CrossDocCandidate(
|
|
127
|
+
text=text,
|
|
128
|
+
document_ids=list(doc_ids),
|
|
129
|
+
document_count=len(doc_ids),
|
|
130
|
+
sample_contexts=mention_contexts[text],
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Sort by document count and limit
|
|
135
|
+
candidates.sort(key=lambda c: c.document_count, reverse=True)
|
|
136
|
+
candidates = candidates[:limit]
|
|
137
|
+
|
|
138
|
+
return candidates
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _extract_noun_phrases(text: str) -> list[tuple[str, str]]:
|
|
142
|
+
"""Extract potential noun phrases from text using simple heuristics.
|
|
143
|
+
|
|
144
|
+
Returns list of (phrase, context) tuples.
|
|
145
|
+
"""
|
|
146
|
+
phrases = []
|
|
147
|
+
|
|
148
|
+
# Pattern for capitalized phrases (likely proper nouns)
|
|
149
|
+
# Matches: "Amazon Web Services", "John Smith", "React.js"
|
|
150
|
+
cap_pattern = r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\.[a-z]+)?)"
|
|
151
|
+
|
|
152
|
+
# Pattern for tech terms with special chars
|
|
153
|
+
# Matches: "C++", "Node.js", "OpenAI", "PostgreSQL"
|
|
154
|
+
tech_pattern = r"\b([A-Z][a-zA-Z]+(?:\.[a-z]+|\+\+)?)\b"
|
|
155
|
+
|
|
156
|
+
# Combined patterns
|
|
157
|
+
for pattern in [cap_pattern, tech_pattern]:
|
|
158
|
+
for match in re.finditer(pattern, text):
|
|
159
|
+
phrase = match.group(1)
|
|
160
|
+
# Get surrounding context
|
|
161
|
+
start = max(0, match.start() - 50)
|
|
162
|
+
end = min(len(text), match.end() + 50)
|
|
163
|
+
context = text[start:end]
|
|
164
|
+
phrases.append((phrase, context))
|
|
165
|
+
|
|
166
|
+
return phrases
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# Common words to exclude (not named entities)
|
|
170
|
+
COMMON_WORDS = {
|
|
171
|
+
# Articles, prepositions, conjunctions
|
|
172
|
+
"the", "this", "that", "with", "from", "have", "been", "were", "will",
|
|
173
|
+
"would", "could", "should", "these", "those", "then", "than", "when",
|
|
174
|
+
"where", "what", "which", "while", "other", "some", "most", "many",
|
|
175
|
+
"more", "each", "every", "both", "after", "before", "about", "over",
|
|
176
|
+
"under", "again", "once", "here", "there", "also", "just", "only",
|
|
177
|
+
"even", "still", "well", "back", "such", "very", "much", "into", "onto",
|
|
178
|
+
"upon", "within", "without", "between", "among", "through", "during",
|
|
179
|
+
"because", "since", "until", "unless", "although", "though", "however",
|
|
180
|
+
"therefore", "thus", "hence", "meanwhile", "otherwise", "instead",
|
|
181
|
+
"if", "for", "or", "and", "but", "nor", "yet", "so", "as", "at", "by",
|
|
182
|
+
"to", "in", "on", "of", "up", "no", "not", "any", "all", "few",
|
|
183
|
+
# Common verbs
|
|
184
|
+
"make", "made", "like", "need", "want", "take", "give", "find", "keep",
|
|
185
|
+
"put", "set", "get", "let", "say", "see", "use", "used", "using", "add",
|
|
186
|
+
"added", "adding", "run", "running", "try", "tried", "call", "called",
|
|
187
|
+
"show", "shown", "showing", "check", "checked", "checking", "include",
|
|
188
|
+
"included", "including", "contain", "contains", "provide", "provides",
|
|
189
|
+
"allow", "allows", "enable", "enables", "support", "supports", "handle",
|
|
190
|
+
"handles", "generate", "generated", "generating", "test", "tested",
|
|
191
|
+
"testing", "build", "building", "deploy", "deploying", "move", "moved",
|
|
192
|
+
"send", "sent", "receive", "received", "pass", "passed", "fail", "failed",
|
|
193
|
+
"complete", "completed", "finish", "finished", "done",
|
|
194
|
+
# Common nouns (generic)
|
|
195
|
+
"user", "users", "data", "file", "files", "code", "time", "work", "way",
|
|
196
|
+
"case", "cases", "point", "points", "part", "parts", "place", "thing",
|
|
197
|
+
"things", "name", "names", "number", "numbers", "type", "types", "list",
|
|
198
|
+
"lists", "line", "lines", "note", "notes", "example", "examples",
|
|
199
|
+
"section", "sections", "chapter", "page", "pages", "document", "documents",
|
|
200
|
+
"item", "items", "entry", "entries", "record", "records", "row", "rows",
|
|
201
|
+
"column", "columns", "table", "tables", "field", "fields", "form", "forms",
|
|
202
|
+
"view", "views", "model", "models", "schema", "index", "key", "keys",
|
|
203
|
+
"token", "tokens", "id", "ids", "url", "urls", "path", "paths",
|
|
204
|
+
# Programming terms (generic)
|
|
205
|
+
"function", "functions", "method", "methods", "class", "classes",
|
|
206
|
+
"object", "objects", "value", "values", "result", "results", "error",
|
|
207
|
+
"errors", "issue", "issues", "problem", "problems", "solution", "solutions",
|
|
208
|
+
"system", "systems", "process", "processes", "service", "services",
|
|
209
|
+
"server", "servers", "client", "clients", "request", "requests",
|
|
210
|
+
"response", "responses", "query", "queries", "update", "updates",
|
|
211
|
+
"delete", "deletes", "create", "creates", "read", "reads", "write",
|
|
212
|
+
"writes", "input", "inputs", "output", "outputs", "return", "returns",
|
|
213
|
+
"exception", "exceptions", "warning", "warnings", "message", "messages",
|
|
214
|
+
"event", "events", "action", "actions", "task", "tasks", "job", "jobs",
|
|
215
|
+
"api", "apis", "json", "xml", "html", "css", "sql", "http", "https",
|
|
216
|
+
"config", "configs", "setting", "settings", "option", "options",
|
|
217
|
+
"param", "params", "parameter", "parameters", "argument", "arguments",
|
|
218
|
+
"variable", "variables", "constant", "constants", "property", "properties",
|
|
219
|
+
"attribute", "attributes", "module", "modules", "package", "packages",
|
|
220
|
+
"library", "libraries", "framework", "frameworks", "tool", "tools",
|
|
221
|
+
"script", "scripts", "command", "commands", "handler", "handlers",
|
|
222
|
+
"callback", "callbacks", "hook", "hooks", "plugin", "plugins",
|
|
223
|
+
"context", "contexts", "state", "states", "status", "priority",
|
|
224
|
+
"level", "levels", "mode", "modes", "flag", "flags", "tag", "tags",
|
|
225
|
+
# Ordinals and quantifiers
|
|
226
|
+
"start", "end", "first", "last", "next", "previous", "begin", "final",
|
|
227
|
+
"new", "old", "good", "bad", "high", "low", "top", "bottom", "left",
|
|
228
|
+
"right", "true", "false", "yes", "none", "null", "undefined", "empty",
|
|
229
|
+
# Adjectives
|
|
230
|
+
"default", "custom", "main", "base", "core", "common", "standard",
|
|
231
|
+
"general", "specific", "local", "remote", "public", "private",
|
|
232
|
+
"internal", "external", "simple", "basic", "advanced", "current",
|
|
233
|
+
"available", "required", "optional", "important", "different", "same",
|
|
234
|
+
"similar", "related", "following", "above", "below", "existing",
|
|
235
|
+
"valid", "invalid", "active", "inactive", "enabled", "disabled",
|
|
236
|
+
"visible", "hidden", "open", "closed", "full", "partial", "total",
|
|
237
|
+
"rest", "remaining", "other", "another", "single", "multiple",
|
|
238
|
+
# Days and months
|
|
239
|
+
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday",
|
|
240
|
+
"sunday", "january", "february", "march", "april", "may", "june",
|
|
241
|
+
"july", "august", "september", "october", "november", "december",
|
|
242
|
+
"today", "tomorrow", "yesterday", "week", "month", "year",
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def classify_candidates(
|
|
247
|
+
candidates: list[CrossDocCandidate],
|
|
248
|
+
db_url: str | None = None,
|
|
249
|
+
batch_size: int = 25,
|
|
250
|
+
) -> list[ExtractedEntity]:
|
|
251
|
+
"""Use LLM to classify cross-document candidates as entities.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
candidates: List of candidates to classify
|
|
255
|
+
db_url: Database URL (for caching)
|
|
256
|
+
batch_size: Max candidates per LLM call (default 25 to avoid prompt length issues)
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
List of ExtractedEntity objects for valid entities
|
|
260
|
+
"""
|
|
261
|
+
if not candidates:
|
|
262
|
+
return []
|
|
263
|
+
|
|
264
|
+
from .. import complete
|
|
265
|
+
|
|
266
|
+
all_entities = []
|
|
267
|
+
candidate_map = {c.text.lower(): c for c in candidates}
|
|
268
|
+
|
|
269
|
+
# Process in batches to avoid prompt length issues with smaller models
|
|
270
|
+
for i in range(0, len(candidates), batch_size):
|
|
271
|
+
batch = candidates[i : i + batch_size]
|
|
272
|
+
|
|
273
|
+
# Build prompt for this batch
|
|
274
|
+
mention_lines = []
|
|
275
|
+
for c in batch:
|
|
276
|
+
mention_lines.append(f'- "{c.text}" (in {c.document_count} docs)')
|
|
277
|
+
|
|
278
|
+
prompt = CLASSIFY_USER.format(mentions="\n".join(mention_lines))
|
|
279
|
+
|
|
280
|
+
response = complete(prompt, system=CLASSIFY_SYSTEM, max_tokens=2048, use_cache=True)
|
|
281
|
+
|
|
282
|
+
if response is None:
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
# Parse response
|
|
286
|
+
try:
|
|
287
|
+
content = response.content.strip()
|
|
288
|
+
if content.startswith("```"):
|
|
289
|
+
lines = content.split("\n")
|
|
290
|
+
content = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
|
|
291
|
+
# Fix common LLM JSON errors: leading zeros (00.9 -> 0.9), trailing commas
|
|
292
|
+
content = re.sub(r":0+(\d)", r":\1", content) # 00.9 -> 0.9
|
|
293
|
+
content = re.sub(r",\s*([}\]])", r"\1", content) # trailing commas
|
|
294
|
+
data = json.loads(content)
|
|
295
|
+
except json.JSONDecodeError:
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# Handle both {"classifications": [...]} and direct list formats
|
|
299
|
+
if isinstance(data, list):
|
|
300
|
+
classifications = data
|
|
301
|
+
else:
|
|
302
|
+
classifications = data.get("classifications", [])
|
|
303
|
+
|
|
304
|
+
for cls in classifications:
|
|
305
|
+
text = cls.get("text", "")
|
|
306
|
+
entity_type = cls.get("type", "")
|
|
307
|
+
confidence = cls.get("confidence", 0.5)
|
|
308
|
+
|
|
309
|
+
if entity_type == "not_entity" or not entity_type:
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
candidate = candidate_map.get(text.lower())
|
|
313
|
+
if candidate:
|
|
314
|
+
# Update candidate with classification
|
|
315
|
+
candidate.suggested_type = entity_type
|
|
316
|
+
candidate.confidence = confidence
|
|
317
|
+
|
|
318
|
+
all_entities.append(
|
|
319
|
+
ExtractedEntity(
|
|
320
|
+
name=text,
|
|
321
|
+
entity_type=entity_type,
|
|
322
|
+
confidence=confidence,
|
|
323
|
+
mentions=candidate.sample_contexts[:3],
|
|
324
|
+
)
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
return all_entities
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def store_candidates(db_url: str, candidates: list[CrossDocCandidate]) -> int:
|
|
331
|
+
"""Store cross-document candidates in database.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
db_url: Database URL
|
|
335
|
+
candidates: Candidates to store
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
Number of candidates stored
|
|
339
|
+
"""
|
|
340
|
+
stored = 0
|
|
341
|
+
|
|
342
|
+
with psycopg.connect(db_url) as conn:
|
|
343
|
+
for c in candidates:
|
|
344
|
+
try:
|
|
345
|
+
conn.execute(
|
|
346
|
+
"""
|
|
347
|
+
INSERT INTO cross_doc_entity_candidates
|
|
348
|
+
(text, document_ids, document_count, sample_contexts,
|
|
349
|
+
suggested_type, confidence, status)
|
|
350
|
+
VALUES (%s, %s, %s, %s, %s, %s, 'pending')
|
|
351
|
+
ON CONFLICT (text) DO UPDATE SET
|
|
352
|
+
document_ids = EXCLUDED.document_ids,
|
|
353
|
+
document_count = EXCLUDED.document_count,
|
|
354
|
+
sample_contexts = EXCLUDED.sample_contexts,
|
|
355
|
+
suggested_type = EXCLUDED.suggested_type,
|
|
356
|
+
confidence = EXCLUDED.confidence
|
|
357
|
+
""",
|
|
358
|
+
(
|
|
359
|
+
c.text,
|
|
360
|
+
c.document_ids,
|
|
361
|
+
c.document_count,
|
|
362
|
+
psycopg.types.json.Json(c.sample_contexts),
|
|
363
|
+
c.suggested_type,
|
|
364
|
+
c.confidence,
|
|
365
|
+
),
|
|
366
|
+
)
|
|
367
|
+
stored += 1
|
|
368
|
+
except Exception:
|
|
369
|
+
pass
|
|
370
|
+
conn.commit()
|
|
371
|
+
|
|
372
|
+
return stored
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def list_cross_doc_candidates(
|
|
376
|
+
db_url: str,
|
|
377
|
+
status: str = "pending",
|
|
378
|
+
limit: int = 50,
|
|
379
|
+
) -> list[dict]:
|
|
380
|
+
"""List cross-document entity candidates.
|
|
381
|
+
|
|
382
|
+
Returns list of dicts with candidate details.
|
|
383
|
+
"""
|
|
384
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
385
|
+
results = conn.execute(
|
|
386
|
+
"""
|
|
387
|
+
SELECT id, text, document_count, sample_contexts,
|
|
388
|
+
suggested_type, confidence, status, created_at
|
|
389
|
+
FROM cross_doc_entity_candidates
|
|
390
|
+
WHERE status = %s
|
|
391
|
+
ORDER BY document_count DESC, confidence DESC
|
|
392
|
+
LIMIT %s
|
|
393
|
+
""",
|
|
394
|
+
(status, limit),
|
|
395
|
+
).fetchall()
|
|
396
|
+
return [dict(r) for r in results]
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def approve_cross_doc_candidate(db_url: str, candidate_id: str) -> str | None:
|
|
400
|
+
"""Approve a cross-doc candidate, creating it as a pending entity.
|
|
401
|
+
|
|
402
|
+
Returns the pending entity ID, or None if failed.
|
|
403
|
+
"""
|
|
404
|
+
with psycopg.connect(db_url, row_factory=dict_row) as conn:
|
|
405
|
+
# Get candidate
|
|
406
|
+
candidate = conn.execute(
|
|
407
|
+
"""
|
|
408
|
+
SELECT text, document_ids, sample_contexts, suggested_type, confidence
|
|
409
|
+
FROM cross_doc_entity_candidates
|
|
410
|
+
WHERE id = %s AND status = 'pending'
|
|
411
|
+
""",
|
|
412
|
+
(candidate_id,),
|
|
413
|
+
).fetchone()
|
|
414
|
+
|
|
415
|
+
if not candidate:
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
# Get first document as source
|
|
419
|
+
doc_ids = candidate["document_ids"]
|
|
420
|
+
if not doc_ids:
|
|
421
|
+
return None
|
|
422
|
+
|
|
423
|
+
source_doc = conn.execute(
|
|
424
|
+
"SELECT id FROM documents WHERE id = %s",
|
|
425
|
+
(doc_ids[0],),
|
|
426
|
+
).fetchone()
|
|
427
|
+
|
|
428
|
+
if not source_doc:
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
# Create pending entity
|
|
432
|
+
result = conn.execute(
|
|
433
|
+
"""
|
|
434
|
+
INSERT INTO pending_entities
|
|
435
|
+
(source_document_id, entity_name, entity_type, mentions, confidence, status)
|
|
436
|
+
VALUES (%s, %s, %s, %s, %s, 'pending')
|
|
437
|
+
RETURNING id
|
|
438
|
+
""",
|
|
439
|
+
(
|
|
440
|
+
source_doc["id"],
|
|
441
|
+
candidate["text"],
|
|
442
|
+
candidate["suggested_type"] or "concept",
|
|
443
|
+
psycopg.types.json.Json(candidate["sample_contexts"]),
|
|
444
|
+
candidate["confidence"],
|
|
445
|
+
),
|
|
446
|
+
).fetchone()
|
|
447
|
+
|
|
448
|
+
# Mark candidate as approved
|
|
449
|
+
conn.execute(
|
|
450
|
+
"""
|
|
451
|
+
UPDATE cross_doc_entity_candidates
|
|
452
|
+
SET status = 'approved', reviewed_at = NOW()
|
|
453
|
+
WHERE id = %s
|
|
454
|
+
""",
|
|
455
|
+
(candidate_id,),
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
conn.commit()
|
|
459
|
+
return str(result["id"]) if result else None
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def reject_cross_doc_candidate(db_url: str, candidate_id: str) -> bool:
|
|
463
|
+
"""Reject a cross-doc candidate.
|
|
464
|
+
|
|
465
|
+
Returns True if successful.
|
|
466
|
+
"""
|
|
467
|
+
with psycopg.connect(db_url) as conn:
|
|
468
|
+
result = conn.execute(
|
|
469
|
+
"""
|
|
470
|
+
UPDATE cross_doc_entity_candidates
|
|
471
|
+
SET status = 'rejected', reviewed_at = NOW()
|
|
472
|
+
WHERE id = %s AND status = 'pending'
|
|
473
|
+
RETURNING id
|
|
474
|
+
""",
|
|
475
|
+
(candidate_id,),
|
|
476
|
+
).fetchone()
|
|
477
|
+
conn.commit()
|
|
478
|
+
return result is not None
|