okb 1.1.0__py3-none-any.whl → 1.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/cli.py +16 -1083
- okb/config.py +4 -122
- okb/http_server.py +2 -163
- okb/llm/providers.py +6 -9
- okb/mcp_server.py +12 -1036
- okb/modal_llm.py +8 -26
- okb/plugins/sources/github.py +5 -5
- okb/tokens.py +3 -25
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/METADATA +6 -83
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/RECORD +12 -24
- okb/llm/analyze.py +0 -524
- okb/llm/consolidate.py +0 -685
- okb/llm/enrich.py +0 -723
- okb/llm/extractors/__init__.py +0 -13
- okb/llm/extractors/base.py +0 -44
- okb/llm/extractors/cross_doc.py +0 -478
- okb/llm/extractors/dedup.py +0 -499
- okb/llm/extractors/entity.py +0 -369
- okb/llm/extractors/todo.py +0 -149
- okb/migrations/0008.enrichment.sql +0 -46
- okb/migrations/0009.entity-consolidation.sql +0 -120
- okb/migrations/0010.token-id.sql +0 -7
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/WHEEL +0 -0
- {okb-1.1.0.dist-info → okb-1.1.0a0.dist-info}/entry_points.txt +0 -0
okb/llm/extractors/entity.py
DELETED
|
@@ -1,369 +0,0 @@
|
|
|
1
|
-
"""Entity extraction from document content using LLM."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
import re
|
|
7
|
-
|
|
8
|
-
from .base import ExtractedEntity
|
|
9
|
-
|
|
10
|
-
ENTITY_SYSTEM_PROMPT = """\
|
|
11
|
-
You are an expert at identifying named entities in text for a PERSONAL knowledge base.
|
|
12
|
-
Extract only entities that are specific to the author's context - things an LLM wouldn't know about.
|
|
13
|
-
|
|
14
|
-
Entity types to extract:
|
|
15
|
-
- person: People the author knows, works with, or references (colleagues, contacts, clients)
|
|
16
|
-
- project: Specific named projects/products/codebases (e.g., "Acme Dashboard", "customer-portal")
|
|
17
|
-
NOT git branches, environments, or workflow stages
|
|
18
|
-
- technology: ONLY obscure/niche tools or internal systems - NOT well-known technologies
|
|
19
|
-
- organization: Specific companies, teams, clients the author works with
|
|
20
|
-
|
|
21
|
-
DO NOT extract:
|
|
22
|
-
- Well-known technologies: JSON, HTTP, SQL, Python, JavaScript, Docker, AWS, PostgreSQL, React, etc.
|
|
23
|
-
(The LLM already knows these - they add no value to a personal knowledge base)
|
|
24
|
-
- Code symbols: function names, method calls, variables, class names
|
|
25
|
-
- Generic terms: "user", "data", "system", "database", "API", "server", "client"
|
|
26
|
-
- Git branches/workflow terms: main, master, develop, release, staging, production, feature, hotfix
|
|
27
|
-
- Generic process terms: deploy, build, test, migration, setup, config
|
|
28
|
-
- Environment names: dev, prod, qa, uat, local
|
|
29
|
-
- Issue or bug descriptions - those are documents, not entities
|
|
30
|
-
- Famous people, major companies (Google, Microsoft, etc.) unless contextually relevant to author
|
|
31
|
-
|
|
32
|
-
ONLY extract entities that would help answer "Who/what is X?" where X is specific to this person.
|
|
33
|
-
|
|
34
|
-
For each entity found, extract:
|
|
35
|
-
- name: The canonical name (proper noun)
|
|
36
|
-
- entity_type: One of: person, project, technology, organization
|
|
37
|
-
- aliases: Other names/abbreviations (optional)
|
|
38
|
-
- description: Brief description based on context (optional)
|
|
39
|
-
- mentions: Text snippets where entity appears (max 3)
|
|
40
|
-
- confidence: How confident you are (0.0-1.0)
|
|
41
|
-
|
|
42
|
-
Return JSON array. Return empty array [] if no context-specific entities found.
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
|
-
ENTITY_USER_PROMPT = """\
|
|
46
|
-
Document title: {title}
|
|
47
|
-
Source type: {source_type}
|
|
48
|
-
|
|
49
|
-
Content:
|
|
50
|
-
{content}
|
|
51
|
-
|
|
52
|
-
Extract named entities as JSON array.
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def extract_entities(
|
|
57
|
-
content: str,
|
|
58
|
-
title: str,
|
|
59
|
-
source_type: str | None = None,
|
|
60
|
-
min_confidence: float = 0.8,
|
|
61
|
-
) -> list[ExtractedEntity]:
|
|
62
|
-
"""Extract entities from document content using LLM.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
content: Document content to analyze
|
|
66
|
-
title: Document title for context
|
|
67
|
-
source_type: Type of document (optional)
|
|
68
|
-
min_confidence: Minimum confidence threshold (0-1)
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
List of extracted entities
|
|
72
|
-
"""
|
|
73
|
-
from .. import complete
|
|
74
|
-
|
|
75
|
-
# Truncate content if too long
|
|
76
|
-
if len(content) > 20000:
|
|
77
|
-
content = content[:20000] + "\n\n[... content truncated ...]"
|
|
78
|
-
|
|
79
|
-
prompt = ENTITY_USER_PROMPT.format(
|
|
80
|
-
title=title,
|
|
81
|
-
source_type=source_type or "unknown",
|
|
82
|
-
content=content,
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
response = complete(
|
|
86
|
-
prompt=prompt,
|
|
87
|
-
system=ENTITY_SYSTEM_PROMPT,
|
|
88
|
-
max_tokens=2048,
|
|
89
|
-
use_cache=True,
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
if response is None:
|
|
93
|
-
return []
|
|
94
|
-
|
|
95
|
-
return _parse_entity_response(response.content, min_confidence, title)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _looks_like_code(name: str) -> bool:
|
|
99
|
-
"""Check if entity name looks like code."""
|
|
100
|
-
# Contains parentheses (function calls)
|
|
101
|
-
if "(" in name or ")" in name:
|
|
102
|
-
return True
|
|
103
|
-
# Snake_case with underscores (likely variable/function)
|
|
104
|
-
if "_" in name and name.islower():
|
|
105
|
-
return True
|
|
106
|
-
# Starts with lowercase and contains dots (method chain)
|
|
107
|
-
if name and name[0].islower() and "." in name:
|
|
108
|
-
return True
|
|
109
|
-
# CamelCase starting with lowercase (variable/method name)
|
|
110
|
-
if name and name[0].islower() and any(c.isupper() for c in name):
|
|
111
|
-
return True
|
|
112
|
-
return False
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# Well-known technologies that add no value to a personal knowledge base
|
|
116
|
-
COMMON_TECHNOLOGIES = frozenset(
|
|
117
|
-
s.lower()
|
|
118
|
-
for s in [
|
|
119
|
-
# Data formats
|
|
120
|
-
"JSON",
|
|
121
|
-
"XML",
|
|
122
|
-
"YAML",
|
|
123
|
-
"CSV",
|
|
124
|
-
"HTML",
|
|
125
|
-
"CSS",
|
|
126
|
-
"Markdown",
|
|
127
|
-
# Protocols
|
|
128
|
-
"HTTP",
|
|
129
|
-
"HTTPS",
|
|
130
|
-
"REST",
|
|
131
|
-
"GraphQL",
|
|
132
|
-
"WebSocket",
|
|
133
|
-
"TCP",
|
|
134
|
-
"UDP",
|
|
135
|
-
"SSH",
|
|
136
|
-
"FTP",
|
|
137
|
-
"SMTP",
|
|
138
|
-
# Languages
|
|
139
|
-
"Python",
|
|
140
|
-
"JavaScript",
|
|
141
|
-
"TypeScript",
|
|
142
|
-
"Java",
|
|
143
|
-
"Go",
|
|
144
|
-
"Rust",
|
|
145
|
-
"C",
|
|
146
|
-
"C++",
|
|
147
|
-
"Ruby",
|
|
148
|
-
"PHP",
|
|
149
|
-
"Swift",
|
|
150
|
-
"Kotlin",
|
|
151
|
-
"Scala",
|
|
152
|
-
"Bash",
|
|
153
|
-
"Shell",
|
|
154
|
-
"SQL",
|
|
155
|
-
"Lua",
|
|
156
|
-
# Major frameworks/tools
|
|
157
|
-
"React",
|
|
158
|
-
"Vue",
|
|
159
|
-
"Angular",
|
|
160
|
-
"Node.js",
|
|
161
|
-
"Django",
|
|
162
|
-
"Flask",
|
|
163
|
-
"FastAPI",
|
|
164
|
-
"Rails",
|
|
165
|
-
"Spring",
|
|
166
|
-
"Express",
|
|
167
|
-
"Next.js",
|
|
168
|
-
# Databases
|
|
169
|
-
"PostgreSQL",
|
|
170
|
-
"MySQL",
|
|
171
|
-
"MongoDB",
|
|
172
|
-
"Redis",
|
|
173
|
-
"SQLite",
|
|
174
|
-
"Elasticsearch",
|
|
175
|
-
"DynamoDB",
|
|
176
|
-
# Cloud/infra
|
|
177
|
-
"AWS",
|
|
178
|
-
"Azure",
|
|
179
|
-
"GCP",
|
|
180
|
-
"Docker",
|
|
181
|
-
"Kubernetes",
|
|
182
|
-
"Linux",
|
|
183
|
-
"Windows",
|
|
184
|
-
"macOS",
|
|
185
|
-
"Nginx",
|
|
186
|
-
"Apache",
|
|
187
|
-
# Tools
|
|
188
|
-
"Git",
|
|
189
|
-
"GitHub",
|
|
190
|
-
"GitLab",
|
|
191
|
-
"npm",
|
|
192
|
-
"pip",
|
|
193
|
-
"Webpack",
|
|
194
|
-
"VS Code",
|
|
195
|
-
"Vim",
|
|
196
|
-
"Emacs",
|
|
197
|
-
]
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
# Generic git/workflow/environment terms that are not context-specific
|
|
201
|
-
GENERIC_TERMS = frozenset(
|
|
202
|
-
s.lower()
|
|
203
|
-
for s in [
|
|
204
|
-
# Git branches
|
|
205
|
-
"main",
|
|
206
|
-
"master",
|
|
207
|
-
"develop",
|
|
208
|
-
"development",
|
|
209
|
-
"release",
|
|
210
|
-
"staging",
|
|
211
|
-
"production",
|
|
212
|
-
"feature",
|
|
213
|
-
"hotfix",
|
|
214
|
-
"bugfix",
|
|
215
|
-
# Environments
|
|
216
|
-
"dev",
|
|
217
|
-
"prod",
|
|
218
|
-
"test",
|
|
219
|
-
"qa",
|
|
220
|
-
"uat",
|
|
221
|
-
"local",
|
|
222
|
-
"sandbox",
|
|
223
|
-
# Workflow/process terms
|
|
224
|
-
"deploy",
|
|
225
|
-
"build",
|
|
226
|
-
"migration",
|
|
227
|
-
"setup",
|
|
228
|
-
"config",
|
|
229
|
-
"configuration",
|
|
230
|
-
"rollback",
|
|
231
|
-
"rollout",
|
|
232
|
-
# Generic architectural terms
|
|
233
|
-
"frontend",
|
|
234
|
-
"backend",
|
|
235
|
-
"api",
|
|
236
|
-
"service",
|
|
237
|
-
"server",
|
|
238
|
-
"client",
|
|
239
|
-
"app",
|
|
240
|
-
"application",
|
|
241
|
-
"module",
|
|
242
|
-
"component",
|
|
243
|
-
"library",
|
|
244
|
-
"package",
|
|
245
|
-
"plugin",
|
|
246
|
-
"extension",
|
|
247
|
-
# Generic data terms
|
|
248
|
-
"database",
|
|
249
|
-
"cache",
|
|
250
|
-
"queue",
|
|
251
|
-
"worker",
|
|
252
|
-
"scheduler",
|
|
253
|
-
"cron",
|
|
254
|
-
]
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
def _parse_entity_response(
|
|
259
|
-
response_text: str, min_confidence: float, title: str = ""
|
|
260
|
-
) -> list[ExtractedEntity]:
|
|
261
|
-
"""Parse LLM response into ExtractedEntity objects."""
|
|
262
|
-
# Try to extract JSON from response
|
|
263
|
-
json_match = re.search(r"\[.*\]", response_text, re.DOTALL)
|
|
264
|
-
if not json_match:
|
|
265
|
-
return []
|
|
266
|
-
|
|
267
|
-
try:
|
|
268
|
-
entities_data = json.loads(json_match.group())
|
|
269
|
-
except json.JSONDecodeError:
|
|
270
|
-
return []
|
|
271
|
-
|
|
272
|
-
if not isinstance(entities_data, list):
|
|
273
|
-
return []
|
|
274
|
-
|
|
275
|
-
valid_types = {"person", "project", "technology", "organization"}
|
|
276
|
-
entities = []
|
|
277
|
-
|
|
278
|
-
for item in entities_data:
|
|
279
|
-
if not isinstance(item, dict):
|
|
280
|
-
continue
|
|
281
|
-
|
|
282
|
-
name = item.get("name", "").strip()
|
|
283
|
-
entity_type = item.get("entity_type")
|
|
284
|
-
|
|
285
|
-
if not name or not isinstance(name, str):
|
|
286
|
-
continue
|
|
287
|
-
if not entity_type or entity_type not in valid_types:
|
|
288
|
-
continue
|
|
289
|
-
|
|
290
|
-
# Filter: too short or too long
|
|
291
|
-
if len(name) < 3 or len(name) > 80:
|
|
292
|
-
continue
|
|
293
|
-
|
|
294
|
-
# Filter: looks like code
|
|
295
|
-
if _looks_like_code(name):
|
|
296
|
-
continue
|
|
297
|
-
|
|
298
|
-
# Filter: well-known technologies (LLM already knows these)
|
|
299
|
-
if name.lower() in COMMON_TECHNOLOGIES:
|
|
300
|
-
continue
|
|
301
|
-
|
|
302
|
-
# Filter: generic git/workflow/environment terms
|
|
303
|
-
if name.lower() in GENERIC_TERMS:
|
|
304
|
-
continue
|
|
305
|
-
|
|
306
|
-
# Filter: matches document title (source shouldn't be extracted as entity)
|
|
307
|
-
if title and name.lower() == title.lower():
|
|
308
|
-
continue
|
|
309
|
-
|
|
310
|
-
# Get confidence (default to 0.85 if not specified)
|
|
311
|
-
confidence = item.get("confidence", 0.85)
|
|
312
|
-
if not isinstance(confidence, int | float):
|
|
313
|
-
confidence = 0.85
|
|
314
|
-
|
|
315
|
-
if confidence < min_confidence:
|
|
316
|
-
continue
|
|
317
|
-
|
|
318
|
-
# Parse aliases
|
|
319
|
-
aliases = item.get("aliases", [])
|
|
320
|
-
if not isinstance(aliases, list):
|
|
321
|
-
aliases = []
|
|
322
|
-
aliases = [a for a in aliases if isinstance(a, str)]
|
|
323
|
-
|
|
324
|
-
# Parse mentions
|
|
325
|
-
mentions = item.get("mentions", [])
|
|
326
|
-
if not isinstance(mentions, list):
|
|
327
|
-
mentions = []
|
|
328
|
-
mentions = [m for m in mentions if isinstance(m, str)][:3]
|
|
329
|
-
|
|
330
|
-
entities.append(
|
|
331
|
-
ExtractedEntity(
|
|
332
|
-
name=name,
|
|
333
|
-
entity_type=entity_type,
|
|
334
|
-
aliases=aliases,
|
|
335
|
-
description=item.get("description"),
|
|
336
|
-
mentions=mentions,
|
|
337
|
-
confidence=float(confidence),
|
|
338
|
-
)
|
|
339
|
-
)
|
|
340
|
-
|
|
341
|
-
return entities
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
def normalize_entity_name(name: str) -> str:
|
|
345
|
-
"""Normalize entity name for deduplication and URL generation.
|
|
346
|
-
|
|
347
|
-
Examples:
|
|
348
|
-
"John Smith" -> "john-smith"
|
|
349
|
-
"AWS (Amazon Web Services)" -> "aws-amazon-web-services"
|
|
350
|
-
"React.js" -> "react-js"
|
|
351
|
-
"""
|
|
352
|
-
# Lowercase
|
|
353
|
-
normalized = name.lower()
|
|
354
|
-
# Replace non-alphanumeric with spaces
|
|
355
|
-
normalized = re.sub(r"[^a-z0-9\s]", " ", normalized)
|
|
356
|
-
# Collapse whitespace and replace with hyphens
|
|
357
|
-
normalized = re.sub(r"\s+", "-", normalized.strip())
|
|
358
|
-
# Remove leading/trailing hyphens
|
|
359
|
-
normalized = normalized.strip("-")
|
|
360
|
-
return normalized
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
def entity_source_path(entity_type: str, name: str) -> str:
|
|
364
|
-
"""Generate source_path for an entity document.
|
|
365
|
-
|
|
366
|
-
Format: okb://entity/{type}/{normalized-name}
|
|
367
|
-
"""
|
|
368
|
-
normalized = normalize_entity_name(name)
|
|
369
|
-
return f"okb://entity/{entity_type}/{normalized}"
|
okb/llm/extractors/todo.py
DELETED
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
"""TODO extraction from document content using LLM."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
import re
|
|
7
|
-
from datetime import UTC, datetime
|
|
8
|
-
|
|
9
|
-
from .base import ExtractedTodo
|
|
10
|
-
|
|
11
|
-
TODO_SYSTEM_PROMPT = """\
|
|
12
|
-
You are an expert at identifying action items and tasks in text.
|
|
13
|
-
Extract TODO items from the given document content.
|
|
14
|
-
|
|
15
|
-
Look for:
|
|
16
|
-
- Explicit markers: TODO, FIXME, HACK, XXX, ACTION
|
|
17
|
-
- Action phrases: "need to", "should", "must", "have to", "action item"
|
|
18
|
-
- Deadlines and commitments: "by Friday", "before the meeting", "this week"
|
|
19
|
-
- Questions implying needed work: "What about X?", "How do we handle Y?"
|
|
20
|
-
- Incomplete items marked for follow-up
|
|
21
|
-
|
|
22
|
-
For each TODO found, extract:
|
|
23
|
-
- title: A concise description of the task (imperative form: "Fix the bug", not "The bug needs fixing")
|
|
24
|
-
- content: Additional context or details (optional)
|
|
25
|
-
- due_date: If a deadline is mentioned, in ISO format YYYY-MM-DD (optional)
|
|
26
|
-
- priority: 1=urgent, 2=high, 3=normal, 4=low, 5=someday (optional)
|
|
27
|
-
- assignee: Person responsible if mentioned (optional)
|
|
28
|
-
- source_context: The exact text snippet where this TODO was found
|
|
29
|
-
|
|
30
|
-
Return JSON array of extracted TODOs. Return empty array [] if none found.
|
|
31
|
-
Be conservative - only extract clear action items, not vague mentions.
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
TODO_USER_PROMPT = """\
|
|
35
|
-
Document title: {title}
|
|
36
|
-
Source type: {source_type}
|
|
37
|
-
|
|
38
|
-
Content:
|
|
39
|
-
{content}
|
|
40
|
-
|
|
41
|
-
Extract all TODO items from this content as JSON array.
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def extract_todos(
|
|
46
|
-
content: str,
|
|
47
|
-
title: str,
|
|
48
|
-
source_type: str,
|
|
49
|
-
min_confidence: float = 0.7,
|
|
50
|
-
) -> list[ExtractedTodo]:
|
|
51
|
-
"""Extract TODO items from document content using LLM.
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
content: Document content to analyze
|
|
55
|
-
title: Document title for context
|
|
56
|
-
source_type: Type of document (markdown, code, org, etc.)
|
|
57
|
-
min_confidence: Minimum confidence threshold (0-1)
|
|
58
|
-
|
|
59
|
-
Returns:
|
|
60
|
-
List of extracted TODO items
|
|
61
|
-
"""
|
|
62
|
-
from .. import complete
|
|
63
|
-
|
|
64
|
-
# Truncate content if too long (keep first ~20k chars for context)
|
|
65
|
-
if len(content) > 20000:
|
|
66
|
-
content = content[:20000] + "\n\n[... content truncated ...]"
|
|
67
|
-
|
|
68
|
-
prompt = TODO_USER_PROMPT.format(
|
|
69
|
-
title=title,
|
|
70
|
-
source_type=source_type,
|
|
71
|
-
content=content,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
response = complete(
|
|
75
|
-
prompt=prompt,
|
|
76
|
-
system=TODO_SYSTEM_PROMPT,
|
|
77
|
-
max_tokens=2048,
|
|
78
|
-
use_cache=True,
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
if response is None:
|
|
82
|
-
return []
|
|
83
|
-
|
|
84
|
-
return _parse_todo_response(response.content, min_confidence)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def _parse_todo_response(response_text: str, min_confidence: float) -> list[ExtractedTodo]:
|
|
88
|
-
"""Parse LLM response into ExtractedTodo objects."""
|
|
89
|
-
# Try to extract JSON from response
|
|
90
|
-
json_match = re.search(r"\[.*\]", response_text, re.DOTALL)
|
|
91
|
-
if not json_match:
|
|
92
|
-
return []
|
|
93
|
-
|
|
94
|
-
try:
|
|
95
|
-
todos_data = json.loads(json_match.group())
|
|
96
|
-
except json.JSONDecodeError:
|
|
97
|
-
return []
|
|
98
|
-
|
|
99
|
-
if not isinstance(todos_data, list):
|
|
100
|
-
return []
|
|
101
|
-
|
|
102
|
-
todos = []
|
|
103
|
-
for item in todos_data:
|
|
104
|
-
if not isinstance(item, dict):
|
|
105
|
-
continue
|
|
106
|
-
|
|
107
|
-
title = item.get("title")
|
|
108
|
-
if not title or not isinstance(title, str):
|
|
109
|
-
continue
|
|
110
|
-
|
|
111
|
-
# Parse due_date if present
|
|
112
|
-
due_date = None
|
|
113
|
-
if due_str := item.get("due_date"):
|
|
114
|
-
try:
|
|
115
|
-
due_date = datetime.fromisoformat(due_str).replace(tzinfo=UTC)
|
|
116
|
-
except (ValueError, TypeError):
|
|
117
|
-
pass
|
|
118
|
-
|
|
119
|
-
# Parse priority
|
|
120
|
-
priority = None
|
|
121
|
-
if p := item.get("priority"):
|
|
122
|
-
try:
|
|
123
|
-
priority = int(p)
|
|
124
|
-
if priority < 1 or priority > 5:
|
|
125
|
-
priority = None
|
|
126
|
-
except (ValueError, TypeError):
|
|
127
|
-
pass
|
|
128
|
-
|
|
129
|
-
# Get confidence (default to 0.8 if not specified)
|
|
130
|
-
confidence = item.get("confidence", 0.8)
|
|
131
|
-
if not isinstance(confidence, (int, float)):
|
|
132
|
-
confidence = 0.8
|
|
133
|
-
|
|
134
|
-
if confidence < min_confidence:
|
|
135
|
-
continue
|
|
136
|
-
|
|
137
|
-
todos.append(
|
|
138
|
-
ExtractedTodo(
|
|
139
|
-
title=title.strip(),
|
|
140
|
-
content=item.get("content"),
|
|
141
|
-
due_date=due_date,
|
|
142
|
-
priority=priority,
|
|
143
|
-
assignee=item.get("assignee"),
|
|
144
|
-
confidence=float(confidence),
|
|
145
|
-
source_context=item.get("source_context"),
|
|
146
|
-
)
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
return todos
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
-- LLM enrichment for document annotation (TODOs and entities)
|
|
2
|
-
-- depends: 0006.llm-cache
|
|
3
|
-
|
|
4
|
-
-- Track enrichment state on documents
|
|
5
|
-
ALTER TABLE documents ADD COLUMN IF NOT EXISTS enriched_at TIMESTAMPTZ;
|
|
6
|
-
ALTER TABLE documents ADD COLUMN IF NOT EXISTS enrichment_version INTEGER;
|
|
7
|
-
|
|
8
|
-
-- Index for "needs enrichment" queries
|
|
9
|
-
CREATE INDEX IF NOT EXISTS idx_documents_needs_enrichment
|
|
10
|
-
ON documents(enriched_at) WHERE enriched_at IS NULL;
|
|
11
|
-
|
|
12
|
-
-- Pending entity suggestions (before approval)
|
|
13
|
-
CREATE TABLE IF NOT EXISTS pending_entities (
|
|
14
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
15
|
-
source_document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
16
|
-
entity_name TEXT NOT NULL,
|
|
17
|
-
entity_type TEXT NOT NULL, -- person, project, technology, concept, organization
|
|
18
|
-
aliases JSONB DEFAULT '[]',
|
|
19
|
-
description TEXT,
|
|
20
|
-
mentions JSONB DEFAULT '[]', -- Context snippets from source document
|
|
21
|
-
confidence REAL,
|
|
22
|
-
status TEXT DEFAULT 'pending', -- pending, approved, rejected
|
|
23
|
-
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
24
|
-
reviewed_at TIMESTAMPTZ
|
|
25
|
-
);
|
|
26
|
-
|
|
27
|
-
CREATE INDEX IF NOT EXISTS idx_pending_entities_status ON pending_entities(status);
|
|
28
|
-
CREATE INDEX IF NOT EXISTS idx_pending_entities_source ON pending_entities(source_document_id);
|
|
29
|
-
CREATE INDEX IF NOT EXISTS idx_pending_entities_type ON pending_entities(entity_type);
|
|
30
|
-
|
|
31
|
-
-- Entity references (links entity documents to source documents)
|
|
32
|
-
-- When an entity is approved, it becomes a document with source_path like okb://entity/person/john-smith
|
|
33
|
-
-- This table tracks which documents mention each entity
|
|
34
|
-
CREATE TABLE IF NOT EXISTS entity_refs (
|
|
35
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
36
|
-
entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
37
|
-
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
38
|
-
mention_text TEXT NOT NULL,
|
|
39
|
-
context TEXT, -- Surrounding text for context
|
|
40
|
-
confidence REAL,
|
|
41
|
-
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
42
|
-
UNIQUE(entity_id, document_id, mention_text)
|
|
43
|
-
);
|
|
44
|
-
|
|
45
|
-
CREATE INDEX IF NOT EXISTS idx_entity_refs_entity ON entity_refs(entity_id);
|
|
46
|
-
CREATE INDEX IF NOT EXISTS idx_entity_refs_document ON entity_refs(document_id);
|
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
-- Entity consolidation: deduplication, cross-doc detection, clustering, relationships
|
|
2
|
-
-- depends: 0008.enrichment
|
|
3
|
-
|
|
4
|
-
-- Canonical mappings: alias text -> entity document
|
|
5
|
-
-- Used for deduplication and alias resolution
|
|
6
|
-
CREATE TABLE IF NOT EXISTS entity_aliases (
|
|
7
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
8
|
-
alias_text TEXT NOT NULL,
|
|
9
|
-
entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
10
|
-
confidence REAL, -- How confident we are this alias belongs to entity
|
|
11
|
-
source TEXT DEFAULT 'manual', -- 'manual', 'merge', 'extraction'
|
|
12
|
-
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
13
|
-
UNIQUE(alias_text, entity_id)
|
|
14
|
-
);
|
|
15
|
-
|
|
16
|
-
CREATE INDEX IF NOT EXISTS idx_entity_aliases_text ON entity_aliases(LOWER(alias_text));
|
|
17
|
-
CREATE INDEX IF NOT EXISTS idx_entity_aliases_entity ON entity_aliases(entity_id);
|
|
18
|
-
|
|
19
|
-
-- Proposed entity merges awaiting user confirmation
|
|
20
|
-
CREATE TABLE IF NOT EXISTS pending_entity_merges (
|
|
21
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
22
|
-
canonical_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
23
|
-
duplicate_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
24
|
-
confidence REAL NOT NULL, -- How confident we are these are the same
|
|
25
|
-
reason TEXT, -- Why we think they're the same ("embedding_similarity", "alias_match", "llm")
|
|
26
|
-
detected_at TIMESTAMPTZ DEFAULT NOW(),
|
|
27
|
-
status TEXT DEFAULT 'pending', -- 'pending', 'approved', 'rejected'
|
|
28
|
-
reviewed_at TIMESTAMPTZ,
|
|
29
|
-
UNIQUE(canonical_id, duplicate_id)
|
|
30
|
-
);
|
|
31
|
-
|
|
32
|
-
CREATE INDEX IF NOT EXISTS idx_pending_merges_status ON pending_entity_merges(status);
|
|
33
|
-
CREATE INDEX IF NOT EXISTS idx_pending_merges_confidence ON pending_entity_merges(confidence DESC);
|
|
34
|
-
|
|
35
|
-
-- Entity-to-entity relationships
|
|
36
|
-
CREATE TABLE IF NOT EXISTS entity_relationships (
|
|
37
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
38
|
-
source_entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
39
|
-
target_entity_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
40
|
-
relationship_type TEXT NOT NULL, -- 'works_for', 'uses', 'belongs_to', 'related_to'
|
|
41
|
-
confidence REAL,
|
|
42
|
-
source TEXT DEFAULT 'extraction', -- 'extraction', 'manual'
|
|
43
|
-
context TEXT, -- Supporting context for the relationship
|
|
44
|
-
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
45
|
-
UNIQUE(source_entity_id, target_entity_id, relationship_type)
|
|
46
|
-
);
|
|
47
|
-
|
|
48
|
-
CREATE INDEX IF NOT EXISTS idx_entity_rel_source ON entity_relationships(source_entity_id);
|
|
49
|
-
CREATE INDEX IF NOT EXISTS idx_entity_rel_target ON entity_relationships(target_entity_id);
|
|
50
|
-
CREATE INDEX IF NOT EXISTS idx_entity_rel_type ON entity_relationships(relationship_type);
|
|
51
|
-
|
|
52
|
-
-- Topic clusters group related entities and documents
|
|
53
|
-
CREATE TABLE IF NOT EXISTS topic_clusters (
|
|
54
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
55
|
-
name TEXT NOT NULL,
|
|
56
|
-
description TEXT,
|
|
57
|
-
centroid vector(768), -- Cluster centroid embedding
|
|
58
|
-
member_count INTEGER DEFAULT 0,
|
|
59
|
-
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
60
|
-
updated_at TIMESTAMPTZ DEFAULT NOW()
|
|
61
|
-
);
|
|
62
|
-
|
|
63
|
-
CREATE INDEX IF NOT EXISTS idx_topic_clusters_centroid ON topic_clusters
|
|
64
|
-
USING hnsw (centroid vector_cosine_ops);
|
|
65
|
-
|
|
66
|
-
-- Cluster membership: entities and documents can belong to clusters
|
|
67
|
-
CREATE TABLE IF NOT EXISTS topic_cluster_members (
|
|
68
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
69
|
-
cluster_id UUID NOT NULL REFERENCES topic_clusters(id) ON DELETE CASCADE,
|
|
70
|
-
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
71
|
-
distance REAL, -- Distance from cluster centroid
|
|
72
|
-
is_entity BOOLEAN DEFAULT FALSE, -- True if document is an entity
|
|
73
|
-
added_at TIMESTAMPTZ DEFAULT NOW(),
|
|
74
|
-
UNIQUE(cluster_id, document_id)
|
|
75
|
-
);
|
|
76
|
-
|
|
77
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_members_cluster ON topic_cluster_members(cluster_id);
|
|
78
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_members_document ON topic_cluster_members(document_id);
|
|
79
|
-
|
|
80
|
-
-- Proposed cluster merges awaiting confirmation
|
|
81
|
-
CREATE TABLE IF NOT EXISTS pending_cluster_merges (
|
|
82
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
83
|
-
primary_cluster_id UUID NOT NULL REFERENCES topic_clusters(id) ON DELETE CASCADE,
|
|
84
|
-
secondary_cluster_id UUID NOT NULL REFERENCES topic_clusters(id) ON DELETE CASCADE,
|
|
85
|
-
similarity REAL NOT NULL, -- How similar the clusters are
|
|
86
|
-
status TEXT DEFAULT 'pending', -- 'pending', 'approved', 'rejected'
|
|
87
|
-
detected_at TIMESTAMPTZ DEFAULT NOW(),
|
|
88
|
-
reviewed_at TIMESTAMPTZ,
|
|
89
|
-
UNIQUE(primary_cluster_id, secondary_cluster_id)
|
|
90
|
-
);
|
|
91
|
-
|
|
92
|
-
CREATE INDEX IF NOT EXISTS idx_pending_cluster_merges_status ON pending_cluster_merges(status);
|
|
93
|
-
|
|
94
|
-
-- Cross-document entity candidates: detected mentions not yet extracted as entities
|
|
95
|
-
CREATE TABLE IF NOT EXISTS cross_doc_entity_candidates (
|
|
96
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
97
|
-
text TEXT NOT NULL, -- The mention text (normalized)
|
|
98
|
-
document_ids UUID[] NOT NULL, -- Array of document IDs containing this mention
|
|
99
|
-
document_count INTEGER NOT NULL, -- Number of documents (for quick filtering)
|
|
100
|
-
sample_contexts JSONB DEFAULT '[]', -- Sample text contexts where it appears
|
|
101
|
-
suggested_type TEXT, -- Suggested entity type
|
|
102
|
-
confidence REAL,
|
|
103
|
-
status TEXT DEFAULT 'pending', -- 'pending', 'approved', 'rejected', 'exists'
|
|
104
|
-
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
105
|
-
reviewed_at TIMESTAMPTZ,
|
|
106
|
-
UNIQUE(text)
|
|
107
|
-
);
|
|
108
|
-
|
|
109
|
-
CREATE INDEX IF NOT EXISTS idx_cross_doc_status ON cross_doc_entity_candidates(status);
|
|
110
|
-
CREATE INDEX IF NOT EXISTS idx_cross_doc_count ON cross_doc_entity_candidates(document_count DESC);
|
|
111
|
-
|
|
112
|
-
-- Track consolidation runs
|
|
113
|
-
CREATE TABLE IF NOT EXISTS consolidation_runs (
|
|
114
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
115
|
-
run_type TEXT NOT NULL, -- 'dedup', 'cross_doc', 'cluster', 'relationship', 'full'
|
|
116
|
-
started_at TIMESTAMPTZ DEFAULT NOW(),
|
|
117
|
-
completed_at TIMESTAMPTZ,
|
|
118
|
-
stats JSONB DEFAULT '{}', -- Run statistics
|
|
119
|
-
error TEXT -- Error message if failed
|
|
120
|
-
);
|
okb/migrations/0010.token-id.sql
DELETED
|
File without changes
|
|
File without changes
|