okb 1.1.0a0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ """Extractors for LLM-based document enrichment."""
2
+
3
+ from .base import EnrichmentResult, ExtractedEntity, ExtractedTodo
4
+ from .entity import extract_entities
5
+ from .todo import extract_todos
6
+
7
+ __all__ = [
8
+ "ExtractedTodo",
9
+ "ExtractedEntity",
10
+ "EnrichmentResult",
11
+ "extract_todos",
12
+ "extract_entities",
13
+ ]
@@ -0,0 +1,44 @@
1
+ """Base types for document enrichment extractors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+
8
+
9
+ @dataclass
10
+ class ExtractedTodo:
11
+ """A TODO item extracted from document content."""
12
+
13
+ title: str
14
+ content: str | None = None
15
+ due_date: datetime | None = None
16
+ priority: int | None = None # 1-5, 1=highest
17
+ assignee: str | None = None
18
+ confidence: float = 1.0
19
+ source_context: str | None = None # Text snippet where TODO was found
20
+
21
+
22
+ @dataclass
23
+ class ExtractedEntity:
24
+ """An entity extracted from document content."""
25
+
26
+ name: str
27
+ entity_type: str # person, project, technology, concept, organization
28
+ aliases: list[str] = field(default_factory=list)
29
+ description: str | None = None
30
+ mentions: list[str] = field(default_factory=list) # Context snippets
31
+ confidence: float = 1.0
32
+
33
+
34
+ @dataclass
35
+ class EnrichmentResult:
36
+ """Results from document enrichment."""
37
+
38
+ todos: list[ExtractedTodo] = field(default_factory=list)
39
+ entities: list[ExtractedEntity] = field(default_factory=list)
40
+
41
+ @property
42
+ def has_extractions(self) -> bool:
43
+ """Check if any extractions were made."""
44
+ return bool(self.todos or self.entities)
@@ -0,0 +1,478 @@
1
+ """Cross-document entity detection - find mentions appearing in multiple documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from collections import Counter, defaultdict
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ import psycopg
12
+ from psycopg.rows import dict_row
13
+
14
+ from .base import ExtractedEntity
15
+
16
+
17
+ @dataclass
18
+ class CrossDocCandidate:
19
+ """A potential entity found across multiple documents."""
20
+
21
+ text: str # Normalized mention text
22
+ document_ids: list[str]
23
+ document_count: int
24
+ sample_contexts: list[str] # Sample text contexts where it appears
25
+ suggested_type: str | None = None
26
+ confidence: float = 0.0
27
+
28
+
29
+ CLASSIFY_SYSTEM = """\
30
+ You classify text mentions as named entities. You MUST respond with ONLY valid JSON, no other text.
31
+
32
+ Entity types: person, project, technology, concept, organization, not_entity
33
+
34
+ Required JSON format:
35
+ {"classifications":[{"text":"Django","type":"technology","confidence":0.9}]}
36
+ """
37
+
38
+ CLASSIFY_USER = """\
39
+ Classify these mentions as entities. Reply with ONLY JSON, no explanation.
40
+
41
+ {mentions}
42
+
43
+ JSON format: {{"classifications":[{{"text":"...","type":"...","confidence":0.9}}]}}
44
+ Types: person, project, technology, concept, organization, not_entity
45
+ """
46
+
47
+
48
+ def find_cross_document_entities(
49
+ db_url: str,
50
+ min_documents: int = 3,
51
+ limit: int = 100,
52
+ ) -> list[CrossDocCandidate]:
53
+ """Find text mentions appearing in multiple documents but not extracted as entities.
54
+
55
+ Args:
56
+ db_url: Database URL
57
+ min_documents: Minimum documents a mention must appear in
58
+ limit: Maximum candidates to return
59
+
60
+ Returns:
61
+ List of CrossDocCandidate objects
62
+ """
63
+ candidates: list[CrossDocCandidate] = []
64
+
65
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
66
+ # Get existing entity names and aliases to exclude
67
+ existing_names = set()
68
+
69
+ # Get entity titles
70
+ results = conn.execute(
71
+ "SELECT LOWER(title) as name FROM documents WHERE source_type = 'entity'"
72
+ ).fetchall()
73
+ existing_names.update(r["name"] for r in results)
74
+
75
+ # Get aliases
76
+ results = conn.execute("SELECT LOWER(alias_text) as name FROM entity_aliases").fetchall()
77
+ existing_names.update(r["name"] for r in results)
78
+
79
+ # Get already-detected candidates
80
+ results = conn.execute(
81
+ "SELECT LOWER(text) as name FROM cross_doc_entity_candidates"
82
+ ).fetchall()
83
+ existing_detected = {r["name"] for r in results}
84
+
85
+ # Get documents (exclude derived documents)
86
+ docs = conn.execute(
87
+ """
88
+ SELECT id, content
89
+ FROM documents
90
+ WHERE source_path NOT LIKE '%%::todo/%%'
91
+ AND source_path NOT LIKE 'okb://entity/%%'
92
+ AND source_path NOT LIKE 'claude://%%'
93
+ AND content IS NOT NULL
94
+ LIMIT 1000
95
+ """
96
+ ).fetchall()
97
+
98
+ # Extract noun phrases and track document occurrences
99
+ mention_docs: dict[str, set[str]] = defaultdict(set)
100
+ mention_contexts: dict[str, list[str]] = defaultdict(list)
101
+
102
+ for doc in docs:
103
+ doc_id = str(doc["id"])
104
+ content = doc["content"]
105
+ phrases = _extract_noun_phrases(content)
106
+
107
+ for phrase, context in phrases:
108
+ normalized = phrase.lower().strip()
109
+ # Skip if too short, too long, or already exists
110
+ if len(normalized) < 2 or len(normalized) > 50:
111
+ continue
112
+ if normalized in existing_names or normalized in existing_detected:
113
+ continue
114
+ # Skip common words
115
+ if normalized in COMMON_WORDS:
116
+ continue
117
+
118
+ mention_docs[normalized].add(doc_id)
119
+ if len(mention_contexts[normalized]) < 3:
120
+ mention_contexts[normalized].append(context[:200])
121
+
122
+ # Filter to mentions appearing in min_documents
123
+ for text, doc_ids in mention_docs.items():
124
+ if len(doc_ids) >= min_documents:
125
+ candidates.append(
126
+ CrossDocCandidate(
127
+ text=text,
128
+ document_ids=list(doc_ids),
129
+ document_count=len(doc_ids),
130
+ sample_contexts=mention_contexts[text],
131
+ )
132
+ )
133
+
134
+ # Sort by document count and limit
135
+ candidates.sort(key=lambda c: c.document_count, reverse=True)
136
+ candidates = candidates[:limit]
137
+
138
+ return candidates
139
+
140
+
141
+ def _extract_noun_phrases(text: str) -> list[tuple[str, str]]:
142
+ """Extract potential noun phrases from text using simple heuristics.
143
+
144
+ Returns list of (phrase, context) tuples.
145
+ """
146
+ phrases = []
147
+
148
+ # Pattern for capitalized phrases (likely proper nouns)
149
+ # Matches: "Amazon Web Services", "John Smith", "React.js"
150
+ cap_pattern = r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\.[a-z]+)?)"
151
+
152
+ # Pattern for tech terms with special chars
153
+ # Matches: "C++", "Node.js", "OpenAI", "PostgreSQL"
154
+ tech_pattern = r"\b([A-Z][a-zA-Z]+(?:\.[a-z]+|\+\+)?)\b"
155
+
156
+ # Combined patterns
157
+ for pattern in [cap_pattern, tech_pattern]:
158
+ for match in re.finditer(pattern, text):
159
+ phrase = match.group(1)
160
+ # Get surrounding context
161
+ start = max(0, match.start() - 50)
162
+ end = min(len(text), match.end() + 50)
163
+ context = text[start:end]
164
+ phrases.append((phrase, context))
165
+
166
+ return phrases
167
+
168
+
169
+ # Common words to exclude (not named entities)
170
+ COMMON_WORDS = {
171
+ # Articles, prepositions, conjunctions
172
+ "the", "this", "that", "with", "from", "have", "been", "were", "will",
173
+ "would", "could", "should", "these", "those", "then", "than", "when",
174
+ "where", "what", "which", "while", "other", "some", "most", "many",
175
+ "more", "each", "every", "both", "after", "before", "about", "over",
176
+ "under", "again", "once", "here", "there", "also", "just", "only",
177
+ "even", "still", "well", "back", "such", "very", "much", "into", "onto",
178
+ "upon", "within", "without", "between", "among", "through", "during",
179
+ "because", "since", "until", "unless", "although", "though", "however",
180
+ "therefore", "thus", "hence", "meanwhile", "otherwise", "instead",
181
+ "if", "for", "or", "and", "but", "nor", "yet", "so", "as", "at", "by",
182
+ "to", "in", "on", "of", "up", "no", "not", "any", "all", "few",
183
+ # Common verbs
184
+ "make", "made", "like", "need", "want", "take", "give", "find", "keep",
185
+ "put", "set", "get", "let", "say", "see", "use", "used", "using", "add",
186
+ "added", "adding", "run", "running", "try", "tried", "call", "called",
187
+ "show", "shown", "showing", "check", "checked", "checking", "include",
188
+ "included", "including", "contain", "contains", "provide", "provides",
189
+ "allow", "allows", "enable", "enables", "support", "supports", "handle",
190
+ "handles", "generate", "generated", "generating", "test", "tested",
191
+ "testing", "build", "building", "deploy", "deploying", "move", "moved",
192
+ "send", "sent", "receive", "received", "pass", "passed", "fail", "failed",
193
+ "complete", "completed", "finish", "finished", "done",
194
+ # Common nouns (generic)
195
+ "user", "users", "data", "file", "files", "code", "time", "work", "way",
196
+ "case", "cases", "point", "points", "part", "parts", "place", "thing",
197
+ "things", "name", "names", "number", "numbers", "type", "types", "list",
198
+ "lists", "line", "lines", "note", "notes", "example", "examples",
199
+ "section", "sections", "chapter", "page", "pages", "document", "documents",
200
+ "item", "items", "entry", "entries", "record", "records", "row", "rows",
201
+ "column", "columns", "table", "tables", "field", "fields", "form", "forms",
202
+ "view", "views", "model", "models", "schema", "index", "key", "keys",
203
+ "token", "tokens", "id", "ids", "url", "urls", "path", "paths",
204
+ # Programming terms (generic)
205
+ "function", "functions", "method", "methods", "class", "classes",
206
+ "object", "objects", "value", "values", "result", "results", "error",
207
+ "errors", "issue", "issues", "problem", "problems", "solution", "solutions",
208
+ "system", "systems", "process", "processes", "service", "services",
209
+ "server", "servers", "client", "clients", "request", "requests",
210
+ "response", "responses", "query", "queries", "update", "updates",
211
+ "delete", "deletes", "create", "creates", "read", "reads", "write",
212
+ "writes", "input", "inputs", "output", "outputs", "return", "returns",
213
+ "exception", "exceptions", "warning", "warnings", "message", "messages",
214
+ "event", "events", "action", "actions", "task", "tasks", "job", "jobs",
215
+ "api", "apis", "json", "xml", "html", "css", "sql", "http", "https",
216
+ "config", "configs", "setting", "settings", "option", "options",
217
+ "param", "params", "parameter", "parameters", "argument", "arguments",
218
+ "variable", "variables", "constant", "constants", "property", "properties",
219
+ "attribute", "attributes", "module", "modules", "package", "packages",
220
+ "library", "libraries", "framework", "frameworks", "tool", "tools",
221
+ "script", "scripts", "command", "commands", "handler", "handlers",
222
+ "callback", "callbacks", "hook", "hooks", "plugin", "plugins",
223
+ "context", "contexts", "state", "states", "status", "priority",
224
+ "level", "levels", "mode", "modes", "flag", "flags", "tag", "tags",
225
+ # Ordinals and quantifiers
226
+ "start", "end", "first", "last", "next", "previous", "begin", "final",
227
+ "new", "old", "good", "bad", "high", "low", "top", "bottom", "left",
228
+ "right", "true", "false", "yes", "none", "null", "undefined", "empty",
229
+ # Adjectives
230
+ "default", "custom", "main", "base", "core", "common", "standard",
231
+ "general", "specific", "local", "remote", "public", "private",
232
+ "internal", "external", "simple", "basic", "advanced", "current",
233
+ "available", "required", "optional", "important", "different", "same",
234
+ "similar", "related", "following", "above", "below", "existing",
235
+ "valid", "invalid", "active", "inactive", "enabled", "disabled",
236
+ "visible", "hidden", "open", "closed", "full", "partial", "total",
237
+ "rest", "remaining", "other", "another", "single", "multiple",
238
+ # Days and months
239
+ "monday", "tuesday", "wednesday", "thursday", "friday", "saturday",
240
+ "sunday", "january", "february", "march", "april", "may", "june",
241
+ "july", "august", "september", "october", "november", "december",
242
+ "today", "tomorrow", "yesterday", "week", "month", "year",
243
+ }
244
+
245
+
246
+ def classify_candidates(
247
+ candidates: list[CrossDocCandidate],
248
+ db_url: str | None = None,
249
+ batch_size: int = 25,
250
+ ) -> list[ExtractedEntity]:
251
+ """Use LLM to classify cross-document candidates as entities.
252
+
253
+ Args:
254
+ candidates: List of candidates to classify
255
+ db_url: Database URL (for caching)
256
+ batch_size: Max candidates per LLM call (default 25 to avoid prompt length issues)
257
+
258
+ Returns:
259
+ List of ExtractedEntity objects for valid entities
260
+ """
261
+ if not candidates:
262
+ return []
263
+
264
+ from .. import complete
265
+
266
+ all_entities = []
267
+ candidate_map = {c.text.lower(): c for c in candidates}
268
+
269
+ # Process in batches to avoid prompt length issues with smaller models
270
+ for i in range(0, len(candidates), batch_size):
271
+ batch = candidates[i : i + batch_size]
272
+
273
+ # Build prompt for this batch
274
+ mention_lines = []
275
+ for c in batch:
276
+ mention_lines.append(f'- "{c.text}" (in {c.document_count} docs)')
277
+
278
+ prompt = CLASSIFY_USER.format(mentions="\n".join(mention_lines))
279
+
280
+ response = complete(prompt, system=CLASSIFY_SYSTEM, max_tokens=2048, use_cache=True)
281
+
282
+ if response is None:
283
+ continue
284
+
285
+ # Parse response
286
+ try:
287
+ content = response.content.strip()
288
+ if content.startswith("```"):
289
+ lines = content.split("\n")
290
+ content = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
291
+ # Fix common LLM JSON errors: leading zeros (00.9 -> 0.9), trailing commas
292
+ content = re.sub(r":0+(\d)", r":\1", content) # 00.9 -> 0.9
293
+ content = re.sub(r",\s*([}\]])", r"\1", content) # trailing commas
294
+ data = json.loads(content)
295
+ except json.JSONDecodeError:
296
+ continue
297
+
298
+ # Handle both {"classifications": [...]} and direct list formats
299
+ if isinstance(data, list):
300
+ classifications = data
301
+ else:
302
+ classifications = data.get("classifications", [])
303
+
304
+ for cls in classifications:
305
+ text = cls.get("text", "")
306
+ entity_type = cls.get("type", "")
307
+ confidence = cls.get("confidence", 0.5)
308
+
309
+ if entity_type == "not_entity" or not entity_type:
310
+ continue
311
+
312
+ candidate = candidate_map.get(text.lower())
313
+ if candidate:
314
+ # Update candidate with classification
315
+ candidate.suggested_type = entity_type
316
+ candidate.confidence = confidence
317
+
318
+ all_entities.append(
319
+ ExtractedEntity(
320
+ name=text,
321
+ entity_type=entity_type,
322
+ confidence=confidence,
323
+ mentions=candidate.sample_contexts[:3],
324
+ )
325
+ )
326
+
327
+ return all_entities
328
+
329
+
330
+ def store_candidates(db_url: str, candidates: list[CrossDocCandidate]) -> int:
331
+ """Store cross-document candidates in database.
332
+
333
+ Args:
334
+ db_url: Database URL
335
+ candidates: Candidates to store
336
+
337
+ Returns:
338
+ Number of candidates stored
339
+ """
340
+ stored = 0
341
+
342
+ with psycopg.connect(db_url) as conn:
343
+ for c in candidates:
344
+ try:
345
+ conn.execute(
346
+ """
347
+ INSERT INTO cross_doc_entity_candidates
348
+ (text, document_ids, document_count, sample_contexts,
349
+ suggested_type, confidence, status)
350
+ VALUES (%s, %s, %s, %s, %s, %s, 'pending')
351
+ ON CONFLICT (text) DO UPDATE SET
352
+ document_ids = EXCLUDED.document_ids,
353
+ document_count = EXCLUDED.document_count,
354
+ sample_contexts = EXCLUDED.sample_contexts,
355
+ suggested_type = EXCLUDED.suggested_type,
356
+ confidence = EXCLUDED.confidence
357
+ """,
358
+ (
359
+ c.text,
360
+ c.document_ids,
361
+ c.document_count,
362
+ psycopg.types.json.Json(c.sample_contexts),
363
+ c.suggested_type,
364
+ c.confidence,
365
+ ),
366
+ )
367
+ stored += 1
368
+ except Exception:
369
+ pass
370
+ conn.commit()
371
+
372
+ return stored
373
+
374
+
375
+ def list_cross_doc_candidates(
376
+ db_url: str,
377
+ status: str = "pending",
378
+ limit: int = 50,
379
+ ) -> list[dict]:
380
+ """List cross-document entity candidates.
381
+
382
+ Returns list of dicts with candidate details.
383
+ """
384
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
385
+ results = conn.execute(
386
+ """
387
+ SELECT id, text, document_count, sample_contexts,
388
+ suggested_type, confidence, status, created_at
389
+ FROM cross_doc_entity_candidates
390
+ WHERE status = %s
391
+ ORDER BY document_count DESC, confidence DESC
392
+ LIMIT %s
393
+ """,
394
+ (status, limit),
395
+ ).fetchall()
396
+ return [dict(r) for r in results]
397
+
398
+
399
+ def approve_cross_doc_candidate(db_url: str, candidate_id: str) -> str | None:
400
+ """Approve a cross-doc candidate, creating it as a pending entity.
401
+
402
+ Returns the pending entity ID, or None if failed.
403
+ """
404
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
405
+ # Get candidate
406
+ candidate = conn.execute(
407
+ """
408
+ SELECT text, document_ids, sample_contexts, suggested_type, confidence
409
+ FROM cross_doc_entity_candidates
410
+ WHERE id = %s AND status = 'pending'
411
+ """,
412
+ (candidate_id,),
413
+ ).fetchone()
414
+
415
+ if not candidate:
416
+ return None
417
+
418
+ # Get first document as source
419
+ doc_ids = candidate["document_ids"]
420
+ if not doc_ids:
421
+ return None
422
+
423
+ source_doc = conn.execute(
424
+ "SELECT id FROM documents WHERE id = %s",
425
+ (doc_ids[0],),
426
+ ).fetchone()
427
+
428
+ if not source_doc:
429
+ return None
430
+
431
+ # Create pending entity
432
+ result = conn.execute(
433
+ """
434
+ INSERT INTO pending_entities
435
+ (source_document_id, entity_name, entity_type, mentions, confidence, status)
436
+ VALUES (%s, %s, %s, %s, %s, 'pending')
437
+ RETURNING id
438
+ """,
439
+ (
440
+ source_doc["id"],
441
+ candidate["text"],
442
+ candidate["suggested_type"] or "concept",
443
+ psycopg.types.json.Json(candidate["sample_contexts"]),
444
+ candidate["confidence"],
445
+ ),
446
+ ).fetchone()
447
+
448
+ # Mark candidate as approved
449
+ conn.execute(
450
+ """
451
+ UPDATE cross_doc_entity_candidates
452
+ SET status = 'approved', reviewed_at = NOW()
453
+ WHERE id = %s
454
+ """,
455
+ (candidate_id,),
456
+ )
457
+
458
+ conn.commit()
459
+ return str(result["id"]) if result else None
460
+
461
+
462
+ def reject_cross_doc_candidate(db_url: str, candidate_id: str) -> bool:
463
+ """Reject a cross-doc candidate.
464
+
465
+ Returns True if successful.
466
+ """
467
+ with psycopg.connect(db_url) as conn:
468
+ result = conn.execute(
469
+ """
470
+ UPDATE cross_doc_entity_candidates
471
+ SET status = 'rejected', reviewed_at = NOW()
472
+ WHERE id = %s AND status = 'pending'
473
+ RETURNING id
474
+ """,
475
+ (candidate_id,),
476
+ ).fetchone()
477
+ conn.commit()
478
+ return result is not None