okb 1.1.0a0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,499 @@
1
+ """Entity deduplication - find and merge duplicate entities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+ from typing import Any
10
+
11
+ import psycopg
12
+ from psycopg.rows import dict_row
13
+
14
+
15
+ @dataclass
16
+ class EntityMergePair:
17
+ """A pair of entities that may be duplicates."""
18
+
19
+ canonical_id: str
20
+ canonical_name: str
21
+ canonical_type: str
22
+ duplicate_id: str
23
+ duplicate_name: str
24
+ duplicate_type: str
25
+ confidence: float
26
+ reason: str # "embedding_similarity", "alias_match", "llm"
27
+
28
+
29
+ DEDUP_SYSTEM_PROMPT = """\
30
+ You are an expert at identifying duplicate entities. Given a list of entity names and types,
31
+ identify groups that refer to the same real-world entity.
32
+
33
+ Consider:
34
+ - Abbreviations: "AWS" and "Amazon Web Services" are the same
35
+ - Spelling variations: "React.js", "ReactJS", "React" are the same
36
+ - Case differences: "Python" and "python" are the same
37
+ - With/without suffixes: "Google Inc" and "Google" are the same
38
+
39
+ Return ONLY valid JSON with this structure:
40
+ {
41
+ "merge_groups": [
42
+ {
43
+ "canonical": "Full/preferred name",
44
+ "duplicates": ["alias1", "alias2"],
45
+ "confidence": 0.95,
46
+ "reason": "Brief explanation"
47
+ }
48
+ ]
49
+ }
50
+
51
+ If no duplicates found, return: {"merge_groups": []}
52
+ """
53
+
54
+ DEDUP_USER_PROMPT = """\
55
+ Analyze these entities for duplicates:
56
+
57
+ {entity_list}
58
+
59
+ Group any entities that refer to the same thing.
60
+ """
61
+
62
+
63
+ def find_duplicate_entities(
64
+ db_url: str,
65
+ similarity_threshold: float = 0.85,
66
+ use_llm: bool = True,
67
+ entity_type: str | None = None,
68
+ limit: int = 100,
69
+ ) -> list[EntityMergePair]:
70
+ """Find potential duplicate entities using embedding similarity and LLM.
71
+
72
+ Args:
73
+ db_url: Database URL
74
+ similarity_threshold: Minimum cosine similarity to consider as duplicate
75
+ use_llm: Whether to use LLM for batch deduplication
76
+ entity_type: Filter to specific entity type
77
+ limit: Maximum entities to analyze
78
+
79
+ Returns:
80
+ List of EntityMergePair objects representing potential duplicates
81
+ """
82
+ pairs: list[EntityMergePair] = []
83
+
84
+ # Phase 1: Embedding similarity
85
+ embedding_pairs = _find_by_embedding_similarity(
86
+ db_url, similarity_threshold, entity_type, limit
87
+ )
88
+ pairs.extend(embedding_pairs)
89
+
90
+ # Phase 2: Alias matching
91
+ alias_pairs = _find_by_alias_match(db_url, entity_type)
92
+ # Don't add if already found by embedding
93
+ existing = {(p.canonical_id, p.duplicate_id) for p in pairs}
94
+ existing.update({(p.duplicate_id, p.canonical_id) for p in pairs})
95
+ for p in alias_pairs:
96
+ if (p.canonical_id, p.duplicate_id) not in existing:
97
+ pairs.append(p)
98
+ existing.add((p.canonical_id, p.duplicate_id))
99
+ existing.add((p.duplicate_id, p.canonical_id))
100
+
101
+ # Phase 3: LLM batch identification
102
+ if use_llm:
103
+ llm_pairs = _find_by_llm(db_url, entity_type, limit)
104
+ for p in llm_pairs:
105
+ if (p.canonical_id, p.duplicate_id) not in existing:
106
+ pairs.append(p)
107
+ existing.add((p.canonical_id, p.duplicate_id))
108
+ existing.add((p.duplicate_id, p.canonical_id))
109
+
110
+ return pairs
111
+
112
+
113
+ def _find_by_embedding_similarity(
114
+ db_url: str,
115
+ threshold: float,
116
+ entity_type: str | None,
117
+ limit: int,
118
+ ) -> list[EntityMergePair]:
119
+ """Find duplicates by comparing entity embeddings."""
120
+ pairs = []
121
+
122
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
123
+ from pgvector.psycopg import register_vector
124
+
125
+ register_vector(conn)
126
+
127
+ # Get entity documents with embeddings
128
+ sql = """
129
+ SELECT d.id, d.title, d.metadata->>'entity_type' as entity_type,
130
+ (SELECT embedding FROM chunks WHERE document_id = d.id LIMIT 1) as embedding
131
+ FROM documents d
132
+ WHERE d.source_type = 'entity'
133
+ """
134
+ params: list[Any] = []
135
+
136
+ if entity_type:
137
+ sql += " AND d.metadata->>'entity_type' = %s"
138
+ params.append(entity_type)
139
+
140
+ sql += " LIMIT %s"
141
+ params.append(limit)
142
+
143
+ entities = conn.execute(sql, params).fetchall()
144
+
145
+ # Compare each pair
146
+ for i, e1 in enumerate(entities):
147
+ if e1["embedding"] is None:
148
+ continue
149
+ for e2 in entities[i + 1 :]:
150
+ if e2["embedding"] is None:
151
+ continue
152
+
153
+ # Calculate similarity
154
+ result = conn.execute(
155
+ "SELECT 1 - (%s::vector <=> %s::vector) as similarity",
156
+ (e1["embedding"], e2["embedding"]),
157
+ ).fetchone()
158
+ similarity = result["similarity"]
159
+
160
+ if similarity >= threshold:
161
+ # Prefer longer/more complete name as canonical
162
+ if len(e1["title"]) >= len(e2["title"]):
163
+ canonical, duplicate = e1, e2
164
+ else:
165
+ canonical, duplicate = e2, e1
166
+
167
+ pairs.append(
168
+ EntityMergePair(
169
+ canonical_id=str(canonical["id"]),
170
+ canonical_name=canonical["title"],
171
+ canonical_type=canonical["entity_type"] or "unknown",
172
+ duplicate_id=str(duplicate["id"]),
173
+ duplicate_name=duplicate["title"],
174
+ duplicate_type=duplicate["entity_type"] or "unknown",
175
+ confidence=similarity,
176
+ reason="embedding_similarity",
177
+ )
178
+ )
179
+
180
+ return pairs
181
+
182
+
183
+ def _find_by_alias_match(
184
+ db_url: str,
185
+ entity_type: str | None,
186
+ ) -> list[EntityMergePair]:
187
+ """Find duplicates where one entity's name matches another's alias."""
188
+ pairs = []
189
+
190
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
191
+ # Get entities with aliases
192
+ sql = """
193
+ SELECT d.id, d.title, d.metadata->>'entity_type' as entity_type,
194
+ d.metadata->'aliases' as aliases
195
+ FROM documents d
196
+ WHERE d.source_type = 'entity'
197
+ AND d.metadata->'aliases' IS NOT NULL
198
+ """
199
+ params: list[Any] = []
200
+
201
+ if entity_type:
202
+ sql += " AND d.metadata->>'entity_type' = %s"
203
+ params.append(entity_type)
204
+
205
+ entities = conn.execute(sql, params).fetchall()
206
+
207
+ # Build alias -> entity mapping
208
+ alias_map: dict[str, dict] = {}
209
+ for e in entities:
210
+ aliases = e["aliases"] if isinstance(e["aliases"], list) else []
211
+ for alias in aliases:
212
+ if isinstance(alias, str):
213
+ normalized = alias.lower().strip()
214
+ alias_map[normalized] = e
215
+
216
+ # Check if any entity name matches another's alias
217
+ for e in entities:
218
+ normalized_name = e["title"].lower().strip()
219
+ if normalized_name in alias_map:
220
+ other = alias_map[normalized_name]
221
+ if other["id"] != e["id"]:
222
+ # Prefer the one with more aliases as canonical
223
+ e_aliases = e["aliases"] if isinstance(e["aliases"], list) else []
224
+ o_aliases = other["aliases"] if isinstance(other["aliases"], list) else []
225
+ if len(o_aliases) >= len(e_aliases):
226
+ canonical, duplicate = other, e
227
+ else:
228
+ canonical, duplicate = e, other
229
+
230
+ pairs.append(
231
+ EntityMergePair(
232
+ canonical_id=str(canonical["id"]),
233
+ canonical_name=canonical["title"],
234
+ canonical_type=canonical["entity_type"] or "unknown",
235
+ duplicate_id=str(duplicate["id"]),
236
+ duplicate_name=duplicate["title"],
237
+ duplicate_type=duplicate["entity_type"] or "unknown",
238
+ confidence=0.9,
239
+ reason="alias_match",
240
+ )
241
+ )
242
+
243
+ return pairs
244
+
245
+
246
+ def _find_by_llm(
247
+ db_url: str,
248
+ entity_type: str | None,
249
+ limit: int,
250
+ ) -> list[EntityMergePair]:
251
+ """Use LLM to identify duplicate entities in batch."""
252
+ from .. import complete
253
+
254
+ pairs = []
255
+
256
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
257
+ sql = """
258
+ SELECT d.id, d.title, d.metadata->>'entity_type' as entity_type
259
+ FROM documents d
260
+ WHERE d.source_type = 'entity'
261
+ """
262
+ params: list[Any] = []
263
+
264
+ if entity_type:
265
+ sql += " AND d.metadata->>'entity_type' = %s"
266
+ params.append(entity_type)
267
+
268
+ sql += " ORDER BY d.title LIMIT %s"
269
+ params.append(limit)
270
+
271
+ entities = conn.execute(sql, params).fetchall()
272
+
273
+ if len(entities) < 2:
274
+ return []
275
+
276
+ # Build entity list for prompt
277
+ entity_lines = []
278
+ entity_map = {}
279
+ for e in entities:
280
+ entity_lines.append(f"- {e['title']} ({e['entity_type']})")
281
+ entity_map[e["title"].lower()] = e
282
+
283
+ prompt = DEDUP_USER_PROMPT.format(entity_list="\n".join(entity_lines))
284
+
285
+ response = complete(prompt, system=DEDUP_SYSTEM_PROMPT, max_tokens=2048, use_cache=True)
286
+
287
+ if response is None:
288
+ return []
289
+
290
+ # Parse response
291
+ try:
292
+ content = response.content.strip()
293
+ if content.startswith("```"):
294
+ lines = content.split("\n")
295
+ content = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
296
+ data = json.loads(content)
297
+ except json.JSONDecodeError:
298
+ return []
299
+
300
+ merge_groups = data.get("merge_groups", [])
301
+ for group in merge_groups:
302
+ canonical_name = group.get("canonical", "")
303
+ duplicates = group.get("duplicates", [])
304
+ confidence = group.get("confidence", 0.8)
305
+
306
+ canonical = entity_map.get(canonical_name.lower())
307
+ if not canonical:
308
+ continue
309
+
310
+ for dup_name in duplicates:
311
+ dup = entity_map.get(dup_name.lower())
312
+ if dup and dup["id"] != canonical["id"]:
313
+ pairs.append(
314
+ EntityMergePair(
315
+ canonical_id=str(canonical["id"]),
316
+ canonical_name=canonical["title"],
317
+ canonical_type=canonical["entity_type"] or "unknown",
318
+ duplicate_id=str(dup["id"]),
319
+ duplicate_name=dup["title"],
320
+ duplicate_type=dup["entity_type"] or "unknown",
321
+ confidence=confidence,
322
+ reason="llm",
323
+ )
324
+ )
325
+
326
+ return pairs
327
+
328
+
329
+ def create_pending_merge(db_url: str, pair: EntityMergePair) -> str | None:
330
+ """Create a pending merge proposal.
331
+
332
+ Returns the merge ID, or None if already exists.
333
+ """
334
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
335
+ try:
336
+ result = conn.execute(
337
+ """
338
+ INSERT INTO pending_entity_merges
339
+ (canonical_id, duplicate_id, confidence, reason, status)
340
+ VALUES (%s, %s, %s, %s, 'pending')
341
+ ON CONFLICT (canonical_id, duplicate_id) DO NOTHING
342
+ RETURNING id
343
+ """,
344
+ (pair.canonical_id, pair.duplicate_id, pair.confidence, pair.reason),
345
+ ).fetchone()
346
+ conn.commit()
347
+ return str(result["id"]) if result else None
348
+ except Exception:
349
+ return None
350
+
351
+
352
+ def execute_merge(db_url: str, canonical_id: str, duplicate_id: str) -> bool:
353
+ """Execute a merge: redirect refs from duplicate to canonical, add alias, delete duplicate.
354
+
355
+ Args:
356
+ db_url: Database URL
357
+ canonical_id: ID of the entity to keep
358
+ duplicate_id: ID of the entity to merge into canonical
359
+
360
+ Returns:
361
+ True if merge succeeded, False otherwise
362
+ """
363
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
364
+ try:
365
+ # Get duplicate info for alias
366
+ duplicate = conn.execute(
367
+ "SELECT title FROM documents WHERE id = %s AND source_type = 'entity'",
368
+ (duplicate_id,),
369
+ ).fetchone()
370
+
371
+ if not duplicate:
372
+ return False
373
+
374
+ # 1. Redirect all entity_refs from duplicate to canonical
375
+ conn.execute(
376
+ """
377
+ UPDATE entity_refs
378
+ SET entity_id = %s
379
+ WHERE entity_id = %s
380
+ """,
381
+ (canonical_id, duplicate_id),
382
+ )
383
+
384
+ # 2. Add duplicate's name as alias of canonical
385
+ conn.execute(
386
+ """
387
+ INSERT INTO entity_aliases (alias_text, entity_id, confidence, source)
388
+ VALUES (%s, %s, 1.0, 'merge')
389
+ ON CONFLICT (alias_text, entity_id) DO NOTHING
390
+ """,
391
+ (duplicate["title"], canonical_id),
392
+ )
393
+
394
+ # 3. Also copy any existing aliases from duplicate to canonical
395
+ conn.execute(
396
+ """
397
+ INSERT INTO entity_aliases (alias_text, entity_id, confidence, source)
398
+ SELECT alias_text, %s, confidence, 'merge'
399
+ FROM entity_aliases WHERE entity_id = %s
400
+ ON CONFLICT (alias_text, entity_id) DO NOTHING
401
+ """,
402
+ (canonical_id, duplicate_id),
403
+ )
404
+
405
+ # 4. Delete duplicate entity document (cascades to chunks)
406
+ conn.execute(
407
+ "DELETE FROM documents WHERE id = %s",
408
+ (duplicate_id,),
409
+ )
410
+
411
+ conn.commit()
412
+ return True
413
+
414
+ except Exception:
415
+ return False
416
+
417
+
418
+ def approve_merge(db_url: str, merge_id: str) -> bool:
419
+ """Approve and execute a pending merge.
420
+
421
+ Returns True if successful.
422
+ """
423
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
424
+ # Get merge details
425
+ merge = conn.execute(
426
+ """
427
+ SELECT canonical_id, duplicate_id
428
+ FROM pending_entity_merges
429
+ WHERE id = %s AND status = 'pending'
430
+ """,
431
+ (merge_id,),
432
+ ).fetchone()
433
+
434
+ if not merge:
435
+ return False
436
+
437
+ # Execute the merge
438
+ if execute_merge(db_url, str(merge["canonical_id"]), str(merge["duplicate_id"])):
439
+ # Mark as approved
440
+ conn.execute(
441
+ """
442
+ UPDATE pending_entity_merges
443
+ SET status = 'approved', reviewed_at = NOW()
444
+ WHERE id = %s
445
+ """,
446
+ (merge_id,),
447
+ )
448
+ conn.commit()
449
+ return True
450
+
451
+ return False
452
+
453
+
454
+ def reject_merge(db_url: str, merge_id: str) -> bool:
455
+ """Reject a pending merge.
456
+
457
+ Returns True if successful.
458
+ """
459
+ with psycopg.connect(db_url) as conn:
460
+ result = conn.execute(
461
+ """
462
+ UPDATE pending_entity_merges
463
+ SET status = 'rejected', reviewed_at = NOW()
464
+ WHERE id = %s AND status = 'pending'
465
+ RETURNING id
466
+ """,
467
+ (merge_id,),
468
+ ).fetchone()
469
+ conn.commit()
470
+ return result is not None
471
+
472
+
473
+ def list_pending_merges(
474
+ db_url: str,
475
+ limit: int = 50,
476
+ ) -> list[dict]:
477
+ """List pending entity merge proposals.
478
+
479
+ Returns list of dicts with merge details.
480
+ """
481
+ with psycopg.connect(db_url, row_factory=dict_row) as conn:
482
+ results = conn.execute(
483
+ """
484
+ SELECT
485
+ m.id, m.confidence, m.reason, m.detected_at,
486
+ c.id as canonical_id, c.title as canonical_name,
487
+ c.metadata->>'entity_type' as canonical_type,
488
+ d.id as duplicate_id, d.title as duplicate_name,
489
+ d.metadata->>'entity_type' as duplicate_type
490
+ FROM pending_entity_merges m
491
+ JOIN documents c ON c.id = m.canonical_id
492
+ JOIN documents d ON d.id = m.duplicate_id
493
+ WHERE m.status = 'pending'
494
+ ORDER BY m.confidence DESC, m.detected_at DESC
495
+ LIMIT %s
496
+ """,
497
+ (limit,),
498
+ ).fetchall()
499
+ return [dict(r) for r in results]