mcal-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1466 @@
1
+ """
2
+ Unified Graph Extractor
3
+
4
+ Single-pass extraction that captures all memory types in one LLM call:
5
+ - Entities (WHO, WHAT)
6
+ - Relationships (CONNECTIONS)
7
+ - Decisions (WHY)
8
+ - Next Actions (NEXT-STEPS)
9
+
10
+ Design Goals:
11
+ - Reduce from 6 LLM calls to 1-2 calls
12
+ - Target: 50-80x overhead vs 220x current
13
+ - Maintain same memory quality
14
+
15
+ Token Optimization Strategies:
16
+ 1. Single comprehensive extraction prompt
17
+ 2. Compact JSON schema (short keys)
18
+ 3. Reference-based relationships (use IDs, not full text)
19
+ 4. Delta extraction for multi-turn (only process new messages)
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import base64
25
+ import json
26
+ import hashlib
27
+ import logging
28
+ from datetime import datetime, timezone
29
+ from typing import Optional, Protocol
30
+ from dataclasses import dataclass, field
31
+ from enum import Enum
32
+
33
+ from pydantic import BaseModel, Field
34
+
35
+
36
+ def _utc_now() -> datetime:
37
+ """Return current UTC time (timezone-aware)."""
38
+ return datetime.now(timezone.utc)
39
+
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ # =============================================================================
45
+ # Compact Data Models (Optimized for Token Efficiency)
46
+ # =============================================================================
47
+
48
+ class NodeType(str, Enum):
49
+ """Node types in the unified graph."""
50
+ PERSON = "P" # WHO - people, users, stakeholders
51
+ THING = "T" # WHAT - products, tools, technologies
52
+ CONCEPT = "C" # WHAT - ideas, approaches, methodologies
53
+ GOAL = "G" # WHAT - objectives, targets
54
+ DECISION = "D" # WHY - choices made with rationale
55
+ ACTION = "A" # NEXT - tasks, next steps
56
+
57
+
58
+ class DeduplicationStrategy(str, Enum):
59
+ """Strategy for detecting duplicate nodes."""
60
+ EXACT_ONLY = "exact" # Only exact label matches (case-insensitive)
61
+ SEMANTIC = "semantic" # Use embedding similarity only
62
+ HYBRID = "hybrid" # Exact first, then semantic (default)
63
+ NONE = "none" # No deduplication
64
+
65
+
66
+ @dataclass
67
+ class DeduplicationStats:
68
+ """Track deduplication statistics for debugging."""
69
+ nodes_added: int = 0
70
+ nodes_merged: int = 0
71
+ exact_matches: int = 0
72
+ semantic_matches: int = 0
73
+
74
+ @property
75
+ def total_operations(self) -> int:
76
+ return self.nodes_added + self.nodes_merged
77
+
78
+ @property
79
+ def dedup_ratio(self) -> float:
80
+ """Ratio of merged nodes to total operations."""
81
+ if self.total_operations == 0:
82
+ return 0.0
83
+ return self.nodes_merged / self.total_operations
84
+
85
+ def reset(self) -> None:
86
+ """Reset all counters."""
87
+ self.nodes_added = 0
88
+ self.nodes_merged = 0
89
+ self.exact_matches = 0
90
+ self.semantic_matches = 0
91
+
92
+
93
+ class EdgeType(str, Enum):
94
+ """Relationship types (compact)."""
95
+ USES = "uses" # Person/Goal uses Thing
96
+ WANTS = "wants" # Person wants Goal
97
+ CHOSE = "chose" # Person/Goal chose Decision
98
+ OVER = "over" # Decision over Alternative (with pros/cons attrs)
99
+ BECAUSE = "because" # Decision because Reason (with source/confidence attrs)
100
+ ENABLES = "enables" # Thing/Decision enables Goal
101
+ BLOCKS = "blocks" # Thing blocks Goal
102
+ NEXT = "next" # Current state leads to Action
103
+ RELATES = "relates" # General relationship
104
+ PART_OF = "part_of" # Hierarchical containment
105
+ # NEW in Unified Deep v2
106
+ DEPENDS_ON = "depends_on" # Task/Goal A requires B to complete first
107
+ CONFLICTS = "conflicts" # Goal X conflicts with Goal Y
108
+ SUPERSEDES = "supersedes" # Decision D2 replaces/updates D1
109
+
110
+
111
+ @dataclass
112
+ class GraphNode:
113
+ """
114
+ Compact graph node.
115
+
116
+ Memory optimization: Use short field names and IDs.
117
+
118
+ Embedding storage (Issue #50):
119
+ - Stored as Float16 binary (bytes) for 8x compression
120
+ - Serialized to base64 in JSON for persistence
121
+ - Quality preserved: 0% loss in search accuracy
122
+ """
123
+ id: str # Short ID: "P1", "T2", "D3"
124
+ type: NodeType
125
+ label: str # Short label: "PostgreSQL", "fraud detection"
126
+ attrs: dict = field(default_factory=dict) # Optional attributes
127
+ embedding: Optional[bytes] = None # Float16 binary embedding (384 dim = 768 bytes)
128
+
129
+ def to_dict(self) -> dict:
130
+ d = {"id": self.id, "t": self.type.value, "l": self.label}
131
+ if self.attrs:
132
+ d["a"] = self.attrs
133
+ if self.embedding:
134
+ # Store as base64-encoded Float16 binary (compact)
135
+ d["e"] = base64.b64encode(self.embedding).decode('ascii')
136
+ return d
137
+
138
+ @classmethod
139
+ def from_dict(cls, d: dict) -> "GraphNode":
140
+ embedding = None
141
+ if "e" in d:
142
+ embedding = base64.b64decode(d["e"])
143
+ return cls(
144
+ id=d["id"],
145
+ type=NodeType(d["t"]),
146
+ label=d["l"],
147
+ attrs=d.get("a", {}),
148
+ embedding=embedding
149
+ )
150
+
151
+
152
+ @dataclass
153
+ class GraphEdge:
154
+ """
155
+ Compact graph edge.
156
+ """
157
+ src: str # Source node ID
158
+ dst: str # Destination node ID
159
+ rel: EdgeType # Relationship type
160
+ attrs: dict = field(default_factory=dict)
161
+
162
+ def to_dict(self) -> dict:
163
+ d = {"s": self.src, "d": self.dst, "r": self.rel.value}
164
+ if self.attrs:
165
+ d["a"] = self.attrs
166
+ return d
167
+
168
+ @classmethod
169
+ def from_dict(cls, d: dict) -> "GraphEdge":
170
+ return cls(
171
+ src=d["s"],
172
+ dst=d["d"],
173
+ rel=EdgeType(d["r"]),
174
+ attrs=d.get("a", {})
175
+ )
176
+
177
+
178
+ @dataclass
179
+ class UnifiedGraph:
180
+ """
181
+ Compact unified graph containing all memory types.
182
+
183
+ This single structure replaces:
184
+ - IntentGraph (goals, tasks)
185
+ - DecisionTrail (decisions, alternatives, rationale)
186
+ - Facts (entities, relationships)
187
+ - NextActions (follow-ups)
188
+
189
+ Deduplication (Issue #52):
190
+ - Hybrid strategy: exact match first, then semantic similarity
191
+ - Configurable threshold (default 0.80)
192
+ - Stats tracking for debugging
193
+ """
194
+ nodes: dict[str, GraphNode] = field(default_factory=dict)
195
+ edges: list[GraphEdge] = field(default_factory=list)
196
+ metadata: dict = field(default_factory=dict)
197
+
198
+ # Deduplication configuration
199
+ _dedup_strategy: DeduplicationStrategy = field(default=DeduplicationStrategy.HYBRID)
200
+ _similarity_threshold: float = field(default=0.80)
201
+ _dedup_stats: DeduplicationStats = field(default_factory=DeduplicationStats)
202
+
203
+ # Internal indexes (rebuilt on demand)
204
+ _label_index: dict[str, list[str]] = field(default_factory=dict) # normalized_label -> [node_ids]
205
+ _vector_index: "VectorIndex" = field(default=None) # Lazy-built
206
+
207
+ def __post_init__(self):
208
+ """Initialize internal indexes."""
209
+ # Rebuild label index from existing nodes
210
+ self._rebuild_label_index()
211
+
212
+ def _rebuild_label_index(self) -> None:
213
+ """Rebuild the label index from current nodes."""
214
+ self._label_index = {}
215
+ for node in self.nodes.values():
216
+ self._index_node_label(node)
217
+
218
+ def _index_node_label(self, node: GraphNode) -> None:
219
+ """Add node to label index."""
220
+ normalized = self._normalize_label(node.label)
221
+ if normalized not in self._label_index:
222
+ self._label_index[normalized] = []
223
+ if node.id not in self._label_index[normalized]:
224
+ self._label_index[normalized].append(node.id)
225
+
226
+ @staticmethod
227
+ def _normalize_label(label: str) -> str:
228
+ """Normalize label for exact matching (lowercase, stripped)."""
229
+ return label.lower().strip()
230
+
231
+ # ==========================================================================
232
+ # Deduplication Methods (Issue #52)
233
+ # ==========================================================================
234
+
235
+ def find_duplicate(
236
+ self,
237
+ node: GraphNode,
238
+ strategy: Optional[DeduplicationStrategy] = None
239
+ ) -> Optional[str]:
240
+ """
241
+ Find existing duplicate node.
242
+
243
+ Args:
244
+ node: Node to check for duplicates
245
+ strategy: Override default strategy (optional)
246
+
247
+ Returns:
248
+ node_id of existing duplicate, or None
249
+ """
250
+ strategy = strategy or self._dedup_strategy
251
+
252
+ if strategy == DeduplicationStrategy.NONE:
253
+ return None
254
+
255
+ # Phase 1: Exact match (O(1) hash lookup)
256
+ if strategy in (DeduplicationStrategy.EXACT_ONLY, DeduplicationStrategy.HYBRID):
257
+ normalized = self._normalize_label(node.label)
258
+ if normalized in self._label_index:
259
+ for existing_id in self._label_index[normalized]:
260
+ existing = self.nodes.get(existing_id)
261
+ if existing and existing.type == node.type:
262
+ return existing_id
263
+
264
+ # Phase 2: Semantic match (requires embeddings)
265
+ if strategy in (DeduplicationStrategy.SEMANTIC, DeduplicationStrategy.HYBRID):
266
+ if node.embedding and self._vector_index:
267
+ results = self._vector_index.search(node.embedding, k=3)
268
+ for existing_id, score in results:
269
+ if score >= self._similarity_threshold:
270
+ existing = self.nodes.get(existing_id)
271
+ if existing and existing.type == node.type:
272
+ return existing_id
273
+
274
+ return None
275
+
276
+ def _merge_node(self, existing_id: str, new_node: GraphNode) -> None:
277
+ """
278
+ Merge new node data into existing node.
279
+
280
+ - Merges attributes (new values override existing)
281
+ - Updates embedding if new one is provided
282
+ - Preserves original ID and type
283
+ """
284
+ existing = self.nodes[existing_id]
285
+
286
+ # Merge attributes (new values override)
287
+ for key, value in new_node.attrs.items():
288
+ existing.attrs[key] = value
289
+
290
+ # Update embedding if new one is provided
291
+ if new_node.embedding:
292
+ existing.embedding = new_node.embedding
293
+ # Update vector index
294
+ if self._vector_index:
295
+ # Note: FAISS doesn't support update, so we accept stale index
296
+ # It will be rebuilt on next build_vector_index() call
297
+ pass
298
+
299
+ def add_node(
300
+ self,
301
+ node: GraphNode,
302
+ deduplicate: bool = True,
303
+ generate_embedding: bool = False
304
+ ) -> str:
305
+ """
306
+ Add node with optional deduplication.
307
+
308
+ Args:
309
+ node: Node to add
310
+ deduplicate: Whether to check for duplicates (default True)
311
+ generate_embedding: Whether to generate embedding if missing
312
+
313
+ Returns:
314
+ Actual node ID used (may be existing duplicate's ID)
315
+ """
316
+ # Generate embedding if requested and missing
317
+ if generate_embedding and node.embedding is None:
318
+ from .embeddings import EmbeddingService
319
+ embedder = EmbeddingService()
320
+ node.embedding = embedder.embed_node(node)
321
+
322
+ # Check for duplicates
323
+ if deduplicate and self._dedup_strategy != DeduplicationStrategy.NONE:
324
+ existing_id = self.find_duplicate(node)
325
+ if existing_id:
326
+ # Merge into existing node
327
+ self._merge_node(existing_id, node)
328
+ self._dedup_stats.nodes_merged += 1
329
+ # Track match type
330
+ normalized = self._normalize_label(node.label)
331
+ if normalized in self._label_index and existing_id in self._label_index.get(normalized, []):
332
+ self._dedup_stats.exact_matches += 1
333
+ else:
334
+ self._dedup_stats.semantic_matches += 1
335
+ return existing_id
336
+
337
+ # Add as new node
338
+ self.nodes[node.id] = node
339
+ self._index_node_label(node)
340
+ self._dedup_stats.nodes_added += 1
341
+
342
+ # Add to vector index if available
343
+ if node.embedding and self._vector_index:
344
+ self._vector_index.add(node.id, node.embedding)
345
+
346
+ return node.id
347
+
348
+ @property
349
+ def dedup_stats(self) -> DeduplicationStats:
350
+ """Get deduplication statistics."""
351
+ return self._dedup_stats
352
+
353
+ def configure_dedup(
354
+ self,
355
+ strategy: DeduplicationStrategy = DeduplicationStrategy.HYBRID,
356
+ threshold: float = 0.80
357
+ ) -> None:
358
+ """
359
+ Configure deduplication settings.
360
+
361
+ Args:
362
+ strategy: Deduplication strategy
363
+ threshold: Similarity threshold for semantic matching (0.0-1.0)
364
+ """
365
+ self._dedup_strategy = strategy
366
+ self._similarity_threshold = threshold
367
+
368
+ def add_edge(self, edge: GraphEdge) -> None:
369
+ """Add edge."""
370
+ self.edges.append(edge)
371
+
372
+ def merge(self, other: "UnifiedGraph") -> "UnifiedGraph":
373
+ """Merge another graph into this one, deduplicating nodes."""
374
+ # Merge nodes (by label similarity)
375
+ label_to_id = {n.label.lower(): n.id for n in self.nodes.values()}
376
+ id_mapping = {} # Map other IDs to our IDs
377
+
378
+ for node in other.nodes.values():
379
+ lower_label = node.label.lower()
380
+ if lower_label in label_to_id:
381
+ # Node exists, map to existing ID
382
+ id_mapping[node.id] = label_to_id[lower_label]
383
+ else:
384
+ # New node, generate new ID
385
+ new_id = self._next_id(node.type)
386
+ id_mapping[node.id] = new_id
387
+ new_node = GraphNode(
388
+ id=new_id,
389
+ type=node.type,
390
+ label=node.label,
391
+ attrs=node.attrs
392
+ )
393
+ self.add_node(new_node)
394
+ label_to_id[lower_label] = new_id
395
+
396
+ # Merge edges with remapped IDs
397
+ existing_edges = {(e.src, e.dst, e.rel) for e in self.edges}
398
+ for edge in other.edges:
399
+ new_src = id_mapping.get(edge.src, edge.src)
400
+ new_dst = id_mapping.get(edge.dst, edge.dst)
401
+ edge_key = (new_src, new_dst, edge.rel)
402
+ if edge_key not in existing_edges:
403
+ self.edges.append(GraphEdge(
404
+ src=new_src,
405
+ dst=new_dst,
406
+ rel=edge.rel,
407
+ attrs=edge.attrs
408
+ ))
409
+ existing_edges.add(edge_key)
410
+
411
+ return self
412
+
413
+ def _next_id(self, node_type: NodeType) -> str:
414
+ """Generate next ID for node type."""
415
+ prefix = node_type.value
416
+ existing = [n.id for n in self.nodes.values() if n.id.startswith(prefix)]
417
+ if not existing:
418
+ return f"{prefix}1"
419
+ max_num = max(int(nid[1:]) for nid in existing if nid[1:].isdigit())
420
+ return f"{prefix}{max_num + 1}"
421
+
422
+ def to_dict(self) -> dict:
423
+ """Serialize to compact dict."""
424
+ return {
425
+ "n": [n.to_dict() for n in self.nodes.values()],
426
+ "e": [e.to_dict() for e in self.edges],
427
+ "m": self.metadata
428
+ }
429
+
430
+ @classmethod
431
+ def from_dict(cls, d: dict) -> "UnifiedGraph":
432
+ """Deserialize from dict."""
433
+ graph = cls()
434
+ for nd in d.get("n", []):
435
+ node = GraphNode.from_dict(nd)
436
+ graph.nodes[node.id] = node
437
+ for ed in d.get("e", []):
438
+ graph.edges.append(GraphEdge.from_dict(ed))
439
+ graph.metadata = d.get("m", {})
440
+ # Rebuild label index for deduplication support
441
+ graph._rebuild_label_index()
442
+ return graph
443
+
444
+ # ==========================================================================
445
+ # Query Methods
446
+ # ==========================================================================
447
+
448
+ def get_entities(self) -> list[GraphNode]:
449
+ """Get WHO and WHAT nodes."""
450
+ return [n for n in self.nodes.values()
451
+ if n.type in (NodeType.PERSON, NodeType.THING, NodeType.CONCEPT)]
452
+
453
+ def get_goals(self) -> list[GraphNode]:
454
+ """Get GOAL nodes."""
455
+ return [n for n in self.nodes.values() if n.type == NodeType.GOAL]
456
+
457
+ def get_decisions(self) -> list[GraphNode]:
458
+ """Get WHY nodes (decisions)."""
459
+ return [n for n in self.nodes.values() if n.type == NodeType.DECISION]
460
+
461
+ def get_actions(self) -> list[GraphNode]:
462
+ """Get NEXT-STEP nodes."""
463
+ return [n for n in self.nodes.values() if n.type == NodeType.ACTION]
464
+
465
+ def get_relationships_for(self, node_id: str) -> list[GraphEdge]:
466
+ """Get all edges involving a node."""
467
+ return [e for e in self.edges if e.src == node_id or e.dst == node_id]
468
+
469
+ def get_decision_context(self, decision_id: str) -> dict:
470
+ """
471
+ Get full context for a decision (LEGACY - use get_decision_detail for Unified Deep).
472
+ """
473
+ decision = self.nodes.get(decision_id)
474
+ if not decision or decision.type != NodeType.DECISION:
475
+ return {}
476
+
477
+ context = {
478
+ "decision": decision.label,
479
+ "alternatives": [],
480
+ "reasons": [],
481
+ "enables": []
482
+ }
483
+
484
+ for edge in self.edges:
485
+ if edge.src == decision_id:
486
+ if edge.rel == EdgeType.OVER:
487
+ alt_node = self.nodes.get(edge.dst)
488
+ if alt_node:
489
+ context["alternatives"].append({
490
+ "option": alt_node.label,
491
+ "rejected_because": edge.attrs.get("reason", "")
492
+ })
493
+ elif edge.rel == EdgeType.BECAUSE:
494
+ reason_node = self.nodes.get(edge.dst)
495
+ if reason_node:
496
+ context["reasons"].append(reason_node.label)
497
+ elif edge.rel == EdgeType.ENABLES:
498
+ goal_node = self.nodes.get(edge.dst)
499
+ if goal_node:
500
+ context["enables"].append(goal_node.label)
501
+
502
+ return context
503
+
504
+ # ==========================================================================
505
+ # Unified Deep Query Methods (Rich Attribute Access)
506
+ # ==========================================================================
507
+
508
+ def get_decision_detail(self, decision_id: str) -> dict:
509
+ """
510
+ Get FULL decision detail with Unified Deep rich attributes.
511
+
512
+ Returns complete decision context including:
513
+ - rationale (full WHY text)
514
+ - confidence score
515
+ - context (situation)
516
+ - trade_offs (list of gained/sacrificed)
517
+ - alternatives with pros/cons/rejection_reason
518
+ - evidence/reasons with source and confidence
519
+
520
+ This is the Unified Deep version of get_decision_context().
521
+ """
522
+ decision = self.nodes.get(decision_id)
523
+ if not decision or decision.type != NodeType.DECISION:
524
+ return {}
525
+
526
+ # Base decision info from node attrs
527
+ detail = {
528
+ "id": decision_id,
529
+ "decision": decision.label,
530
+ "rationale": decision.attrs.get("rationale", ""),
531
+ "confidence": decision.attrs.get("confidence", 0.0),
532
+ "context": decision.attrs.get("context", ""),
533
+ "trade_offs": decision.attrs.get("trade_offs", []),
534
+ "turn_ref": decision.attrs.get("turn_ref", []),
535
+ "alternatives": [],
536
+ "reasons": [],
537
+ "enables": [],
538
+ "supersedes": None,
539
+ }
540
+
541
+ # Collect rich edge data
542
+ for edge in self.edges:
543
+ if edge.src == decision_id:
544
+ if edge.rel == EdgeType.OVER:
545
+ alt_node = self.nodes.get(edge.dst)
546
+ if alt_node:
547
+ detail["alternatives"].append({
548
+ "option": alt_node.label,
549
+ "pros": edge.attrs.get("pros", []),
550
+ "cons": edge.attrs.get("cons", []),
551
+ "rejection_reason": edge.attrs.get("rejection_reason", "")
552
+ })
553
+ elif edge.rel == EdgeType.BECAUSE:
554
+ reason_node = self.nodes.get(edge.dst)
555
+ if reason_node:
556
+ detail["reasons"].append({
557
+ "claim": reason_node.label,
558
+ "source": edge.attrs.get("source", "inferred"),
559
+ "confidence": edge.attrs.get("confidence", 0.5),
560
+ "turn_ref": edge.attrs.get("turn_ref", "")
561
+ })
562
+ elif edge.rel == EdgeType.ENABLES:
563
+ goal_node = self.nodes.get(edge.dst)
564
+ if goal_node:
565
+ detail["enables"].append(goal_node.label)
566
+ elif edge.rel == EdgeType.SUPERSEDES:
567
+ old_decision = self.nodes.get(edge.dst)
568
+ if old_decision:
569
+ detail["supersedes"] = old_decision.label
570
+
571
+ return detail
572
+
573
+ def get_goal_hierarchy(self) -> dict:
574
+ """
575
+ Get hierarchical goal structure with Unified Deep attributes.
576
+
577
+ Returns goals organized by type (mission -> goals -> tasks)
578
+ with status tracking and confidence scores.
579
+ """
580
+ hierarchy = {
581
+ "missions": [],
582
+ "goals": [],
583
+ "tasks": [],
584
+ }
585
+
586
+ for node in self.nodes.values():
587
+ if node.type != NodeType.GOAL:
588
+ continue
589
+
590
+ goal_info = {
591
+ "id": node.id,
592
+ "content": node.label,
593
+ "goal_type": node.attrs.get("goal_type", "goal"),
594
+ "status": node.attrs.get("status", "active"),
595
+ "confidence": node.attrs.get("confidence", 0.5),
596
+ "turn_ref": node.attrs.get("turn_ref", []),
597
+ "depends_on": [],
598
+ "enables": [],
599
+ "blocks": [],
600
+ }
601
+
602
+ # Collect dependencies from edges
603
+ for edge in self.edges:
604
+ if edge.src == node.id:
605
+ target = self.nodes.get(edge.dst)
606
+ if target:
607
+ if edge.rel == EdgeType.DEPENDS_ON:
608
+ goal_info["depends_on"].append(target.label)
609
+ elif edge.rel == EdgeType.ENABLES:
610
+ goal_info["enables"].append(target.label)
611
+ elif edge.rel == EdgeType.BLOCKS:
612
+ goal_info["blocks"].append(target.label)
613
+
614
+ # Categorize by goal_type
615
+ goal_type = goal_info["goal_type"]
616
+ if goal_type == "mission":
617
+ hierarchy["missions"].append(goal_info)
618
+ elif goal_type == "task":
619
+ hierarchy["tasks"].append(goal_info)
620
+ else:
621
+ hierarchy["goals"].append(goal_info)
622
+
623
+ return hierarchy
624
+
625
+ def get_active_goals(self) -> list[dict]:
626
+ """Get only active/pending goals with full attributes."""
627
+ active_statuses = ("active", "pending")
628
+ return [
629
+ {
630
+ "id": node.id,
631
+ "content": node.label,
632
+ "goal_type": node.attrs.get("goal_type", "goal"),
633
+ "status": node.attrs.get("status", "active"),
634
+ "confidence": node.attrs.get("confidence", 0.5),
635
+ }
636
+ for node in self.nodes.values()
637
+ if node.type == NodeType.GOAL
638
+ and node.attrs.get("status", "active") in active_statuses
639
+ ]
640
+
641
+ def get_all_decisions_with_detail(self) -> list[dict]:
642
+ """Get all decisions with full Unified Deep detail."""
643
+ return [
644
+ self.get_decision_detail(node.id)
645
+ for node in self.nodes.values()
646
+ if node.type == NodeType.DECISION
647
+ ]
648
+
649
+ def search(
650
+ self,
651
+ query: str,
652
+ limit: int = 10,
653
+ include_types: Optional[list[NodeType]] = None
654
+ ) -> list[dict]:
655
+ """
656
+ Search unified graph for nodes matching query.
657
+
658
+ Uses keyword-based matching with relevance scoring:
659
+ - Exact match: score 1.0
660
+ - Partial word match: score 0.7
661
+ - Related via edge: score 0.5
662
+
663
+ Args:
664
+ query: Search query text
665
+ limit: Maximum results to return
666
+ include_types: Optional filter for specific node types
667
+
668
+ Returns:
669
+ List of dicts with node info and relevance score, sorted by score
670
+ """
671
+ if not query or not self.nodes:
672
+ return []
673
+
674
+ query_lower = query.lower()
675
+ query_words = set(query_lower.split())
676
+
677
+ results = []
678
+ matched_ids = set()
679
+
680
+ # Score each node
681
+ for node in self.nodes.values():
682
+ # Filter by type if specified
683
+ if include_types and node.type not in include_types:
684
+ continue
685
+
686
+ label_lower = node.label.lower()
687
+ label_words = set(label_lower.split())
688
+
689
+ score = 0.0
690
+
691
+ # Exact substring match
692
+ if query_lower in label_lower:
693
+ score = 1.0
694
+ # Word overlap
695
+ elif query_words & label_words:
696
+ overlap = len(query_words & label_words) / len(query_words)
697
+ score = 0.7 * overlap
698
+ # Check attributes for matches
699
+ elif node.attrs:
700
+ for key, val in node.attrs.items():
701
+ if isinstance(val, str) and query_lower in val.lower():
702
+ score = 0.6
703
+ break
704
+ elif isinstance(val, list):
705
+ for item in val:
706
+ if isinstance(item, str) and query_lower in item.lower():
707
+ score = 0.5
708
+ break
709
+
710
+ if score > 0:
711
+ result = {
712
+ "id": node.id,
713
+ "type": node.type.name.lower(),
714
+ "content": node.label,
715
+ "score": score,
716
+ "attributes": node.attrs,
717
+ }
718
+ results.append(result)
719
+ matched_ids.add(node.id)
720
+
721
+ # Add related nodes via edges (lower score)
722
+ for edge in self.edges:
723
+ for matched_id in list(matched_ids):
724
+ related_id = None
725
+ if edge.src == matched_id and edge.dst not in matched_ids:
726
+ related_id = edge.dst
727
+ elif edge.dst == matched_id and edge.src not in matched_ids:
728
+ related_id = edge.src
729
+
730
+ if related_id and related_id not in matched_ids:
731
+ related_node = self.nodes.get(related_id)
732
+ if related_node:
733
+ if include_types and related_node.type not in include_types:
734
+ continue
735
+ results.append({
736
+ "id": related_node.id,
737
+ "type": related_node.type.name.lower(),
738
+ "content": related_node.label,
739
+ "score": 0.4,
740
+ "attributes": related_node.attrs,
741
+ "related_to": matched_id,
742
+ })
743
+ matched_ids.add(related_id)
744
+
745
+ # Sort by score descending, take top limit
746
+ results.sort(key=lambda x: x["score"], reverse=True)
747
+ return results[:limit]
748
+
749
+ def summary(self) -> dict:
750
+ """Get graph summary statistics."""
751
+ type_counts = {}
752
+ for node in self.nodes.values():
753
+ type_counts[node.type.value] = type_counts.get(node.type.value, 0) + 1
754
+
755
+ return {
756
+ "total_nodes": len(self.nodes),
757
+ "total_edges": len(self.edges),
758
+ "by_type": type_counts,
759
+ "entities": len(self.get_entities()),
760
+ "goals": len(self.get_goals()),
761
+ "decisions": len(self.get_decisions()),
762
+ "actions": len(self.get_actions())
763
+ }
764
+
765
+ # =========================================================================
766
+ # Semantic Search (Issue #51)
767
+ # =========================================================================
768
+
769
+ def build_vector_index(self) -> "VectorIndex":
770
+ """
771
+ Build FAISS vector index from node embeddings.
772
+
773
+ Called lazily on first semantic search. Very fast (<0.2ms for 1000 nodes).
774
+ Also stores the index internally for deduplication semantic matching.
775
+
776
+ Returns:
777
+ VectorIndex populated with all node embeddings
778
+ """
779
+ from .vector_index import VectorIndex
780
+
781
+ index = VectorIndex()
782
+ nodes_with_embeddings = [
783
+ (node_id, node.embedding)
784
+ for node_id, node in self.nodes.items()
785
+ if node.embedding is not None
786
+ ]
787
+
788
+ if nodes_with_embeddings:
789
+ index.add_batch(nodes_with_embeddings)
790
+
791
+ # Store internally for deduplication
792
+ self._vector_index = index
793
+
794
+ return index
795
+
796
+ def semantic_search(
797
+ self,
798
+ query: str,
799
+ k: int = 10,
800
+ node_types: Optional[list[NodeType]] = None,
801
+ min_score: float = 0.0
802
+ ) -> list[tuple[GraphNode, float]]:
803
+ """
804
+ Search graph nodes by semantic similarity.
805
+
806
+ Uses FAISS vector search on node embeddings for fast, relevant results.
807
+
808
+ Args:
809
+ query: Natural language search query
810
+ k: Maximum number of results
811
+ node_types: Optional filter for specific node types (e.g., [NodeType.GOAL])
812
+ min_score: Minimum similarity score threshold (0.0 to 1.0)
813
+
814
+ Returns:
815
+ List of (GraphNode, similarity_score) tuples, sorted by score descending
816
+
817
+ Example:
818
+ # Find decisions about databases
819
+ results = graph.semantic_search("database choice", k=5,
820
+ node_types=[NodeType.DECISION])
821
+ for node, score in results:
822
+ print(f"{node.label}: {score:.3f}")
823
+ """
824
+ from .embeddings import EmbeddingService
825
+
826
+ # Embed query
827
+ service = EmbeddingService()
828
+ query_embedding = service.embed_text(query)
829
+
830
+ # Build index (very fast, <0.2ms for 1000 nodes)
831
+ index = self.build_vector_index()
832
+
833
+ if len(index) == 0:
834
+ return []
835
+
836
+ # Over-fetch to allow for type filtering
837
+ fetch_k = k * 3 if node_types else k
838
+ raw_results = index.search(query_embedding, fetch_k)
839
+
840
+ # Filter and collect results
841
+ results = []
842
+ for node_id, score in raw_results:
843
+ if score < min_score:
844
+ continue
845
+
846
+ node = self.nodes.get(node_id)
847
+ if node is None:
848
+ continue
849
+
850
+ if node_types and node.type not in node_types:
851
+ continue
852
+
853
+ results.append((node, score))
854
+
855
+ if len(results) >= k:
856
+ break
857
+
858
+ return results
859
+
860
+ def search_goals(
861
+ self,
862
+ query: str,
863
+ k: int = 5,
864
+ min_score: float = 0.0
865
+ ) -> list[tuple[GraphNode, float]]:
866
+ """
867
+ Find goals related to query.
868
+
869
+ Args:
870
+ query: Search query
871
+ k: Max results
872
+ min_score: Minimum similarity threshold
873
+
874
+ Returns:
875
+ List of (goal_node, score) tuples
876
+ """
877
+ return self.semantic_search(query, k, node_types=[NodeType.GOAL], min_score=min_score)
878
+
879
+ def search_decisions(
880
+ self,
881
+ query: str,
882
+ k: int = 5,
883
+ min_score: float = 0.0
884
+ ) -> list[tuple[GraphNode, float]]:
885
+ """
886
+ Find decisions related to query.
887
+
888
+ Useful for "why" queries - understanding past choices.
889
+
890
+ Args:
891
+ query: Search query
892
+ k: Max results
893
+ min_score: Minimum similarity threshold
894
+
895
+ Returns:
896
+ List of (decision_node, score) tuples
897
+ """
898
+ return self.semantic_search(query, k, node_types=[NodeType.DECISION], min_score=min_score)
899
+
900
+ def search_things(
901
+ self,
902
+ query: str,
903
+ k: int = 5,
904
+ min_score: float = 0.0
905
+ ) -> list[tuple[GraphNode, float]]:
906
+ """
907
+ Find things (technologies, tools, products) related to query.
908
+
909
+ Args:
910
+ query: Search query
911
+ k: Max results
912
+ min_score: Minimum similarity threshold
913
+
914
+ Returns:
915
+ List of (thing_node, score) tuples
916
+ """
917
+ return self.semantic_search(query, k, node_types=[NodeType.THING], min_score=min_score)
918
+
919
+ def search_all_types(
920
+ self,
921
+ query: str,
922
+ k_per_type: int = 3,
923
+ min_score: float = 0.0
924
+ ) -> dict[str, list[tuple[GraphNode, float]]]:
925
+ """
926
+ Search across all node types, returning top results for each.
927
+
928
+ Useful for comprehensive context retrieval.
929
+
930
+ Args:
931
+ query: Search query
932
+ k_per_type: Max results per node type
933
+ min_score: Minimum similarity threshold
934
+
935
+ Returns:
936
+ Dict mapping type name to list of (node, score) tuples
937
+ """
938
+ return {
939
+ "goals": self.search_goals(query, k_per_type, min_score),
940
+ "decisions": self.search_decisions(query, k_per_type, min_score),
941
+ "things": self.search_things(query, k_per_type, min_score),
942
+ "concepts": self.semantic_search(query, k_per_type, [NodeType.CONCEPT], min_score),
943
+ "actions": self.semantic_search(query, k_per_type, [NodeType.ACTION], min_score),
944
+ "persons": self.semantic_search(query, k_per_type, [NodeType.PERSON], min_score),
945
+ }
946
+
947
+
948
+ # =============================================================================
949
+ # LLM Protocol
950
+ # =============================================================================
951
+
952
+ class LLMClient(Protocol):
953
+ """Protocol for LLM client implementations."""
954
+
955
+ async def complete(self, prompt: str, system: Optional[str] = None) -> str:
956
+ """Generate a completion for the given prompt."""
957
+ ...
958
+
959
+
960
+ # =============================================================================
961
+ # Unified Extraction Prompt (Token-Optimized)
962
+ # =============================================================================
963
+
964
+ UNIFIED_EXTRACTION_SYSTEM = """You extract structured knowledge graphs with FULL REASONING DEPTH.
965
+
966
+ Output a JSON graph with:
967
+ - n: nodes (entities, goals, decisions, actions) - with rich attributes
968
+ - e: edges (relationships) - with rich attributes
969
+
970
+ Node types: P=Person, T=Thing, C=Concept, G=Goal, D=Decision, A=Action
971
+ Edge types: uses, wants, chose, over, because, enables, blocks, next, relates, part_of, depends_on, conflicts, supersedes
972
+
973
+ CRITICAL RULES FOR UNIFIED DEEP:
974
+ 1. Extract EVERY technology/tool/framework/product by exact name
975
+ 2. GOAL nodes MUST have attrs: goal_type (mission|goal|task), status (active|completed|blocked|pending), confidence (0.0-1.0)
976
+ 3. DECISION nodes MUST have attrs: rationale (full WHY text), confidence, context, trade_offs list
977
+ 4. OVER edges (alternatives) MUST have attrs: pros list, cons list, rejection_reason
978
+ 5. BECAUSE edges MUST have attrs: source (user_stated|inferred|external), confidence
979
+ 6. Capture alternatives even if rejected - use 'over' edge with full pros/cons
980
+ 7. Use depends_on for task dependencies, conflicts for incompatible goals, supersedes for updated decisions"""
981
+
982
+ UNIFIED_EXTRACTION_PROMPT = """Extract knowledge graph with FULL REASONING DEPTH from this conversation:
983
+
984
+ {conversation}
985
+
986
+ Output JSON with RICH ATTRIBUTES:
987
+ {{
988
+ "n": [
989
+ {{"id": "P1", "t": "P", "l": "user"}},
990
+ {{"id": "G1", "t": "G", "l": "build REST API", "a": {{
991
+ "goal_type": "goal",
992
+ "status": "active",
993
+ "confidence": 0.9,
994
+ "turn_ref": ["turn_1"]
995
+ }}}},
996
+ {{"id": "D1", "t": "D", "l": "use FastAPI", "a": {{
997
+ "rationale": "FastAPI provides better async performance and automatic OpenAPI docs",
998
+ "confidence": 0.85,
999
+ "context": "Choosing web framework for REST API",
1000
+ "trade_offs": [{{"gained": "performance + auto-docs", "sacrificed": "Django ecosystem"}}],
1001
+ "turn_ref": ["turn_2", "turn_3"]
1002
+ }}}},
1003
+ {{"id": "T1", "t": "T", "l": "FastAPI"}},
1004
+ {{"id": "T2", "t": "T", "l": "Django"}},
1005
+ {{"id": "C1", "t": "C", "l": "async performance"}},
1006
+ {{"id": "C2", "t": "C", "l": "automatic OpenAPI docs"}},
1007
+ {{"id": "A1", "t": "A", "l": "set up project"}}
1008
+ ],
1009
+ "e": [
1010
+ {{"s": "P1", "d": "G1", "r": "wants"}},
1011
+ {{"s": "D1", "d": "T1", "r": "chose"}},
1012
+ {{"s": "D1", "d": "T2", "r": "over", "a": {{
1013
+ "pros": ["mature ecosystem", "built-in admin", "ORM"],
1014
+ "cons": ["sync by default", "heavier weight"],
1015
+ "rejection_reason": "Need async performance for high-throughput API"
1016
+ }}}},
1017
+ {{"s": "D1", "d": "C1", "r": "because", "a": {{
1018
+ "source": "user_stated",
1019
+ "confidence": 0.9,
1020
+ "turn_ref": "turn_2"
1021
+ }}}},
1022
+ {{"s": "D1", "d": "C2", "r": "because", "a": {{
1023
+ "source": "inferred",
1024
+ "confidence": 0.7,
1025
+ "turn_ref": "turn_3"
1026
+ }}}},
1027
+ {{"s": "D1", "d": "G1", "r": "enables"}},
1028
+ {{"s": "G1", "d": "A1", "r": "next"}}
1029
+ ]
1030
+ }}
1031
+
1032
+ EXTRACTION RULES FOR UNIFIED DEEP:
1033
+ 1. Extract EVERY technology/tool/framework/language by EXACT name
1034
+ 2. GOAL nodes: Always include goal_type, status, confidence in attrs
1035
+ 3. DECISION nodes: Always include rationale (full text), confidence, context, trade_offs
1036
+ 4. OVER edges: Always include pros, cons, rejection_reason attrs
1037
+ 5. BECAUSE edges: Always include source (user_stated/inferred/external), confidence
1038
+ 6. Use depends_on edge when one task requires another
1039
+ 7. Use conflicts edge when goals are mutually exclusive
1040
+ 8. Use supersedes edge when a decision updates/replaces a previous one
1041
+ 9. Include next actions or planned steps
1042
+ 10. Only output valid JSON - no extra text"""
1043
+
1044
+
1045
+ DELTA_EXTRACTION_PROMPT = """Analyze new conversation content and extract NEW information to add to the existing knowledge graph.
1046
+
1047
+ EXISTING NODES (do NOT repeat these, they already exist):
1048
+ {existing_graph}
1049
+
1050
+ NEW CONVERSATION:
1051
+ {new_messages}
1052
+
1053
+ Extract ONLY genuinely NEW information not already in the graph above.
1054
+ Output JSON with:
1055
+ - "add_n": New nodes (entities, goals, decisions, actions) NOT in existing list
1056
+ - "add_e": New relationships between nodes
1057
+ - "update_n": Updates to existing node attributes (e.g., status changes)
1058
+
1059
+ Format:
1060
+ {{
1061
+ "add_n": [{{"id": "T1", "t": "T", "l": "NewTechnology", "a": {{}}}}],
1062
+ "add_e": [{{"s": "G1", "d": "T1", "r": "uses"}}],
1063
+ "update_n": [{{"id": "G1", "a": {{"status": "completed"}}}}]
1064
+ }}
1065
+
1066
+ Node types: P=Person, T=Thing, C=Concept, G=Goal, D=Decision, A=Action
1067
+ Edge types: uses, wants, chose, over, because, enables, blocks, next, relates, part_of, depends_on, conflicts, supersedes
1068
+
1069
+ Output valid JSON only - no explanations."""
1070
+
1071
+
1072
+ # =============================================================================
1073
+ # Unified Extractor
1074
+ # =============================================================================
1075
+
1076
+ class UnifiedExtractor:
1077
+ """
1078
+ Single-pass extractor that captures all memory types.
1079
+
1080
+ Replaces separate IntentTracker + ReasoningStore with one unified extraction.
1081
+
1082
+ Token Efficiency:
1083
+ - Current: 6 calls × ~10K tokens = 60K tokens per conversation
1084
+ - Optimized: 1-2 calls × ~5K tokens = 5-10K tokens per conversation
1085
+ - Target: 10-20x overhead vs 220x current
1086
+
1087
+ Usage:
1088
+ extractor = UnifiedExtractor(llm_client)
1089
+
1090
+ # Full extraction (new conversation)
1091
+ graph = await extractor.extract(messages)
1092
+
1093
+ # Delta extraction (continuation)
1094
+ graph = await extractor.extract_delta(new_messages, existing_graph)
1095
+ """
1096
+
1097
+ def __init__(self, llm_client: LLMClient):
1098
+ self.llm = llm_client
1099
+ self._extraction_cache: dict[str, UnifiedGraph] = {}
1100
+
1101
+ def _format_conversation(self, messages: list[dict]) -> str:
1102
+ """Format messages compactly for prompt."""
1103
+ lines = []
1104
+ for i, msg in enumerate(messages):
1105
+ role = msg.get("role", "user")[0].upper() # U or A
1106
+ content = msg.get("content", "")
1107
+ # Truncate very long messages
1108
+ if len(content) > 500:
1109
+ content = content[:500] + "..."
1110
+ lines.append(f"{role}: {content}")
1111
+ return "\n".join(lines)
1112
+
1113
+ def _cache_key(self, messages: list[dict]) -> str:
1114
+ """Generate cache key for messages."""
1115
+ content = json.dumps(messages, sort_keys=True)
1116
+ return hashlib.sha256(content.encode()).hexdigest()[:12]
1117
+
1118
+ async def extract(
1119
+ self,
1120
+ messages: list[dict],
1121
+ use_cache: bool = True
1122
+ ) -> UnifiedGraph:
1123
+ """
1124
+ Extract unified graph from conversation.
1125
+
1126
+ Single LLM call captures:
1127
+ - Entities (WHO, WHAT)
1128
+ - Goals (objectives)
1129
+ - Decisions with rationale (WHY)
1130
+ - Relationships (CONNECTIONS)
1131
+ - Next actions (NEXT-STEPS)
1132
+
1133
+ Args:
1134
+ messages: Conversation messages
1135
+ use_cache: Whether to use extraction cache
1136
+
1137
+ Returns:
1138
+ UnifiedGraph with all extracted knowledge
1139
+ """
1140
+ # Check cache
1141
+ cache_key = self._cache_key(messages)
1142
+ if use_cache and cache_key in self._extraction_cache:
1143
+ logger.debug(f"Cache hit for extraction: {cache_key}")
1144
+ return self._extraction_cache[cache_key]
1145
+
1146
+ # Format conversation
1147
+ conv_text = self._format_conversation(messages)
1148
+
1149
+ # Single extraction call
1150
+ prompt = UNIFIED_EXTRACTION_PROMPT.format(conversation=conv_text)
1151
+
1152
+ try:
1153
+ response = await self.llm.complete(prompt, system=UNIFIED_EXTRACTION_SYSTEM)
1154
+ graph = self._parse_response(response)
1155
+ except Exception as e:
1156
+ logger.error(f"Extraction failed: {e}")
1157
+ graph = UnifiedGraph()
1158
+
1159
+ # Cache result
1160
+ if use_cache:
1161
+ self._extraction_cache[cache_key] = graph
1162
+
1163
+ return graph
1164
+
1165
+ async def extract_delta(
1166
+ self,
1167
+ new_messages: list[dict],
1168
+ existing_graph: UnifiedGraph
1169
+ ) -> UnifiedGraph:
1170
+ """
1171
+ Extract only changes from new messages.
1172
+
1173
+ More token-efficient for multi-turn conversations:
1174
+ - Sends compact representation of existing graph
1175
+ - Only extracts NEW information
1176
+ - Merges delta into existing graph
1177
+ - Applies attribute updates to existing nodes
1178
+
1179
+ Args:
1180
+ new_messages: New messages to process
1181
+ existing_graph: Current graph state
1182
+
1183
+ Returns:
1184
+ Updated UnifiedGraph
1185
+ """
1186
+ # Compact representation of existing graph
1187
+ existing_compact = self._compact_graph_repr(existing_graph)
1188
+ new_conv = self._format_conversation(new_messages)
1189
+
1190
+ prompt = DELTA_EXTRACTION_PROMPT.format(
1191
+ existing_graph=existing_compact,
1192
+ new_messages=new_conv
1193
+ )
1194
+
1195
+ try:
1196
+ response = await self.llm.complete(prompt, system=UNIFIED_EXTRACTION_SYSTEM)
1197
+ delta, updates = self._parse_delta_response(response)
1198
+
1199
+ # Merge delta into existing graph
1200
+ existing_graph.merge(delta)
1201
+
1202
+ # Apply node updates (Issue #27 fix)
1203
+ for update in updates:
1204
+ node_id = update["id"]
1205
+ if node_id in existing_graph.nodes:
1206
+ existing_graph.nodes[node_id].attrs.update(update["attrs"])
1207
+ logger.debug(f"Updated node {node_id}: {update['attrs']}")
1208
+
1209
+ except Exception as e:
1210
+ logger.error(f"Delta extraction failed: {e}")
1211
+
1212
+ return existing_graph
1213
+
1214
+ def _compact_graph_repr(self, graph: UnifiedGraph) -> str:
1215
+ """Create compact string representation of graph for prompt.
1216
+
1217
+ Lists all existing nodes clearly so LLM knows what NOT to add.
1218
+ """
1219
+ lines = []
1220
+
1221
+ # Nodes by type - clear listing format
1222
+ for node_type in NodeType:
1223
+ nodes = [n for n in graph.nodes.values() if n.type == node_type]
1224
+ if nodes:
1225
+ labels = [f"{n.id}:{n.label}" for n in nodes]
1226
+ lines.append(f"{node_type.name}: {', '.join(labels)}")
1227
+
1228
+ if not lines:
1229
+ lines.append("(empty graph - no existing nodes)")
1230
+
1231
+ # Key edges (limit to most important)
1232
+ key_edges = [e for e in graph.edges
1233
+ if e.rel in (EdgeType.CHOSE, EdgeType.BECAUSE, EdgeType.ENABLES)][:10]
1234
+ if key_edges:
1235
+ edge_strs = [f"{e.src}->{e.rel.value}->{e.dst}" for e in key_edges]
1236
+ lines.append(f"KEY_EDGES: {'; '.join(edge_strs)}")
1237
+
1238
+ return "\n".join(lines)
1239
+
1240
+ return "\n".join(lines)
1241
+
1242
+ def _parse_response(self, response: str) -> UnifiedGraph:
1243
+ """Parse LLM response into UnifiedGraph."""
1244
+ graph = UnifiedGraph()
1245
+
1246
+ # Extract JSON from response
1247
+ try:
1248
+ # Try to find JSON in response
1249
+ json_start = response.find("{")
1250
+ json_end = response.rfind("}") + 1
1251
+ if json_start >= 0 and json_end > json_start:
1252
+ json_str = response[json_start:json_end]
1253
+ data = json.loads(json_str)
1254
+ else:
1255
+ logger.warning("No JSON found in response")
1256
+ return graph
1257
+ except json.JSONDecodeError as e:
1258
+ logger.error(f"JSON parse error: {e}")
1259
+ return graph
1260
+
1261
+ # Parse nodes
1262
+ for node_data in data.get("n", []):
1263
+ try:
1264
+ node = GraphNode(
1265
+ id=node_data["id"],
1266
+ type=NodeType(node_data["t"]),
1267
+ label=node_data["l"],
1268
+ attrs=node_data.get("a", {})
1269
+ )
1270
+ graph.add_node(node)
1271
+ except (KeyError, ValueError) as e:
1272
+ logger.warning(f"Skipping invalid node: {e}")
1273
+
1274
+ # Parse edges
1275
+ for edge_data in data.get("e", []):
1276
+ try:
1277
+ edge = GraphEdge(
1278
+ src=edge_data["s"],
1279
+ dst=edge_data["d"],
1280
+ rel=EdgeType(edge_data["r"]),
1281
+ attrs=edge_data.get("a", {})
1282
+ )
1283
+ graph.add_edge(edge)
1284
+ except (KeyError, ValueError) as e:
1285
+ logger.warning(f"Skipping invalid edge: {e}")
1286
+
1287
+ graph.metadata["extracted_at"] = _utc_now().isoformat()
1288
+
1289
+ return graph
1290
+
1291
+ def _parse_delta_response(self, response: str) -> tuple[UnifiedGraph, list[dict]]:
1292
+ """Parse delta extraction response.
1293
+
1294
+ Returns:
1295
+ Tuple of (graph with new nodes/edges, list of node updates)
1296
+ """
1297
+ graph = UnifiedGraph()
1298
+ updates = []
1299
+
1300
+ try:
1301
+ json_start = response.find("{")
1302
+ json_end = response.rfind("}") + 1
1303
+ if json_start >= 0 and json_end > json_start:
1304
+ data = json.loads(response[json_start:json_end])
1305
+ else:
1306
+ return graph, updates
1307
+ except json.JSONDecodeError:
1308
+ return graph, updates
1309
+
1310
+ # Parse new nodes
1311
+ for node_data in data.get("add_n", []):
1312
+ try:
1313
+ node = GraphNode(
1314
+ id=node_data["id"],
1315
+ type=NodeType(node_data["t"]),
1316
+ label=node_data["l"],
1317
+ attrs=node_data.get("a", {})
1318
+ )
1319
+ graph.add_node(node)
1320
+ except (KeyError, ValueError):
1321
+ pass
1322
+
1323
+ # Parse new edges
1324
+ for edge_data in data.get("add_e", []):
1325
+ try:
1326
+ edge = GraphEdge(
1327
+ src=edge_data["s"],
1328
+ dst=edge_data["d"],
1329
+ rel=EdgeType(edge_data["r"]),
1330
+ attrs=edge_data.get("a", {})
1331
+ )
1332
+ graph.add_edge(edge)
1333
+ except (KeyError, ValueError):
1334
+ pass
1335
+
1336
+ # Parse node updates (Issue #27 fix)
1337
+ for update_data in data.get("update_n", []):
1338
+ try:
1339
+ updates.append({
1340
+ "id": update_data["id"],
1341
+ "attrs": update_data.get("a", {})
1342
+ })
1343
+ except KeyError:
1344
+ pass
1345
+
1346
+ return graph, updates
1347
+
1348
+
1349
+ # =============================================================================
1350
+ # Extraction Statistics
1351
+ # =============================================================================
1352
+
1353
+ @dataclass
1354
+ class ExtractionStats:
1355
+ """Track extraction performance."""
1356
+ total_extractions: int = 0
1357
+ cache_hits: int = 0
1358
+ total_tokens: int = 0
1359
+ total_nodes: int = 0
1360
+ total_edges: int = 0
1361
+
1362
+ @property
1363
+ def cache_hit_rate(self) -> float:
1364
+ if self.total_extractions == 0:
1365
+ return 0.0
1366
+ return self.cache_hits / self.total_extractions
1367
+
1368
+ @property
1369
+ def avg_tokens_per_extraction(self) -> float:
1370
+ effective = self.total_extractions - self.cache_hits
1371
+ if effective == 0:
1372
+ return 0.0
1373
+ return self.total_tokens / effective
1374
+
1375
+
1376
+ # =============================================================================
1377
+ # Helper Functions
1378
+ # =============================================================================
1379
+
1380
+ def graph_to_memories(graph: UnifiedGraph) -> list[dict]:
1381
+ """
1382
+ Convert UnifiedGraph to list of memory objects for compatibility.
1383
+
1384
+ Each node becomes a memory, edges add relationship context.
1385
+ """
1386
+ memories = []
1387
+
1388
+ for node in graph.nodes.values():
1389
+ memory = {
1390
+ "id": node.id,
1391
+ "type": node.type.name.lower(),
1392
+ "content": node.label,
1393
+ "attributes": node.attrs,
1394
+ "relationships": []
1395
+ }
1396
+
1397
+ # Add relationship context
1398
+ for edge in graph.get_relationships_for(node.id):
1399
+ if edge.src == node.id:
1400
+ other = graph.nodes.get(edge.dst)
1401
+ if other:
1402
+ memory["relationships"].append({
1403
+ "type": edge.rel.value,
1404
+ "target": other.label,
1405
+ "direction": "outgoing"
1406
+ })
1407
+ else:
1408
+ other = graph.nodes.get(edge.src)
1409
+ if other:
1410
+ memory["relationships"].append({
1411
+ "type": edge.rel.value,
1412
+ "target": other.label,
1413
+ "direction": "incoming"
1414
+ })
1415
+
1416
+ memories.append(memory)
1417
+
1418
+ return memories
1419
+
1420
+
1421
+ def memories_to_context_string(graph: UnifiedGraph, max_tokens: int = 2000) -> str:
1422
+ """
1423
+ Convert graph to context string for LLM prompt.
1424
+
1425
+ Prioritizes:
1426
+ 1. Active goals
1427
+ 2. Recent decisions with rationale
1428
+ 3. Key entities
1429
+ 4. Next actions
1430
+ """
1431
+ sections = []
1432
+
1433
+ # Goals
1434
+ goals = graph.get_goals()
1435
+ if goals:
1436
+ goal_lines = [f"- {g.label}" for g in goals[:5]]
1437
+ sections.append("GOALS:\n" + "\n".join(goal_lines))
1438
+
1439
+ # Decisions with context
1440
+ decisions = graph.get_decisions()
1441
+ if decisions:
1442
+ decision_lines = []
1443
+ for d in decisions[:5]:
1444
+ ctx = graph.get_decision_context(d.id)
1445
+ line = f"- {d.label}"
1446
+ if ctx.get("reasons"):
1447
+ line += f" (because: {', '.join(ctx['reasons'][:2])})"
1448
+ if ctx.get("alternatives"):
1449
+ alts = [a["option"] for a in ctx["alternatives"][:2]]
1450
+ line += f" [over: {', '.join(alts)}]"
1451
+ decision_lines.append(line)
1452
+ sections.append("DECISIONS:\n" + "\n".join(decision_lines))
1453
+
1454
+ # Key entities
1455
+ entities = graph.get_entities()
1456
+ if entities:
1457
+ entity_lines = [f"- {e.label} ({e.type.name})" for e in entities[:10]]
1458
+ sections.append("KEY ENTITIES:\n" + "\n".join(entity_lines))
1459
+
1460
+ # Next actions
1461
+ actions = graph.get_actions()
1462
+ if actions:
1463
+ action_lines = [f"- {a.label}" for a in actions[:5]]
1464
+ sections.append("NEXT STEPS:\n" + "\n".join(action_lines))
1465
+
1466
+ return "\n\n".join(sections)