py-context-graph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. decision_graph/__init__.py +8 -0
  2. decision_graph/backends/__init__.py +0 -0
  3. decision_graph/backends/firestore/__init__.py +41 -0
  4. decision_graph/backends/firestore/stores.py +254 -0
  5. decision_graph/backends/memory/__init__.py +30 -0
  6. decision_graph/backends/memory/stores.py +323 -0
  7. decision_graph/clustering_service.py +301 -0
  8. decision_graph/context_graph/__init__.py +0 -0
  9. decision_graph/context_graph/planner.py +102 -0
  10. decision_graph/context_graph/post_processing.py +247 -0
  11. decision_graph/context_graph/registry.py +35 -0
  12. decision_graph/context_graph/service.py +360 -0
  13. decision_graph/context_graph/templates.py +138 -0
  14. decision_graph/context_retrieval.py +298 -0
  15. decision_graph/core/__init__.py +0 -0
  16. decision_graph/core/config.py +44 -0
  17. decision_graph/core/decision_trace_profiles.py +76 -0
  18. decision_graph/core/domain.py +307 -0
  19. decision_graph/core/interfaces.py +160 -0
  20. decision_graph/core/matching.py +383 -0
  21. decision_graph/core/registry.py +35 -0
  22. decision_graph/decision_enrichment.py +22 -0
  23. decision_graph/decision_trace_pipeline.py +293 -0
  24. decision_graph/enrichment_service.py +209 -0
  25. decision_graph/extraction_service.py +50 -0
  26. decision_graph/graph.py +51 -0
  27. decision_graph/ingestion.py +171 -0
  28. decision_graph/llm/__init__.py +3 -0
  29. decision_graph/llm/litellm_adapter.py +63 -0
  30. decision_graph/markdown_chunker.py +50 -0
  31. decision_graph/prompt_loader.py +19 -0
  32. decision_graph/prompts/decision_enrichment.txt +35 -0
  33. decision_graph/prompts/decision_trace.txt +177 -0
  34. decision_graph/py.typed +0 -0
  35. decision_graph/retrieval.py +274 -0
  36. decision_graph/services.py +362 -0
  37. decision_graph/visualization.py +133 -0
  38. py_context_graph-0.1.0.dist-info/METADATA +271 -0
  39. py_context_graph-0.1.0.dist-info/RECORD +41 -0
  40. py_context_graph-0.1.0.dist-info/WHEEL +4 -0
  41. py_context_graph-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,8 @@
1
+ """Decision graph / cross-conversation decision intelligence flow."""
2
+
3
+ from decision_graph.core.config import LLMConfig
4
+ from decision_graph.core.interfaces import LLMAdapter
5
+ from decision_graph.graph import DecisionGraph
6
+ from decision_graph.llm import LiteLLMAdapter
7
+
8
+ __all__ = ["DecisionGraph", "LLMAdapter", "LLMConfig", "LiteLLMAdapter"]
File without changes
@@ -0,0 +1,41 @@
1
+ """Simple Firestore backend for decision_graph.
2
+
3
+ Usage::
4
+
5
+ from google.cloud import firestore
6
+ from decision_graph.backends.firestore import FirestoreBackend
7
+
8
+ client = firestore.Client()
9
+ backend = FirestoreBackend(client=client, collection_prefix="myapp_")
10
+ """
11
+
12
+ from decision_graph.backends.firestore.stores import (
13
+ FirestoreClusterStore,
14
+ FirestoreEnrichmentStore,
15
+ FirestoreLinkStore,
16
+ FirestoreProjectionStore,
17
+ )
18
+ from decision_graph.core.registry import StorageBackend
19
+
20
+
21
+ class FirestoreBackend(StorageBackend):
22
+ """Standalone Firestore backend — requires only a ``google.cloud.firestore.Client``."""
23
+
24
+ def __init__(self, *, client, collection_prefix: str = ""):
25
+ self._client = client
26
+ self._prefix = collection_prefix
27
+
28
+ def _col(self, name: str) -> str:
29
+ return f"{self._prefix}{name}"
30
+
31
+ def enrichment_store(self):
32
+ return FirestoreEnrichmentStore(self._client, self._col("decision_enrichments"))
33
+
34
+ def projection_store(self):
35
+ return FirestoreProjectionStore(self._client, self._col("decision_projections"))
36
+
37
+ def cluster_store(self):
38
+ return FirestoreClusterStore(self._client, self._col("decision_clusters"))
39
+
40
+ def link_store(self):
41
+ return FirestoreLinkStore(self._client, self._col("decision_links"))
@@ -0,0 +1,254 @@
1
+ """Simple Firestore implementations of the decision graph store protocols.
2
+
3
+ No BaseDAO, no Singleton, no encryption. Just a Firestore client and collection names.
4
+ Users provide their own ``google.cloud.firestore.Client`` instance.
5
+ """
6
+
7
+ import time
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from decision_graph.core.interfaces import (
11
+ ClusterStore,
12
+ EnrichmentStore,
13
+ LinkStore,
14
+ ProjectionStore,
15
+ )
16
+
17
+
18
+ class FirestoreEnrichmentStore(EnrichmentStore):
19
+ def __init__(self, client, collection_name: str):
20
+ self._client = client
21
+ self._collection_name = collection_name
22
+
23
+ def _col(self):
24
+ return self._client.collection(self._collection_name)
25
+
26
+ def find_by_id(self, decision_id: str) -> Optional[dict]:
27
+ snap = self._col().document(decision_id).get()
28
+ return snap.to_dict() if snap.exists else None
29
+
30
+ def find_by_ids(self, ids: List[str]) -> Dict[str, dict]:
31
+ if not ids:
32
+ return {}
33
+ refs = [self._col().document(did) for did in ids]
34
+ result = {}
35
+ for snap in self._client.get_all(refs):
36
+ if snap.exists:
37
+ result[snap.id] = snap.to_dict()
38
+ return result
39
+
40
+ async def find_by_ids_async(self, ids: List[str]) -> Dict[str, dict]:
41
+ return self.find_by_ids(ids)
42
+
43
+ def save(self, decision_id: str, data: dict) -> None:
44
+ self._col().document(decision_id).set(data)
45
+
46
+ def upsert(self, decision_id: str, data: dict) -> None:
47
+ self._col().document(decision_id).set(data, merge=True)
48
+
49
+ def query(
50
+ self,
51
+ filters: List[Tuple[str, str, Any]],
52
+ order_by: Optional[List[Tuple[str, str]]] = None,
53
+ limit: int = 200,
54
+ ) -> List[dict]:
55
+ in_filters = [(f, v) for f, op, v in filters if op == "in"]
56
+ regular_filters = [(f, op, v) for f, op, v in filters if op != "in"]
57
+
58
+ if in_filters:
59
+ all_rows: List[dict] = []
60
+ for field, values in in_filters:
61
+ for i in range(0, len(values), 30):
62
+ chunk = values[i : i + 30]
63
+ q = self._col()
64
+ for rf, rop, rv in regular_filters:
65
+ q = q.where(rf, rop, rv)
66
+ q = q.where(field, "in", chunk)
67
+ if order_by:
68
+ for ob_field, ob_dir in order_by:
69
+ q = q.order_by(ob_field, direction=ob_dir)
70
+ q = q.limit(limit)
71
+ all_rows.extend(doc.to_dict() for doc in q.stream())
72
+ return all_rows[:limit]
73
+
74
+ q = self._col()
75
+ for f, op, v in filters:
76
+ q = q.where(f, op, v)
77
+ if order_by:
78
+ for ob_field, ob_dir in order_by:
79
+ q = q.order_by(ob_field, direction=ob_dir)
80
+ q = q.limit(limit)
81
+ return [doc.to_dict() for doc in q.stream()]
82
+
83
+
84
+ class FirestoreProjectionStore(ProjectionStore):
85
+ def __init__(self, client, collection_name: str):
86
+ self._client = client
87
+ self._collection_name = collection_name
88
+
89
+ def _col(self):
90
+ return self._client.collection(self._collection_name)
91
+
92
+ def find_by_id(self, pid: str) -> Optional[dict]:
93
+ snap = self._col().document(pid).get()
94
+ return snap.to_dict() if snap.exists else None
95
+
96
+ def find_by_ids(self, ids: List[str]) -> Dict[str, dict]:
97
+ if not ids:
98
+ return {}
99
+ refs = [self._col().document(pid) for pid in ids]
100
+ result = {}
101
+ for snap in self._client.get_all(refs):
102
+ if snap.exists:
103
+ result[snap.id] = snap.to_dict()
104
+ return result
105
+
106
+ def find_by_conv_ids(self, cids: List[str], proj_type: str) -> List[dict]:
107
+ cid_set = set(cids)
108
+ q = self._col().where("proj_type", "==", proj_type).where("valid", "==", True)
109
+ return [doc.to_dict() for doc in q.stream() if doc.to_dict().get("cid") in cid_set]
110
+
111
+ async def find_by_filters(
112
+ self,
113
+ *,
114
+ gids: List[str],
115
+ proj_type: str,
116
+ last_n_days: Optional[int] = None,
117
+ limit: Optional[int] = None,
118
+ before_ts: Optional[float] = None,
119
+ ) -> List[dict]:
120
+ q = self._col().where("proj_type", "==", proj_type).where("valid", "==", True)
121
+ rows = []
122
+ gid_set = set(gids)
123
+ for doc in q.stream():
124
+ data = doc.to_dict()
125
+ if data.get("gid") not in gid_set:
126
+ continue
127
+ rows.append(data)
128
+
129
+ if last_n_days is not None:
130
+ cutoff = time.time() - (last_n_days * 86400)
131
+ rows = [r for r in rows if (r.get("updated_at") or r.get("created_at") or 0) >= cutoff]
132
+ if before_ts is not None:
133
+ rows = [r for r in rows if (r.get("updated_at") or r.get("created_at") or 0) < before_ts]
134
+ rows.sort(key=lambda r: r.get("updated_at") or r.get("created_at") or 0, reverse=True)
135
+ if limit:
136
+ rows = rows[:limit]
137
+ return rows
138
+
139
+ def query(
140
+ self,
141
+ filters: List[Tuple[str, str, Any]],
142
+ order_by: Optional[List[Tuple[str, str]]] = None,
143
+ limit: int = 200,
144
+ ) -> List[dict]:
145
+ q = self._col()
146
+ for f, op, v in filters:
147
+ q = q.where(f, op, v)
148
+ if order_by:
149
+ for ob_field, ob_dir in order_by:
150
+ q = q.order_by(ob_field, direction=ob_dir)
151
+ q = q.limit(limit)
152
+ return [doc.to_dict() for doc in q.stream()]
153
+
154
+ def invalidate(self, pid: str) -> None:
155
+ self._col().document(pid).update({"valid": False})
156
+
157
+ def save(self, *, pid: str, gid: str, cid: str, proj_type: str, projection: dict, msg_ts: int) -> bool:
158
+ doc_ref = self._col().document(pid)
159
+ snap = doc_ref.get()
160
+ if snap.exists:
161
+ return False
162
+ doc_ref.set(
163
+ {
164
+ "pid": pid,
165
+ "gid": gid,
166
+ "cid": cid,
167
+ "proj_type": proj_type,
168
+ "projection": projection,
169
+ "created_at": msg_ts,
170
+ "updated_at": msg_ts,
171
+ "valid": True,
172
+ }
173
+ )
174
+ return True
175
+
176
+ def update(self, *, pid: str, projection: dict, update_type: str, msg_ts: int) -> dict:
177
+ doc_ref = self._col().document(pid)
178
+ doc_ref.update({"projection": projection, "updated_at": msg_ts})
179
+ snap = doc_ref.get()
180
+ return snap.to_dict() if snap.exists else {}
181
+
182
+
183
+ class FirestoreClusterStore(ClusterStore):
184
+ def __init__(self, client, collection_name: str):
185
+ self._client = client
186
+ self._collection_name = collection_name
187
+
188
+ def _col(self):
189
+ return self._client.collection(self._collection_name)
190
+
191
+ def create(self, data: dict) -> str:
192
+ cluster_id = data.get("cluster_id", "")
193
+ self._col().document(cluster_id).set(data)
194
+ return cluster_id
195
+
196
+ def update(self, cluster_id: str, updates: dict) -> None:
197
+ self._col().document(cluster_id).update(updates)
198
+
199
+ def find_by_id(self, cluster_id: str) -> Optional[dict]:
200
+ snap = self._col().document(cluster_id).get()
201
+ return snap.to_dict() if snap.exists else None
202
+
203
+ def find_by_ids(self, cluster_ids: List[str]) -> List[dict]:
204
+ if not cluster_ids:
205
+ return []
206
+ refs = [self._col().document(cid) for cid in cluster_ids]
207
+ return [snap.to_dict() for snap in self._client.get_all(refs) if snap.exists]
208
+
209
+
210
+
211
+ class FirestoreLinkStore(LinkStore):
212
+ def __init__(self, client, collection_name: str):
213
+ self._client = client
214
+ self._collection_name = collection_name
215
+
216
+ def _col(self):
217
+ return self._client.collection(self._collection_name)
218
+
219
+ def save_batch(self, links: List[dict]) -> int:
220
+ if not links:
221
+ return 0
222
+ for link in links:
223
+ decision_id = link.get("decision_id", "")
224
+ self._col().document(decision_id).set(link)
225
+ return len(links)
226
+
227
+ def find_by_decision_id(self, decision_id: str) -> Optional[dict]:
228
+ snap = self._col().document(decision_id).get()
229
+ return snap.to_dict() if snap.exists else None
230
+
231
+ def find_by_cluster_id(self, cluster_id: str) -> List[dict]:
232
+ q = self._col().where("cluster_id", "==", cluster_id)
233
+ return [doc.to_dict() for doc in q.stream()]
234
+
235
+ def find_by_decision_ids(self, decision_ids: List[str]) -> Dict[str, dict]:
236
+ if not decision_ids:
237
+ return {}
238
+ refs = [self._col().document(did) for did in decision_ids]
239
+ return {snap.id: snap.to_dict() for snap in self._client.get_all(refs) if snap.exists}
240
+
241
+ def find_cluster_ids_by_gids(self, gids: List[str]) -> List[str]:
242
+ if not gids:
243
+ return []
244
+ gid_set = set(gids)
245
+ cluster_ids = set()
246
+ for doc in self._col().stream():
247
+ data = doc.to_dict()
248
+ if data.get("gid") in gid_set and data.get("cluster_id"):
249
+ cluster_ids.add(data["cluster_id"])
250
+ return list(cluster_ids)
251
+
252
+ def find_cluster_id_for_decision(self, decision_id: str) -> Optional[str]:
253
+ link = self.find_by_decision_id(decision_id)
254
+ return link.get("cluster_id") if link else None
@@ -0,0 +1,30 @@
1
+ from decision_graph.backends.memory.stores import (
2
+ InMemoryClusterStore,
3
+ InMemoryEnrichmentStore,
4
+ InMemoryLinkStore,
5
+ InMemoryProjectionStore,
6
+ InMemoryVectorIndex,
7
+ )
8
+ from decision_graph.core.registry import StorageBackend
9
+
10
+
11
+ class InMemoryBackend(StorageBackend):
12
+ """Fully in-memory backend. Useful for testing and as a reference implementation."""
13
+
14
+ def __init__(self):
15
+ self._enrichment_store = InMemoryEnrichmentStore()
16
+ self._projection_store = InMemoryProjectionStore()
17
+ self._cluster_store = InMemoryClusterStore()
18
+ self._link_store = InMemoryLinkStore()
19
+
20
+ def enrichment_store(self):
21
+ return self._enrichment_store
22
+
23
+ def projection_store(self):
24
+ return self._projection_store
25
+
26
+ def cluster_store(self):
27
+ return self._cluster_store
28
+
29
+ def link_store(self):
30
+ return self._link_store
@@ -0,0 +1,323 @@
1
+ """In-memory implementations of the decision graph store protocols."""
2
+
3
+ import math
4
+ import re
5
+ import time
6
+ from collections import Counter
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ import pandas as pd
10
+
11
+ from decision_graph.core.interfaces import (
12
+ ClusterStore,
13
+ EnrichmentStore,
14
+ LinkStore,
15
+ ProjectionStore,
16
+ )
17
+ from decision_graph.ingestion import breakdown_hydrated_clusters
18
+
19
+
20
+ def _get_nested(data: dict, field_path: str) -> Tuple[bool, Any]:
21
+ cur = data
22
+ for part in field_path.split("."):
23
+ if not isinstance(cur, dict) or part not in cur:
24
+ return False, None
25
+ cur = cur[part]
26
+ return True, cur
27
+
28
+
29
+ def _matches_filter(row: dict, field: str, op: str, value: Any) -> bool:
30
+ exists, field_value = _get_nested(row, field)
31
+ if not exists:
32
+ return False
33
+ if op == "==":
34
+ return field_value == value
35
+ if op == "!=":
36
+ return field_value != value
37
+ if op == "in":
38
+ return field_value in (value or [])
39
+ if op == "array_contains":
40
+ return isinstance(field_value, list) and value in field_value
41
+ if op == ">=":
42
+ return field_value >= value
43
+ if op == ">":
44
+ return field_value > value
45
+ if op == "<=":
46
+ return field_value <= value
47
+ if op == "<":
48
+ return field_value < value
49
+ raise ValueError(f"Unsupported filter op: {op}")
50
+
51
+
52
+ def _apply_query(
53
+ data: Dict[str, dict],
54
+ filters: List[Tuple[str, str, Any]],
55
+ order_by: Optional[List[Tuple[str, str]]] = None,
56
+ limit: int = 200,
57
+ ) -> List[dict]:
58
+ rows = list(data.values())
59
+ for field, op, value in filters:
60
+ rows = [r for r in rows if _matches_filter(r, field, op, value)]
61
+ if order_by:
62
+ for ob_field, ob_dir in reversed(order_by):
63
+ reverse = ob_dir.upper() == "DESCENDING"
64
+ rows = sorted(
65
+ rows,
66
+ key=lambda r, f=ob_field: (r.get(f) is None, r.get(f)),
67
+ reverse=reverse,
68
+ )
69
+ return rows[:limit]
70
+
71
+
72
+ class InMemoryEnrichmentStore(EnrichmentStore):
73
+ def __init__(self):
74
+ self._data: Dict[str, dict] = {}
75
+
76
+ def find_by_id(self, decision_id: str) -> Optional[dict]:
77
+ return self._data.get(decision_id)
78
+
79
+ def find_by_ids(self, ids: List[str]) -> Dict[str, dict]:
80
+ return {did: self._data[did] for did in ids if did in self._data}
81
+
82
+ async def find_by_ids_async(self, ids: List[str]) -> Dict[str, dict]:
83
+ return self.find_by_ids(ids)
84
+
85
+ def save(self, decision_id: str, data: dict) -> None:
86
+ self._data[decision_id] = dict(data)
87
+
88
+ def upsert(self, decision_id: str, data: dict) -> None:
89
+ existing = self._data.get(decision_id, {})
90
+ existing.update(data)
91
+ self._data[decision_id] = existing
92
+
93
+ def query(
94
+ self,
95
+ filters: List[Tuple[str, str, Any]],
96
+ order_by: Optional[List[Tuple[str, str]]] = None,
97
+ limit: int = 200,
98
+ ) -> List[dict]:
99
+ return _apply_query(self._data, filters, order_by, limit)
100
+
101
+
102
+ class InMemoryProjectionStore(ProjectionStore):
103
+ def __init__(self):
104
+ self._data: Dict[str, dict] = {}
105
+
106
+ def find_by_ids(self, ids: List[str]) -> Dict[str, dict]:
107
+ return {pid: self._data[pid] for pid in ids if pid in self._data}
108
+
109
+ def find_by_conv_ids(self, cids: List[str], proj_type: str) -> List[dict]:
110
+ cid_set = set(cids)
111
+ return [
112
+ doc
113
+ for doc in self._data.values()
114
+ if doc.get("cid") in cid_set and doc.get("proj_type") == proj_type and doc.get("valid", True)
115
+ ]
116
+
117
+ async def find_by_filters(
118
+ self,
119
+ *,
120
+ gids: List[str],
121
+ proj_type: str,
122
+ last_n_days: Optional[int] = None,
123
+ limit: Optional[int] = None,
124
+ before_ts: Optional[float] = None,
125
+ ) -> List[dict]:
126
+ gid_set = set(gids)
127
+ rows = [
128
+ doc
129
+ for doc in self._data.values()
130
+ if doc.get("gid") in gid_set and doc.get("proj_type") == proj_type and doc.get("valid", True)
131
+ ]
132
+ if last_n_days is not None:
133
+ cutoff = time.time() - (last_n_days * 86400)
134
+ rows = [r for r in rows if (r.get("updated_at") or r.get("created_at") or 0) >= cutoff]
135
+ if before_ts is not None:
136
+ rows = [r for r in rows if (r.get("updated_at") or r.get("created_at") or 0) < before_ts]
137
+ rows.sort(key=lambda r: r.get("updated_at") or r.get("created_at") or 0, reverse=True)
138
+ if limit:
139
+ rows = rows[:limit]
140
+ return rows
141
+
142
+ def find_by_id(self, pid: str) -> Optional[dict]:
143
+ return self._data.get(pid)
144
+
145
+ def query(
146
+ self,
147
+ filters: List[Tuple[str, str, Any]],
148
+ order_by: Optional[List[Tuple[str, str]]] = None,
149
+ limit: int = 200,
150
+ ) -> List[dict]:
151
+ return _apply_query(self._data, filters, order_by, limit)
152
+
153
+ def invalidate(self, pid: str) -> None:
154
+ if pid in self._data:
155
+ self._data[pid]["valid"] = False
156
+
157
+ def save(self, *, pid: str, gid: str, cid: str, proj_type: str, projection: dict, msg_ts: int) -> bool:
158
+ is_new = pid not in self._data
159
+ if is_new:
160
+ self._data[pid] = {
161
+ "pid": pid,
162
+ "gid": gid,
163
+ "cid": cid,
164
+ "proj_type": proj_type,
165
+ "projection": projection,
166
+ "created_at": msg_ts,
167
+ "updated_at": msg_ts,
168
+ "valid": True,
169
+ }
170
+ return is_new
171
+
172
+ def update(self, *, pid: str, projection: dict, update_type: str, msg_ts: int) -> dict:
173
+ if pid in self._data:
174
+ self._data[pid]["projection"] = projection
175
+ self._data[pid]["updated_at"] = msg_ts
176
+ return self._data.get(pid, {})
177
+
178
+
179
+ class InMemoryClusterStore(ClusterStore):
180
+ def __init__(self):
181
+ self._data: Dict[str, dict] = {}
182
+
183
+ def create(self, data: dict) -> str:
184
+ cluster_id = data.get("cluster_id", "")
185
+ self._data[cluster_id] = dict(data)
186
+ return cluster_id
187
+
188
+ def update(self, cluster_id: str, updates: dict) -> None:
189
+ if cluster_id in self._data:
190
+ self._data[cluster_id].update(updates)
191
+
192
+ def find_by_id(self, cluster_id: str) -> Optional[dict]:
193
+ return self._data.get(cluster_id)
194
+
195
+ def find_by_ids(self, cluster_ids: List[str]) -> List[dict]:
196
+ return [self._data[cid] for cid in cluster_ids if cid in self._data]
197
+
198
+
199
+ class InMemoryLinkStore(LinkStore):
200
+ def __init__(self):
201
+ self._data: Dict[str, dict] = {}
202
+
203
+ def save_batch(self, links: List[dict]) -> int:
204
+ for link in links:
205
+ decision_id = link.get("decision_id", "")
206
+ self._data[decision_id] = dict(link)
207
+ return len(links)
208
+
209
+ def find_by_decision_id(self, decision_id: str) -> Optional[dict]:
210
+ return self._data.get(decision_id)
211
+
212
+ def find_by_cluster_id(self, cluster_id: str) -> List[dict]:
213
+ return [d for d in self._data.values() if d.get("cluster_id") == cluster_id]
214
+
215
+ def find_by_decision_ids(self, decision_ids: List[str]) -> Dict[str, dict]:
216
+ return {did: self._data[did] for did in decision_ids if did in self._data}
217
+
218
+ def find_cluster_ids_by_gids(self, gids: List[str]) -> List[str]:
219
+ gid_set = set(gids)
220
+ cluster_ids = set()
221
+ for link in self._data.values():
222
+ if link.get("gid") in gid_set and link.get("cluster_id"):
223
+ cluster_ids.add(link["cluster_id"])
224
+ return list(cluster_ids)
225
+
226
+ def find_cluster_id_for_decision(self, decision_id: str) -> Optional[str]:
227
+ link = self._data.get(decision_id)
228
+ return link.get("cluster_id") if link else None
229
+
230
+
231
+ _STOP_WORDS = frozenset(
232
+ "a an the is are was were be been being have has had do does did will would "
233
+ "shall should may might can could i you he she it we they me him her us them "
234
+ "my your his its our their this that these those in on at to for with by from "
235
+ "of and or not no but if so as up out about into over after".split()
236
+ )
237
+
238
+
239
+ def _tokenize(text: str) -> List[str]:
240
+ return [w for w in re.findall(r"[a-z0-9]+", text.lower()) if w not in _STOP_WORDS and len(w) > 1]
241
+
242
+
243
+ def _tfidf_vector(tokens: List[str], idf: Dict[str, float]) -> Dict[str, float]:
244
+ tf = Counter(tokens)
245
+ total = len(tokens) or 1
246
+ return {t: (c / total) * idf.get(t, 1.0) for t, c in tf.items()}
247
+
248
+
249
+ def _cosine(a: Dict[str, float], b: Dict[str, float]) -> float:
250
+ keys = set(a) & set(b)
251
+ if not keys:
252
+ return 0.0
253
+ dot = sum(a[k] * b[k] for k in keys)
254
+ mag_a = math.sqrt(sum(v * v for v in a.values()))
255
+ mag_b = math.sqrt(sum(v * v for v in b.values()))
256
+ if mag_a == 0 or mag_b == 0:
257
+ return 0.0
258
+ return dot / (mag_a * mag_b)
259
+
260
+
261
+ class InMemoryVectorIndex:
262
+ """TF-IDF cosine similarity vector index. No external dependencies."""
263
+
264
+ def __init__(self):
265
+ self._docs: List[Dict[str, Any]] = []
266
+ self._idf: Dict[str, float] = {}
267
+
268
+ def add(self, *, pid: str, text: str, gid: str, cid: str):
269
+ tokens = _tokenize(text)
270
+ self._docs.append({"pid": pid, "tokens": tokens, "gid": gid, "cid": cid})
271
+ self._rebuild_idf()
272
+
273
+ def _rebuild_idf(self):
274
+ n = len(self._docs)
275
+ df: Dict[str, int] = {}
276
+ for doc in self._docs:
277
+ for t in set(doc["tokens"]):
278
+ df[t] = df.get(t, 0) + 1
279
+ self._idf = {t: math.log((n + 1) / (c + 1)) + 1 for t, c in df.items()}
280
+
281
+ def get_top_n_matches(self, *, query: str, query_filter: Optional[dict], top_n: int) -> pd.DataFrame:
282
+ if not self._docs:
283
+ return pd.DataFrame()
284
+
285
+ query_tokens = _tokenize(query)
286
+ query_vec = _tfidf_vector(query_tokens, self._idf)
287
+
288
+ allowed_gids = None
289
+ if query_filter and "$in" in (query_filter.get("gid") or {}):
290
+ allowed_gids = set(query_filter["gid"]["$in"])
291
+
292
+ results = []
293
+ for doc in self._docs:
294
+ if allowed_gids and doc["gid"] not in allowed_gids:
295
+ continue
296
+ doc_vec = _tfidf_vector(doc["tokens"], self._idf)
297
+ score = _cosine(query_vec, doc_vec)
298
+ results.append({"pid": doc["pid"], "score": score, "gid": doc["gid"], "cid": doc["cid"]})
299
+
300
+ results.sort(key=lambda x: x["score"], reverse=True)
301
+ return pd.DataFrame(results[:top_n]) if results else pd.DataFrame()
302
+
303
+
304
+ class InMemoryGraphStore:
305
+ """In-memory graph store that materializes hydrated clusters into graph arrays."""
306
+
307
+ def __init__(self):
308
+ self._hydrated_clusters: List[Dict[str, Any]] = []
309
+ self._graph_arrays: Dict[str, list] = {}
310
+
311
+ def ingest(self, hydrated_clusters: List[Dict[str, Any]]) -> None:
312
+ self._hydrated_clusters.extend(hydrated_clusters)
313
+ arrays = breakdown_hydrated_clusters(hydrated_clusters)
314
+ for key, items in arrays.items():
315
+ self._graph_arrays.setdefault(key, []).extend(items)
316
+
317
+ @property
318
+ def hydrated_clusters(self) -> List[Dict[str, Any]]:
319
+ return self._hydrated_clusters
320
+
321
+ @property
322
+ def graph_arrays(self) -> Dict[str, list]:
323
+ return self._graph_arrays