agmem 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ import os
12
12
  from pathlib import Path
13
13
  from typing import Optional, List, Tuple, Any, Dict
14
14
 
15
- from .objects import ObjectStore, Tree, Commit
15
+ from .objects import ObjectStore, Tree, Commit, Blob
16
16
 
17
17
  # Ed25519 via cryptography (optional)
18
18
  try:
@@ -239,6 +239,17 @@ def verify_commit(
239
239
  stored_sig = (commit.metadata or {}).get("signature")
240
240
  if not stored_root:
241
241
  return (False, "commit has no merkle_root (unverified)")
242
+
243
+ # Verify that blob objects can be loaded successfully (detects tampering in compressed/encrypted content)
244
+ blob_hashes = _collect_blob_hashes_from_tree(store, commit.tree)
245
+ for blob_hash in blob_hashes:
246
+ try:
247
+ blob = Blob.load(store, blob_hash)
248
+ if blob is None:
249
+ return (False, f"blob {blob_hash[:8]} corrupted or missing")
250
+ except Exception as e:
251
+ return (False, f"merkle_root mismatch (commit tampered)")
252
+
242
253
  computed_root = build_merkle_root_for_commit(store, commit_hash)
243
254
  if not computed_root:
244
255
  return (False, "could not build Merkle tree (missing tree/blobs)")
memvcs/core/distiller.py CHANGED
@@ -35,6 +35,9 @@ class DistillerConfig:
35
35
  llm_provider: Optional[str] = None
36
36
  llm_model: Optional[str] = None
37
37
  create_safety_branch: bool = True
38
+ use_dp: bool = False
39
+ dp_epsilon: Optional[float] = None
40
+ dp_delta: Optional[float] = None
38
41
 
39
42
 
40
43
  @dataclass
@@ -160,13 +163,25 @@ class Distiller:
160
163
  except ValueError:
161
164
  out_path = self.target_dir / f"consolidated-{ts}.md"
162
165
 
166
+ confidence_score = self.config.extraction_confidence_threshold
167
+ if (
168
+ self.config.use_dp
169
+ and self.config.dp_epsilon is not None
170
+ and self.config.dp_delta is not None
171
+ ):
172
+ from .privacy_budget import add_noise
173
+
174
+ confidence_score = add_noise(
175
+ confidence_score, 0.1, self.config.dp_epsilon, self.config.dp_delta
176
+ )
177
+ confidence_score = max(0.0, min(1.0, confidence_score))
163
178
  frontmatter = {
164
179
  "schema_version": "1.0",
165
180
  "last_updated": datetime.utcnow().isoformat() + "Z",
166
181
  "source_agent_id": "distiller",
167
182
  "memory_type": "semantic",
168
183
  "tags": cluster.tags + ["auto-generated", "consolidated"],
169
- "confidence_score": self.config.extraction_confidence_threshold,
184
+ "confidence_score": confidence_score,
170
185
  }
171
186
  body = f"# Consolidated: {cluster.topic}\n\n" + "\n".join(facts)
172
187
  if YAML_AVAILABLE:
@@ -266,11 +281,62 @@ class Distiller:
266
281
  except Exception:
267
282
  pass
268
283
 
284
+ clusters_processed = len(clusters)
285
+ facts_extracted = facts_count
286
+ episodes_archived = archived
287
+ if (
288
+ self.config.use_dp
289
+ and self.config.dp_epsilon is not None
290
+ and self.config.dp_delta is not None
291
+ ):
292
+ from .privacy_budget import add_noise
293
+
294
+ sensitivity = 1.0
295
+ clusters_processed = max(
296
+ 0,
297
+ int(
298
+ round(
299
+ add_noise(
300
+ float(clusters_processed),
301
+ sensitivity,
302
+ self.config.dp_epsilon,
303
+ self.config.dp_delta,
304
+ )
305
+ )
306
+ ),
307
+ )
308
+ facts_extracted = max(
309
+ 0,
310
+ int(
311
+ round(
312
+ add_noise(
313
+ float(facts_extracted),
314
+ sensitivity,
315
+ self.config.dp_epsilon,
316
+ self.config.dp_delta,
317
+ )
318
+ )
319
+ ),
320
+ )
321
+ episodes_archived = max(
322
+ 0,
323
+ int(
324
+ round(
325
+ add_noise(
326
+ float(episodes_archived),
327
+ sensitivity,
328
+ self.config.dp_epsilon,
329
+ self.config.dp_delta,
330
+ )
331
+ )
332
+ ),
333
+ )
334
+
269
335
  return DistillerResult(
270
336
  success=True,
271
- clusters_processed=len(clusters),
272
- facts_extracted=facts_count,
273
- episodes_archived=archived,
337
+ clusters_processed=clusters_processed,
338
+ facts_extracted=facts_extracted,
339
+ episodes_archived=episodes_archived,
274
340
  branch_created=branch_name,
275
341
  commit_hash=commit_hash,
276
342
  message=f"Processed {len(clusters)} clusters, extracted {facts_count} facts",
memvcs/core/federated.py CHANGED
@@ -5,6 +5,7 @@ Agents share model updates or aggregated summaries instead of raw episodic logs.
5
5
  Optional coordinator URL; optional differential privacy (Tier 3).
6
6
  """
7
7
 
8
+ import hashlib
8
9
  import json
9
10
  from pathlib import Path
10
11
  from typing import Optional, List, Dict, Any
@@ -21,30 +22,100 @@ def get_federated_config(repo_root: Path) -> Optional[Dict[str, Any]]:
21
22
  url = fed.get("coordinator_url")
22
23
  if not url:
23
24
  return None
24
- return {
25
+ out = {
25
26
  "coordinator_url": url.rstrip("/"),
26
27
  "memory_types": fed.get("memory_types", ["episodic", "semantic"]),
27
28
  }
29
+ dp = fed.get("differential_privacy") or config.get("differential_privacy") or {}
30
+ if dp.get("enabled"):
31
+ out["use_dp"] = True
32
+ out["dp_epsilon"] = float(dp.get("epsilon", 0.1))
33
+ out["dp_delta"] = float(dp.get("delta", 1e-5))
34
+ else:
35
+ out["use_dp"] = False
36
+ return out
28
37
 
29
38
 
30
- def produce_local_summary(repo_root: Path, memory_types: List[str]) -> Dict[str, Any]:
39
+ def _normalize_for_hash(text: str) -> str:
40
+ """Normalize text for hashing (no raw content sent)."""
41
+ return " ".join(text.strip().split())
42
+
43
+
44
+ def _extract_topic_from_md(path: Path, content: str) -> str:
45
+ """Extract topic from frontmatter tags or first heading."""
46
+ if content.startswith("---"):
47
+ end = content.find("---", 3)
48
+ if end > 0:
49
+ try:
50
+ import yaml
51
+
52
+ fm = yaml.safe_load(content[3:end])
53
+ if isinstance(fm, dict):
54
+ tags = fm.get("tags", [])
55
+ if tags:
56
+ return str(tags[0])[:50]
57
+ except (ImportError, Exception):
58
+ pass
59
+ first_line = content.strip().split("\n")[0] if content.strip() else ""
60
+ if first_line.startswith("#"):
61
+ return first_line.lstrip("#").strip()[:50] or "untitled"
62
+ return "untitled"
63
+
64
+
65
+ def produce_local_summary(
66
+ repo_root: Path,
67
+ memory_types: List[str],
68
+ use_dp: bool = False,
69
+ dp_epsilon: float = 0.1,
70
+ dp_delta: float = 1e-5,
71
+ ) -> Dict[str, Any]:
31
72
  """
32
73
  Produce a local summary from episodic/semantic data (no raw content).
33
- Returns dict suitable for sending to coordinator (e.g. topic counts, fact hashes).
74
+ Returns dict with topic counts and fact hashes suitable for coordinator.
34
75
  """
35
76
  current_dir = repo_root / "current"
36
- summary = {"memory_types": memory_types, "topics": {}, "fact_count": 0}
77
+ summary = {"memory_types": memory_types, "topics": {}, "topic_hashes": {}, "fact_count": 0}
78
+ all_fact_hashes: List[str] = []
79
+
37
80
  for mtype in memory_types:
38
81
  d = current_dir / mtype
39
82
  if not d.exists():
83
+ summary["topics"][mtype] = 0
84
+ summary["topic_hashes"][mtype] = []
40
85
  continue
41
- count = 0
86
+ topic_to_count: Dict[str, int] = {}
87
+ topic_to_hashes: Dict[str, List[str]] = {}
42
88
  for f in d.rglob("*.md"):
43
- if f.is_file():
44
- count += 1
45
- summary["topics"][mtype] = count
89
+ if not f.is_file():
90
+ continue
91
+ try:
92
+ content = f.read_text(encoding="utf-8", errors="replace")
93
+ except Exception:
94
+ continue
95
+ normalized = _normalize_for_hash(content)
96
+ if normalized:
97
+ h = hashlib.sha256(normalized.encode()).hexdigest()
98
+ all_fact_hashes.append(h)
99
+ topic = _extract_topic_from_md(f, content)
100
+ topic_to_count[topic] = topic_to_count.get(topic, 0) + 1
101
+ topic_to_hashes.setdefault(topic, []).append(h)
102
+ summary["topics"][mtype] = sum(topic_to_count.values())
103
+ summary["topic_hashes"][mtype] = list(topic_to_hashes.keys())
46
104
  if mtype == "semantic":
47
- summary["fact_count"] = count
105
+ summary["fact_count"] = len(all_fact_hashes)
106
+
107
+ if use_dp and dp_epsilon and dp_delta:
108
+ from .privacy_budget import add_noise
109
+
110
+ for mtype in summary["topics"]:
111
+ raw = summary["topics"][mtype]
112
+ summary["topics"][mtype] = max(
113
+ 0, int(round(add_noise(float(raw), 1.0, dp_epsilon, dp_delta)))
114
+ )
115
+ summary["fact_count"] = max(
116
+ 0, int(round(add_noise(float(summary["fact_count"]), 1.0, dp_epsilon, dp_delta)))
117
+ )
118
+
48
119
  return summary
49
120
 
50
121
 
memvcs/core/gardener.py CHANGED
@@ -43,6 +43,9 @@ class GardenerConfig:
43
43
  llm_provider: Optional[str] = None # "openai", "anthropic", etc.
44
44
  llm_model: Optional[str] = None
45
45
  auto_commit: bool = True
46
+ use_dp: bool = False
47
+ dp_epsilon: Optional[float] = None
48
+ dp_delta: Optional[float] = None
46
49
 
47
50
 
48
51
  @dataclass
@@ -351,14 +354,35 @@ class Gardener:
351
354
  except ValueError:
352
355
  insight_path = self.semantic_dir / f"insight-{timestamp}.md"
353
356
 
354
- # Generate frontmatter
357
+ # Generate frontmatter (optionally noised for differential privacy)
358
+ source_episodes = len(cluster.episodes)
359
+ if (
360
+ self.config.use_dp
361
+ and self.config.dp_epsilon is not None
362
+ and self.config.dp_delta is not None
363
+ ):
364
+ from .privacy_budget import add_noise
365
+
366
+ source_episodes = max(
367
+ 0,
368
+ int(
369
+ round(
370
+ add_noise(
371
+ float(source_episodes),
372
+ 1.0,
373
+ self.config.dp_epsilon,
374
+ self.config.dp_delta,
375
+ )
376
+ )
377
+ ),
378
+ )
355
379
  frontmatter = {
356
380
  "schema_version": "1.0",
357
381
  "last_updated": datetime.utcnow().isoformat() + "Z",
358
382
  "source_agent_id": "gardener",
359
383
  "memory_type": "semantic",
360
384
  "tags": cluster.tags + ["auto-generated", "insight"],
361
- "source_episodes": len(cluster.episodes),
385
+ "source_episodes": source_episodes,
362
386
  }
363
387
 
364
388
  # Write file
@@ -487,11 +511,62 @@ class Gardener:
487
511
  except Exception as e:
488
512
  print(f"Warning: Auto-commit failed: {e}")
489
513
 
514
+ clusters_found = len(clusters)
515
+ insights_generated = insights_written
516
+ episodes_archived = archived_count
517
+ if (
518
+ self.config.use_dp
519
+ and self.config.dp_epsilon is not None
520
+ and self.config.dp_delta is not None
521
+ ):
522
+ from .privacy_budget import add_noise
523
+
524
+ sensitivity = 1.0
525
+ clusters_found = max(
526
+ 0,
527
+ int(
528
+ round(
529
+ add_noise(
530
+ float(clusters_found),
531
+ sensitivity,
532
+ self.config.dp_epsilon,
533
+ self.config.dp_delta,
534
+ )
535
+ )
536
+ ),
537
+ )
538
+ insights_generated = max(
539
+ 0,
540
+ int(
541
+ round(
542
+ add_noise(
543
+ float(insights_generated),
544
+ sensitivity,
545
+ self.config.dp_epsilon,
546
+ self.config.dp_delta,
547
+ )
548
+ )
549
+ ),
550
+ )
551
+ episodes_archived = max(
552
+ 0,
553
+ int(
554
+ round(
555
+ add_noise(
556
+ float(episodes_archived),
557
+ sensitivity,
558
+ self.config.dp_epsilon,
559
+ self.config.dp_delta,
560
+ )
561
+ )
562
+ ),
563
+ )
564
+
490
565
  return GardenerResult(
491
566
  success=True,
492
- clusters_found=len(clusters),
493
- insights_generated=insights_written,
494
- episodes_archived=archived_count,
567
+ clusters_found=clusters_found,
568
+ insights_generated=insights_generated,
569
+ episodes_archived=episodes_archived,
495
570
  commit_hash=commit_hash,
496
571
  message=f"Processed {len(clusters)} clusters, generated {insights_written} insights",
497
572
  )
@@ -1,25 +1,161 @@
1
1
  """
2
- IPFS remote for agmem (stub).
2
+ IPFS remote for agmem.
3
3
 
4
- Push/pull via CIDs; pinning; gateway fallback when daemon unavailable.
5
- Requires optional ipfs extra (ipfshttpclient or gateway requests).
4
+ Push/pull via CIDs using HTTP gateway (POST /api/v0/add, GET /ipfs/<cid>).
5
+ Optional ipfshttpclient for local daemon.
6
6
  """
7
7
 
8
+ import json
9
+ import struct
10
+ import zlib
8
11
  from pathlib import Path
9
- from typing import Optional, Set
12
+ from typing import Optional, Set, Dict, Tuple
10
13
 
11
14
  from .objects import ObjectStore
12
15
  from .remote import _collect_objects_from_commit
13
16
 
17
+ # Type byte for bundle (same as pack)
18
+ _TYPE_BLOB = 1
19
+ _TYPE_TREE = 2
20
+ _TYPE_COMMIT = 3
21
+ _TYPE_TAG = 4
22
+ _TYPE_TO_BYTE = {"blob": _TYPE_BLOB, "tree": _TYPE_TREE, "commit": _TYPE_COMMIT, "tag": _TYPE_TAG}
23
+ _BYTE_TO_TYPE = {v: k for k, v in _TYPE_TO_BYTE.items()}
24
+
25
+
26
+ def _get_object_type_and_content(store: ObjectStore, hash_id: str) -> Optional[Tuple[str, bytes]]:
27
+ """Return (obj_type, raw_content) for a hash, or None."""
28
+ for obj_type in ["commit", "tree", "blob", "tag"]:
29
+ content = store.retrieve(hash_id, obj_type)
30
+ if content is not None:
31
+ return (obj_type, content)
32
+ return None
33
+
34
+
35
+ def _bundle_objects(store: ObjectStore, hash_ids: Set[str]) -> bytes:
36
+ """Bundle objects into a single byte blob: count + [hash(32) type(1) len(4) zlib_payload]."""
37
+ entries = []
38
+ for h in sorted(hash_ids):
39
+ pair = _get_object_type_and_content(store, h)
40
+ if pair is None:
41
+ continue
42
+ obj_type, content = pair
43
+ header = f"{obj_type} {len(content)}\0".encode()
44
+ full = header + content
45
+ compressed = zlib.compress(full)
46
+ h_bin = bytes.fromhex(h) if len(h) == 64 else h.encode().ljust(32)[:32]
47
+ entries.append((h_bin, _TYPE_TO_BYTE.get(obj_type, _TYPE_BLOB), compressed))
48
+ parts = [struct.pack(">I", len(entries))]
49
+ for h_bin, type_byte, compressed in entries:
50
+ parts.append(h_bin)
51
+ parts.append(bytes([type_byte]))
52
+ parts.append(struct.pack(">I", len(compressed)))
53
+ parts.append(compressed)
54
+ return b"".join(parts)
55
+
56
+
57
+ def _unbundle_objects(data: bytes, objects_dir: Path) -> int:
58
+ """Unbundle and write loose objects. Returns count written."""
59
+ if len(data) < 4:
60
+ return 0
61
+ count = struct.unpack(">I", data[:4])[0]
62
+ offset = 4
63
+ written = 0
64
+ for _ in range(count):
65
+ if offset + 32 + 1 + 4 > len(data):
66
+ break
67
+ h_bin = data[offset : offset + 32]
68
+ offset += 32
69
+ type_byte = data[offset]
70
+ offset += 1
71
+ comp_len = struct.unpack(">I", data[offset : offset + 4])[0]
72
+ offset += 4
73
+ if offset + comp_len > len(data):
74
+ break
75
+ compressed = data[offset : offset + comp_len]
76
+ offset += comp_len
77
+ obj_type = _BYTE_TO_TYPE.get(type_byte)
78
+ if obj_type is None:
79
+ continue
80
+ try:
81
+ full = zlib.decompress(compressed)
82
+ except Exception:
83
+ continue
84
+ null_idx = full.index(b"\0")
85
+ # Validate header
86
+ prefix = full[:null_idx].decode()
87
+ if " " not in prefix:
88
+ continue
89
+ name, size_str = prefix.split(" ", 1)
90
+ hash_hex = h_bin.hex() if len(h_bin) == 32 else h_bin.decode().strip()
91
+ if len(hash_hex) < 4:
92
+ continue
93
+ obj_path = objects_dir / obj_type / hash_hex[:2] / hash_hex[2:]
94
+ obj_path.parent.mkdir(parents=True, exist_ok=True)
95
+ obj_path.write_bytes(compressed)
96
+ written += 1
97
+ return written
98
+
99
+
100
+ def _add_to_ipfs_gateway(bundle: bytes, gateway_url: str) -> Optional[str]:
101
+ """POST bundle to IPFS gateway /api/v0/add (multipart). Returns CID or None."""
102
+ boundary = "----agmem-boundary-" + str(abs(hash(bundle)))[:12]
103
+ body = (
104
+ b"--" + boundary.encode() + b"\r\n"
105
+ b'Content-Disposition: form-data; name="file"; filename="agmem-bundle.bin"\r\n'
106
+ b"Content-Type: application/octet-stream\r\n\r\n" + bundle + b"\r\n"
107
+ b"--" + boundary.encode() + b"--\r\n"
108
+ )
109
+ try:
110
+ import urllib.request
111
+
112
+ url = gateway_url.rstrip("/") + "/api/v0/add"
113
+ req = urllib.request.Request(url, data=body, method="POST")
114
+ req.add_header("Content-Type", "multipart/form-data; boundary=" + boundary)
115
+ req.add_header("Content-Length", str(len(body)))
116
+ with urllib.request.urlopen(req, timeout=120) as resp:
117
+ if resp.status != 200:
118
+ return None
119
+ data = json.loads(resp.read().decode())
120
+ return data.get("Hash") or data.get("Name")
121
+ except Exception:
122
+ try:
123
+ import requests
124
+
125
+ url = gateway_url.rstrip("/") + "/api/v0/add"
126
+ r = requests.post(
127
+ url,
128
+ files={"file": ("agmem-bundle.bin", bundle, "application/octet-stream")},
129
+ timeout=120,
130
+ )
131
+ if r.status_code != 200:
132
+ return None
133
+ return r.json().get("Hash") or r.json().get("Name")
134
+ except Exception:
135
+ return None
136
+
14
137
 
15
138
  def push_to_ipfs(
16
139
  objects_dir: Path,
17
140
  branch: str,
18
141
  commit_hash: str,
19
142
  gateway_url: str = "https://ipfs.io",
143
+ store: Optional[ObjectStore] = None,
20
144
  ) -> Optional[str]:
21
- """Push branch objects to IPFS and return root CID. Stub: returns None until IPFS client added."""
22
- return None
145
+ """
146
+ Push branch objects to IPFS and return root CID.
147
+ Uses gateway POST /api/v0/add (multipart).
148
+ """
149
+ if store is None:
150
+ store = ObjectStore(objects_dir)
151
+ try:
152
+ reachable = _collect_objects_from_commit(store, commit_hash)
153
+ except Exception:
154
+ return None
155
+ if not reachable:
156
+ return None
157
+ bundle = _bundle_objects(store, reachable)
158
+ return _add_to_ipfs_gateway(bundle, gateway_url)
23
159
 
24
160
 
25
161
  def pull_from_ipfs(
@@ -27,8 +163,32 @@ def pull_from_ipfs(
27
163
  cid: str,
28
164
  gateway_url: str = "https://ipfs.io",
29
165
  ) -> bool:
30
- """Pull objects by CID from IPFS into objects_dir. Stub: returns False until IPFS client added."""
31
- return False
166
+ """
167
+ Pull objects by CID from IPFS into objects_dir (loose objects).
168
+ Uses GET gateway_url/ipfs/<cid>.
169
+ """
170
+ try:
171
+ import urllib.request
172
+
173
+ url = gateway_url.rstrip("/") + "/ipfs/" + cid
174
+ req = urllib.request.Request(url, method="GET")
175
+ with urllib.request.urlopen(req, timeout=60) as resp:
176
+ if resp.status != 200:
177
+ return False
178
+ data = resp.read()
179
+ except Exception:
180
+ try:
181
+ import requests
182
+
183
+ url = gateway_url.rstrip("/") + "/ipfs/" + cid
184
+ r = requests.get(url, timeout=60)
185
+ if r.status_code != 200:
186
+ return False
187
+ data = r.content
188
+ except Exception:
189
+ return False
190
+ written = _unbundle_objects(data, objects_dir)
191
+ return written > 0
32
192
 
33
193
 
34
194
  def parse_ipfs_url(url: str) -> Optional[str]:
@@ -84,7 +84,14 @@ class KnowledgeGraphBuilder:
84
84
  1. Wikilinks: [[filename]] references
85
85
  2. Semantic similarity: Using embeddings
86
86
  3. Shared tags: Files with common tags
87
- 4. Co-occurrence: Facts in same episodic session (optional)
87
+ 4. Co-occurrence: Files that mention the same entity (e.g. same section/session)
88
+ 5. Causal: Phrases like "caused by", "because of" linking concepts (when derivable)
89
+ 6. Entity: Person/place/thing links (simple keyword or pattern)
90
+
91
+ Incremental updates: To update when new files are added without full rebuild,
92
+ filter the file list to new/changed paths, run build_graph logic for that subset,
93
+ and merge new nodes/edges into the existing graph (or re-run build_graph; cost is
94
+ linear in file count).
88
95
  """
89
96
 
90
97
  # Pattern for wikilinks: [[target]] or [[target|display text]]
@@ -262,7 +269,22 @@ class KnowledgeGraphBuilder:
262
269
  except Exception:
263
270
  pass # Skip similarity if vector store fails
264
271
 
272
+ # Add co-occurrence edges (files sharing entities)
273
+ try:
274
+ edges.extend(self._build_cooccurrence_edges(file_paths, file_contents))
275
+ except Exception:
276
+ pass
277
+
278
+ # Add causal edges (phrases like "caused by", "because of" linking to another file)
279
+ try:
280
+ edges.extend(self._build_causal_edges(file_contents))
281
+ except Exception:
282
+ pass
283
+
265
284
  # Build metadata
285
+ edge_type_counts = defaultdict(int)
286
+ for e in edges:
287
+ edge_type_counts[e.edge_type] += 1
266
288
  metadata = {
267
289
  "total_nodes": len(nodes),
268
290
  "total_edges": len(edges),
@@ -274,15 +296,66 @@ class KnowledgeGraphBuilder:
274
296
  1 for n in nodes if n.memory_type not in ["episodic", "semantic", "procedural"]
275
297
  ),
276
298
  },
277
- "edge_types": {
278
- "reference": sum(1 for e in edges if e.edge_type == "reference"),
279
- "similarity": sum(1 for e in edges if e.edge_type == "similarity"),
280
- "same_topic": sum(1 for e in edges if e.edge_type == "same_topic"),
281
- },
299
+ "edge_types": dict(edge_type_counts),
282
300
  }
283
301
 
284
302
  return KnowledgeGraphData(nodes=nodes, edges=edges, metadata=metadata)
285
303
 
304
+ def _extract_entities_simple(self, content: str) -> Set[str]:
305
+ """Extract simple entity tokens (capitalized words, key phrases) for co-occurrence."""
306
+ entities = set()
307
+ for word in re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", content):
308
+ if len(word) > 2:
309
+ entities.add(word.lower())
310
+ for phrase in ["user", "project", "agent", "memory", "preference", "workflow"]:
311
+ if phrase in content.lower():
312
+ entities.add(phrase)
313
+ return entities
314
+
315
+ def _build_cooccurrence_edges(
316
+ self, file_paths: List[str], file_contents: Dict[str, str]
317
+ ) -> List[GraphEdge]:
318
+ """Build edges between files that share at least one entity (co-occurrence)."""
319
+ file_entities: Dict[str, Set[str]] = {}
320
+ for path, content in file_contents.items():
321
+ file_entities[path] = self._extract_entities_simple(content)
322
+ edges = []
323
+ paths_list = list(file_paths)
324
+ for i, path1 in enumerate(paths_list):
325
+ for path2 in paths_list[i + 1 :]:
326
+ common = file_entities.get(path1, set()) & file_entities.get(path2, set())
327
+ if common:
328
+ w = min(1.0, 0.3 + 0.1 * len(common))
329
+ edge = GraphEdge(
330
+ source=path1, target=path2, edge_type="co_occurrence", weight=w
331
+ )
332
+ edges.append(edge)
333
+ if self._graph is not None:
334
+ self._graph.add_edge(path1, path2, type="co_occurrence", weight=w)
335
+ return edges
336
+
337
+ def _build_causal_edges(self, file_contents: Dict[str, str]) -> List[GraphEdge]:
338
+ """Build edges when content has causal phrases linking to another file (e.g. caused by [[X]])."""
339
+ causal_phrases = re.compile(
340
+ r"(?:caused by|because of|led to|due to)\s+(?:\[\[([^\]]+)\]\]|(\w+))",
341
+ re.IGNORECASE,
342
+ )
343
+ edges = []
344
+ for source_path, content in file_contents.items():
345
+ for m in causal_phrases.finditer(content):
346
+ target = m.group(1) or m.group(2)
347
+ if not target:
348
+ continue
349
+ target_path = self._normalize_link_target(target.strip(), source_path)
350
+ if target_path and target_path in file_contents and target_path != source_path:
351
+ edge = GraphEdge(
352
+ source=source_path, target=target_path, edge_type="causal", weight=0.7
353
+ )
354
+ edges.append(edge)
355
+ if self._graph is not None:
356
+ self._graph.add_edge(source_path, target_path, type="causal", weight=0.7)
357
+ return edges
358
+
286
359
  def _build_similarity_edges(
287
360
  self, file_paths: List[str], file_contents: Dict[str, str], threshold: float
288
361
  ) -> List[GraphEdge]: