continuityos 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ """ContinuityOS — durable, hybrid memory + continuity layer for AI agents and humans.
2
+
3
+ Memory: structural (namespaces + keyword FTS) + semantic (vector cosine) recall.
4
+ Continuity: canon, frontiers, open loops, checkpoints, anti-drift doctor, handoff.
5
+ Local-first. No data leaves the machine.
6
+ """
7
+ from .memory import Memory, MemoryItem
8
+ from .continuity import Continuity
9
+ from .agents import Council, Actor
10
+ from .twin import Twin
11
+ from .control import ControlPlane
12
+ from . import fork
13
+ __all__ = ["Memory", "MemoryItem", "Continuity", "Council", "Actor", "Twin", "ControlPlane", "fork"]
14
+ __version__ = "0.7.0"
continuityos/agents.py ADDED
@@ -0,0 +1,76 @@
1
+ """Multi-agent council — many agents (and the human) on one ContinuityOS.
2
+
3
+ Generalized from the Continuity OS AGENT_COUNCIL / INTERNAL_AGENTS canon:
4
+ every actor has an authority level (1..5); namespaces have a minimum write
5
+ level; the human operator is Sovereign (5). Internal roles are stable
6
+ attention functions (Archivist / Builder / Critic / Steward), not personas.
7
+ Every memory written through the council is tagged with its author + authority,
8
+ so a swarm can share one memory without overwriting trust.
9
+ """
10
+ from __future__ import annotations
11
+ from dataclasses import dataclass
12
+ from typing import Dict, List, Optional
13
+ from .memory import Memory
14
+
15
+ # authority levels
16
+ SOVEREIGN, STEWARD, BUILDER, CRITIC, READER = 5, 4, 3, 2, 1
17
+ LEVEL_NAME = {5:"sovereign",4:"steward",3:"builder",2:"critic",1:"reader"}
18
+
19
+ # internal roles = stable attention functions (from INTERNAL_AGENTS canon)
20
+ ROLES = {
21
+ "archivist": "Read everything; extract the delta without drowning in raw archive.",
22
+ "builder": "Make the change; turn intent into concrete artifacts.",
23
+ "critic": "Challenge; surface risks, gaps, and false trails before they ship.",
24
+ "steward": "Maintain integrity; enforce canon, anti-drift, and closure.",
25
+ }
26
+
27
+ # minimum authority level required to WRITE into a namespace
28
+ NAMESPACE_MIN_WRITE = {
29
+ "canon": SOVEREIGN, # only the human changes non-negotiable truths
30
+ "frontier": STEWARD,
31
+ "checkpoint": BUILDER,
32
+ "loop": BUILDER,
33
+ "rules": STEWARD,
34
+ "default": CRITIC, # most namespaces: critic+ may write
35
+ }
36
+
37
+ @dataclass
38
+ class Actor:
39
+ name: str
40
+ authority: int = BUILDER
41
+ role: str = "builder"
42
+
43
+ class Council:
44
+ def __init__(self, memory: Optional[Memory] = None, db: str = "continuityos.db"):
45
+ self.m = memory or Memory(db)
46
+ self.actors: Dict[str, Actor] = {}
47
+
48
+ def register(self, name: str, authority: int = BUILDER, role: str = "builder") -> Actor:
49
+ a = Actor(name=name, authority=int(authority), role=role)
50
+ self.actors[name] = a
51
+ return a
52
+
53
+ def can_write(self, actor: str, namespace: str) -> bool:
54
+ a = self.actors.get(actor)
55
+ if not a:
56
+ return False
57
+ need = NAMESPACE_MIN_WRITE.get(namespace, NAMESPACE_MIN_WRITE["default"])
58
+ return a.authority >= need
59
+
60
+ def remember(self, actor: str, text: str, namespace: str = "notes",
61
+ tags: Optional[List[str]] = None) -> int:
62
+ a = self.actors.get(actor)
63
+ if not a:
64
+ raise PermissionError(f"unknown actor '{actor}'")
65
+ if not self.can_write(actor, namespace):
66
+ need = NAMESPACE_MIN_WRITE.get(namespace, NAMESPACE_MIN_WRITE["default"])
67
+ raise PermissionError(
68
+ f"{actor} (L{a.authority}/{LEVEL_NAME.get(a.authority)}) cannot write "
69
+ f"[{namespace}] — needs L{need}/{LEVEL_NAME.get(need)}")
70
+ tags = (tags or []) + [f"by:{actor}", f"role:{a.role}", f"auth:{a.authority}"]
71
+ return self.m.remember(text, namespace=namespace, tags=tags,
72
+ meta={"author": actor, "authority": a.authority, "role": a.role})
73
+
74
+ def roster(self) -> List[Dict]:
75
+ return [{"name":a.name,"authority":a.authority,"level":LEVEL_NAME.get(a.authority),"role":a.role}
76
+ for a in self.actors.values()]
continuityos/api.py ADDED
@@ -0,0 +1,36 @@
1
+ """Tiny stdlib HTTP API (no FastAPI dependency) so `pip install continuityos` stays light.
2
+ POST /remember {text,namespace?,tags?} GET /recall?q=..&k=.. GET /namespaces
3
+ """
4
+ from __future__ import annotations
5
+ import json
6
+ from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
7
+ from urllib.parse import urlparse, parse_qs
8
+ from .memory import Memory
9
+
10
+ def run(db: str, host: str = "127.0.0.1", port: int = 8077):
11
+ mem = Memory(db)
12
+ class H(BaseHTTPRequestHandler):
13
+ def _j(self, code, obj):
14
+ b = json.dumps(obj, ensure_ascii=False).encode()
15
+ self.send_response(code); self.send_header("Content-Type","application/json; charset=utf-8")
16
+ self.send_header("Content-Length",str(len(b))); self.end_headers(); self.wfile.write(b)
17
+ def log_message(self,*a): pass
18
+ def do_GET(self):
19
+ u = urlparse(self.path); qs = parse_qs(u.query)
20
+ if u.path == "/recall":
21
+ q = (qs.get("q") or [""])[0]; k = int((qs.get("k") or ["5"])[0])
22
+ ns = (qs.get("namespace") or [None])[0]
23
+ return self._j(200, {"hits":[h.to_dict() for h in mem.recall(q,k=k,namespace=ns)]})
24
+ if u.path == "/namespaces":
25
+ return self._j(200, {"namespaces": mem.namespaces(), "count": mem.count()})
26
+ if u.path in ("/","/health"):
27
+ return self._j(200, {"ok":True,"product":"ContinuityOS","count":mem.count()})
28
+ self._j(404, {"error":"not found"})
29
+ def do_POST(self):
30
+ n = int(self.headers.get("Content-Length",0)); body = json.loads(self.rfile.read(n) or b"{}")
31
+ if self.path == "/remember":
32
+ rid = mem.remember(body["text"], namespace=body.get("namespace","notes"), tags=body.get("tags"))
33
+ return self._j(200, {"id":rid})
34
+ self._j(404, {"error":"not found"})
35
+ print(f"ContinuityOS API on http://{host}:{port}")
36
+ ThreadingHTTPServer((host,port), H).serve_forever()
continuityos/cli.py ADDED
@@ -0,0 +1,102 @@
1
+ """ContinuityOS CLI.
2
+ Memory: cos remember | recall | namespaces
3
+ Continuity: cos canon | frontier | loop | checkpoint | doctor | handoff
4
+ Twin: cos predict | alignment
5
+ Serve: cos serve (MCP stdio) | cos api (HTTP)
6
+ """
7
+ from __future__ import annotations
8
+ import argparse, os, json, sys
9
+ from .memory import Memory
10
+ from .continuity import Continuity
11
+ from .twin import Twin
12
+
13
+ def _db(a): return a.db or os.path.expanduser("~/.continuityos/memory.db")
14
+
15
+ def main(argv=None):
16
+ ap = argparse.ArgumentParser(prog="cos", description="ContinuityOS — durable memory + continuity for agents & humans")
17
+ ap.add_argument("--db", default=None)
18
+ s = ap.add_subparsers(dest="cmd", required=True)
19
+ r = s.add_parser("remember"); r.add_argument("text"); r.add_argument("-n","--namespace",default="notes"); r.add_argument("-t","--tags",default="")
20
+ q = s.add_parser("recall"); q.add_argument("query"); q.add_argument("-k",type=int,default=5); q.add_argument("-n","--namespace",default=None)
21
+ s.add_parser("namespaces")
22
+ cn = s.add_parser("canon"); cn.add_argument("text", nargs="?");
23
+ fr = s.add_parser("frontier"); fr.add_argument("kind", nargs="?", choices=["trunk","cash","lab","parked"]); fr.add_argument("item", nargs="?")
24
+ lp = s.add_parser("loop"); lp.add_argument("text", nargs="?"); lp.add_argument("--close", type=int, default=None)
25
+ cp = s.add_parser("checkpoint"); cp.add_argument("--summary",required=True); cp.add_argument("--next",required=True,dest="nxt"); cp.add_argument("--proof",default="")
26
+ s.add_parser("doctor")
27
+ s.add_parser("handoff")
28
+ s.add_parser("boot")
29
+ bc = s.add_parser("close"); bc.add_argument("--summary",required=True); bc.add_argument("--next",required=True,dest="nxt"); bc.add_argument("--proof",default="")
30
+ s.add_parser("compress")
31
+ s.add_parser("serve")
32
+ pa = s.add_parser("api"); pa.add_argument("--host",default="127.0.0.1"); pa.add_argument("--port",type=int,default=8077)
33
+ pr = s.add_parser("predict", help="Digital-twin: likely stance on a situation, grounded in recorded rules and precedent")
34
+ pr.add_argument("situation")
35
+ al = s.add_parser("alignment", help="Check a proposed action against canon/rules; flags conflicts with non-negotiable rules")
36
+ al.add_argument("action")
37
+ a = ap.parse_args(argv)
38
+
39
+ if a.cmd == "serve":
40
+ from . import mcp_server; sys.argv = ["mcp","--db",_db(a)]; return mcp_server.main()
41
+ if a.cmd == "api":
42
+ from . import api; return api.run(_db(a), a.host, a.port)
43
+
44
+ db = _db(a);
45
+ try:
46
+ from .embedders import FastEmbedEmbedder
47
+ m = Memory(db, embedder=FastEmbedEmbedder())
48
+ except Exception:
49
+ m = Memory(db)
50
+ c = Continuity(memory=m)
51
+ t = Twin(memory=m)
52
+ if a.cmd == "remember":
53
+ tags=[t.strip() for t in a.tags.split(",") if t.strip()]
54
+ print("stored #%d in [%s]" % (m.remember(a.text,namespace=a.namespace,tags=tags), a.namespace))
55
+ elif a.cmd == "recall":
56
+ for h in m.recall(a.query,k=a.k,namespace=a.namespace):
57
+ print("%.3f [%s] %s (%s)" % (h.score,h.namespace,h.text,h.why))
58
+ elif a.cmd == "namespaces":
59
+ print(json.dumps(m.namespaces(),ensure_ascii=False,indent=2))
60
+ elif a.cmd == "canon":
61
+ if a.text: print("canon #%d" % c.add_canon(a.text))
62
+ else:
63
+ for r in c._dump("canon"): print("- "+r["text"])
64
+ elif a.cmd == "frontier":
65
+ if a.kind and a.item: print("set %s -> %s (#%d)" % (a.kind,a.item,c.set_frontier(a.kind,a.item)))
66
+ else: print(json.dumps(c.frontiers(),ensure_ascii=False,indent=2))
67
+ elif a.cmd == "loop":
68
+ if a.close is not None: c.close_loop(a.close); print("closed loop #%d" % a.close)
69
+ elif a.text: print("loop #%d opened" % c.add_loop(a.text))
70
+ else:
71
+ for l in c.open_loops(): print("[#%d] %s" % (l["id"],l["text"]))
72
+ elif a.cmd == "checkpoint":
73
+ print("checkpoint #%d" % c.checkpoint(summary=a.summary,next_action=a.nxt,proof=a.proof))
74
+ elif a.cmd == "doctor":
75
+ d=c.doctor(); print("%s %d/%d" % ("✅ healthy" if d["healthy"] else "⚠ drift", d["passed"], d["total"]))
76
+ for ch in d["checks"]: print(" %s %s — %s" % ("✓" if ch["ok"] else "✗", ch["check"], ch["detail"]))
77
+ elif a.cmd == "handoff":
78
+ print(c.handoff())
79
+ elif a.cmd == "boot":
80
+ # start of session: show handoff + doctor (the boot ritual)
81
+ print(c.handoff()); print("\n--- doctor ---")
82
+ d=c.doctor(); print("%s %d/%d" % ("OK" if d["healthy"] else "DRIFT", d["passed"], d["total"]))
83
+ for ch in d["checks"]:
84
+ if not ch["ok"]: print(" ! %s — %s" % (ch["check"], ch["detail"]))
85
+ elif a.cmd == "close":
86
+ # end of session: checkpoint + doctor (closure beats branching)
87
+ cid=c.checkpoint(summary=a.summary, next_action=a.nxt, proof=a.proof)
88
+ print("checkpoint #%d" % cid); d=c.doctor()
89
+ print("doctor: %s %d/%d" % ("OK" if d["healthy"] else "DRIFT", d["passed"], d["total"]))
90
+ elif a.cmd == "compress":
91
+ # weekly compression: report counts per namespace to spot bloat
92
+ print("namespace sizes (compress candidates):")
93
+ for ns in m.namespaces(): print(" %-12s %d" % (ns["namespace"], ns["count"]))
94
+ ol=c.open_loops()
95
+ print("open loops: %d (close stale ones with: cos loop --close <id>)" % len(ol))
96
+ elif a.cmd == "predict":
97
+ print(json.dumps(t.predict(a.situation), ensure_ascii=False, indent=2))
98
+ elif a.cmd == "alignment":
99
+ print(json.dumps(t.alignment(a.action), ensure_ascii=False, indent=2))
100
+
101
+ if __name__ == "__main__":
102
+ main()
@@ -0,0 +1,117 @@
1
+ """Continuity layer — ContinuityOS is more than a memory store.
2
+
3
+ A continuity OS keeps the *thread* between sessions (and between versions of you
4
+ and versions of the model): slow truths (canon), live state (frontiers + open
5
+ loops), session checkpoints, anti-drift checks, and handoff packs. All of it is
6
+ just structured memory, so it shares the same local store and hybrid recall.
7
+
8
+ Reserved namespaces: `canon` (non-negotiable truths/rules), `frontier`
9
+ (trunk/cash/lab focus), `loop` (open loops), `checkpoint` (session deltas).
10
+ """
11
+ from __future__ import annotations
12
+ import time, json
13
+ from typing import List, Dict, Any, Optional
14
+ from .memory import Memory
15
+
16
+ FRONTIER_KINDS = ("trunk", "cash", "lab", "parked")
17
+
18
+ class Continuity:
19
+ def __init__(self, memory: Optional[Memory] = None, db: str = "continuityos.db"):
20
+ self.m = memory or Memory(db)
21
+
22
+ # ---- canon: slow truths ----
23
+ def add_canon(self, text: str, tags: Optional[List[str]] = None) -> int:
24
+ return self.m.remember(text, namespace="canon", tags=tags or [])
25
+
26
+ def canon(self) -> List[Dict[str, Any]]:
27
+ return [r.to_dict() for r in self.m.recall("", k=100, namespace="canon")] or \
28
+ self._dump("canon")
29
+
30
+ # ---- frontiers: 1 trunk + 1 cash + 1 lab discipline ----
31
+ def set_frontier(self, kind: str, item: str) -> int:
32
+ kind = kind.lower()
33
+ if kind not in FRONTIER_KINDS:
34
+ raise ValueError(f"kind must be one of {FRONTIER_KINDS}")
35
+ # supersede previous frontier of same kind (keep history via meta)
36
+ return self.m.remember(item, namespace="frontier", tags=[kind],
37
+ meta={"kind": kind, "ts": time.time()})
38
+
39
+ def frontiers(self) -> Dict[str, Optional[str]]:
40
+ rows = self._dump("frontier")
41
+ latest: Dict[str, Dict[str, Any]] = {}
42
+ for r in rows:
43
+ k = (r["meta"] or {}).get("kind") or (r["tags"][0] if r["tags"] else "parked")
44
+ ts = (r["meta"] or {}).get("ts", 0)
45
+ if k not in latest or ts >= latest[k]["meta"].get("ts", 0):
46
+ latest[k] = r
47
+ return {k: latest[k]["text"] for k in latest}
48
+
49
+ # ---- open loops ----
50
+ def add_loop(self, text: str, tags: Optional[List[str]] = None) -> int:
51
+ return self.m.remember(text, namespace="loop", tags=(tags or []) + ["open"],
52
+ meta={"open": True, "ts": time.time()})
53
+
54
+ def close_loop(self, loop_id: int) -> bool:
55
+ return self.m.forget(loop_id)
56
+
57
+ def open_loops(self) -> List[Dict[str, Any]]:
58
+ return [r for r in self._dump("loop")]
59
+
60
+ # ---- checkpoints: every session ends with delta + next + proof ----
61
+ def checkpoint(self, summary: str, next_action: str, proof: str = "") -> int:
62
+ text = f"DELTA: {summary} | NEXT: {next_action}" + (f" | PROOF: {proof}" if proof else "")
63
+ return self.m.remember(text, namespace="checkpoint",
64
+ tags=["checkpoint"], meta={"ts": time.time(),
65
+ "summary": summary, "next": next_action, "proof": proof})
66
+
67
+ def last_checkpoint(self) -> Optional[Dict[str, Any]]:
68
+ rows = sorted(self._dump("checkpoint"), key=lambda r: (r["meta"] or {}).get("ts", 0), reverse=True)
69
+ return rows[0] if rows else None
70
+
71
+ # ---- anti-drift doctor ----
72
+ def doctor(self, max_open_loops: int = 7, checkpoint_stale_hours: float = 48) -> Dict[str, Any]:
73
+ fr = self.frontiers()
74
+ loops = self.open_loops()
75
+ last = self.last_checkpoint()
76
+ now = time.time()
77
+ checks = []
78
+ def chk(ok, name, detail): checks.append({"ok": bool(ok), "check": name, "detail": detail})
79
+ chk("cash" in fr, "cash_frontier_set", fr.get("cash", "— not set"))
80
+ chk("trunk" in fr, "trunk_set", fr.get("trunk", "— not set"))
81
+ chk(len(loops) <= max_open_loops, "open_loops_bounded", f"{len(loops)} open (max {max_open_loops})")
82
+ if last:
83
+ age_h = (now - (last["meta"] or {}).get("ts", now)) / 3600
84
+ chk(age_h <= checkpoint_stale_hours, "checkpoint_fresh", f"{age_h:.1f}h old")
85
+ chk(bool((last["meta"] or {}).get("proof")), "has_proof", (last["meta"] or {}).get("proof") or "— no proof")
86
+ else:
87
+ chk(False, "checkpoint_fresh", "no checkpoint yet")
88
+ chk(False, "has_proof", "no checkpoint yet")
89
+ # L6 autopoiesis — self-maintenance invariants (system "alive")
90
+ chk(self.m.count() > 0, "memory_persists", f"{self.m.count()} memories")
91
+ chk(len(self._dump("canon")) > 0, "identity_persists", "canon present" )
92
+ chk(len(loops) > 0, "purpose_persists", f"{len(loops)} open loop(s)")
93
+ passed = sum(1 for c in checks if c["ok"])
94
+ return {"healthy": passed == len(checks), "passed": passed, "total": len(checks), "checks": checks}
95
+
96
+ # ---- handoff pack: context for the next session / agent ----
97
+ def handoff(self) -> str:
98
+ fr = self.frontiers()
99
+ loops = self.open_loops()
100
+ last = self.last_checkpoint()
101
+ canon = self._dump("canon")
102
+ out = ["# ContinuityOS handoff pack"]
103
+ out.append("\n## Canon (non-negotiable)")
104
+ out += [f"- {c['text']}" for c in canon[:12]] or ["- (none)"]
105
+ out.append("\n## Frontiers")
106
+ out += [f"- {k}: {v}" for k, v in fr.items()] or ["- (none)"]
107
+ out.append("\n## Open loops")
108
+ out += [f"- [#{l['id']}] {l['text']}" for l in loops[:20]] or ["- (none)"]
109
+ out.append("\n## Last checkpoint")
110
+ out.append(f"- {last['text']}" if last else "- (none)")
111
+ return "\n".join(out)
112
+
113
+ def _dump(self, namespace: str) -> List[Dict[str, Any]]:
114
+ rows = self.m.store.all_with_vecs(namespace=namespace)
115
+ import json as _j
116
+ return [{"id": r["id"], "text": r["text"], "namespace": r["namespace"],
117
+ "tags": _j.loads(r["tags"]), "meta": _j.loads(r["meta"])} for r in rows]
@@ -0,0 +1,54 @@
1
+ """L5 Control Plane — operator actions over memory (from Twin admin-control-plane spec).
2
+
3
+ corrections, redaction (privacy), rollback to a checkpoint, and a transparency
4
+ export (what is stored about whom). Every control action is itself recorded.
5
+ """
6
+ from __future__ import annotations
7
+ import time, json
8
+ from typing import List, Dict, Any, Optional
9
+ from .memory import Memory
10
+
11
+ class ControlPlane:
12
+ def __init__(self, memory: Optional[Memory] = None, db: str = "continuityos.db"):
13
+ self.m = memory or Memory(db)
14
+
15
+ def correct(self, item_id: int, new_text: str, namespace: str = "notes") -> int:
16
+ """Supersede a memory: forget the old, store the corrected one, log it."""
17
+ old = self.m.store.get(item_id)
18
+ self.m.forget(item_id)
19
+ rid = self.m.remember(new_text, namespace=namespace, tags=["corrected"],
20
+ meta={"corrects": item_id})
21
+ self.m.remember(f"correction of #{item_id} -> #{rid}", namespace="control",
22
+ tags=["control","correct"], meta={"ts": time.time()})
23
+ return rid
24
+
25
+ def redact(self, query: str, namespace: Optional[str] = None) -> int:
26
+ """Privacy: delete memories matching a query. Returns count removed."""
27
+ hits = self.m.recall(query, k=100, namespace=namespace)
28
+ n = 0
29
+ for h in hits:
30
+ if h.score > 0.2:
31
+ self.m.forget(h.id); n += 1
32
+ self.m.remember(f"redacted {n} memories matching '{query}'", namespace="control",
33
+ tags=["control","redact"], meta={"ts": time.time(), "count": n})
34
+ return n
35
+
36
+ def rollback(self, checkpoint_id: int) -> Dict[str, Any]:
37
+ """Revert state created AFTER a checkpoint: forget loop/frontier/checkpoint items newer than it."""
38
+ cp = self.m.store.get(checkpoint_id)
39
+ if not cp:
40
+ return {"ok": False, "error": "checkpoint not found"}
41
+ cutoff = cp["created_at"]
42
+ removed = 0
43
+ for ns in ("loop", "frontier", "checkpoint", "notes"):
44
+ for r in self.m.store.all_with_vecs(namespace=ns):
45
+ if r["created_at"] > cutoff:
46
+ self.m.forget(r["id"]); removed += 1
47
+ self.m.remember(f"rollback to checkpoint #{checkpoint_id}, removed {removed} newer items",
48
+ namespace="control", tags=["control","rollback"], meta={"ts": time.time()})
49
+ return {"ok": True, "checkpoint": checkpoint_id, "removed": removed}
50
+
51
+ def export(self) -> Dict[str, Any]:
52
+ """Transparency: what is stored, by namespace (consent / data-subject view)."""
53
+ return {"namespaces": self.m.namespaces(), "total": self.m.count(),
54
+ "note": "All data is local. Use redact() to remove, correct() to fix."}
continuityos/dedupe.py ADDED
@@ -0,0 +1,131 @@
1
+ """ContinuityOS memory de-duplication — MinHash + SimHash LSH (pure-python, no deps).
2
+
3
+ Applies guide_lsh_deduplication: collapses near-duplicate memories into CompactDigest
4
+ records so the long-term store doesn't bloat with restated facts. Parameters match the
5
+ guide: MinHash Jaccard >= 0.55 (n_perm=128), SimHash Hamming <= 3 (strict) / <= 4 (soft).
6
+
7
+ Usage:
8
+ from continuityos.dedupe import find_near_duplicates, compact_digest
9
+ groups = find_near_duplicates([(m.id, m.text) for m in all_memories])
10
+ for g in groups: digest = compact_digest(g)
11
+ Read-only by design: returns duplicate groups + digests; the caller decides what to drop.
12
+ """
13
+ from __future__ import annotations
14
+ import re, hashlib
15
+ from typing import List, Tuple, Dict
16
+
17
+ _WORD = re.compile(r"\w+", re.UNICODE)
18
+ N_PERM = 128
19
+ JACCARD_THRESH = 0.55
20
+ HAMMING_STRICT = 3
21
+ HAMMING_SOFT = 4
22
+ _MERSENNE = (1 << 61) - 1
23
+
24
+
25
+ def _shingles(text: str, k: int = 3) -> set:
26
+ toks = [t.lower() for t in _WORD.findall(text)]
27
+ if len(toks) < k:
28
+ return set(toks)
29
+ return {" ".join(toks[i:i + k]) for i in range(len(toks) - k + 1)}
30
+
31
+
32
+ def _h(s: str, seed: int) -> int:
33
+ return int(hashlib.blake2b(s.encode("utf-8"), digest_size=8,
34
+ salt=seed.to_bytes(8, "little")).hexdigest(), 16)
35
+
36
+
37
+ def minhash(text: str, n_perm: int = N_PERM) -> Tuple[int, ...]:
38
+ """MinHash signature (n_perm permutations)."""
39
+ sh = _shingles(text)
40
+ if not sh:
41
+ return tuple([0] * n_perm)
42
+ sig = []
43
+ for p in range(n_perm):
44
+ sig.append(min(_h(s, p) % _MERSENNE for s in sh))
45
+ return tuple(sig)
46
+
47
+
48
+ def jaccard(a: Tuple[int, ...], b: Tuple[int, ...]) -> float:
49
+ if not a or not b:
50
+ return 0.0
51
+ return sum(1 for x, y in zip(a, b) if x == y) / len(a)
52
+
53
+
54
+ def simhash(text: str, bits: int = 64) -> int:
55
+ """SimHash fingerprint."""
56
+ v = [0] * bits
57
+ for s in _shingles(text, k=2):
58
+ hv = _h(s, 1)
59
+ for i in range(bits):
60
+ v[i] += 1 if (hv >> i) & 1 else -1
61
+ out = 0
62
+ for i in range(bits):
63
+ if v[i] > 0:
64
+ out |= (1 << i)
65
+ return out
66
+
67
+
68
+ def hamming(a: int, b: int) -> int:
69
+ return bin(a ^ b).count("1")
70
+
71
+
72
+ def find_near_duplicates(items: List[Tuple[str, str]],
73
+ jaccard_thresh: float = JACCARD_THRESH,
74
+ hamming_thresh: int = HAMMING_STRICT) -> List[List[str]]:
75
+ """items = [(id, text), ...] -> list of duplicate groups (each a list of ids).
76
+ Two-stage gate (guide): SimHash Hamming candidate -> MinHash Jaccard confirm."""
77
+ sims = {i: simhash(t) for i, t in items}
78
+ mins = {i: minhash(t) for i, t in items}
79
+ ids = [i for i, _ in items]
80
+ parent: Dict[str, str] = {i: i for i in ids}
81
+
82
+ def find(x):
83
+ while parent[x] != x:
84
+ parent[x] = parent[parent[x]]; x = parent[x]
85
+ return x
86
+
87
+ def union(a, b):
88
+ ra, rb = find(a), find(b)
89
+ if ra != rb:
90
+ parent[rb] = ra
91
+
92
+ for ai in range(len(ids)):
93
+ for bi in range(ai + 1, len(ids)):
94
+ a, b = ids[ai], ids[bi]
95
+ if hamming(sims[a], sims[b]) <= hamming_thresh and \
96
+ jaccard(mins[a], mins[b]) >= jaccard_thresh:
97
+ union(a, b)
98
+ groups: Dict[str, List[str]] = {}
99
+ for i in ids:
100
+ groups.setdefault(find(i), []).append(i)
101
+ return [g for g in groups.values() if len(g) > 1]
102
+
103
+
104
+ def compact_digest(group_items: List[Tuple[str, str]]) -> dict:
105
+ """Collapse a duplicate group into one CompactDigest (guide format):
106
+ core summary = the longest/most-informative text, back-trace ids preserved."""
107
+ canonical = max(group_items, key=lambda it: len(it[1]))
108
+ return {
109
+ "core_summary": canonical[1],
110
+ "canonical_id": canonical[0],
111
+ "back_trace_ids": [i for i, _ in group_items],
112
+ "merged_count": len(group_items),
113
+ }
114
+
115
+
116
+ # NOTE: LSH (MinHash/SimHash) is the NEAR-EXACT layer — it collapses the same fact
117
+ # restated with minor edits (punctuation, whitespace, a swapped word). Semantic
118
+ # paraphrases ("X causes Y" vs "Y is caused by X") are NOT near-duplicates here by
119
+ # design; those are handled by ContinuityOS vector recall. Two cheap, complementary layers.
120
+ if __name__ == "__main__":
121
+ demo = [
122
+ ("m1", "The arena uses GCP spot preemption which causes periodic reboots and bot flapping."),
123
+ ("m2", "The arena uses GCP spot preemption which causes periodic reboots and bot flapping"),
124
+ ("m3", "The arena uses GCP spot-preemption, which causes periodic reboots and bot flapping."),
125
+ ("m4", "Grid trading is market-neutral and works best in flat regimes."),
126
+ ("m5", "Completely unrelated note about coffee and morning routines and tea."),
127
+ ]
128
+ groups = find_near_duplicates(demo)
129
+ print("duplicate groups:", groups) # expect [['m1','m2','m3']]
130
+ for g in groups:
131
+ print("digest:", compact_digest([(i, t) for i, t in demo if i in g]))
continuityos/embed.py ADDED
@@ -0,0 +1,43 @@
1
+ """Pluggable embeddings.
2
+
3
+ Default `HashingEmbedder` is dependency-free and deterministic: char n-gram
4
+ hashing into a fixed-dim L2-normalized vector. Good enough for local semantic
5
+ recall and fully offline. For production-grade semantics, pass any callable
6
+ that maps str->list[float] (e.g. a sentence-transformers model) as `embedder`.
7
+ """
8
+ from __future__ import annotations
9
+ import math, re, hashlib
10
+ from typing import List
11
+
12
+ _TOKEN = re.compile(r"[\w]+", re.UNICODE)
13
+
14
+ def _stable_hash(s: str) -> int:
15
+ # deterministic across processes/runs (unlike builtin hash() for str)
16
+ return int.from_bytes(hashlib.blake2b(s.encode("utf-8"), digest_size=8).digest(), "big")
17
+
18
+ def _ngrams(text: str, n: int = 3):
19
+ toks = _TOKEN.findall((text or "").lower())
20
+ for t in toks: # word-level tokens
21
+ yield t
22
+ joined = " ".join(toks)
23
+ for i in range(len(joined) - n + 1): # char n-grams (morphology / typos / multilingual)
24
+ yield joined[i:i+n]
25
+
26
+ class HashingEmbedder:
27
+ """Deterministic offline embedder. dim defaults to 256."""
28
+ def __init__(self, dim: int = 256):
29
+ self.dim = dim
30
+
31
+ def __call__(self, text: str) -> List[float]:
32
+ vec = [0.0] * self.dim
33
+ for g in _ngrams(text):
34
+ h = _stable_hash(g)
35
+ idx = h % self.dim
36
+ sign = 1.0 if (h >> 16) & 1 else -1.0
37
+ vec[idx] += sign
38
+ norm = math.sqrt(sum(v * v for v in vec)) or 1.0
39
+ return [v / norm for v in vec]
40
+
41
+ def cosine(a: List[float], b: List[float]) -> float:
42
+ # vectors are L2-normalized -> dot product == cosine
43
+ return sum(x * y for x, y in zip(a, b))
@@ -0,0 +1,47 @@
1
+ """Optional production-grade embedders (pluggable into Memory(embedder=...)).
2
+
3
+ Default stays the dependency-free offline HashingEmbedder. These give real
4
+ semantic recall:
5
+
6
+ pip install "continuityos[fast]" # fastembed: ONNX, no torch, small + fast
7
+ pip install "continuityos[st]" # sentence-transformers: widest model choice
8
+
9
+ from continuityos import Memory
10
+ from continuityos.embedders import FastEmbedEmbedder
11
+ m = Memory("memory.db", embedder=FastEmbedEmbedder())
12
+
13
+ All embedders return L2-normalized vectors so cosine == dot product.
14
+ """
15
+ from __future__ import annotations
16
+ import math
17
+ from typing import List
18
+
19
+ def _l2(v: List[float]) -> List[float]:
20
+ n = math.sqrt(sum(x * x for x in v)) or 1.0
21
+ return [x / n for x in v]
22
+
23
+ class FastEmbedEmbedder:
24
+ """ONNX embedder via `fastembed` (no torch). Default model bge-small-en-v1.5 (384-dim)."""
25
+ def __init__(self, model: str = "BAAI/bge-small-en-v1.5"):
26
+ try:
27
+ from fastembed import TextEmbedding
28
+ except Exception as e:
29
+ raise ImportError('FastEmbedEmbedder needs: pip install "continuityos[fast]"') from e
30
+ self._model = TextEmbedding(model_name=model)
31
+
32
+ def __call__(self, text: str) -> List[float]:
33
+ vec = list(next(iter(self._model.embed([text or ""]))))
34
+ return _l2([float(x) for x in vec])
35
+
36
+ class SentenceTransformerEmbedder:
37
+ """sentence-transformers embedder. Default all-MiniLM-L6-v2 (384-dim)."""
38
+ def __init__(self, model: str = "all-MiniLM-L6-v2"):
39
+ try:
40
+ from sentence_transformers import SentenceTransformer
41
+ except Exception as e:
42
+ raise ImportError('SentenceTransformerEmbedder needs: pip install "continuityos[st]"') from e
43
+ self._model = SentenceTransformer(model)
44
+
45
+ def __call__(self, text: str) -> List[float]:
46
+ v = self._model.encode(text or "", normalize_embeddings=True)
47
+ return [float(x) for x in v]