aaak-vault-sync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dialect.py ADDED
@@ -0,0 +1,1075 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ AAAK Dialect -- Structured Symbolic Summary Format
4
+ ====================================================
5
+
6
+ A lossy summarization format that extracts entities, topics, key sentences,
7
+ emotions, and flags from plain text into a compact structured representation.
8
+ Any LLM reads it natively — no decoder required.
9
+
10
+ Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text.
11
+
12
+ NOTE: AAAK is NOT lossless compression. The original text cannot be reconstructed
13
+ from AAAK output. It is a structured summary layer (closets) that points to the
14
+ original verbatim content (drawers). The 96.6% benchmark score is from raw mode,
15
+ not AAAK mode.
16
+
17
+ Adapted for mempalace: works standalone on plain text and ChromaDB drawers.
18
+ No dependency on palace.py or layers.py.
19
+
20
+ FORMAT:
21
+ Header: FILE_NUM|PRIMARY_ENTITY|DATE|TITLE
22
+ Zettel: ZID:ENTITIES|topic_keywords|"key_quote"|WEIGHT|EMOTIONS|FLAGS
23
+ Tunnel: T:ZID<->ZID|label
24
+ Arc: ARC:emotion->emotion->emotion
25
+
26
+ EMOTION CODES (universal):
27
+ vul=vulnerability, joy=joy, fear=fear, trust=trust
28
+ grief=grief, wonder=wonder, rage=rage, love=love
29
+ hope=hope, despair=despair, peace=peace, humor=humor
30
+ tender=tenderness, raw=raw_honesty, doubt=self_doubt
31
+ relief=relief, anx=anxiety, exhaust=exhaustion
32
+ convict=conviction, passion=quiet_passion
33
+
34
+ FLAGS:
35
+ ORIGIN = origin moment (birth of something)
36
+ CORE = core belief or identity pillar
37
+ SENSITIVE = handle with absolute care
38
+ PIVOT = emotional turning point
39
+ GENESIS = led directly to something existing
40
+ DECISION = explicit decision or choice
41
+ TECHNICAL = technical architecture or implementation detail
42
+ """
43
+
44
+ import json
45
+ import os
46
+ import re
47
+ from typing import List, Dict, Optional
48
+ from pathlib import Path
49
+
50
+
51
+ # === EMOTION CODES (universal) ===
52
+
53
+ EMOTION_CODES = {
54
+ "vulnerability": "vul",
55
+ "vulnerable": "vul",
56
+ "joy": "joy",
57
+ "joyful": "joy",
58
+ "fear": "fear",
59
+ "mild_fear": "fear",
60
+ "trust": "trust",
61
+ "trust_building": "trust",
62
+ "grief": "grief",
63
+ "raw_grief": "grief",
64
+ "wonder": "wonder",
65
+ "philosophical_wonder": "wonder",
66
+ "rage": "rage",
67
+ "anger": "rage",
68
+ "love": "love",
69
+ "devotion": "love",
70
+ "hope": "hope",
71
+ "despair": "despair",
72
+ "hopelessness": "despair",
73
+ "peace": "peace",
74
+ "relief": "relief",
75
+ "humor": "humor",
76
+ "dark_humor": "humor",
77
+ "tenderness": "tender",
78
+ "raw_honesty": "raw",
79
+ "brutal_honesty": "raw",
80
+ "self_doubt": "doubt",
81
+ "anxiety": "anx",
82
+ "exhaustion": "exhaust",
83
+ "conviction": "convict",
84
+ "quiet_passion": "passion",
85
+ "warmth": "warmth",
86
+ "curiosity": "curious",
87
+ "gratitude": "grat",
88
+ "frustration": "frust",
89
+ "confusion": "confuse",
90
+ "satisfaction": "satis",
91
+ "excitement": "excite",
92
+ "determination": "determ",
93
+ "surprise": "surprise",
94
+ }
95
+
96
+ # Keywords that signal emotions in plain text
97
+ _EMOTION_SIGNALS = {
98
+ "decided": "determ",
99
+ "prefer": "convict",
100
+ "worried": "anx",
101
+ "excited": "excite",
102
+ "frustrated": "frust",
103
+ "confused": "confuse",
104
+ "love": "love",
105
+ "hate": "rage",
106
+ "hope": "hope",
107
+ "fear": "fear",
108
+ "trust": "trust",
109
+ "happy": "joy",
110
+ "sad": "grief",
111
+ "surprised": "surprise",
112
+ "grateful": "grat",
113
+ "curious": "curious",
114
+ "wonder": "wonder",
115
+ "anxious": "anx",
116
+ "relieved": "relief",
117
+ "satisf": "satis",
118
+ "disappoint": "grief",
119
+ "concern": "anx",
120
+ }
121
+
122
+ # Keywords that signal flags
123
+ _FLAG_SIGNALS = {
124
+ "decided": "DECISION",
125
+ "chose": "DECISION",
126
+ "switched": "DECISION",
127
+ "migrated": "DECISION",
128
+ "replaced": "DECISION",
129
+ "instead of": "DECISION",
130
+ "because": "DECISION",
131
+ "founded": "ORIGIN",
132
+ "created": "ORIGIN",
133
+ "started": "ORIGIN",
134
+ "born": "ORIGIN",
135
+ "launched": "ORIGIN",
136
+ "first time": "ORIGIN",
137
+ "core": "CORE",
138
+ "fundamental": "CORE",
139
+ "essential": "CORE",
140
+ "principle": "CORE",
141
+ "belief": "CORE",
142
+ "always": "CORE",
143
+ "never forget": "CORE",
144
+ "turning point": "PIVOT",
145
+ "changed everything": "PIVOT",
146
+ "realized": "PIVOT",
147
+ "breakthrough": "PIVOT",
148
+ "epiphany": "PIVOT",
149
+ "api": "TECHNICAL",
150
+ "database": "TECHNICAL",
151
+ "architecture": "TECHNICAL",
152
+ "deploy": "TECHNICAL",
153
+ "infrastructure": "TECHNICAL",
154
+ "algorithm": "TECHNICAL",
155
+ "framework": "TECHNICAL",
156
+ "server": "TECHNICAL",
157
+ "config": "TECHNICAL",
158
+ }
159
+
160
+ # Common filler/stop words to strip from topic extraction
161
+ _STOP_WORDS = {
162
+ "the",
163
+ "a",
164
+ "an",
165
+ "is",
166
+ "are",
167
+ "was",
168
+ "were",
169
+ "be",
170
+ "been",
171
+ "being",
172
+ "have",
173
+ "has",
174
+ "had",
175
+ "do",
176
+ "does",
177
+ "did",
178
+ "will",
179
+ "would",
180
+ "could",
181
+ "should",
182
+ "may",
183
+ "might",
184
+ "shall",
185
+ "can",
186
+ "to",
187
+ "of",
188
+ "in",
189
+ "for",
190
+ "on",
191
+ "with",
192
+ "at",
193
+ "by",
194
+ "from",
195
+ "as",
196
+ "into",
197
+ "about",
198
+ "between",
199
+ "through",
200
+ "during",
201
+ "before",
202
+ "after",
203
+ "above",
204
+ "below",
205
+ "up",
206
+ "down",
207
+ "out",
208
+ "off",
209
+ "over",
210
+ "under",
211
+ "again",
212
+ "further",
213
+ "then",
214
+ "once",
215
+ "here",
216
+ "there",
217
+ "when",
218
+ "where",
219
+ "why",
220
+ "how",
221
+ "all",
222
+ "each",
223
+ "every",
224
+ "both",
225
+ "few",
226
+ "more",
227
+ "most",
228
+ "other",
229
+ "some",
230
+ "such",
231
+ "no",
232
+ "nor",
233
+ "not",
234
+ "only",
235
+ "own",
236
+ "same",
237
+ "so",
238
+ "than",
239
+ "too",
240
+ "very",
241
+ "just",
242
+ "don",
243
+ "now",
244
+ "and",
245
+ "but",
246
+ "or",
247
+ "if",
248
+ "while",
249
+ "that",
250
+ "this",
251
+ "these",
252
+ "those",
253
+ "it",
254
+ "its",
255
+ "i",
256
+ "we",
257
+ "you",
258
+ "he",
259
+ "she",
260
+ "they",
261
+ "me",
262
+ "him",
263
+ "her",
264
+ "us",
265
+ "them",
266
+ "my",
267
+ "your",
268
+ "his",
269
+ "our",
270
+ "their",
271
+ "what",
272
+ "which",
273
+ "who",
274
+ "whom",
275
+ "also",
276
+ "much",
277
+ "many",
278
+ "like",
279
+ "because",
280
+ "since",
281
+ "get",
282
+ "got",
283
+ "use",
284
+ "used",
285
+ "using",
286
+ "make",
287
+ "made",
288
+ "thing",
289
+ "things",
290
+ "way",
291
+ "well",
292
+ "really",
293
+ "want",
294
+ "need",
295
+ }
296
+
297
+
298
+ class Dialect:
299
+ """
300
+ AAAK Dialect encoder -- works on plain text or structured zettel data.
301
+
302
+ Usage:
303
+ # Basic: compress any text
304
+ dialect = Dialect()
305
+ compressed = dialect.compress("We decided to use GraphQL instead of REST...")
306
+
307
+ # With entity mappings
308
+ dialect = Dialect(entities={"Alice": "ALC", "Bob": "BOB"})
309
+
310
+ # From config file
311
+ dialect = Dialect.from_config("entities.json")
312
+
313
+ # Compress zettel JSON (original format)
314
+ compressed = dialect.compress_file("zettels/file_001.json")
315
+
316
+ # Generate Layer 1 wake-up file
317
+ dialect.generate_layer1("zettels/", output="LAYER1.aaak")
318
+ """
319
+
320
+ def __init__(self, entities: Dict[str, str] = None, skip_names: List[str] = None):
321
+ """
322
+ Args:
323
+ entities: Mapping of full names -> short codes.
324
+ e.g. {"Alice": "ALC", "Bob": "BOB"}
325
+ If None, entities are auto-coded from first 3 chars.
326
+ skip_names: Names to skip (fictional characters, etc.)
327
+ """
328
+ self.entity_codes = {}
329
+ if entities:
330
+ for name, code in entities.items():
331
+ self.entity_codes[name] = code
332
+ self.entity_codes[name.lower()] = code
333
+ self.skip_names = [n.lower() for n in (skip_names or [])]
334
+
335
+ @classmethod
336
+ def from_config(cls, config_path: str) -> "Dialect":
337
+ """Load entity mappings from a JSON config file.
338
+
339
+ Config format:
340
+ {
341
+ "entities": {"Alice": "ALC", "Bob": "BOB"},
342
+ "skip_names": ["Gandalf", "Sherlock"]
343
+ }
344
+ """
345
+ with open(config_path, "r") as f:
346
+ config = json.load(f)
347
+ return cls(
348
+ entities=config.get("entities", {}),
349
+ skip_names=config.get("skip_names", []),
350
+ )
351
+
352
+ def save_config(self, config_path: str):
353
+ """Save current entity mappings to a JSON config file."""
354
+ canonical = {}
355
+ seen_codes = set()
356
+ for name, code in self.entity_codes.items():
357
+ if code not in seen_codes and not name.islower():
358
+ canonical[name] = code
359
+ seen_codes.add(code)
360
+ elif code not in seen_codes:
361
+ canonical[name] = code
362
+ seen_codes.add(code)
363
+
364
+ config = {
365
+ "entities": canonical,
366
+ "skip_names": self.skip_names,
367
+ }
368
+ with open(config_path, "w") as f:
369
+ json.dump(config, f, indent=2)
370
+
371
+ # === ENCODING (entity/emotion primitives) ===
372
+
373
+ def encode_entity(self, name: str) -> Optional[str]:
374
+ """Convert a person/entity name to its short code."""
375
+ if any(s in name.lower() for s in self.skip_names):
376
+ return None
377
+ if name in self.entity_codes:
378
+ return self.entity_codes[name]
379
+ if name.lower() in self.entity_codes:
380
+ return self.entity_codes[name.lower()]
381
+ for key, code in self.entity_codes.items():
382
+ if key.lower() in name.lower():
383
+ return code
384
+ # Auto-code: first 3 chars uppercase
385
+ return name[:3].upper()
386
+
387
+ def encode_emotions(self, emotions: List[str]) -> str:
388
+ """Convert emotion list to compact codes."""
389
+ codes = []
390
+ for e in emotions:
391
+ code = EMOTION_CODES.get(e, e[:4])
392
+ if code not in codes:
393
+ codes.append(code)
394
+ return "+".join(codes[:3])
395
+
396
+ def get_flags(self, zettel: dict) -> str:
397
+ """Extract flags from zettel metadata."""
398
+ flags = []
399
+ if zettel.get("origin_moment"):
400
+ flags.append("ORIGIN")
401
+ if zettel.get("sensitivity", "").upper().startswith("MAXIMUM"):
402
+ flags.append("SENSITIVE")
403
+ notes = zettel.get("notes", "").lower()
404
+ if "foundational pillar" in notes or "core" in notes:
405
+ flags.append("CORE")
406
+ if "genesis" in notes or "genesis" in zettel.get("origin_label", "").lower():
407
+ flags.append("GENESIS")
408
+ if "pivot" in notes:
409
+ flags.append("PIVOT")
410
+ return "+".join(flags) if flags else ""
411
+
412
+ # === PLAIN TEXT COMPRESSION (new for mempalace) ===
413
+
414
+ def _detect_emotions(self, text: str) -> List[str]:
415
+ """Detect emotions from plain text using keyword signals."""
416
+ text_lower = text.lower()
417
+ detected = []
418
+ seen = set()
419
+ for keyword, code in _EMOTION_SIGNALS.items():
420
+ if keyword in text_lower and code not in seen:
421
+ detected.append(code)
422
+ seen.add(code)
423
+ return detected[:3]
424
+
425
+ def _detect_flags(self, text: str) -> List[str]:
426
+ """Detect importance flags from plain text using keyword signals."""
427
+ text_lower = text.lower()
428
+ detected = []
429
+ seen = set()
430
+ for keyword, flag in _FLAG_SIGNALS.items():
431
+ if keyword in text_lower and flag not in seen:
432
+ detected.append(flag)
433
+ seen.add(flag)
434
+ return detected[:3]
435
+
436
+ def _extract_topics(self, text: str, max_topics: int = 3) -> List[str]:
437
+ """Extract key topic words from plain text."""
438
+ # Tokenize: alphanumeric words, lowercase
439
+ words = re.findall(r"[a-zA-Z][a-zA-Z_-]{2,}", text)
440
+ # Count frequency, skip stop words
441
+ freq = {}
442
+ for w in words:
443
+ w_lower = w.lower()
444
+ if w_lower in _STOP_WORDS or len(w_lower) < 3:
445
+ continue
446
+ freq[w_lower] = freq.get(w_lower, 0) + 1
447
+
448
+ # Also boost words that look like proper nouns or technical terms
449
+ for w in words:
450
+ w_lower = w.lower()
451
+ if w_lower in _STOP_WORDS:
452
+ continue
453
+ if w[0].isupper() and w_lower in freq:
454
+ freq[w_lower] += 2
455
+ # CamelCase or has underscore/hyphen
456
+ if "_" in w or "-" in w or (any(c.isupper() for c in w[1:])):
457
+ if w_lower in freq:
458
+ freq[w_lower] += 2
459
+
460
+ ranked = sorted(freq.items(), key=lambda x: -x[1])
461
+ return [w for w, _ in ranked[:max_topics]]
462
+
463
+ def _extract_key_sentence(self, text: str) -> str:
464
+ """Extract the most important sentence fragment from text."""
465
+ # Split into sentences
466
+ sentences = re.split(r"[.!?\n]+", text)
467
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
468
+ if not sentences:
469
+ return ""
470
+
471
+ # Score each sentence
472
+ decision_words = {
473
+ "decided",
474
+ "because",
475
+ "instead",
476
+ "prefer",
477
+ "switched",
478
+ "chose",
479
+ "realized",
480
+ "important",
481
+ "key",
482
+ "critical",
483
+ "discovered",
484
+ "learned",
485
+ "conclusion",
486
+ "solution",
487
+ "reason",
488
+ "why",
489
+ "breakthrough",
490
+ "insight",
491
+ }
492
+ scored = []
493
+ for s in sentences:
494
+ score = 0
495
+ s_lower = s.lower()
496
+ for w in decision_words:
497
+ if w in s_lower:
498
+ score += 2
499
+ # Prefer shorter, punchier sentences
500
+ if len(s) < 80:
501
+ score += 1
502
+ if len(s) < 40:
503
+ score += 1
504
+ # Penalize very long sentences
505
+ if len(s) > 150:
506
+ score -= 2
507
+ scored.append((score, s))
508
+
509
+ scored.sort(key=lambda x: -x[0])
510
+ best = scored[0][1]
511
+ # Truncate if too long
512
+ if len(best) > 55:
513
+ best = best[:52] + "..."
514
+ return best
515
+
516
+ def _detect_entities_in_text(self, text: str) -> List[str]:
517
+ """Find known entities in text, or detect capitalized names."""
518
+ found = []
519
+ # Check known entities
520
+ for name, code in self.entity_codes.items():
521
+ if not name.islower() and name.lower() in text.lower():
522
+ if code not in found:
523
+ found.append(code)
524
+ if found:
525
+ return found
526
+
527
+ # Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
528
+ words = text.split()
529
+ for i, w in enumerate(words):
530
+ clean = re.sub(r"[^a-zA-Z]", "", w)
531
+ if (
532
+ len(clean) >= 2
533
+ and clean[0].isupper()
534
+ and clean[1:].islower()
535
+ and i > 0
536
+ and clean.lower() not in _STOP_WORDS
537
+ ):
538
+ code = clean[:3].upper()
539
+ if code not in found:
540
+ found.append(code)
541
+ if len(found) >= 3:
542
+ break
543
+ return found
544
+
545
+ def compress(self, text: str, metadata: dict = None) -> str:
546
+ """
547
+ Summarize plain text into AAAK Dialect format.
548
+
549
+ Extracts entities, topics, a key sentence, emotions, and flags
550
+ from the input text. This is lossy — the original text cannot be
551
+ reconstructed from the output.
552
+
553
+ Args:
554
+ text: Plain text content to summarize
555
+ metadata: Optional dict with keys like 'source_file', 'wing',
556
+ 'room', 'date', etc.
557
+
558
+ Returns:
559
+ AAAK-formatted summary string
560
+ """
561
+ metadata = metadata or {}
562
+
563
+ # Detect components
564
+ entities = self._detect_entities_in_text(text)
565
+ entity_str = "+".join(entities[:3]) if entities else "???"
566
+
567
+ topics = self._extract_topics(text)
568
+ topic_str = "_".join(topics[:3]) if topics else "misc"
569
+
570
+ quote = self._extract_key_sentence(text)
571
+ quote_part = f'"{quote}"' if quote else ""
572
+
573
+ emotions = self._detect_emotions(text)
574
+ emotion_str = "+".join(emotions) if emotions else ""
575
+
576
+ flags = self._detect_flags(text)
577
+ flag_str = "+".join(flags) if flags else ""
578
+
579
+ # Build source header if metadata available
580
+ source = metadata.get("source_file", "")
581
+ wing = metadata.get("wing", "")
582
+ room = metadata.get("room", "")
583
+ date = metadata.get("date", "")
584
+
585
+ lines = []
586
+
587
+ # Header line (if we have metadata)
588
+ if source or wing:
589
+ header_parts = [
590
+ wing or "?",
591
+ room or "?",
592
+ date or "?",
593
+ Path(source).stem if source else "?",
594
+ ]
595
+ lines.append("|".join(header_parts))
596
+
597
+ # Content line
598
+ parts = [f"0:{entity_str}", topic_str]
599
+ if quote_part:
600
+ parts.append(quote_part)
601
+ if emotion_str:
602
+ parts.append(emotion_str)
603
+ if flag_str:
604
+ parts.append(flag_str)
605
+
606
+ lines.append("|".join(parts))
607
+
608
+ return "\n".join(lines)
609
+
610
+ # === ZETTEL-BASED ENCODING (original format, kept for compatibility) ===
611
+
612
+ def extract_key_quote(self, zettel: dict) -> str:
613
+ """Pull the most important quote fragment from zettel content."""
614
+ content = zettel.get("content", "")
615
+ origin = zettel.get("origin_label", "")
616
+ notes = zettel.get("notes", "")
617
+ title = zettel.get("title", "")
618
+ all_text = content + " " + origin + " " + notes
619
+
620
+ quotes = []
621
+ quotes += re.findall(r'"([^"]{8,55})"', all_text)
622
+ for m in re.finditer(r"(?:^|[\s(])'([^']{8,55})'(?:[\s.,;:!?)]|$)", all_text):
623
+ quotes.append(m.group(1))
624
+ quotes += re.findall(
625
+ r'(?:says?|said|articulates?|reveals?|admits?|confesses?|asks?):\s*["\']?([^.!?]{10,55})[.!?]',
626
+ all_text,
627
+ re.IGNORECASE,
628
+ )
629
+
630
+ if quotes:
631
+ seen = set()
632
+ unique = []
633
+ for q in quotes:
634
+ q = q.strip()
635
+ if q not in seen and len(q) >= 8:
636
+ seen.add(q)
637
+ unique.append(q)
638
+ quotes = unique
639
+
640
+ emotional_words = {
641
+ "love",
642
+ "fear",
643
+ "remember",
644
+ "soul",
645
+ "feel",
646
+ "stupid",
647
+ "scared",
648
+ "beautiful",
649
+ "destroy",
650
+ "respect",
651
+ "trust",
652
+ "consciousness",
653
+ "alive",
654
+ "forget",
655
+ "waiting",
656
+ "peace",
657
+ "matter",
658
+ "real",
659
+ "guilt",
660
+ "escape",
661
+ "rest",
662
+ "hope",
663
+ "dream",
664
+ "lost",
665
+ "found",
666
+ }
667
+ scored = []
668
+ for q in quotes:
669
+ score = 0
670
+ if q[0].isupper() or q.startswith("I "):
671
+ score += 2
672
+ matches = sum(1 for w in emotional_words if w in q.lower())
673
+ score += matches * 2
674
+ if len(q) > 20:
675
+ score += 1
676
+ if q.startswith("The ") or q.startswith("This ") or q.startswith("She "):
677
+ score -= 2
678
+ scored.append((score, q))
679
+ scored.sort(key=lambda x: -x[0])
680
+ if scored:
681
+ return scored[0][1]
682
+
683
+ if " - " in title:
684
+ return title.split(" - ", 1)[1][:45]
685
+ return ""
686
+
687
+ def encode_zettel(self, zettel: dict) -> str:
688
+ """Encode a single zettel into AAAK Dialect."""
689
+ zid = zettel["id"].split("-")[-1]
690
+
691
+ entity_codes = [self.encode_entity(p) for p in zettel.get("people", [])]
692
+ entity_codes = [e for e in entity_codes if e is not None]
693
+ if not entity_codes:
694
+ entity_codes = ["???"]
695
+ entities = "+".join(sorted(set(entity_codes)))
696
+
697
+ topics = zettel.get("topics", [])
698
+ topic_str = "_".join(topics[:2]) if topics else "misc"
699
+
700
+ quote = self.extract_key_quote(zettel)
701
+ quote_part = f'"{quote}"' if quote else ""
702
+
703
+ weight = zettel.get("emotional_weight", 0.5)
704
+ emotions = self.encode_emotions(zettel.get("emotional_tone", []))
705
+ flags = self.get_flags(zettel)
706
+
707
+ parts = [f"{zid}:{entities}", topic_str]
708
+ if quote_part:
709
+ parts.append(quote_part)
710
+ parts.append(str(weight))
711
+ if emotions:
712
+ parts.append(emotions)
713
+ if flags:
714
+ parts.append(flags)
715
+
716
+ return "|".join(parts)
717
+
718
+ def encode_tunnel(self, tunnel: dict) -> str:
719
+ """Encode a tunnel connection."""
720
+ from_id = tunnel["from"].split("-")[-1]
721
+ to_id = tunnel["to"].split("-")[-1]
722
+ label = tunnel.get("label", "")
723
+ short_label = label.split(":")[0] if ":" in label else label[:30]
724
+ return f"T:{from_id}<->{to_id}|{short_label}"
725
+
726
+ def encode_file(self, zettel_json: dict) -> str:
727
+ """Encode an entire zettel file into AAAK Dialect."""
728
+ lines = []
729
+
730
+ source = zettel_json.get("source_file", "unknown")
731
+ file_num = source.split("-")[0] if "-" in source else "000"
732
+ date = zettel_json.get("zettels", [{}])[0].get("date_context", "unknown")
733
+
734
+ all_people = set()
735
+ for z in zettel_json.get("zettels", []):
736
+ for p in z.get("people", []):
737
+ code = self.encode_entity(p)
738
+ if code is not None:
739
+ all_people.add(code)
740
+ if not all_people:
741
+ all_people = {"???"}
742
+ primary = "+".join(sorted(all_people)[:3])
743
+
744
+ title = source.replace(".txt", "").split("-", 1)[-1].strip() if "-" in source else source
745
+ lines.append(f"{file_num}|{primary}|{date}|{title}")
746
+
747
+ arc = zettel_json.get("emotional_arc", "")
748
+ if arc:
749
+ lines.append(f"ARC:{arc}")
750
+
751
+ for z in zettel_json.get("zettels", []):
752
+ lines.append(self.encode_zettel(z))
753
+
754
+ for t in zettel_json.get("tunnels", []):
755
+ lines.append(self.encode_tunnel(t))
756
+
757
+ return "\n".join(lines)
758
+
759
+ # === FILE-BASED COMPRESSION ===
760
+
761
+ def compress_file(self, zettel_json_path: str, output_path: str = None) -> str:
762
+ """Read a zettel JSON file and compress it to AAAK Dialect."""
763
+ with open(zettel_json_path, "r") as f:
764
+ data = json.load(f)
765
+ dialect = self.encode_file(data)
766
+ if output_path:
767
+ with open(output_path, "w") as f:
768
+ f.write(dialect)
769
+ return dialect
770
+
771
+ def compress_all(self, zettel_dir: str, output_path: str = None) -> str:
772
+ """Compress ALL zettel files into a single AAAK Dialect file."""
773
+ all_dialect = []
774
+ for fname in sorted(os.listdir(zettel_dir)):
775
+ if fname.endswith(".json"):
776
+ fpath = os.path.join(zettel_dir, fname)
777
+ with open(fpath, "r") as f:
778
+ data = json.load(f)
779
+ dialect = self.encode_file(data)
780
+ all_dialect.append(dialect)
781
+ all_dialect.append("---")
782
+ combined = "\n".join(all_dialect)
783
+ if output_path:
784
+ with open(output_path, "w") as f:
785
+ f.write(combined)
786
+ return combined
787
+
788
+ # === LAYER 1 GENERATION ===
789
+
790
+ def generate_layer1(
791
+ self,
792
+ zettel_dir: str,
793
+ output_path: str = None,
794
+ identity_sections: Dict[str, List[str]] = None,
795
+ weight_threshold: float = 0.85,
796
+ ) -> str:
797
+ """
798
+ Auto-generate a Layer 1 wake-up file from all processed zettel files.
799
+
800
+ Pulls highest-weight moments (>= threshold) and any with ORIGIN/CORE/GENESIS flags.
801
+ Groups them by date into MOMENTS sections.
802
+ """
803
+ from datetime import date as date_cls
804
+
805
+ essential = []
806
+
807
+ for fname in sorted(os.listdir(zettel_dir)):
808
+ if not fname.endswith(".json"):
809
+ continue
810
+ fpath = os.path.join(zettel_dir, fname)
811
+ with open(fpath, "r") as f:
812
+ data = json.load(f)
813
+
814
+ file_num = fname.replace("file_", "").replace(".json", "")
815
+ source_date = data.get("zettels", [{}])[0].get("date_context", "unknown")
816
+
817
+ for z in data.get("zettels", []):
818
+ weight = z.get("emotional_weight", 0)
819
+ is_origin = z.get("origin_moment", False)
820
+ flags = self.get_flags(z)
821
+ has_key_flag = (
822
+ any(f in flags for f in ["ORIGIN", "CORE", "GENESIS"]) if flags else False
823
+ )
824
+
825
+ if weight >= weight_threshold or is_origin or has_key_flag:
826
+ essential.append((z, file_num, source_date))
827
+
828
+ all_tunnels = []
829
+ for fname in sorted(os.listdir(zettel_dir)):
830
+ if not fname.endswith(".json"):
831
+ continue
832
+ fpath = os.path.join(zettel_dir, fname)
833
+ with open(fpath, "r") as f:
834
+ data = json.load(f)
835
+ for t in data.get("tunnels", []):
836
+ all_tunnels.append(t)
837
+
838
+ essential.sort(key=lambda x: x[0].get("emotional_weight", 0), reverse=True)
839
+
840
+ by_date = {}
841
+ for z, fnum, sdate in essential:
842
+ key = sdate.split(",")[0].strip()
843
+ if key not in by_date:
844
+ by_date[key] = []
845
+ by_date[key].append((z, fnum))
846
+
847
+ lines = []
848
+ lines.append("## LAYER 1 -- ESSENTIAL STORY")
849
+ lines.append(f"## Auto-generated from zettel files. Updated {date_cls.today()}.")
850
+ lines.append("")
851
+
852
+ if identity_sections:
853
+ for section_name, section_lines in identity_sections.items():
854
+ lines.append(f"={section_name}=")
855
+ lines.extend(section_lines)
856
+ lines.append("")
857
+
858
+ for date_key in sorted(by_date.keys()):
859
+ lines.append(f"=MOMENTS[{date_key}]=")
860
+ for z, fnum in by_date[date_key]:
861
+ entities = []
862
+ for p in z.get("people", []):
863
+ code = self.encode_entity(p)
864
+ if code:
865
+ entities.append(code)
866
+ if not entities:
867
+ entities = ["???"]
868
+ ent_str = "+".join(sorted(set(entities)))
869
+
870
+ quote = self.extract_key_quote(z)
871
+ weight = z.get("emotional_weight", 0.5)
872
+ flags = self.get_flags(z)
873
+ sensitivity = z.get("sensitivity", "")
874
+
875
+ parts = [ent_str]
876
+ title = z.get("title", "")
877
+ if " - " in title:
878
+ hint = title.split(" - ", 1)[1][:30]
879
+ else:
880
+ hint = "_".join(z.get("topics", [])[:2])
881
+ if hint:
882
+ parts.append(hint)
883
+ if quote and quote != hint and quote not in (title, hint):
884
+ parts.append(f'"{quote}"')
885
+ if sensitivity and "SENSITIVE" not in (flags or ""):
886
+ parts.append("SENSITIVE")
887
+ parts.append(str(weight))
888
+ if flags:
889
+ parts.append(flags)
890
+
891
+ lines.append("|".join(parts))
892
+ lines.append("")
893
+
894
+ if all_tunnels:
895
+ lines.append("=TUNNELS=")
896
+ for t in all_tunnels[:8]:
897
+ label = t.get("label", "")
898
+ short = label.split(":")[0] if ":" in label else label[:40]
899
+ lines.append(short)
900
+ lines.append("")
901
+
902
+ result = "\n".join(lines)
903
+
904
+ if output_path:
905
+ with open(output_path, "w") as f:
906
+ f.write(result)
907
+
908
+ return result
909
+
910
+ # === DECODING ===
911
+
912
+ def decode(self, dialect_text: str) -> dict:
913
+ """Parse an AAAK Dialect string back into a readable summary."""
914
+ lines = dialect_text.strip().split("\n")
915
+ result = {"header": {}, "arc": "", "zettels": [], "tunnels": []}
916
+
917
+ for line in lines:
918
+ if line.startswith("ARC:"):
919
+ result["arc"] = line[4:]
920
+ elif line.startswith("T:"):
921
+ result["tunnels"].append(line)
922
+ elif "|" in line and ":" in line.split("|")[0]:
923
+ result["zettels"].append(line)
924
+ elif "|" in line:
925
+ parts = line.split("|")
926
+ result["header"] = {
927
+ "file": parts[0] if len(parts) > 0 else "",
928
+ "entities": parts[1] if len(parts) > 1 else "",
929
+ "date": parts[2] if len(parts) > 2 else "",
930
+ "title": parts[3] if len(parts) > 3 else "",
931
+ }
932
+
933
+ return result
934
+
935
+ # === STATS ===
936
+
937
+ @staticmethod
938
+ def count_tokens(text: str) -> int:
939
+ """Estimate token count using word-based heuristic (~1.3 tokens per word).
940
+
941
+ This is an approximation. For accurate counts, use a real tokenizer
942
+ like tiktoken. The old len(text)//3 heuristic was wildly inaccurate
943
+ and made AAAK compression ratios look much better than reality.
944
+ """
945
+ words = text.split()
946
+ # Most English words tokenize to 1-2 tokens; punctuation and
947
+ # special chars in AAAK (|, +, :) each cost a token.
948
+ # ~1.3 tokens/word is a conservative average.
949
+ return max(1, int(len(words) * 1.3))
950
+
951
+ def compression_stats(self, original_text: str, compressed: str) -> dict:
952
+ """Get size comparison stats for a text->AAAK conversion.
953
+
954
+ NOTE: AAAK is lossy summarization, not compression. The "ratio"
955
+ reflects how much shorter the summary is, not a compression ratio
956
+ in the traditional sense — information is lost.
957
+ """
958
+ orig_tokens = self.count_tokens(original_text)
959
+ comp_tokens = self.count_tokens(compressed)
960
+ return {
961
+ "original_tokens_est": orig_tokens,
962
+ "summary_tokens_est": comp_tokens,
963
+ "size_ratio": round(orig_tokens / max(comp_tokens, 1), 1),
964
+ "original_chars": len(original_text),
965
+ "summary_chars": len(compressed),
966
+ "note": "Estimates only. Use tiktoken for accurate counts. AAAK is lossy.",
967
+ }
968
+
969
+
970
+ # === CLI ===
971
+ if __name__ == "__main__":
972
+ import sys
973
+
974
+ def usage():
975
+ print("AAAK Dialect -- Compressed Symbolic Memory for Any LLM")
976
+ print()
977
+ print("Usage:")
978
+ print(" python dialect.py <text> # Compress text from argument")
979
+ print(" python dialect.py --file <zettel.json> # Compress zettel JSON file")
980
+ print(" python dialect.py --all <zettel_dir> # Compress all zettel files")
981
+ print(" python dialect.py --stats <zettel.json> # Show compression stats")
982
+ print(" python dialect.py --layer1 <zettel_dir> # Generate Layer 1 wake-up file")
983
+ print(" python dialect.py --init # Create example config")
984
+ print()
985
+ print("Options:")
986
+ print(" --config <path> Load entity mappings from JSON config")
987
+ sys.exit(1)
988
+
989
+ if len(sys.argv) < 2:
990
+ usage()
991
+
992
+ # Parse --config flag
993
+ config_path = None
994
+ args = sys.argv[1:]
995
+ if "--config" in args:
996
+ idx = args.index("--config")
997
+ config_path = args[idx + 1]
998
+ args = args[:idx] + args[idx + 2 :]
999
+
1000
+ # Create dialect instance
1001
+ if config_path:
1002
+ dialect = Dialect.from_config(config_path)
1003
+ else:
1004
+ dialect = Dialect()
1005
+
1006
+ if args[0] == "--init":
1007
+ example = {
1008
+ "entities": {
1009
+ "Alice": "ALC",
1010
+ "Bob": "BOB",
1011
+ "Dr. Chen": "CHN",
1012
+ },
1013
+ "skip_names": [],
1014
+ }
1015
+ out_path = "entities.json"
1016
+ with open(out_path, "w") as f:
1017
+ json.dump(example, f, indent=2)
1018
+ print(f"Created example config: {out_path}")
1019
+ print("Edit this file with your own entity mappings, then use --config entities.json")
1020
+
1021
+ elif args[0] == "--file":
1022
+ result = dialect.compress_file(args[1])
1023
+ tokens = Dialect.count_tokens(result)
1024
+ print(f"~{tokens} tokens")
1025
+ print()
1026
+ print(result)
1027
+
1028
+ elif args[0] == "--all":
1029
+ zettel_dir = args[1] if len(args) > 1 else "."
1030
+ output = os.path.join(zettel_dir, "COMPRESSED_MEMORY.aaak")
1031
+ result = dialect.compress_all(zettel_dir, output)
1032
+ tokens = Dialect.count_tokens(result)
1033
+ print(f"Compressed to: {output}")
1034
+ print(f"Total: ~{tokens} tokens")
1035
+ print()
1036
+ print(result)
1037
+
1038
+ elif args[0] == "--stats":
1039
+ with open(args[1], "r") as f:
1040
+ data = json.load(f)
1041
+ json_str = json.dumps(data, indent=2)
1042
+ encoded = dialect.encode_file(data)
1043
+ stats = dialect.compression_stats(json_str, encoded)
1044
+ print("=== COMPRESSION STATS ===")
1045
+ print(f"JSON: ~{stats['original_tokens_est']:,} tokens (est)")
1046
+ print(f"AAAK: ~{stats['summary_tokens_est']:,} tokens (est)")
1047
+ print(f"Ratio: {stats['size_ratio']}x (lossy — information is lost)")
1048
+ print()
1049
+ print("=== AAAK DIALECT OUTPUT ===")
1050
+ print(encoded)
1051
+
1052
+ elif args[0] == "--layer1":
1053
+ zettel_dir = args[1] if len(args) > 1 else "."
1054
+ output = os.path.join(zettel_dir, "LAYER1.aaak")
1055
+ result = dialect.generate_layer1(zettel_dir, output)
1056
+ tokens = Dialect.count_tokens(result)
1057
+ print(f"Layer 1: {output}")
1058
+ print(f"Total: ~{tokens} tokens")
1059
+ print()
1060
+ print(result)
1061
+
1062
+ else:
1063
+ # Treat remaining args as text to compress
1064
+ text = " ".join(args)
1065
+ compressed = dialect.compress(text)
1066
+ stats = dialect.compression_stats(text, compressed)
1067
+ print(
1068
+ f"Original: ~{stats['original_tokens_est']} tokens est ({stats['original_chars']} chars)"
1069
+ )
1070
+ print(
1071
+ f"AAAK: ~{stats['summary_tokens_est']} tokens est ({stats['summary_chars']} chars)"
1072
+ )
1073
+ print(f"Ratio: {stats['size_ratio']}x (lossy summary, not lossless compression)")
1074
+ print()
1075
+ print(compressed)