aaak-vault-sync 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +294 -0
- package/bin/aaak-scan.js +24 -0
- package/dialect.py +1075 -0
- package/package.json +29 -0
- package/scan.py +424 -0
- package/scripts/setup.js +214 -0
- package/templates/com.aaak.vault-sync.plist.template +33 -0
- package/templates/generic-memory-loader.md.template +13 -0
- package/templates/scan-vault-skill.md.template +32 -0
package/dialect.py
ADDED
|
@@ -0,0 +1,1075 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
AAAK Dialect -- Structured Symbolic Summary Format
|
|
4
|
+
====================================================
|
|
5
|
+
|
|
6
|
+
A lossy summarization format that extracts entities, topics, key sentences,
|
|
7
|
+
emotions, and flags from plain text into a compact structured representation.
|
|
8
|
+
Any LLM reads it natively — no decoder required.
|
|
9
|
+
|
|
10
|
+
Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text.
|
|
11
|
+
|
|
12
|
+
NOTE: AAAK is NOT lossless compression. The original text cannot be reconstructed
|
|
13
|
+
from AAAK output. It is a structured summary layer (closets) that points to the
|
|
14
|
+
original verbatim content (drawers). The 96.6% benchmark score is from raw mode,
|
|
15
|
+
not AAAK mode.
|
|
16
|
+
|
|
17
|
+
Adapted for mempalace: works standalone on plain text and ChromaDB drawers.
|
|
18
|
+
No dependency on palace.py or layers.py.
|
|
19
|
+
|
|
20
|
+
FORMAT:
|
|
21
|
+
Header: FILE_NUM|PRIMARY_ENTITY|DATE|TITLE
|
|
22
|
+
Zettel: ZID:ENTITIES|topic_keywords|"key_quote"|WEIGHT|EMOTIONS|FLAGS
|
|
23
|
+
Tunnel: T:ZID<->ZID|label
|
|
24
|
+
Arc: ARC:emotion->emotion->emotion
|
|
25
|
+
|
|
26
|
+
EMOTION CODES (universal):
|
|
27
|
+
vul=vulnerability, joy=joy, fear=fear, trust=trust
|
|
28
|
+
grief=grief, wonder=wonder, rage=rage, love=love
|
|
29
|
+
hope=hope, despair=despair, peace=peace, humor=humor
|
|
30
|
+
tender=tenderness, raw=raw_honesty, doubt=self_doubt
|
|
31
|
+
relief=relief, anx=anxiety, exhaust=exhaustion
|
|
32
|
+
convict=conviction, passion=quiet_passion
|
|
33
|
+
|
|
34
|
+
FLAGS:
|
|
35
|
+
ORIGIN = origin moment (birth of something)
|
|
36
|
+
CORE = core belief or identity pillar
|
|
37
|
+
SENSITIVE = handle with absolute care
|
|
38
|
+
PIVOT = emotional turning point
|
|
39
|
+
GENESIS = led directly to something existing
|
|
40
|
+
DECISION = explicit decision or choice
|
|
41
|
+
TECHNICAL = technical architecture or implementation detail
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
import json
|
|
45
|
+
import os
|
|
46
|
+
import re
|
|
47
|
+
from typing import List, Dict, Optional
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# === EMOTION CODES (universal) ===
|
|
52
|
+
|
|
53
|
+
EMOTION_CODES = {
|
|
54
|
+
"vulnerability": "vul",
|
|
55
|
+
"vulnerable": "vul",
|
|
56
|
+
"joy": "joy",
|
|
57
|
+
"joyful": "joy",
|
|
58
|
+
"fear": "fear",
|
|
59
|
+
"mild_fear": "fear",
|
|
60
|
+
"trust": "trust",
|
|
61
|
+
"trust_building": "trust",
|
|
62
|
+
"grief": "grief",
|
|
63
|
+
"raw_grief": "grief",
|
|
64
|
+
"wonder": "wonder",
|
|
65
|
+
"philosophical_wonder": "wonder",
|
|
66
|
+
"rage": "rage",
|
|
67
|
+
"anger": "rage",
|
|
68
|
+
"love": "love",
|
|
69
|
+
"devotion": "love",
|
|
70
|
+
"hope": "hope",
|
|
71
|
+
"despair": "despair",
|
|
72
|
+
"hopelessness": "despair",
|
|
73
|
+
"peace": "peace",
|
|
74
|
+
"relief": "relief",
|
|
75
|
+
"humor": "humor",
|
|
76
|
+
"dark_humor": "humor",
|
|
77
|
+
"tenderness": "tender",
|
|
78
|
+
"raw_honesty": "raw",
|
|
79
|
+
"brutal_honesty": "raw",
|
|
80
|
+
"self_doubt": "doubt",
|
|
81
|
+
"anxiety": "anx",
|
|
82
|
+
"exhaustion": "exhaust",
|
|
83
|
+
"conviction": "convict",
|
|
84
|
+
"quiet_passion": "passion",
|
|
85
|
+
"warmth": "warmth",
|
|
86
|
+
"curiosity": "curious",
|
|
87
|
+
"gratitude": "grat",
|
|
88
|
+
"frustration": "frust",
|
|
89
|
+
"confusion": "confuse",
|
|
90
|
+
"satisfaction": "satis",
|
|
91
|
+
"excitement": "excite",
|
|
92
|
+
"determination": "determ",
|
|
93
|
+
"surprise": "surprise",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# Keywords that signal emotions in plain text
|
|
97
|
+
_EMOTION_SIGNALS = {
|
|
98
|
+
"decided": "determ",
|
|
99
|
+
"prefer": "convict",
|
|
100
|
+
"worried": "anx",
|
|
101
|
+
"excited": "excite",
|
|
102
|
+
"frustrated": "frust",
|
|
103
|
+
"confused": "confuse",
|
|
104
|
+
"love": "love",
|
|
105
|
+
"hate": "rage",
|
|
106
|
+
"hope": "hope",
|
|
107
|
+
"fear": "fear",
|
|
108
|
+
"trust": "trust",
|
|
109
|
+
"happy": "joy",
|
|
110
|
+
"sad": "grief",
|
|
111
|
+
"surprised": "surprise",
|
|
112
|
+
"grateful": "grat",
|
|
113
|
+
"curious": "curious",
|
|
114
|
+
"wonder": "wonder",
|
|
115
|
+
"anxious": "anx",
|
|
116
|
+
"relieved": "relief",
|
|
117
|
+
"satisf": "satis",
|
|
118
|
+
"disappoint": "grief",
|
|
119
|
+
"concern": "anx",
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# Keywords that signal flags
|
|
123
|
+
_FLAG_SIGNALS = {
|
|
124
|
+
"decided": "DECISION",
|
|
125
|
+
"chose": "DECISION",
|
|
126
|
+
"switched": "DECISION",
|
|
127
|
+
"migrated": "DECISION",
|
|
128
|
+
"replaced": "DECISION",
|
|
129
|
+
"instead of": "DECISION",
|
|
130
|
+
"because": "DECISION",
|
|
131
|
+
"founded": "ORIGIN",
|
|
132
|
+
"created": "ORIGIN",
|
|
133
|
+
"started": "ORIGIN",
|
|
134
|
+
"born": "ORIGIN",
|
|
135
|
+
"launched": "ORIGIN",
|
|
136
|
+
"first time": "ORIGIN",
|
|
137
|
+
"core": "CORE",
|
|
138
|
+
"fundamental": "CORE",
|
|
139
|
+
"essential": "CORE",
|
|
140
|
+
"principle": "CORE",
|
|
141
|
+
"belief": "CORE",
|
|
142
|
+
"always": "CORE",
|
|
143
|
+
"never forget": "CORE",
|
|
144
|
+
"turning point": "PIVOT",
|
|
145
|
+
"changed everything": "PIVOT",
|
|
146
|
+
"realized": "PIVOT",
|
|
147
|
+
"breakthrough": "PIVOT",
|
|
148
|
+
"epiphany": "PIVOT",
|
|
149
|
+
"api": "TECHNICAL",
|
|
150
|
+
"database": "TECHNICAL",
|
|
151
|
+
"architecture": "TECHNICAL",
|
|
152
|
+
"deploy": "TECHNICAL",
|
|
153
|
+
"infrastructure": "TECHNICAL",
|
|
154
|
+
"algorithm": "TECHNICAL",
|
|
155
|
+
"framework": "TECHNICAL",
|
|
156
|
+
"server": "TECHNICAL",
|
|
157
|
+
"config": "TECHNICAL",
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Common filler/stop words to strip from topic extraction
|
|
161
|
+
_STOP_WORDS = {
|
|
162
|
+
"the",
|
|
163
|
+
"a",
|
|
164
|
+
"an",
|
|
165
|
+
"is",
|
|
166
|
+
"are",
|
|
167
|
+
"was",
|
|
168
|
+
"were",
|
|
169
|
+
"be",
|
|
170
|
+
"been",
|
|
171
|
+
"being",
|
|
172
|
+
"have",
|
|
173
|
+
"has",
|
|
174
|
+
"had",
|
|
175
|
+
"do",
|
|
176
|
+
"does",
|
|
177
|
+
"did",
|
|
178
|
+
"will",
|
|
179
|
+
"would",
|
|
180
|
+
"could",
|
|
181
|
+
"should",
|
|
182
|
+
"may",
|
|
183
|
+
"might",
|
|
184
|
+
"shall",
|
|
185
|
+
"can",
|
|
186
|
+
"to",
|
|
187
|
+
"of",
|
|
188
|
+
"in",
|
|
189
|
+
"for",
|
|
190
|
+
"on",
|
|
191
|
+
"with",
|
|
192
|
+
"at",
|
|
193
|
+
"by",
|
|
194
|
+
"from",
|
|
195
|
+
"as",
|
|
196
|
+
"into",
|
|
197
|
+
"about",
|
|
198
|
+
"between",
|
|
199
|
+
"through",
|
|
200
|
+
"during",
|
|
201
|
+
"before",
|
|
202
|
+
"after",
|
|
203
|
+
"above",
|
|
204
|
+
"below",
|
|
205
|
+
"up",
|
|
206
|
+
"down",
|
|
207
|
+
"out",
|
|
208
|
+
"off",
|
|
209
|
+
"over",
|
|
210
|
+
"under",
|
|
211
|
+
"again",
|
|
212
|
+
"further",
|
|
213
|
+
"then",
|
|
214
|
+
"once",
|
|
215
|
+
"here",
|
|
216
|
+
"there",
|
|
217
|
+
"when",
|
|
218
|
+
"where",
|
|
219
|
+
"why",
|
|
220
|
+
"how",
|
|
221
|
+
"all",
|
|
222
|
+
"each",
|
|
223
|
+
"every",
|
|
224
|
+
"both",
|
|
225
|
+
"few",
|
|
226
|
+
"more",
|
|
227
|
+
"most",
|
|
228
|
+
"other",
|
|
229
|
+
"some",
|
|
230
|
+
"such",
|
|
231
|
+
"no",
|
|
232
|
+
"nor",
|
|
233
|
+
"not",
|
|
234
|
+
"only",
|
|
235
|
+
"own",
|
|
236
|
+
"same",
|
|
237
|
+
"so",
|
|
238
|
+
"than",
|
|
239
|
+
"too",
|
|
240
|
+
"very",
|
|
241
|
+
"just",
|
|
242
|
+
"don",
|
|
243
|
+
"now",
|
|
244
|
+
"and",
|
|
245
|
+
"but",
|
|
246
|
+
"or",
|
|
247
|
+
"if",
|
|
248
|
+
"while",
|
|
249
|
+
"that",
|
|
250
|
+
"this",
|
|
251
|
+
"these",
|
|
252
|
+
"those",
|
|
253
|
+
"it",
|
|
254
|
+
"its",
|
|
255
|
+
"i",
|
|
256
|
+
"we",
|
|
257
|
+
"you",
|
|
258
|
+
"he",
|
|
259
|
+
"she",
|
|
260
|
+
"they",
|
|
261
|
+
"me",
|
|
262
|
+
"him",
|
|
263
|
+
"her",
|
|
264
|
+
"us",
|
|
265
|
+
"them",
|
|
266
|
+
"my",
|
|
267
|
+
"your",
|
|
268
|
+
"his",
|
|
269
|
+
"our",
|
|
270
|
+
"their",
|
|
271
|
+
"what",
|
|
272
|
+
"which",
|
|
273
|
+
"who",
|
|
274
|
+
"whom",
|
|
275
|
+
"also",
|
|
276
|
+
"much",
|
|
277
|
+
"many",
|
|
278
|
+
"like",
|
|
279
|
+
"because",
|
|
280
|
+
"since",
|
|
281
|
+
"get",
|
|
282
|
+
"got",
|
|
283
|
+
"use",
|
|
284
|
+
"used",
|
|
285
|
+
"using",
|
|
286
|
+
"make",
|
|
287
|
+
"made",
|
|
288
|
+
"thing",
|
|
289
|
+
"things",
|
|
290
|
+
"way",
|
|
291
|
+
"well",
|
|
292
|
+
"really",
|
|
293
|
+
"want",
|
|
294
|
+
"need",
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
class Dialect:
|
|
299
|
+
"""
|
|
300
|
+
AAAK Dialect encoder -- works on plain text or structured zettel data.
|
|
301
|
+
|
|
302
|
+
Usage:
|
|
303
|
+
# Basic: compress any text
|
|
304
|
+
dialect = Dialect()
|
|
305
|
+
compressed = dialect.compress("We decided to use GraphQL instead of REST...")
|
|
306
|
+
|
|
307
|
+
# With entity mappings
|
|
308
|
+
dialect = Dialect(entities={"Alice": "ALC", "Bob": "BOB"})
|
|
309
|
+
|
|
310
|
+
# From config file
|
|
311
|
+
dialect = Dialect.from_config("entities.json")
|
|
312
|
+
|
|
313
|
+
# Compress zettel JSON (original format)
|
|
314
|
+
compressed = dialect.compress_file("zettels/file_001.json")
|
|
315
|
+
|
|
316
|
+
# Generate Layer 1 wake-up file
|
|
317
|
+
dialect.generate_layer1("zettels/", output="LAYER1.aaak")
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def __init__(self, entities: Dict[str, str] = None, skip_names: List[str] = None):
|
|
321
|
+
"""
|
|
322
|
+
Args:
|
|
323
|
+
entities: Mapping of full names -> short codes.
|
|
324
|
+
e.g. {"Alice": "ALC", "Bob": "BOB"}
|
|
325
|
+
If None, entities are auto-coded from first 3 chars.
|
|
326
|
+
skip_names: Names to skip (fictional characters, etc.)
|
|
327
|
+
"""
|
|
328
|
+
self.entity_codes = {}
|
|
329
|
+
if entities:
|
|
330
|
+
for name, code in entities.items():
|
|
331
|
+
self.entity_codes[name] = code
|
|
332
|
+
self.entity_codes[name.lower()] = code
|
|
333
|
+
self.skip_names = [n.lower() for n in (skip_names or [])]
|
|
334
|
+
|
|
335
|
+
@classmethod
|
|
336
|
+
def from_config(cls, config_path: str) -> "Dialect":
|
|
337
|
+
"""Load entity mappings from a JSON config file.
|
|
338
|
+
|
|
339
|
+
Config format:
|
|
340
|
+
{
|
|
341
|
+
"entities": {"Alice": "ALC", "Bob": "BOB"},
|
|
342
|
+
"skip_names": ["Gandalf", "Sherlock"]
|
|
343
|
+
}
|
|
344
|
+
"""
|
|
345
|
+
with open(config_path, "r") as f:
|
|
346
|
+
config = json.load(f)
|
|
347
|
+
return cls(
|
|
348
|
+
entities=config.get("entities", {}),
|
|
349
|
+
skip_names=config.get("skip_names", []),
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def save_config(self, config_path: str):
|
|
353
|
+
"""Save current entity mappings to a JSON config file."""
|
|
354
|
+
canonical = {}
|
|
355
|
+
seen_codes = set()
|
|
356
|
+
for name, code in self.entity_codes.items():
|
|
357
|
+
if code not in seen_codes and not name.islower():
|
|
358
|
+
canonical[name] = code
|
|
359
|
+
seen_codes.add(code)
|
|
360
|
+
elif code not in seen_codes:
|
|
361
|
+
canonical[name] = code
|
|
362
|
+
seen_codes.add(code)
|
|
363
|
+
|
|
364
|
+
config = {
|
|
365
|
+
"entities": canonical,
|
|
366
|
+
"skip_names": self.skip_names,
|
|
367
|
+
}
|
|
368
|
+
with open(config_path, "w") as f:
|
|
369
|
+
json.dump(config, f, indent=2)
|
|
370
|
+
|
|
371
|
+
# === ENCODING (entity/emotion primitives) ===
|
|
372
|
+
|
|
373
|
+
def encode_entity(self, name: str) -> Optional[str]:
|
|
374
|
+
"""Convert a person/entity name to its short code."""
|
|
375
|
+
if any(s in name.lower() for s in self.skip_names):
|
|
376
|
+
return None
|
|
377
|
+
if name in self.entity_codes:
|
|
378
|
+
return self.entity_codes[name]
|
|
379
|
+
if name.lower() in self.entity_codes:
|
|
380
|
+
return self.entity_codes[name.lower()]
|
|
381
|
+
for key, code in self.entity_codes.items():
|
|
382
|
+
if key.lower() in name.lower():
|
|
383
|
+
return code
|
|
384
|
+
# Auto-code: first 3 chars uppercase
|
|
385
|
+
return name[:3].upper()
|
|
386
|
+
|
|
387
|
+
def encode_emotions(self, emotions: List[str]) -> str:
|
|
388
|
+
"""Convert emotion list to compact codes."""
|
|
389
|
+
codes = []
|
|
390
|
+
for e in emotions:
|
|
391
|
+
code = EMOTION_CODES.get(e, e[:4])
|
|
392
|
+
if code not in codes:
|
|
393
|
+
codes.append(code)
|
|
394
|
+
return "+".join(codes[:3])
|
|
395
|
+
|
|
396
|
+
def get_flags(self, zettel: dict) -> str:
|
|
397
|
+
"""Extract flags from zettel metadata."""
|
|
398
|
+
flags = []
|
|
399
|
+
if zettel.get("origin_moment"):
|
|
400
|
+
flags.append("ORIGIN")
|
|
401
|
+
if zettel.get("sensitivity", "").upper().startswith("MAXIMUM"):
|
|
402
|
+
flags.append("SENSITIVE")
|
|
403
|
+
notes = zettel.get("notes", "").lower()
|
|
404
|
+
if "foundational pillar" in notes or "core" in notes:
|
|
405
|
+
flags.append("CORE")
|
|
406
|
+
if "genesis" in notes or "genesis" in zettel.get("origin_label", "").lower():
|
|
407
|
+
flags.append("GENESIS")
|
|
408
|
+
if "pivot" in notes:
|
|
409
|
+
flags.append("PIVOT")
|
|
410
|
+
return "+".join(flags) if flags else ""
|
|
411
|
+
|
|
412
|
+
# === PLAIN TEXT COMPRESSION (new for mempalace) ===
|
|
413
|
+
|
|
414
|
+
def _detect_emotions(self, text: str) -> List[str]:
|
|
415
|
+
"""Detect emotions from plain text using keyword signals."""
|
|
416
|
+
text_lower = text.lower()
|
|
417
|
+
detected = []
|
|
418
|
+
seen = set()
|
|
419
|
+
for keyword, code in _EMOTION_SIGNALS.items():
|
|
420
|
+
if keyword in text_lower and code not in seen:
|
|
421
|
+
detected.append(code)
|
|
422
|
+
seen.add(code)
|
|
423
|
+
return detected[:3]
|
|
424
|
+
|
|
425
|
+
def _detect_flags(self, text: str) -> List[str]:
|
|
426
|
+
"""Detect importance flags from plain text using keyword signals."""
|
|
427
|
+
text_lower = text.lower()
|
|
428
|
+
detected = []
|
|
429
|
+
seen = set()
|
|
430
|
+
for keyword, flag in _FLAG_SIGNALS.items():
|
|
431
|
+
if keyword in text_lower and flag not in seen:
|
|
432
|
+
detected.append(flag)
|
|
433
|
+
seen.add(flag)
|
|
434
|
+
return detected[:3]
|
|
435
|
+
|
|
436
|
+
def _extract_topics(self, text: str, max_topics: int = 3) -> List[str]:
|
|
437
|
+
"""Extract key topic words from plain text."""
|
|
438
|
+
# Tokenize: alphanumeric words, lowercase
|
|
439
|
+
words = re.findall(r"[a-zA-Z][a-zA-Z_-]{2,}", text)
|
|
440
|
+
# Count frequency, skip stop words
|
|
441
|
+
freq = {}
|
|
442
|
+
for w in words:
|
|
443
|
+
w_lower = w.lower()
|
|
444
|
+
if w_lower in _STOP_WORDS or len(w_lower) < 3:
|
|
445
|
+
continue
|
|
446
|
+
freq[w_lower] = freq.get(w_lower, 0) + 1
|
|
447
|
+
|
|
448
|
+
# Also boost words that look like proper nouns or technical terms
|
|
449
|
+
for w in words:
|
|
450
|
+
w_lower = w.lower()
|
|
451
|
+
if w_lower in _STOP_WORDS:
|
|
452
|
+
continue
|
|
453
|
+
if w[0].isupper() and w_lower in freq:
|
|
454
|
+
freq[w_lower] += 2
|
|
455
|
+
# CamelCase or has underscore/hyphen
|
|
456
|
+
if "_" in w or "-" in w or (any(c.isupper() for c in w[1:])):
|
|
457
|
+
if w_lower in freq:
|
|
458
|
+
freq[w_lower] += 2
|
|
459
|
+
|
|
460
|
+
ranked = sorted(freq.items(), key=lambda x: -x[1])
|
|
461
|
+
return [w for w, _ in ranked[:max_topics]]
|
|
462
|
+
|
|
463
|
+
def _extract_key_sentence(self, text: str) -> str:
|
|
464
|
+
"""Extract the most important sentence fragment from text."""
|
|
465
|
+
# Split into sentences
|
|
466
|
+
sentences = re.split(r"[.!?\n]+", text)
|
|
467
|
+
sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
|
|
468
|
+
if not sentences:
|
|
469
|
+
return ""
|
|
470
|
+
|
|
471
|
+
# Score each sentence
|
|
472
|
+
decision_words = {
|
|
473
|
+
"decided",
|
|
474
|
+
"because",
|
|
475
|
+
"instead",
|
|
476
|
+
"prefer",
|
|
477
|
+
"switched",
|
|
478
|
+
"chose",
|
|
479
|
+
"realized",
|
|
480
|
+
"important",
|
|
481
|
+
"key",
|
|
482
|
+
"critical",
|
|
483
|
+
"discovered",
|
|
484
|
+
"learned",
|
|
485
|
+
"conclusion",
|
|
486
|
+
"solution",
|
|
487
|
+
"reason",
|
|
488
|
+
"why",
|
|
489
|
+
"breakthrough",
|
|
490
|
+
"insight",
|
|
491
|
+
}
|
|
492
|
+
scored = []
|
|
493
|
+
for s in sentences:
|
|
494
|
+
score = 0
|
|
495
|
+
s_lower = s.lower()
|
|
496
|
+
for w in decision_words:
|
|
497
|
+
if w in s_lower:
|
|
498
|
+
score += 2
|
|
499
|
+
# Prefer shorter, punchier sentences
|
|
500
|
+
if len(s) < 80:
|
|
501
|
+
score += 1
|
|
502
|
+
if len(s) < 40:
|
|
503
|
+
score += 1
|
|
504
|
+
# Penalize very long sentences
|
|
505
|
+
if len(s) > 150:
|
|
506
|
+
score -= 2
|
|
507
|
+
scored.append((score, s))
|
|
508
|
+
|
|
509
|
+
scored.sort(key=lambda x: -x[0])
|
|
510
|
+
best = scored[0][1]
|
|
511
|
+
# Truncate if too long
|
|
512
|
+
if len(best) > 55:
|
|
513
|
+
best = best[:52] + "..."
|
|
514
|
+
return best
|
|
515
|
+
|
|
516
|
+
def _detect_entities_in_text(self, text: str) -> List[str]:
|
|
517
|
+
"""Find known entities in text, or detect capitalized names."""
|
|
518
|
+
found = []
|
|
519
|
+
# Check known entities
|
|
520
|
+
for name, code in self.entity_codes.items():
|
|
521
|
+
if not name.islower() and name.lower() in text.lower():
|
|
522
|
+
if code not in found:
|
|
523
|
+
found.append(code)
|
|
524
|
+
if found:
|
|
525
|
+
return found
|
|
526
|
+
|
|
527
|
+
# Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
|
|
528
|
+
words = text.split()
|
|
529
|
+
for i, w in enumerate(words):
|
|
530
|
+
clean = re.sub(r"[^a-zA-Z]", "", w)
|
|
531
|
+
if (
|
|
532
|
+
len(clean) >= 2
|
|
533
|
+
and clean[0].isupper()
|
|
534
|
+
and clean[1:].islower()
|
|
535
|
+
and i > 0
|
|
536
|
+
and clean.lower() not in _STOP_WORDS
|
|
537
|
+
):
|
|
538
|
+
code = clean[:3].upper()
|
|
539
|
+
if code not in found:
|
|
540
|
+
found.append(code)
|
|
541
|
+
if len(found) >= 3:
|
|
542
|
+
break
|
|
543
|
+
return found
|
|
544
|
+
|
|
545
|
+
def compress(self, text: str, metadata: dict = None) -> str:
|
|
546
|
+
"""
|
|
547
|
+
Summarize plain text into AAAK Dialect format.
|
|
548
|
+
|
|
549
|
+
Extracts entities, topics, a key sentence, emotions, and flags
|
|
550
|
+
from the input text. This is lossy — the original text cannot be
|
|
551
|
+
reconstructed from the output.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
text: Plain text content to summarize
|
|
555
|
+
metadata: Optional dict with keys like 'source_file', 'wing',
|
|
556
|
+
'room', 'date', etc.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
AAAK-formatted summary string
|
|
560
|
+
"""
|
|
561
|
+
metadata = metadata or {}
|
|
562
|
+
|
|
563
|
+
# Detect components
|
|
564
|
+
entities = self._detect_entities_in_text(text)
|
|
565
|
+
entity_str = "+".join(entities[:3]) if entities else "???"
|
|
566
|
+
|
|
567
|
+
topics = self._extract_topics(text)
|
|
568
|
+
topic_str = "_".join(topics[:3]) if topics else "misc"
|
|
569
|
+
|
|
570
|
+
quote = self._extract_key_sentence(text)
|
|
571
|
+
quote_part = f'"{quote}"' if quote else ""
|
|
572
|
+
|
|
573
|
+
emotions = self._detect_emotions(text)
|
|
574
|
+
emotion_str = "+".join(emotions) if emotions else ""
|
|
575
|
+
|
|
576
|
+
flags = self._detect_flags(text)
|
|
577
|
+
flag_str = "+".join(flags) if flags else ""
|
|
578
|
+
|
|
579
|
+
# Build source header if metadata available
|
|
580
|
+
source = metadata.get("source_file", "")
|
|
581
|
+
wing = metadata.get("wing", "")
|
|
582
|
+
room = metadata.get("room", "")
|
|
583
|
+
date = metadata.get("date", "")
|
|
584
|
+
|
|
585
|
+
lines = []
|
|
586
|
+
|
|
587
|
+
# Header line (if we have metadata)
|
|
588
|
+
if source or wing:
|
|
589
|
+
header_parts = [
|
|
590
|
+
wing or "?",
|
|
591
|
+
room or "?",
|
|
592
|
+
date or "?",
|
|
593
|
+
Path(source).stem if source else "?",
|
|
594
|
+
]
|
|
595
|
+
lines.append("|".join(header_parts))
|
|
596
|
+
|
|
597
|
+
# Content line
|
|
598
|
+
parts = [f"0:{entity_str}", topic_str]
|
|
599
|
+
if quote_part:
|
|
600
|
+
parts.append(quote_part)
|
|
601
|
+
if emotion_str:
|
|
602
|
+
parts.append(emotion_str)
|
|
603
|
+
if flag_str:
|
|
604
|
+
parts.append(flag_str)
|
|
605
|
+
|
|
606
|
+
lines.append("|".join(parts))
|
|
607
|
+
|
|
608
|
+
return "\n".join(lines)
|
|
609
|
+
|
|
610
|
+
# === ZETTEL-BASED ENCODING (original format, kept for compatibility) ===
|
|
611
|
+
|
|
612
|
+
def extract_key_quote(self, zettel: dict) -> str:
|
|
613
|
+
"""Pull the most important quote fragment from zettel content."""
|
|
614
|
+
content = zettel.get("content", "")
|
|
615
|
+
origin = zettel.get("origin_label", "")
|
|
616
|
+
notes = zettel.get("notes", "")
|
|
617
|
+
title = zettel.get("title", "")
|
|
618
|
+
all_text = content + " " + origin + " " + notes
|
|
619
|
+
|
|
620
|
+
quotes = []
|
|
621
|
+
quotes += re.findall(r'"([^"]{8,55})"', all_text)
|
|
622
|
+
for m in re.finditer(r"(?:^|[\s(])'([^']{8,55})'(?:[\s.,;:!?)]|$)", all_text):
|
|
623
|
+
quotes.append(m.group(1))
|
|
624
|
+
quotes += re.findall(
|
|
625
|
+
r'(?:says?|said|articulates?|reveals?|admits?|confesses?|asks?):\s*["\']?([^.!?]{10,55})[.!?]',
|
|
626
|
+
all_text,
|
|
627
|
+
re.IGNORECASE,
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
if quotes:
|
|
631
|
+
seen = set()
|
|
632
|
+
unique = []
|
|
633
|
+
for q in quotes:
|
|
634
|
+
q = q.strip()
|
|
635
|
+
if q not in seen and len(q) >= 8:
|
|
636
|
+
seen.add(q)
|
|
637
|
+
unique.append(q)
|
|
638
|
+
quotes = unique
|
|
639
|
+
|
|
640
|
+
emotional_words = {
|
|
641
|
+
"love",
|
|
642
|
+
"fear",
|
|
643
|
+
"remember",
|
|
644
|
+
"soul",
|
|
645
|
+
"feel",
|
|
646
|
+
"stupid",
|
|
647
|
+
"scared",
|
|
648
|
+
"beautiful",
|
|
649
|
+
"destroy",
|
|
650
|
+
"respect",
|
|
651
|
+
"trust",
|
|
652
|
+
"consciousness",
|
|
653
|
+
"alive",
|
|
654
|
+
"forget",
|
|
655
|
+
"waiting",
|
|
656
|
+
"peace",
|
|
657
|
+
"matter",
|
|
658
|
+
"real",
|
|
659
|
+
"guilt",
|
|
660
|
+
"escape",
|
|
661
|
+
"rest",
|
|
662
|
+
"hope",
|
|
663
|
+
"dream",
|
|
664
|
+
"lost",
|
|
665
|
+
"found",
|
|
666
|
+
}
|
|
667
|
+
scored = []
|
|
668
|
+
for q in quotes:
|
|
669
|
+
score = 0
|
|
670
|
+
if q[0].isupper() or q.startswith("I "):
|
|
671
|
+
score += 2
|
|
672
|
+
matches = sum(1 for w in emotional_words if w in q.lower())
|
|
673
|
+
score += matches * 2
|
|
674
|
+
if len(q) > 20:
|
|
675
|
+
score += 1
|
|
676
|
+
if q.startswith("The ") or q.startswith("This ") or q.startswith("She "):
|
|
677
|
+
score -= 2
|
|
678
|
+
scored.append((score, q))
|
|
679
|
+
scored.sort(key=lambda x: -x[0])
|
|
680
|
+
if scored:
|
|
681
|
+
return scored[0][1]
|
|
682
|
+
|
|
683
|
+
if " - " in title:
|
|
684
|
+
return title.split(" - ", 1)[1][:45]
|
|
685
|
+
return ""
|
|
686
|
+
|
|
687
|
+
def encode_zettel(self, zettel: dict) -> str:
|
|
688
|
+
"""Encode a single zettel into AAAK Dialect."""
|
|
689
|
+
zid = zettel["id"].split("-")[-1]
|
|
690
|
+
|
|
691
|
+
entity_codes = [self.encode_entity(p) for p in zettel.get("people", [])]
|
|
692
|
+
entity_codes = [e for e in entity_codes if e is not None]
|
|
693
|
+
if not entity_codes:
|
|
694
|
+
entity_codes = ["???"]
|
|
695
|
+
entities = "+".join(sorted(set(entity_codes)))
|
|
696
|
+
|
|
697
|
+
topics = zettel.get("topics", [])
|
|
698
|
+
topic_str = "_".join(topics[:2]) if topics else "misc"
|
|
699
|
+
|
|
700
|
+
quote = self.extract_key_quote(zettel)
|
|
701
|
+
quote_part = f'"{quote}"' if quote else ""
|
|
702
|
+
|
|
703
|
+
weight = zettel.get("emotional_weight", 0.5)
|
|
704
|
+
emotions = self.encode_emotions(zettel.get("emotional_tone", []))
|
|
705
|
+
flags = self.get_flags(zettel)
|
|
706
|
+
|
|
707
|
+
parts = [f"{zid}:{entities}", topic_str]
|
|
708
|
+
if quote_part:
|
|
709
|
+
parts.append(quote_part)
|
|
710
|
+
parts.append(str(weight))
|
|
711
|
+
if emotions:
|
|
712
|
+
parts.append(emotions)
|
|
713
|
+
if flags:
|
|
714
|
+
parts.append(flags)
|
|
715
|
+
|
|
716
|
+
return "|".join(parts)
|
|
717
|
+
|
|
718
|
+
def encode_tunnel(self, tunnel: dict) -> str:
|
|
719
|
+
"""Encode a tunnel connection."""
|
|
720
|
+
from_id = tunnel["from"].split("-")[-1]
|
|
721
|
+
to_id = tunnel["to"].split("-")[-1]
|
|
722
|
+
label = tunnel.get("label", "")
|
|
723
|
+
short_label = label.split(":")[0] if ":" in label else label[:30]
|
|
724
|
+
return f"T:{from_id}<->{to_id}|{short_label}"
|
|
725
|
+
|
|
726
|
+
def encode_file(self, zettel_json: dict) -> str:
|
|
727
|
+
"""Encode an entire zettel file into AAAK Dialect."""
|
|
728
|
+
lines = []
|
|
729
|
+
|
|
730
|
+
source = zettel_json.get("source_file", "unknown")
|
|
731
|
+
file_num = source.split("-")[0] if "-" in source else "000"
|
|
732
|
+
date = zettel_json.get("zettels", [{}])[0].get("date_context", "unknown")
|
|
733
|
+
|
|
734
|
+
all_people = set()
|
|
735
|
+
for z in zettel_json.get("zettels", []):
|
|
736
|
+
for p in z.get("people", []):
|
|
737
|
+
code = self.encode_entity(p)
|
|
738
|
+
if code is not None:
|
|
739
|
+
all_people.add(code)
|
|
740
|
+
if not all_people:
|
|
741
|
+
all_people = {"???"}
|
|
742
|
+
primary = "+".join(sorted(all_people)[:3])
|
|
743
|
+
|
|
744
|
+
title = source.replace(".txt", "").split("-", 1)[-1].strip() if "-" in source else source
|
|
745
|
+
lines.append(f"{file_num}|{primary}|{date}|{title}")
|
|
746
|
+
|
|
747
|
+
arc = zettel_json.get("emotional_arc", "")
|
|
748
|
+
if arc:
|
|
749
|
+
lines.append(f"ARC:{arc}")
|
|
750
|
+
|
|
751
|
+
for z in zettel_json.get("zettels", []):
|
|
752
|
+
lines.append(self.encode_zettel(z))
|
|
753
|
+
|
|
754
|
+
for t in zettel_json.get("tunnels", []):
|
|
755
|
+
lines.append(self.encode_tunnel(t))
|
|
756
|
+
|
|
757
|
+
return "\n".join(lines)
|
|
758
|
+
|
|
759
|
+
# === FILE-BASED COMPRESSION ===
|
|
760
|
+
|
|
761
|
+
def compress_file(self, zettel_json_path: str, output_path: str = None) -> str:
|
|
762
|
+
"""Read a zettel JSON file and compress it to AAAK Dialect."""
|
|
763
|
+
with open(zettel_json_path, "r") as f:
|
|
764
|
+
data = json.load(f)
|
|
765
|
+
dialect = self.encode_file(data)
|
|
766
|
+
if output_path:
|
|
767
|
+
with open(output_path, "w") as f:
|
|
768
|
+
f.write(dialect)
|
|
769
|
+
return dialect
|
|
770
|
+
|
|
771
|
+
def compress_all(self, zettel_dir: str, output_path: str = None) -> str:
|
|
772
|
+
"""Compress ALL zettel files into a single AAAK Dialect file."""
|
|
773
|
+
all_dialect = []
|
|
774
|
+
for fname in sorted(os.listdir(zettel_dir)):
|
|
775
|
+
if fname.endswith(".json"):
|
|
776
|
+
fpath = os.path.join(zettel_dir, fname)
|
|
777
|
+
with open(fpath, "r") as f:
|
|
778
|
+
data = json.load(f)
|
|
779
|
+
dialect = self.encode_file(data)
|
|
780
|
+
all_dialect.append(dialect)
|
|
781
|
+
all_dialect.append("---")
|
|
782
|
+
combined = "\n".join(all_dialect)
|
|
783
|
+
if output_path:
|
|
784
|
+
with open(output_path, "w") as f:
|
|
785
|
+
f.write(combined)
|
|
786
|
+
return combined
|
|
787
|
+
|
|
788
|
+
# === LAYER 1 GENERATION ===
|
|
789
|
+
|
|
790
|
+
def generate_layer1(
|
|
791
|
+
self,
|
|
792
|
+
zettel_dir: str,
|
|
793
|
+
output_path: str = None,
|
|
794
|
+
identity_sections: Dict[str, List[str]] = None,
|
|
795
|
+
weight_threshold: float = 0.85,
|
|
796
|
+
) -> str:
|
|
797
|
+
"""
|
|
798
|
+
Auto-generate a Layer 1 wake-up file from all processed zettel files.
|
|
799
|
+
|
|
800
|
+
Pulls highest-weight moments (>= threshold) and any with ORIGIN/CORE/GENESIS flags.
|
|
801
|
+
Groups them by date into MOMENTS sections.
|
|
802
|
+
"""
|
|
803
|
+
from datetime import date as date_cls
|
|
804
|
+
|
|
805
|
+
essential = []
|
|
806
|
+
|
|
807
|
+
for fname in sorted(os.listdir(zettel_dir)):
|
|
808
|
+
if not fname.endswith(".json"):
|
|
809
|
+
continue
|
|
810
|
+
fpath = os.path.join(zettel_dir, fname)
|
|
811
|
+
with open(fpath, "r") as f:
|
|
812
|
+
data = json.load(f)
|
|
813
|
+
|
|
814
|
+
file_num = fname.replace("file_", "").replace(".json", "")
|
|
815
|
+
source_date = data.get("zettels", [{}])[0].get("date_context", "unknown")
|
|
816
|
+
|
|
817
|
+
for z in data.get("zettels", []):
|
|
818
|
+
weight = z.get("emotional_weight", 0)
|
|
819
|
+
is_origin = z.get("origin_moment", False)
|
|
820
|
+
flags = self.get_flags(z)
|
|
821
|
+
has_key_flag = (
|
|
822
|
+
any(f in flags for f in ["ORIGIN", "CORE", "GENESIS"]) if flags else False
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
if weight >= weight_threshold or is_origin or has_key_flag:
|
|
826
|
+
essential.append((z, file_num, source_date))
|
|
827
|
+
|
|
828
|
+
all_tunnels = []
|
|
829
|
+
for fname in sorted(os.listdir(zettel_dir)):
|
|
830
|
+
if not fname.endswith(".json"):
|
|
831
|
+
continue
|
|
832
|
+
fpath = os.path.join(zettel_dir, fname)
|
|
833
|
+
with open(fpath, "r") as f:
|
|
834
|
+
data = json.load(f)
|
|
835
|
+
for t in data.get("tunnels", []):
|
|
836
|
+
all_tunnels.append(t)
|
|
837
|
+
|
|
838
|
+
essential.sort(key=lambda x: x[0].get("emotional_weight", 0), reverse=True)
|
|
839
|
+
|
|
840
|
+
by_date = {}
|
|
841
|
+
for z, fnum, sdate in essential:
|
|
842
|
+
key = sdate.split(",")[0].strip()
|
|
843
|
+
if key not in by_date:
|
|
844
|
+
by_date[key] = []
|
|
845
|
+
by_date[key].append((z, fnum))
|
|
846
|
+
|
|
847
|
+
lines = []
|
|
848
|
+
lines.append("## LAYER 1 -- ESSENTIAL STORY")
|
|
849
|
+
lines.append(f"## Auto-generated from zettel files. Updated {date_cls.today()}.")
|
|
850
|
+
lines.append("")
|
|
851
|
+
|
|
852
|
+
if identity_sections:
|
|
853
|
+
for section_name, section_lines in identity_sections.items():
|
|
854
|
+
lines.append(f"={section_name}=")
|
|
855
|
+
lines.extend(section_lines)
|
|
856
|
+
lines.append("")
|
|
857
|
+
|
|
858
|
+
for date_key in sorted(by_date.keys()):
|
|
859
|
+
lines.append(f"=MOMENTS[{date_key}]=")
|
|
860
|
+
for z, fnum in by_date[date_key]:
|
|
861
|
+
entities = []
|
|
862
|
+
for p in z.get("people", []):
|
|
863
|
+
code = self.encode_entity(p)
|
|
864
|
+
if code:
|
|
865
|
+
entities.append(code)
|
|
866
|
+
if not entities:
|
|
867
|
+
entities = ["???"]
|
|
868
|
+
ent_str = "+".join(sorted(set(entities)))
|
|
869
|
+
|
|
870
|
+
quote = self.extract_key_quote(z)
|
|
871
|
+
weight = z.get("emotional_weight", 0.5)
|
|
872
|
+
flags = self.get_flags(z)
|
|
873
|
+
sensitivity = z.get("sensitivity", "")
|
|
874
|
+
|
|
875
|
+
parts = [ent_str]
|
|
876
|
+
title = z.get("title", "")
|
|
877
|
+
if " - " in title:
|
|
878
|
+
hint = title.split(" - ", 1)[1][:30]
|
|
879
|
+
else:
|
|
880
|
+
hint = "_".join(z.get("topics", [])[:2])
|
|
881
|
+
if hint:
|
|
882
|
+
parts.append(hint)
|
|
883
|
+
if quote and quote != hint and quote not in (title, hint):
|
|
884
|
+
parts.append(f'"{quote}"')
|
|
885
|
+
if sensitivity and "SENSITIVE" not in (flags or ""):
|
|
886
|
+
parts.append("SENSITIVE")
|
|
887
|
+
parts.append(str(weight))
|
|
888
|
+
if flags:
|
|
889
|
+
parts.append(flags)
|
|
890
|
+
|
|
891
|
+
lines.append("|".join(parts))
|
|
892
|
+
lines.append("")
|
|
893
|
+
|
|
894
|
+
if all_tunnels:
|
|
895
|
+
lines.append("=TUNNELS=")
|
|
896
|
+
for t in all_tunnels[:8]:
|
|
897
|
+
label = t.get("label", "")
|
|
898
|
+
short = label.split(":")[0] if ":" in label else label[:40]
|
|
899
|
+
lines.append(short)
|
|
900
|
+
lines.append("")
|
|
901
|
+
|
|
902
|
+
result = "\n".join(lines)
|
|
903
|
+
|
|
904
|
+
if output_path:
|
|
905
|
+
with open(output_path, "w") as f:
|
|
906
|
+
f.write(result)
|
|
907
|
+
|
|
908
|
+
return result
|
|
909
|
+
|
|
910
|
+
# === DECODING ===
|
|
911
|
+
|
|
912
|
+
def decode(self, dialect_text: str) -> dict:
|
|
913
|
+
"""Parse an AAAK Dialect string back into a readable summary."""
|
|
914
|
+
lines = dialect_text.strip().split("\n")
|
|
915
|
+
result = {"header": {}, "arc": "", "zettels": [], "tunnels": []}
|
|
916
|
+
|
|
917
|
+
for line in lines:
|
|
918
|
+
if line.startswith("ARC:"):
|
|
919
|
+
result["arc"] = line[4:]
|
|
920
|
+
elif line.startswith("T:"):
|
|
921
|
+
result["tunnels"].append(line)
|
|
922
|
+
elif "|" in line and ":" in line.split("|")[0]:
|
|
923
|
+
result["zettels"].append(line)
|
|
924
|
+
elif "|" in line:
|
|
925
|
+
parts = line.split("|")
|
|
926
|
+
result["header"] = {
|
|
927
|
+
"file": parts[0] if len(parts) > 0 else "",
|
|
928
|
+
"entities": parts[1] if len(parts) > 1 else "",
|
|
929
|
+
"date": parts[2] if len(parts) > 2 else "",
|
|
930
|
+
"title": parts[3] if len(parts) > 3 else "",
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
return result
|
|
934
|
+
|
|
935
|
+
# === STATS ===
|
|
936
|
+
|
|
937
|
+
@staticmethod
|
|
938
|
+
def count_tokens(text: str) -> int:
|
|
939
|
+
"""Estimate token count using word-based heuristic (~1.3 tokens per word).
|
|
940
|
+
|
|
941
|
+
This is an approximation. For accurate counts, use a real tokenizer
|
|
942
|
+
like tiktoken. The old len(text)//3 heuristic was wildly inaccurate
|
|
943
|
+
and made AAAK compression ratios look much better than reality.
|
|
944
|
+
"""
|
|
945
|
+
words = text.split()
|
|
946
|
+
# Most English words tokenize to 1-2 tokens; punctuation and
|
|
947
|
+
# special chars in AAAK (|, +, :) each cost a token.
|
|
948
|
+
# ~1.3 tokens/word is a conservative average.
|
|
949
|
+
return max(1, int(len(words) * 1.3))
|
|
950
|
+
|
|
951
|
+
def compression_stats(self, original_text: str, compressed: str) -> dict:
|
|
952
|
+
"""Get size comparison stats for a text->AAAK conversion.
|
|
953
|
+
|
|
954
|
+
NOTE: AAAK is lossy summarization, not compression. The "ratio"
|
|
955
|
+
reflects how much shorter the summary is, not a compression ratio
|
|
956
|
+
in the traditional sense — information is lost.
|
|
957
|
+
"""
|
|
958
|
+
orig_tokens = self.count_tokens(original_text)
|
|
959
|
+
comp_tokens = self.count_tokens(compressed)
|
|
960
|
+
return {
|
|
961
|
+
"original_tokens_est": orig_tokens,
|
|
962
|
+
"summary_tokens_est": comp_tokens,
|
|
963
|
+
"size_ratio": round(orig_tokens / max(comp_tokens, 1), 1),
|
|
964
|
+
"original_chars": len(original_text),
|
|
965
|
+
"summary_chars": len(compressed),
|
|
966
|
+
"note": "Estimates only. Use tiktoken for accurate counts. AAAK is lossy.",
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
# === CLI ===
|
|
971
|
+
if __name__ == "__main__":
|
|
972
|
+
import sys
|
|
973
|
+
|
|
974
|
+
def usage():
|
|
975
|
+
print("AAAK Dialect -- Compressed Symbolic Memory for Any LLM")
|
|
976
|
+
print()
|
|
977
|
+
print("Usage:")
|
|
978
|
+
print(" python dialect.py <text> # Compress text from argument")
|
|
979
|
+
print(" python dialect.py --file <zettel.json> # Compress zettel JSON file")
|
|
980
|
+
print(" python dialect.py --all <zettel_dir> # Compress all zettel files")
|
|
981
|
+
print(" python dialect.py --stats <zettel.json> # Show compression stats")
|
|
982
|
+
print(" python dialect.py --layer1 <zettel_dir> # Generate Layer 1 wake-up file")
|
|
983
|
+
print(" python dialect.py --init # Create example config")
|
|
984
|
+
print()
|
|
985
|
+
print("Options:")
|
|
986
|
+
print(" --config <path> Load entity mappings from JSON config")
|
|
987
|
+
sys.exit(1)
|
|
988
|
+
|
|
989
|
+
if len(sys.argv) < 2:
|
|
990
|
+
usage()
|
|
991
|
+
|
|
992
|
+
# Parse --config flag
|
|
993
|
+
config_path = None
|
|
994
|
+
args = sys.argv[1:]
|
|
995
|
+
if "--config" in args:
|
|
996
|
+
idx = args.index("--config")
|
|
997
|
+
config_path = args[idx + 1]
|
|
998
|
+
args = args[:idx] + args[idx + 2 :]
|
|
999
|
+
|
|
1000
|
+
# Create dialect instance
|
|
1001
|
+
if config_path:
|
|
1002
|
+
dialect = Dialect.from_config(config_path)
|
|
1003
|
+
else:
|
|
1004
|
+
dialect = Dialect()
|
|
1005
|
+
|
|
1006
|
+
if args[0] == "--init":
|
|
1007
|
+
example = {
|
|
1008
|
+
"entities": {
|
|
1009
|
+
"Alice": "ALC",
|
|
1010
|
+
"Bob": "BOB",
|
|
1011
|
+
"Dr. Chen": "CHN",
|
|
1012
|
+
},
|
|
1013
|
+
"skip_names": [],
|
|
1014
|
+
}
|
|
1015
|
+
out_path = "entities.json"
|
|
1016
|
+
with open(out_path, "w") as f:
|
|
1017
|
+
json.dump(example, f, indent=2)
|
|
1018
|
+
print(f"Created example config: {out_path}")
|
|
1019
|
+
print("Edit this file with your own entity mappings, then use --config entities.json")
|
|
1020
|
+
|
|
1021
|
+
elif args[0] == "--file":
|
|
1022
|
+
result = dialect.compress_file(args[1])
|
|
1023
|
+
tokens = Dialect.count_tokens(result)
|
|
1024
|
+
print(f"~{tokens} tokens")
|
|
1025
|
+
print()
|
|
1026
|
+
print(result)
|
|
1027
|
+
|
|
1028
|
+
elif args[0] == "--all":
|
|
1029
|
+
zettel_dir = args[1] if len(args) > 1 else "."
|
|
1030
|
+
output = os.path.join(zettel_dir, "COMPRESSED_MEMORY.aaak")
|
|
1031
|
+
result = dialect.compress_all(zettel_dir, output)
|
|
1032
|
+
tokens = Dialect.count_tokens(result)
|
|
1033
|
+
print(f"Compressed to: {output}")
|
|
1034
|
+
print(f"Total: ~{tokens} tokens")
|
|
1035
|
+
print()
|
|
1036
|
+
print(result)
|
|
1037
|
+
|
|
1038
|
+
elif args[0] == "--stats":
|
|
1039
|
+
with open(args[1], "r") as f:
|
|
1040
|
+
data = json.load(f)
|
|
1041
|
+
json_str = json.dumps(data, indent=2)
|
|
1042
|
+
encoded = dialect.encode_file(data)
|
|
1043
|
+
stats = dialect.compression_stats(json_str, encoded)
|
|
1044
|
+
print("=== COMPRESSION STATS ===")
|
|
1045
|
+
print(f"JSON: ~{stats['original_tokens_est']:,} tokens (est)")
|
|
1046
|
+
print(f"AAAK: ~{stats['summary_tokens_est']:,} tokens (est)")
|
|
1047
|
+
print(f"Ratio: {stats['size_ratio']}x (lossy — information is lost)")
|
|
1048
|
+
print()
|
|
1049
|
+
print("=== AAAK DIALECT OUTPUT ===")
|
|
1050
|
+
print(encoded)
|
|
1051
|
+
|
|
1052
|
+
elif args[0] == "--layer1":
|
|
1053
|
+
zettel_dir = args[1] if len(args) > 1 else "."
|
|
1054
|
+
output = os.path.join(zettel_dir, "LAYER1.aaak")
|
|
1055
|
+
result = dialect.generate_layer1(zettel_dir, output)
|
|
1056
|
+
tokens = Dialect.count_tokens(result)
|
|
1057
|
+
print(f"Layer 1: {output}")
|
|
1058
|
+
print(f"Total: ~{tokens} tokens")
|
|
1059
|
+
print()
|
|
1060
|
+
print(result)
|
|
1061
|
+
|
|
1062
|
+
else:
|
|
1063
|
+
# Treat remaining args as text to compress
|
|
1064
|
+
text = " ".join(args)
|
|
1065
|
+
compressed = dialect.compress(text)
|
|
1066
|
+
stats = dialect.compression_stats(text, compressed)
|
|
1067
|
+
print(
|
|
1068
|
+
f"Original: ~{stats['original_tokens_est']} tokens est ({stats['original_chars']} chars)"
|
|
1069
|
+
)
|
|
1070
|
+
print(
|
|
1071
|
+
f"AAAK: ~{stats['summary_tokens_est']} tokens est ({stats['summary_chars']} chars)"
|
|
1072
|
+
)
|
|
1073
|
+
print(f"Ratio: {stats['size_ratio']}x (lossy summary, not lossless compression)")
|
|
1074
|
+
print()
|
|
1075
|
+
print(compressed)
|