mempalace-code 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,521 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ general_extractor.py — Extract 5 types of memories from text.
4
+
5
+ Types:
6
+ 1. DECISIONS — "we went with X because Y", choices made
7
+ 2. PREFERENCES — "always use X", "never do Y", "I prefer Z"
8
+ 3. MILESTONES — breakthroughs, things that finally worked
9
+ 4. PROBLEMS — what broke, what fixed it, root causes
10
+ 5. EMOTIONAL — feelings, vulnerability, relationships
11
+
12
+ No LLM required. Pure keyword/pattern heuristics.
13
+ No external dependencies on palace.py, dialect.py, or layers.py.
14
+
15
+ Usage:
16
+ from general_extractor import extract_memories
17
+
18
+ chunks = extract_memories(text)
19
+ # [{"content": "...", "memory_type": "decision", "chunk_index": 0}, ...]
20
+ """
21
+
22
+ import re
23
+ from typing import List, Dict, Tuple
24
+
25
+
26
+ # =============================================================================
27
+ # MARKER SETS — One per memory type
28
+ # =============================================================================
29
+
30
+ DECISION_MARKERS = [
31
+ r"\blet'?s (use|go with|try|pick|choose|switch to)\b",
32
+ r"\bwe (should|decided|chose|went with|picked|settled on)\b",
33
+ r"\bi'?m going (to|with)\b",
34
+ r"\bbetter (to|than|approach|option|choice)\b",
35
+ r"\binstead of\b",
36
+ r"\brather than\b",
37
+ r"\bthe reason (is|was|being)\b",
38
+ r"\bbecause\b",
39
+ r"\btrade-?off\b",
40
+ r"\bpros and cons\b",
41
+ r"\bover\b.*\bbecause\b",
42
+ r"\barchitecture\b",
43
+ r"\bapproach\b",
44
+ r"\bstrategy\b",
45
+ r"\bpattern\b",
46
+ r"\bstack\b",
47
+ r"\bframework\b",
48
+ r"\binfrastructure\b",
49
+ r"\bset (it |this )?to\b",
50
+ r"\bconfigure\b",
51
+ r"\bdefault\b",
52
+ ]
53
+
54
+ PREFERENCE_MARKERS = [
55
+ r"\bi prefer\b",
56
+ r"\balways use\b",
57
+ r"\bnever use\b",
58
+ r"\bdon'?t (ever |like to )?(use|do|mock|stub|import)\b",
59
+ r"\bi like (to|when|how)\b",
60
+ r"\bi hate (when|how|it when)\b",
61
+ r"\bplease (always|never|don'?t)\b",
62
+ r"\bmy (rule|preference|style|convention) is\b",
63
+ r"\bwe (always|never)\b",
64
+ r"\bfunctional\b.*\bstyle\b",
65
+ r"\bimperative\b",
66
+ r"\bsnake_?case\b",
67
+ r"\bcamel_?case\b",
68
+ r"\btabs\b.*\bspaces\b",
69
+ r"\bspaces\b.*\btabs\b",
70
+ r"\buse\b.*\binstead of\b",
71
+ ]
72
+
73
+ MILESTONE_MARKERS = [
74
+ r"\bit works\b",
75
+ r"\bit worked\b",
76
+ r"\bgot it working\b",
77
+ r"\bfixed\b",
78
+ r"\bsolved\b",
79
+ r"\bbreakthrough\b",
80
+ r"\bfigured (it )?out\b",
81
+ r"\bnailed it\b",
82
+ r"\bcracked (it|the)\b",
83
+ r"\bfinally\b",
84
+ r"\bfirst time\b",
85
+ r"\bfirst ever\b",
86
+ r"\bnever (done|been|had) before\b",
87
+ r"\bdiscovered\b",
88
+ r"\brealized\b",
89
+ r"\bfound (out|that)\b",
90
+ r"\bturns out\b",
91
+ r"\bthe key (is|was|insight)\b",
92
+ r"\bthe trick (is|was)\b",
93
+ r"\bnow i (understand|see|get it)\b",
94
+ r"\bbuilt\b",
95
+ r"\bcreated\b",
96
+ r"\bimplemented\b",
97
+ r"\bshipped\b",
98
+ r"\blaunched\b",
99
+ r"\bdeployed\b",
100
+ r"\breleased\b",
101
+ r"\bprototype\b",
102
+ r"\bproof of concept\b",
103
+ r"\bdemo\b",
104
+ r"\bversion \d",
105
+ r"\bv\d+\.\d+",
106
+ r"\d+x (compression|faster|slower|better|improvement|reduction)",
107
+ r"\d+% (reduction|improvement|faster|better|smaller)",
108
+ ]
109
+
110
+ PROBLEM_MARKERS = [
111
+ r"\b(bug|error|crash|fail|broke|broken|issue|problem)\b",
112
+ r"\bdoesn'?t work\b",
113
+ r"\bnot working\b",
114
+ r"\bwon'?t\b.*\bwork\b",
115
+ r"\bkeeps? (failing|crashing|breaking|erroring)\b",
116
+ r"\broot cause\b",
117
+ r"\bthe (problem|issue|bug) (is|was)\b",
118
+ r"\bturns out\b.*\b(was|because|due to)\b",
119
+ r"\bthe fix (is|was)\b",
120
+ r"\bworkaround\b",
121
+ r"\bthat'?s why\b",
122
+ r"\bthe reason it\b",
123
+ r"\bfixed (it |the |by )\b",
124
+ r"\bsolution (is|was)\b",
125
+ r"\bresolved\b",
126
+ r"\bpatched\b",
127
+ r"\bthe answer (is|was)\b",
128
+ r"\b(had|need) to\b.*\binstead\b",
129
+ ]
130
+
131
+ EMOTION_MARKERS = [
132
+ r"\blove\b",
133
+ r"\bscared\b",
134
+ r"\bafraid\b",
135
+ r"\bproud\b",
136
+ r"\bhurt\b",
137
+ r"\bhappy\b",
138
+ r"\bsad\b",
139
+ r"\bcry\b",
140
+ r"\bcrying\b",
141
+ r"\bmiss\b",
142
+ r"\bsorry\b",
143
+ r"\bgrateful\b",
144
+ r"\bangry\b",
145
+ r"\bworried\b",
146
+ r"\blonely\b",
147
+ r"\bbeautiful\b",
148
+ r"\bamazing\b",
149
+ r"\bwonderful\b",
150
+ r"i feel",
151
+ r"i'm scared",
152
+ r"i love you",
153
+ r"i'm sorry",
154
+ r"i can't",
155
+ r"i wish",
156
+ r"i miss",
157
+ r"i need",
158
+ r"never told anyone",
159
+ r"nobody knows",
160
+ r"\*[^*]+\*",
161
+ ]
162
+
163
+ ALL_MARKERS = {
164
+ "decision": DECISION_MARKERS,
165
+ "preference": PREFERENCE_MARKERS,
166
+ "milestone": MILESTONE_MARKERS,
167
+ "problem": PROBLEM_MARKERS,
168
+ "emotional": EMOTION_MARKERS,
169
+ }
170
+
171
+
172
+ # =============================================================================
173
+ # SENTIMENT — for disambiguation
174
+ # =============================================================================
175
+
176
+ POSITIVE_WORDS = {
177
+ "pride",
178
+ "proud",
179
+ "joy",
180
+ "happy",
181
+ "love",
182
+ "loving",
183
+ "beautiful",
184
+ "amazing",
185
+ "wonderful",
186
+ "incredible",
187
+ "fantastic",
188
+ "brilliant",
189
+ "perfect",
190
+ "excited",
191
+ "thrilled",
192
+ "grateful",
193
+ "warm",
194
+ "breakthrough",
195
+ "success",
196
+ "works",
197
+ "working",
198
+ "solved",
199
+ "fixed",
200
+ "nailed",
201
+ "heart",
202
+ "hug",
203
+ "precious",
204
+ "adore",
205
+ }
206
+
207
+ NEGATIVE_WORDS = {
208
+ "bug",
209
+ "error",
210
+ "crash",
211
+ "crashing",
212
+ "crashed",
213
+ "fail",
214
+ "failed",
215
+ "failing",
216
+ "failure",
217
+ "broken",
218
+ "broke",
219
+ "breaking",
220
+ "breaks",
221
+ "issue",
222
+ "problem",
223
+ "wrong",
224
+ "stuck",
225
+ "blocked",
226
+ "unable",
227
+ "impossible",
228
+ "missing",
229
+ "terrible",
230
+ "horrible",
231
+ "awful",
232
+ "worse",
233
+ "worst",
234
+ "panic",
235
+ "disaster",
236
+ "mess",
237
+ }
238
+
239
+
240
+ def _get_sentiment(text: str) -> str:
241
+ """Quick sentiment: 'positive', 'negative', or 'neutral'."""
242
+ words = set(w.lower() for w in re.findall(r"\b\w+\b", text))
243
+ pos = len(words & POSITIVE_WORDS)
244
+ neg = len(words & NEGATIVE_WORDS)
245
+ if pos > neg:
246
+ return "positive"
247
+ elif neg > pos:
248
+ return "negative"
249
+ return "neutral"
250
+
251
+
252
+ def _has_resolution(text: str) -> bool:
253
+ """Check if text describes a RESOLVED problem."""
254
+ text_lower = text.lower()
255
+ patterns = [
256
+ r"\bfixed\b",
257
+ r"\bsolved\b",
258
+ r"\bresolved\b",
259
+ r"\bpatched\b",
260
+ r"\bgot it working\b",
261
+ r"\bit works\b",
262
+ r"\bnailed it\b",
263
+ r"\bfigured (it )?out\b",
264
+ r"\bthe (fix|answer|solution)\b",
265
+ ]
266
+ return any(re.search(p, text_lower) for p in patterns)
267
+
268
+
269
+ def _disambiguate(memory_type: str, text: str, scores: Dict[str, float]) -> str:
270
+ """Fix misclassifications using sentiment + resolution."""
271
+ sentiment = _get_sentiment(text)
272
+
273
+ # Resolved problems are milestones
274
+ if memory_type == "problem" and _has_resolution(text):
275
+ if scores.get("emotional", 0) > 0 and sentiment == "positive":
276
+ return "emotional"
277
+ return "milestone"
278
+
279
+ # Problem + positive sentiment => milestone or emotional
280
+ if memory_type == "problem" and sentiment == "positive":
281
+ if scores.get("milestone", 0) > 0:
282
+ return "milestone"
283
+ if scores.get("emotional", 0) > 0:
284
+ return "emotional"
285
+
286
+ return memory_type
287
+
288
+
289
+ # =============================================================================
290
+ # CODE LINE FILTERING
291
+ # =============================================================================
292
+
293
+ _CODE_LINE_PATTERNS = [
294
+ re.compile(r"^\s*[\$#]\s"),
295
+ re.compile(
296
+ r"^\s*(cd|source|echo|export|pip|npm|git|python|bash|curl|wget|mkdir|rm|cp|mv|ls|cat|grep|find|chmod|sudo|brew|docker)\s"
297
+ ),
298
+ re.compile(r"^\s*```"),
299
+ re.compile(r"^\s*(import|from|def|class|function|const|let|var|return)\s"),
300
+ re.compile(r"^\s*[A-Z_]{2,}="),
301
+ re.compile(r"^\s*\|"),
302
+ re.compile(r"^\s*[-]{2,}"),
303
+ re.compile(r"^\s*[{}\[\]]\s*$"),
304
+ re.compile(r"^\s*(if|for|while|try|except|elif|else:)\b"),
305
+ re.compile(r"^\s*\w+\.\w+\("),
306
+ re.compile(r"^\s*\w+ = \w+\.\w+"),
307
+ ]
308
+
309
+
310
+ def _is_code_line(line: str) -> bool:
311
+ stripped = line.strip()
312
+ if not stripped:
313
+ return False
314
+ for pattern in _CODE_LINE_PATTERNS:
315
+ if pattern.match(stripped):
316
+ return True
317
+ alpha_ratio = sum(1 for c in stripped if c.isalpha()) / max(len(stripped), 1)
318
+ if alpha_ratio < 0.4 and len(stripped) > 10:
319
+ return True
320
+ return False
321
+
322
+
323
+ def _extract_prose(text: str) -> str:
324
+ """Extract only prose lines (skip code) for classification scoring."""
325
+ lines = text.split("\n")
326
+ prose = []
327
+ in_code = False
328
+ for line in lines:
329
+ if line.strip().startswith("```"):
330
+ in_code = not in_code
331
+ continue
332
+ if in_code:
333
+ continue
334
+ if not _is_code_line(line):
335
+ prose.append(line)
336
+ result = "\n".join(prose).strip()
337
+ return result if result else text
338
+
339
+
340
+ # =============================================================================
341
+ # SCORING
342
+ # =============================================================================
343
+
344
+
345
+ def _score_markers(text: str, markers: List[str]) -> Tuple[float, List[str]]:
346
+ """Score text against regex markers. Returns (score, matched_keywords)."""
347
+ text_lower = text.lower()
348
+ score = 0.0
349
+ keywords = []
350
+ for marker in markers:
351
+ matches = re.findall(marker, text_lower)
352
+ if matches:
353
+ score += len(matches)
354
+ keywords.extend(m if isinstance(m, str) else m[0] if m else marker for m in matches)
355
+ return score, list(set(keywords))
356
+
357
+
358
+ # =============================================================================
359
+ # MAIN EXTRACTION
360
+ # =============================================================================
361
+
362
+
363
+ def extract_memories(text: str, min_confidence: float = 0.3) -> List[Dict]:
364
+ """
365
+ Extract memories from a text string.
366
+
367
+ Args:
368
+ text: The text to extract from (any format).
369
+ min_confidence: Minimum confidence threshold (0.0-1.0).
370
+
371
+ Returns:
372
+ List of dicts: {"content": str, "memory_type": str, "chunk_index": int}
373
+ """
374
+ # Split into paragraphs (double newline or speaker-turn boundaries)
375
+ paragraphs = _split_into_segments(text)
376
+ memories = []
377
+
378
+ for para in paragraphs:
379
+ if len(para.strip()) < 20:
380
+ continue
381
+
382
+ prose = _extract_prose(para)
383
+
384
+ # Score against all types
385
+ scores = {}
386
+ for mem_type, markers in ALL_MARKERS.items():
387
+ score, _ = _score_markers(prose, markers)
388
+ if score > 0:
389
+ scores[mem_type] = score
390
+
391
+ if not scores:
392
+ continue
393
+
394
+ # Length bonus
395
+ if len(para) > 500:
396
+ length_bonus = 2
397
+ elif len(para) > 200:
398
+ length_bonus = 1
399
+ else:
400
+ length_bonus = 0
401
+
402
+ max_type = max(scores, key=scores.get)
403
+ max_score = scores[max_type] + length_bonus
404
+
405
+ # Disambiguate
406
+ max_type = _disambiguate(max_type, prose, scores)
407
+
408
+ # Confidence
409
+ confidence = min(1.0, max_score / 5.0)
410
+ if confidence < min_confidence:
411
+ continue
412
+
413
+ memories.append(
414
+ {
415
+ "content": para.strip(),
416
+ "memory_type": max_type,
417
+ "chunk_index": len(memories),
418
+ }
419
+ )
420
+
421
+ return memories
422
+
423
+
424
+ def _split_into_segments(text: str) -> List[str]:
425
+ """
426
+ Split text into segments suitable for memory extraction.
427
+
428
+ Tries speaker-turn splitting first (> markers, "Human:", "Assistant:", etc.),
429
+ then falls back to paragraph splitting.
430
+ """
431
+ lines = text.split("\n")
432
+
433
+ # Check for speaker-turn markers
434
+ turn_patterns = [
435
+ re.compile(r"^>\s"), # > quoted user turn
436
+ re.compile(r"^(Human|User|Q)\s*:", re.I), # Human: / User:
437
+ re.compile(r"^(Assistant|AI|A|Claude|ChatGPT)\s*:", re.I),
438
+ ]
439
+
440
+ turn_count = 0
441
+ for line in lines:
442
+ stripped = line.strip()
443
+ for pat in turn_patterns:
444
+ if pat.match(stripped):
445
+ turn_count += 1
446
+ break
447
+
448
+ # If enough turn markers, split by turns
449
+ if turn_count >= 3:
450
+ return _split_by_turns(lines, turn_patterns)
451
+
452
+ # Fallback: paragraph splitting
453
+ paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
454
+
455
+ # If single giant block, chunk by line groups
456
+ if len(paragraphs) <= 1 and len(lines) > 20:
457
+ segments = []
458
+ for i in range(0, len(lines), 25):
459
+ group = "\n".join(lines[i : i + 25]).strip()
460
+ if group:
461
+ segments.append(group)
462
+ return segments
463
+
464
+ return paragraphs
465
+
466
+
467
+ def _split_by_turns(lines: List[str], turn_patterns: List[re.Pattern]) -> List[str]:
468
+ """Split lines into segments at each speaker turn boundary."""
469
+ segments = []
470
+ current = []
471
+
472
+ for line in lines:
473
+ stripped = line.strip()
474
+ is_turn = any(pat.match(stripped) for pat in turn_patterns)
475
+
476
+ if is_turn and current:
477
+ segments.append("\n".join(current))
478
+ current = [line]
479
+ else:
480
+ current.append(line)
481
+
482
+ if current:
483
+ segments.append("\n".join(current))
484
+
485
+ return segments
486
+
487
+
488
+ # =============================================================================
489
+ # CLI
490
+ # =============================================================================
491
+
492
+ if __name__ == "__main__":
493
+ import sys
494
+
495
+ if len(sys.argv) < 2:
496
+ print("Usage: python general_extractor.py <file>")
497
+ print()
498
+ print("Extracts decisions, preferences, milestones, problems, and")
499
+ print("emotional moments from any text file.")
500
+ sys.exit(1)
501
+
502
+ filepath = sys.argv[1]
503
+ with open(filepath, "r", encoding="utf-8", errors="replace") as f:
504
+ text = f.read()
505
+
506
+ memories = extract_memories(text)
507
+
508
+ # Summary
509
+ from collections import Counter
510
+
511
+ type_counts = Counter(m["memory_type"] for m in memories)
512
+ print(f"Extracted {len(memories)} memories:")
513
+ for mtype in ["decision", "preference", "milestone", "problem", "emotional"]:
514
+ count = type_counts.get(mtype, 0)
515
+ if count:
516
+ print(f" {mtype:12} {count}")
517
+
518
+ print()
519
+ for m in memories[:10]:
520
+ preview = m["content"][:80].replace("\n", " ")
521
+ print(f" [{m['memory_type']:10}] {preview}...")